/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse4_1_single.c
Location:	line 1033, column 5
Description:	Value stored to 'j_coord_offsetC' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: LennardJones
56	* Geometry: Water4-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwioffset3;
92	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
100	real *charge;
101	int nvdwtype;
102	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103	int *vdwtype;
104	real *vdwparam;
105	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
106	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
107	__m128i ewitab;
108	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109	real *ewtab;
110	__m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
111	real rswitch_scalar,d_scalar;
112	__m128 dummy_mask,cutoff_mask;
113	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114	__m128 one = _mm_set1_ps(1.0);
115	__m128 two = _mm_set1_ps(2.0);
116	x = xx[0];
117	f = ff[0];
118
119	nri = nlist->nri;
120	iinr = nlist->iinr;
121	jindex = nlist->jindex;
122	jjnr = nlist->jjnr;
123	shiftidx = nlist->shift;
124	gid = nlist->gid;
125	shiftvec = fr->shift_vec[0];
126	fshift = fr->fshift[0];
127	facel = _mm_set1_ps(fr->epsfac);
128	charge = mdatoms->chargeA;
129	nvdwtype = fr->ntype;
130	vdwparam = fr->nbfp;
131	vdwtype = mdatoms->typeA;
132
133	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
134	ewtab = fr->ic->tabq_coul_FDV0;
135	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
136	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
137
138	/* Setup water-specific parameters */
139	inr = nlist->iinr[0];
140	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
141	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
142	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
143	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
144
145	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
146	rcutoff_scalar = fr->rcoulomb;
147	rcutoff = _mm_set1_ps(rcutoff_scalar);
148	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
149
150	rswitch_scalar = fr->rcoulomb_switch;
151	rswitch = _mm_set1_ps(rswitch_scalar);
152	/* Setup switch parameters */
153	d_scalar = rcutoff_scalar-rswitch_scalar;
154	d = _mm_set1_ps(d_scalar);
155	swV3 = _mm_set1_ps(-10.0/(d_scalard_scalard_scalar));
156	swV4 = _mm_set1_ps( 15.0/(d_scalard_scalard_scalar*d_scalar));
157	swV5 = _mm_set1_ps( -6.0/(d_scalard_scalard_scalard_scalard_scalar));
158	swF2 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalar));
159	swF3 = _mm_set1_ps( 60.0/(d_scalard_scalard_scalar*d_scalar));
160	swF4 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalard_scalard_scalar));
161
162	/* Avoid stupid compiler warnings */
163	jnrA = jnrB = jnrC = jnrD = 0;
164	j_coord_offsetA = 0;
165	j_coord_offsetB = 0;
166	j_coord_offsetC = 0;
167	j_coord_offsetD = 0;
168
169	outeriter = 0;
170	inneriter = 0;
171
172	for(iidx=0;iidx<4*DIM3;iidx++)
173	{
174	scratch[iidx] = 0.0;
175	}
176
177	/* Start outer loop over neighborlists */
178	for(iidx=0; iidx<nri; iidx++)
179	{
180	/* Load shift vector for this list */
181	i_shift_offset = DIM3*shiftidx[iidx];
182
183	/* Load limits for loop over neighbors */
184	j_index_start = jindex[iidx];
185	j_index_end = jindex[iidx+1];
186
187	/* Get outer coordinate index */
188	inr = iinr[iidx];
189	i_coord_offset = DIM3*inr;
190
191	/* Load i particle coords and add shift vector */
192	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
194
195	fix0 = _mm_setzero_ps();
196	fiy0 = _mm_setzero_ps();
197	fiz0 = _mm_setzero_ps();
198	fix1 = _mm_setzero_ps();
199	fiy1 = _mm_setzero_ps();
200	fiz1 = _mm_setzero_ps();
201	fix2 = _mm_setzero_ps();
202	fiy2 = _mm_setzero_ps();
203	fiz2 = _mm_setzero_ps();
204	fix3 = _mm_setzero_ps();
205	fiy3 = _mm_setzero_ps();
206	fiz3 = _mm_setzero_ps();
207
208	/* Reset potential sums */
209	velecsum = _mm_setzero_ps();
210	vvdwsum = _mm_setzero_ps();
211
212	/* Start inner kernel loop */
213	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214	{
215
216	/* Get j neighbor index, and coordinate index */
217	jnrA = jjnr[jidx];
218	jnrB = jjnr[jidx+1];
219	jnrC = jjnr[jidx+2];
220	jnrD = jjnr[jidx+3];
221	j_coord_offsetA = DIM3*jnrA;
222	j_coord_offsetB = DIM3*jnrB;
223	j_coord_offsetC = DIM3*jnrC;
224	j_coord_offsetD = DIM3*jnrD;
225
226	/* load j atom coordinates */
227	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228	x+j_coord_offsetC,x+j_coord_offsetD,
229	&jx0,&jy0,&jz0);
230
231	/* Calculate displacement vector */
232	dx00 = _mm_sub_ps(ix0,jx0);
233	dy00 = _mm_sub_ps(iy0,jy0);
234	dz00 = _mm_sub_ps(iz0,jz0);
235	dx10 = _mm_sub_ps(ix1,jx0);
236	dy10 = _mm_sub_ps(iy1,jy0);
237	dz10 = _mm_sub_ps(iz1,jz0);
238	dx20 = _mm_sub_ps(ix2,jx0);
239	dy20 = _mm_sub_ps(iy2,jy0);
240	dz20 = _mm_sub_ps(iz2,jz0);
241	dx30 = _mm_sub_ps(ix3,jx0);
242	dy30 = _mm_sub_ps(iy3,jy0);
243	dz30 = _mm_sub_ps(iz3,jz0);
244
245	/* Calculate squared distance and things based on it */
246	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
247	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
248	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
249	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
250
251	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
252	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
253	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
254	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
255
256	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
257	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
258	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
259	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
260
261	/* Load parameters for j particles */
262	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
263	charge+jnrC+0,charge+jnrD+0);
264	vdwjidx0A = 2*vdwtype[jnrA+0];
265	vdwjidx0B = 2*vdwtype[jnrB+0];
266	vdwjidx0C = 2*vdwtype[jnrC+0];
267	vdwjidx0D = 2*vdwtype[jnrD+0];
268
269	fjx0 = _mm_setzero_ps();
270	fjy0 = _mm_setzero_ps();
271	fjz0 = _mm_setzero_ps();
272
273	/**************************
274	* CALCULATE INTERACTIONS *
275	**************************/
276
277	if (gmx_mm_any_lt(rsq00,rcutoff2))
278	{
279
280	r00 = _mm_mul_ps(rsq00,rinv00);
281
282	/* Compute parameters for interactions between i and j atoms */
283	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
284	vdwparam+vdwioffset0+vdwjidx0B,
285	vdwparam+vdwioffset0+vdwjidx0C,
286	vdwparam+vdwioffset0+vdwjidx0D,
287	&c6_00,&c12_00);
288
289	/* LENNARD-JONES DISPERSION/REPULSION */
290
291	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
292	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
293	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
294	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
295	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
296
297	d = _mm_sub_ps(r00,rswitch);
298	d = _mm_max_ps(d,_mm_setzero_ps());
299	d2 = _mm_mul_ps(d,d);
300	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
301
302	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
303
304	/* Evaluate switch function */
305	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
306	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
307	vvdw = _mm_mul_ps(vvdw,sw);
308	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
309
310	/* Update potential sum for this i atom from the interaction with this j atom. */
311	vvdw = _mm_and_ps(vvdw,cutoff_mask);
312	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
313
314	fscal = fvdw;
315
316	fscal = _mm_and_ps(fscal,cutoff_mask);
317
318	/* Calculate temporary vectorial force */
319	tx = _mm_mul_ps(fscal,dx00);
320	ty = _mm_mul_ps(fscal,dy00);
321	tz = _mm_mul_ps(fscal,dz00);
322
323	/* Update vectorial force */
324	fix0 = _mm_add_ps(fix0,tx);
325	fiy0 = _mm_add_ps(fiy0,ty);
326	fiz0 = _mm_add_ps(fiz0,tz);
327
328	fjx0 = _mm_add_ps(fjx0,tx);
329	fjy0 = _mm_add_ps(fjy0,ty);
330	fjz0 = _mm_add_ps(fjz0,tz);
331
332	}
333
334	/**************************
335	* CALCULATE INTERACTIONS *
336	**************************/
337
338	if (gmx_mm_any_lt(rsq10,rcutoff2))
339	{
340
341	r10 = _mm_mul_ps(rsq10,rinv10);
342
343	/* Compute parameters for interactions between i and j atoms */
344	qq10 = _mm_mul_ps(iq1,jq0);
345
346	/* EWALD ELECTROSTATICS */
347
348	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
349	ewrt = _mm_mul_ps(r10,ewtabscale);
350	ewitab = _mm_cvttps_epi32(ewrt);
351	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
352	ewitab = _mm_slli_epi32(ewitab,2);
353	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
354	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
355	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
356	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
357	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
358	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
359	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
360	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
361	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
362
363	d = _mm_sub_ps(r10,rswitch);
364	d = _mm_max_ps(d,_mm_setzero_ps());
365	d2 = _mm_mul_ps(d,d);
366	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
367
368	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
369
370	/* Evaluate switch function */
371	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
372	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
373	velec = _mm_mul_ps(velec,sw);
374	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
375
376	/* Update potential sum for this i atom from the interaction with this j atom. */
377	velec = _mm_and_ps(velec,cutoff_mask);
378	velecsum = _mm_add_ps(velecsum,velec);
379
380	fscal = felec;
381
382	fscal = _mm_and_ps(fscal,cutoff_mask);
383
384	/* Calculate temporary vectorial force */
385	tx = _mm_mul_ps(fscal,dx10);
386	ty = _mm_mul_ps(fscal,dy10);
387	tz = _mm_mul_ps(fscal,dz10);
388
389	/* Update vectorial force */
390	fix1 = _mm_add_ps(fix1,tx);
391	fiy1 = _mm_add_ps(fiy1,ty);
392	fiz1 = _mm_add_ps(fiz1,tz);
393
394	fjx0 = _mm_add_ps(fjx0,tx);
395	fjy0 = _mm_add_ps(fjy0,ty);
396	fjz0 = _mm_add_ps(fjz0,tz);
397
398	}
399
400	/**************************
401	* CALCULATE INTERACTIONS *
402	**************************/
403
404	if (gmx_mm_any_lt(rsq20,rcutoff2))
405	{
406
407	r20 = _mm_mul_ps(rsq20,rinv20);
408
409	/* Compute parameters for interactions between i and j atoms */
410	qq20 = _mm_mul_ps(iq2,jq0);
411
412	/* EWALD ELECTROSTATICS */
413
414	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
415	ewrt = _mm_mul_ps(r20,ewtabscale);
416	ewitab = _mm_cvttps_epi32(ewrt);
417	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
418	ewitab = _mm_slli_epi32(ewitab,2);
419	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
420	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
421	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
422	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
423	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
424	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
425	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
426	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
427	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
428
429	d = _mm_sub_ps(r20,rswitch);
430	d = _mm_max_ps(d,_mm_setzero_ps());
431	d2 = _mm_mul_ps(d,d);
432	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
433
434	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
435
436	/* Evaluate switch function */
437	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
438	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
439	velec = _mm_mul_ps(velec,sw);
440	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
441
442	/* Update potential sum for this i atom from the interaction with this j atom. */
443	velec = _mm_and_ps(velec,cutoff_mask);
444	velecsum = _mm_add_ps(velecsum,velec);
445
446	fscal = felec;
447
448	fscal = _mm_and_ps(fscal,cutoff_mask);
449
450	/* Calculate temporary vectorial force */
451	tx = _mm_mul_ps(fscal,dx20);
452	ty = _mm_mul_ps(fscal,dy20);
453	tz = _mm_mul_ps(fscal,dz20);
454
455	/* Update vectorial force */
456	fix2 = _mm_add_ps(fix2,tx);
457	fiy2 = _mm_add_ps(fiy2,ty);
458	fiz2 = _mm_add_ps(fiz2,tz);
459
460	fjx0 = _mm_add_ps(fjx0,tx);
461	fjy0 = _mm_add_ps(fjy0,ty);
462	fjz0 = _mm_add_ps(fjz0,tz);
463
464	}
465
466	/**************************
467	* CALCULATE INTERACTIONS *
468	**************************/
469
470	if (gmx_mm_any_lt(rsq30,rcutoff2))
471	{
472
473	r30 = _mm_mul_ps(rsq30,rinv30);
474
475	/* Compute parameters for interactions between i and j atoms */
476	qq30 = _mm_mul_ps(iq3,jq0);
477
478	/* EWALD ELECTROSTATICS */
479
480	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
481	ewrt = _mm_mul_ps(r30,ewtabscale);
482	ewitab = _mm_cvttps_epi32(ewrt);
483	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
484	ewitab = _mm_slli_epi32(ewitab,2);
485	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
486	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
487	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
488	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
489	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
490	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
491	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
492	velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
493	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
494
495	d = _mm_sub_ps(r30,rswitch);
496	d = _mm_max_ps(d,_mm_setzero_ps());
497	d2 = _mm_mul_ps(d,d);
498	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
499
500	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
501
502	/* Evaluate switch function */
503	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
504	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
505	velec = _mm_mul_ps(velec,sw);
506	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
507
508	/* Update potential sum for this i atom from the interaction with this j atom. */
509	velec = _mm_and_ps(velec,cutoff_mask);
510	velecsum = _mm_add_ps(velecsum,velec);
511
512	fscal = felec;
513
514	fscal = _mm_and_ps(fscal,cutoff_mask);
515
516	/* Calculate temporary vectorial force */
517	tx = _mm_mul_ps(fscal,dx30);
518	ty = _mm_mul_ps(fscal,dy30);
519	tz = _mm_mul_ps(fscal,dz30);
520
521	/* Update vectorial force */
522	fix3 = _mm_add_ps(fix3,tx);
523	fiy3 = _mm_add_ps(fiy3,ty);
524	fiz3 = _mm_add_ps(fiz3,tz);
525
526	fjx0 = _mm_add_ps(fjx0,tx);
527	fjy0 = _mm_add_ps(fjy0,ty);
528	fjz0 = _mm_add_ps(fjz0,tz);
529
530	}
531
532	fjptrA = f+j_coord_offsetA;
533	fjptrB = f+j_coord_offsetB;
534	fjptrC = f+j_coord_offsetC;
535	fjptrD = f+j_coord_offsetD;
536
537	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
538
539	/* Inner loop uses 254 flops */
540	}
541
542	if(jidx<j_index_end)
543	{
544
545	/* Get j neighbor index, and coordinate index */
546	jnrlistA = jjnr[jidx];
547	jnrlistB = jjnr[jidx+1];
548	jnrlistC = jjnr[jidx+2];
549	jnrlistD = jjnr[jidx+3];
550	/* Sign of each element will be negative for non-real atoms.
551	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
552	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
553	*/
554	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
555	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
556	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
557	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
558	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
559	j_coord_offsetA = DIM3*jnrA;
560	j_coord_offsetB = DIM3*jnrB;
561	j_coord_offsetC = DIM3*jnrC;
562	j_coord_offsetD = DIM3*jnrD;
563
564	/* load j atom coordinates */
565	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
566	x+j_coord_offsetC,x+j_coord_offsetD,
567	&jx0,&jy0,&jz0);
568
569	/* Calculate displacement vector */
570	dx00 = _mm_sub_ps(ix0,jx0);
571	dy00 = _mm_sub_ps(iy0,jy0);
572	dz00 = _mm_sub_ps(iz0,jz0);
573	dx10 = _mm_sub_ps(ix1,jx0);
574	dy10 = _mm_sub_ps(iy1,jy0);
575	dz10 = _mm_sub_ps(iz1,jz0);
576	dx20 = _mm_sub_ps(ix2,jx0);
577	dy20 = _mm_sub_ps(iy2,jy0);
578	dz20 = _mm_sub_ps(iz2,jz0);
579	dx30 = _mm_sub_ps(ix3,jx0);
580	dy30 = _mm_sub_ps(iy3,jy0);
581	dz30 = _mm_sub_ps(iz3,jz0);
582
583	/* Calculate squared distance and things based on it */
584	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
585	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
586	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
587	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
588
589	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
590	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
591	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
592	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
593
594	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
595	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
596	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
597	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
598
599	/* Load parameters for j particles */
600	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
601	charge+jnrC+0,charge+jnrD+0);
602	vdwjidx0A = 2*vdwtype[jnrA+0];
603	vdwjidx0B = 2*vdwtype[jnrB+0];
604	vdwjidx0C = 2*vdwtype[jnrC+0];
605	vdwjidx0D = 2*vdwtype[jnrD+0];
606
607	fjx0 = _mm_setzero_ps();
608	fjy0 = _mm_setzero_ps();
609	fjz0 = _mm_setzero_ps();
610
611	/**************************
612	* CALCULATE INTERACTIONS *
613	**************************/
614
615	if (gmx_mm_any_lt(rsq00,rcutoff2))
616	{
617
618	r00 = _mm_mul_ps(rsq00,rinv00);
619	r00 = _mm_andnot_ps(dummy_mask,r00);
620
621	/* Compute parameters for interactions between i and j atoms */
622	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
623	vdwparam+vdwioffset0+vdwjidx0B,
624	vdwparam+vdwioffset0+vdwjidx0C,
625	vdwparam+vdwioffset0+vdwjidx0D,
626	&c6_00,&c12_00);
627
628	/* LENNARD-JONES DISPERSION/REPULSION */
629
630	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
631	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
632	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
633	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
634	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
635
636	d = _mm_sub_ps(r00,rswitch);
637	d = _mm_max_ps(d,_mm_setzero_ps());
638	d2 = _mm_mul_ps(d,d);
639	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
640
641	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
642
643	/* Evaluate switch function */
644	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
645	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
646	vvdw = _mm_mul_ps(vvdw,sw);
647	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
648
649	/* Update potential sum for this i atom from the interaction with this j atom. */
650	vvdw = _mm_and_ps(vvdw,cutoff_mask);
651	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
652	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
653
654	fscal = fvdw;
655
656	fscal = _mm_and_ps(fscal,cutoff_mask);
657
658	fscal = _mm_andnot_ps(dummy_mask,fscal);
659
660	/* Calculate temporary vectorial force */
661	tx = _mm_mul_ps(fscal,dx00);
662	ty = _mm_mul_ps(fscal,dy00);
663	tz = _mm_mul_ps(fscal,dz00);
664
665	/* Update vectorial force */
666	fix0 = _mm_add_ps(fix0,tx);
667	fiy0 = _mm_add_ps(fiy0,ty);
668	fiz0 = _mm_add_ps(fiz0,tz);
669
670	fjx0 = _mm_add_ps(fjx0,tx);
671	fjy0 = _mm_add_ps(fjy0,ty);
672	fjz0 = _mm_add_ps(fjz0,tz);
673
674	}
675
676	/**************************
677	* CALCULATE INTERACTIONS *
678	**************************/
679
680	if (gmx_mm_any_lt(rsq10,rcutoff2))
681	{
682
683	r10 = _mm_mul_ps(rsq10,rinv10);
684	r10 = _mm_andnot_ps(dummy_mask,r10);
685
686	/* Compute parameters for interactions between i and j atoms */
687	qq10 = _mm_mul_ps(iq1,jq0);
688
689	/* EWALD ELECTROSTATICS */
690
691	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
692	ewrt = _mm_mul_ps(r10,ewtabscale);
693	ewitab = _mm_cvttps_epi32(ewrt);
694	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
695	ewitab = _mm_slli_epi32(ewitab,2);
696	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
697	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
698	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
699	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
700	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
701	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
702	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
703	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
704	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
705
706	d = _mm_sub_ps(r10,rswitch);
707	d = _mm_max_ps(d,_mm_setzero_ps());
708	d2 = _mm_mul_ps(d,d);
709	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
710
711	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
712
713	/* Evaluate switch function */
714	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
715	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
716	velec = _mm_mul_ps(velec,sw);
717	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
718
719	/* Update potential sum for this i atom from the interaction with this j atom. */
720	velec = _mm_and_ps(velec,cutoff_mask);
721	velec = _mm_andnot_ps(dummy_mask,velec);
722	velecsum = _mm_add_ps(velecsum,velec);
723
724	fscal = felec;
725
726	fscal = _mm_and_ps(fscal,cutoff_mask);
727
728	fscal = _mm_andnot_ps(dummy_mask,fscal);
729
730	/* Calculate temporary vectorial force */
731	tx = _mm_mul_ps(fscal,dx10);
732	ty = _mm_mul_ps(fscal,dy10);
733	tz = _mm_mul_ps(fscal,dz10);
734
735	/* Update vectorial force */
736	fix1 = _mm_add_ps(fix1,tx);
737	fiy1 = _mm_add_ps(fiy1,ty);
738	fiz1 = _mm_add_ps(fiz1,tz);
739
740	fjx0 = _mm_add_ps(fjx0,tx);
741	fjy0 = _mm_add_ps(fjy0,ty);
742	fjz0 = _mm_add_ps(fjz0,tz);
743
744	}
745
746	/**************************
747	* CALCULATE INTERACTIONS *
748	**************************/
749
750	if (gmx_mm_any_lt(rsq20,rcutoff2))
751	{
752
753	r20 = _mm_mul_ps(rsq20,rinv20);
754	r20 = _mm_andnot_ps(dummy_mask,r20);
755
756	/* Compute parameters for interactions between i and j atoms */
757	qq20 = _mm_mul_ps(iq2,jq0);
758
759	/* EWALD ELECTROSTATICS */
760
761	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
762	ewrt = _mm_mul_ps(r20,ewtabscale);
763	ewitab = _mm_cvttps_epi32(ewrt);
764	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
765	ewitab = _mm_slli_epi32(ewitab,2);
766	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
767	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
768	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
769	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
770	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
771	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
772	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
773	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
774	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
775
776	d = _mm_sub_ps(r20,rswitch);
777	d = _mm_max_ps(d,_mm_setzero_ps());
778	d2 = _mm_mul_ps(d,d);
779	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
780
781	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
782
783	/* Evaluate switch function */
784	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
785	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
786	velec = _mm_mul_ps(velec,sw);
787	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
788
789	/* Update potential sum for this i atom from the interaction with this j atom. */
790	velec = _mm_and_ps(velec,cutoff_mask);
791	velec = _mm_andnot_ps(dummy_mask,velec);
792	velecsum = _mm_add_ps(velecsum,velec);
793
794	fscal = felec;
795
796	fscal = _mm_and_ps(fscal,cutoff_mask);
797
798	fscal = _mm_andnot_ps(dummy_mask,fscal);
799
800	/* Calculate temporary vectorial force */
801	tx = _mm_mul_ps(fscal,dx20);
802	ty = _mm_mul_ps(fscal,dy20);
803	tz = _mm_mul_ps(fscal,dz20);
804
805	/* Update vectorial force */
806	fix2 = _mm_add_ps(fix2,tx);
807	fiy2 = _mm_add_ps(fiy2,ty);
808	fiz2 = _mm_add_ps(fiz2,tz);
809
810	fjx0 = _mm_add_ps(fjx0,tx);
811	fjy0 = _mm_add_ps(fjy0,ty);
812	fjz0 = _mm_add_ps(fjz0,tz);
813
814	}
815
816	/**************************
817	* CALCULATE INTERACTIONS *
818	**************************/
819
820	if (gmx_mm_any_lt(rsq30,rcutoff2))
821	{
822
823	r30 = _mm_mul_ps(rsq30,rinv30);
824	r30 = _mm_andnot_ps(dummy_mask,r30);
825
826	/* Compute parameters for interactions between i and j atoms */
827	qq30 = _mm_mul_ps(iq3,jq0);
828
829	/* EWALD ELECTROSTATICS */
830
831	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
832	ewrt = _mm_mul_ps(r30,ewtabscale);
833	ewitab = _mm_cvttps_epi32(ewrt);
834	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
835	ewitab = _mm_slli_epi32(ewitab,2);
836	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
837	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
838	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
839	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
840	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
841	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
842	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
843	velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
844	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
845
846	d = _mm_sub_ps(r30,rswitch);
847	d = _mm_max_ps(d,_mm_setzero_ps());
848	d2 = _mm_mul_ps(d,d);
849	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
850
851	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
852
853	/* Evaluate switch function */
854	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
855	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
856	velec = _mm_mul_ps(velec,sw);
857	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
858
859	/* Update potential sum for this i atom from the interaction with this j atom. */
860	velec = _mm_and_ps(velec,cutoff_mask);
861	velec = _mm_andnot_ps(dummy_mask,velec);
862	velecsum = _mm_add_ps(velecsum,velec);
863
864	fscal = felec;
865
866	fscal = _mm_and_ps(fscal,cutoff_mask);
867
868	fscal = _mm_andnot_ps(dummy_mask,fscal);
869
870	/* Calculate temporary vectorial force */
871	tx = _mm_mul_ps(fscal,dx30);
872	ty = _mm_mul_ps(fscal,dy30);
873	tz = _mm_mul_ps(fscal,dz30);
874
875	/* Update vectorial force */
876	fix3 = _mm_add_ps(fix3,tx);
877	fiy3 = _mm_add_ps(fiy3,ty);
878	fiz3 = _mm_add_ps(fiz3,tz);
879
880	fjx0 = _mm_add_ps(fjx0,tx);
881	fjy0 = _mm_add_ps(fjy0,ty);
882	fjz0 = _mm_add_ps(fjz0,tz);
883
884	}
885
886	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
887	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
888	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
889	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
890
891	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
892
893	/* Inner loop uses 258 flops */
894	}
895
896	/* End of innermost loop */
897
898	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
899	f+i_coord_offset,fshift+i_shift_offset);
900
901	ggid = gid[iidx];
902	/* Update potential energies */
903	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
904	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
905
906	/* Increment number of inner iterations */
907	inneriter += j_index_end - j_index_start;
908
909	/* Outer loop uses 26 flops */
910	}
911
912	/* Increment number of outer iterations */
913	outeriter += nri;
914
915	/* Update outer/inner flops */
916
917	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter26 + inneriter258)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_VF] += outeriter26 + inneriter 258;
918	}
919	/*
920	* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
921	* Electrostatics interaction: Ewald
922	* VdW interaction: LennardJones
923	* Geometry: Water4-Particle
924	* Calculate force/pot: Force
925	*/
926	void
927	nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
928	(t_nblist * gmx_restrict nlist,
929	rvec * gmx_restrict xx,
930	rvec * gmx_restrict ff,
931	t_forcerec * gmx_restrict fr,
932	t_mdatoms * gmx_restrict mdatoms,
933	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
934	t_nrnb * gmx_restrict nrnb)
935	{
936	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
937	* just 0 for non-waters.
938	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
939	* jnr indices corresponding to data put in the four positions in the SIMD register.
940	*/
941	int i_shift_offset,i_coord_offset,outeriter,inneriter;
942	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
943	int jnrA,jnrB,jnrC,jnrD;
944	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
945	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
946	int iinr,jindex,jjnr,shiftidx,*gid;
947	real rcutoff_scalar;
948	real shiftvec,fshift,x,f;
949	real fjptrA,fjptrB,fjptrC,fjptrD;
950	real scratch[4*DIM3];
951	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
952	int vdwioffset0;
953	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
954	int vdwioffset1;
955	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
956	int vdwioffset2;
957	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
958	int vdwioffset3;
959	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
960	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
961	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
962	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
963	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
964	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
965	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
966	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
967	real *charge;
968	int nvdwtype;
969	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
970	int *vdwtype;
971	real *vdwparam;
972	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
973	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
974	__m128i ewitab;
975	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
976	real *ewtab;
977	__m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
978	real rswitch_scalar,d_scalar;
979	__m128 dummy_mask,cutoff_mask;
980	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
981	__m128 one = _mm_set1_ps(1.0);
982	__m128 two = _mm_set1_ps(2.0);
983	x = xx[0];
984	f = ff[0];
985
986	nri = nlist->nri;
987	iinr = nlist->iinr;
988	jindex = nlist->jindex;
989	jjnr = nlist->jjnr;
990	shiftidx = nlist->shift;
991	gid = nlist->gid;
992	shiftvec = fr->shift_vec[0];
993	fshift = fr->fshift[0];
994	facel = _mm_set1_ps(fr->epsfac);
995	charge = mdatoms->chargeA;
996	nvdwtype = fr->ntype;
997	vdwparam = fr->nbfp;
998	vdwtype = mdatoms->typeA;
999
1000	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1001	ewtab = fr->ic->tabq_coul_FDV0;
1002	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1003	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1004
1005	/* Setup water-specific parameters */
1006	inr = nlist->iinr[0];
1007	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1008	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1009	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1010	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
1011
1012	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1013	rcutoff_scalar = fr->rcoulomb;
1014	rcutoff = _mm_set1_ps(rcutoff_scalar);
1015	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1016
1017	rswitch_scalar = fr->rcoulomb_switch;
1018	rswitch = _mm_set1_ps(rswitch_scalar);
1019	/* Setup switch parameters */
1020	d_scalar = rcutoff_scalar-rswitch_scalar;
1021	d = _mm_set1_ps(d_scalar);
1022	swV3 = _mm_set1_ps(-10.0/(d_scalard_scalard_scalar));
1023	swV4 = _mm_set1_ps( 15.0/(d_scalard_scalard_scalar*d_scalar));
1024	swV5 = _mm_set1_ps( -6.0/(d_scalard_scalard_scalard_scalard_scalar));
1025	swF2 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalar));
1026	swF3 = _mm_set1_ps( 60.0/(d_scalard_scalard_scalar*d_scalar));
1027	swF4 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalard_scalard_scalar));
1028
1029	/* Avoid stupid compiler warnings */
1030	jnrA = jnrB = jnrC = jnrD = 0;
1031	j_coord_offsetA = 0;
1032	j_coord_offsetB = 0;
1033	j_coord_offsetC = 0;
	Value stored to 'j_coord_offsetC' is never read
1034	j_coord_offsetD = 0;
1035
1036	outeriter = 0;
1037	inneriter = 0;
1038
1039	for(iidx=0;iidx<4*DIM3;iidx++)
1040	{
1041	scratch[iidx] = 0.0;
1042	}
1043
1044	/* Start outer loop over neighborlists */
1045	for(iidx=0; iidx<nri; iidx++)
1046	{
1047	/* Load shift vector for this list */
1048	i_shift_offset = DIM3*shiftidx[iidx];
1049
1050	/* Load limits for loop over neighbors */
1051	j_index_start = jindex[iidx];
1052	j_index_end = jindex[iidx+1];
1053
1054	/* Get outer coordinate index */
1055	inr = iinr[iidx];
1056	i_coord_offset = DIM3*inr;
1057
1058	/* Load i particle coords and add shift vector */
1059	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1060	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1061
1062	fix0 = _mm_setzero_ps();
1063	fiy0 = _mm_setzero_ps();
1064	fiz0 = _mm_setzero_ps();
1065	fix1 = _mm_setzero_ps();
1066	fiy1 = _mm_setzero_ps();
1067	fiz1 = _mm_setzero_ps();
1068	fix2 = _mm_setzero_ps();
1069	fiy2 = _mm_setzero_ps();
1070	fiz2 = _mm_setzero_ps();
1071	fix3 = _mm_setzero_ps();
1072	fiy3 = _mm_setzero_ps();
1073	fiz3 = _mm_setzero_ps();
1074
1075	/* Start inner kernel loop */
1076	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1077	{
1078
1079	/* Get j neighbor index, and coordinate index */
1080	jnrA = jjnr[jidx];
1081	jnrB = jjnr[jidx+1];
1082	jnrC = jjnr[jidx+2];
1083	jnrD = jjnr[jidx+3];
1084	j_coord_offsetA = DIM3*jnrA;
1085	j_coord_offsetB = DIM3*jnrB;
1086	j_coord_offsetC = DIM3*jnrC;
1087	j_coord_offsetD = DIM3*jnrD;
1088
1089	/* load j atom coordinates */
1090	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1091	x+j_coord_offsetC,x+j_coord_offsetD,
1092	&jx0,&jy0,&jz0);
1093
1094	/* Calculate displacement vector */
1095	dx00 = _mm_sub_ps(ix0,jx0);
1096	dy00 = _mm_sub_ps(iy0,jy0);
1097	dz00 = _mm_sub_ps(iz0,jz0);
1098	dx10 = _mm_sub_ps(ix1,jx0);
1099	dy10 = _mm_sub_ps(iy1,jy0);
1100	dz10 = _mm_sub_ps(iz1,jz0);
1101	dx20 = _mm_sub_ps(ix2,jx0);
1102	dy20 = _mm_sub_ps(iy2,jy0);
1103	dz20 = _mm_sub_ps(iz2,jz0);
1104	dx30 = _mm_sub_ps(ix3,jx0);
1105	dy30 = _mm_sub_ps(iy3,jy0);
1106	dz30 = _mm_sub_ps(iz3,jz0);
1107
1108	/* Calculate squared distance and things based on it */
1109	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1110	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1111	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1112	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1113
1114	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1115	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1116	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1117	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1118
1119	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1120	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1121	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1122	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1123
1124	/* Load parameters for j particles */
1125	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1126	charge+jnrC+0,charge+jnrD+0);
1127	vdwjidx0A = 2*vdwtype[jnrA+0];
1128	vdwjidx0B = 2*vdwtype[jnrB+0];
1129	vdwjidx0C = 2*vdwtype[jnrC+0];
1130	vdwjidx0D = 2*vdwtype[jnrD+0];
1131
1132	fjx0 = _mm_setzero_ps();
1133	fjy0 = _mm_setzero_ps();
1134	fjz0 = _mm_setzero_ps();
1135
1136	/**************************
1137	* CALCULATE INTERACTIONS *
1138	**************************/
1139
1140	if (gmx_mm_any_lt(rsq00,rcutoff2))
1141	{
1142
1143	r00 = _mm_mul_ps(rsq00,rinv00);
1144
1145	/* Compute parameters for interactions between i and j atoms */
1146	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1147	vdwparam+vdwioffset0+vdwjidx0B,
1148	vdwparam+vdwioffset0+vdwjidx0C,
1149	vdwparam+vdwioffset0+vdwjidx0D,
1150	&c6_00,&c12_00);
1151
1152	/* LENNARD-JONES DISPERSION/REPULSION */
1153
1154	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1155	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1156	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1157	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1158	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1159
1160	d = _mm_sub_ps(r00,rswitch);
1161	d = _mm_max_ps(d,_mm_setzero_ps());
1162	d2 = _mm_mul_ps(d,d);
1163	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1164
1165	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1166
1167	/* Evaluate switch function */
1168	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1169	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1170	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1171
1172	fscal = fvdw;
1173
1174	fscal = _mm_and_ps(fscal,cutoff_mask);
1175
1176	/* Calculate temporary vectorial force */
1177	tx = _mm_mul_ps(fscal,dx00);
1178	ty = _mm_mul_ps(fscal,dy00);
1179	tz = _mm_mul_ps(fscal,dz00);
1180
1181	/* Update vectorial force */
1182	fix0 = _mm_add_ps(fix0,tx);
1183	fiy0 = _mm_add_ps(fiy0,ty);
1184	fiz0 = _mm_add_ps(fiz0,tz);
1185
1186	fjx0 = _mm_add_ps(fjx0,tx);
1187	fjy0 = _mm_add_ps(fjy0,ty);
1188	fjz0 = _mm_add_ps(fjz0,tz);
1189
1190	}
1191
1192	/**************************
1193	* CALCULATE INTERACTIONS *
1194	**************************/
1195
1196	if (gmx_mm_any_lt(rsq10,rcutoff2))
1197	{
1198
1199	r10 = _mm_mul_ps(rsq10,rinv10);
1200
1201	/* Compute parameters for interactions between i and j atoms */
1202	qq10 = _mm_mul_ps(iq1,jq0);
1203
1204	/* EWALD ELECTROSTATICS */
1205
1206	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1207	ewrt = _mm_mul_ps(r10,ewtabscale);
1208	ewitab = _mm_cvttps_epi32(ewrt);
1209	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1210	ewitab = _mm_slli_epi32(ewitab,2);
1211	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1212	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1213	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1214	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1215	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1216	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1217	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1218	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1219	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1220
1221	d = _mm_sub_ps(r10,rswitch);
1222	d = _mm_max_ps(d,_mm_setzero_ps());
1223	d2 = _mm_mul_ps(d,d);
1224	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1225
1226	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1227
1228	/* Evaluate switch function */
1229	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1230	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1231	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1232
1233	fscal = felec;
1234
1235	fscal = _mm_and_ps(fscal,cutoff_mask);
1236
1237	/* Calculate temporary vectorial force */
1238	tx = _mm_mul_ps(fscal,dx10);
1239	ty = _mm_mul_ps(fscal,dy10);
1240	tz = _mm_mul_ps(fscal,dz10);
1241
1242	/* Update vectorial force */
1243	fix1 = _mm_add_ps(fix1,tx);
1244	fiy1 = _mm_add_ps(fiy1,ty);
1245	fiz1 = _mm_add_ps(fiz1,tz);
1246
1247	fjx0 = _mm_add_ps(fjx0,tx);
1248	fjy0 = _mm_add_ps(fjy0,ty);
1249	fjz0 = _mm_add_ps(fjz0,tz);
1250
1251	}
1252
1253	/**************************
1254	* CALCULATE INTERACTIONS *
1255	**************************/
1256
1257	if (gmx_mm_any_lt(rsq20,rcutoff2))
1258	{
1259
1260	r20 = _mm_mul_ps(rsq20,rinv20);
1261
1262	/* Compute parameters for interactions between i and j atoms */
1263	qq20 = _mm_mul_ps(iq2,jq0);
1264
1265	/* EWALD ELECTROSTATICS */
1266
1267	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268	ewrt = _mm_mul_ps(r20,ewtabscale);
1269	ewitab = _mm_cvttps_epi32(ewrt);
1270	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1271	ewitab = _mm_slli_epi32(ewitab,2);
1272	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1273	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1274	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1275	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1276	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1277	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1278	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1279	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1280	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1281
1282	d = _mm_sub_ps(r20,rswitch);
1283	d = _mm_max_ps(d,_mm_setzero_ps());
1284	d2 = _mm_mul_ps(d,d);
1285	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1286
1287	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1288
1289	/* Evaluate switch function */
1290	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1291	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1292	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1293
1294	fscal = felec;
1295
1296	fscal = _mm_and_ps(fscal,cutoff_mask);
1297
1298	/* Calculate temporary vectorial force */
1299	tx = _mm_mul_ps(fscal,dx20);
1300	ty = _mm_mul_ps(fscal,dy20);
1301	tz = _mm_mul_ps(fscal,dz20);
1302
1303	/* Update vectorial force */
1304	fix2 = _mm_add_ps(fix2,tx);
1305	fiy2 = _mm_add_ps(fiy2,ty);
1306	fiz2 = _mm_add_ps(fiz2,tz);
1307
1308	fjx0 = _mm_add_ps(fjx0,tx);
1309	fjy0 = _mm_add_ps(fjy0,ty);
1310	fjz0 = _mm_add_ps(fjz0,tz);
1311
1312	}
1313
1314	/**************************
1315	* CALCULATE INTERACTIONS *
1316	**************************/
1317
1318	if (gmx_mm_any_lt(rsq30,rcutoff2))
1319	{
1320
1321	r30 = _mm_mul_ps(rsq30,rinv30);
1322
1323	/* Compute parameters for interactions between i and j atoms */
1324	qq30 = _mm_mul_ps(iq3,jq0);
1325
1326	/* EWALD ELECTROSTATICS */
1327
1328	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1329	ewrt = _mm_mul_ps(r30,ewtabscale);
1330	ewitab = _mm_cvttps_epi32(ewrt);
1331	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1332	ewitab = _mm_slli_epi32(ewitab,2);
1333	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1334	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1335	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1336	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1337	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1338	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1339	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1340	velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1341	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1342
1343	d = _mm_sub_ps(r30,rswitch);
1344	d = _mm_max_ps(d,_mm_setzero_ps());
1345	d2 = _mm_mul_ps(d,d);
1346	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1347
1348	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1349
1350	/* Evaluate switch function */
1351	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1352	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1353	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1354
1355	fscal = felec;
1356
1357	fscal = _mm_and_ps(fscal,cutoff_mask);
1358
1359	/* Calculate temporary vectorial force */
1360	tx = _mm_mul_ps(fscal,dx30);
1361	ty = _mm_mul_ps(fscal,dy30);
1362	tz = _mm_mul_ps(fscal,dz30);
1363
1364	/* Update vectorial force */
1365	fix3 = _mm_add_ps(fix3,tx);
1366	fiy3 = _mm_add_ps(fiy3,ty);
1367	fiz3 = _mm_add_ps(fiz3,tz);
1368
1369	fjx0 = _mm_add_ps(fjx0,tx);
1370	fjy0 = _mm_add_ps(fjy0,ty);
1371	fjz0 = _mm_add_ps(fjz0,tz);
1372
1373	}
1374
1375	fjptrA = f+j_coord_offsetA;
1376	fjptrB = f+j_coord_offsetB;
1377	fjptrC = f+j_coord_offsetC;
1378	fjptrD = f+j_coord_offsetD;
1379
1380	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1381
1382	/* Inner loop uses 242 flops */
1383	}
1384
1385	if(jidx<j_index_end)
1386	{
1387
1388	/* Get j neighbor index, and coordinate index */
1389	jnrlistA = jjnr[jidx];
1390	jnrlistB = jjnr[jidx+1];
1391	jnrlistC = jjnr[jidx+2];
1392	jnrlistD = jjnr[jidx+3];
1393	/* Sign of each element will be negative for non-real atoms.
1394	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1395	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1396	*/
1397	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1398	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1399	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1400	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1401	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1402	j_coord_offsetA = DIM3*jnrA;
1403	j_coord_offsetB = DIM3*jnrB;
1404	j_coord_offsetC = DIM3*jnrC;
1405	j_coord_offsetD = DIM3*jnrD;
1406
1407	/* load j atom coordinates */
1408	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1409	x+j_coord_offsetC,x+j_coord_offsetD,
1410	&jx0,&jy0,&jz0);
1411
1412	/* Calculate displacement vector */
1413	dx00 = _mm_sub_ps(ix0,jx0);
1414	dy00 = _mm_sub_ps(iy0,jy0);
1415	dz00 = _mm_sub_ps(iz0,jz0);
1416	dx10 = _mm_sub_ps(ix1,jx0);
1417	dy10 = _mm_sub_ps(iy1,jy0);
1418	dz10 = _mm_sub_ps(iz1,jz0);
1419	dx20 = _mm_sub_ps(ix2,jx0);
1420	dy20 = _mm_sub_ps(iy2,jy0);
1421	dz20 = _mm_sub_ps(iz2,jz0);
1422	dx30 = _mm_sub_ps(ix3,jx0);
1423	dy30 = _mm_sub_ps(iy3,jy0);
1424	dz30 = _mm_sub_ps(iz3,jz0);
1425
1426	/* Calculate squared distance and things based on it */
1427	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1428	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1429	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1430	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1431
1432	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1433	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1434	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1435	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1436
1437	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1438	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1439	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1440	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1441
1442	/* Load parameters for j particles */
1443	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1444	charge+jnrC+0,charge+jnrD+0);
1445	vdwjidx0A = 2*vdwtype[jnrA+0];
1446	vdwjidx0B = 2*vdwtype[jnrB+0];
1447	vdwjidx0C = 2*vdwtype[jnrC+0];
1448	vdwjidx0D = 2*vdwtype[jnrD+0];
1449
1450	fjx0 = _mm_setzero_ps();
1451	fjy0 = _mm_setzero_ps();
1452	fjz0 = _mm_setzero_ps();
1453
1454	/**************************
1455	* CALCULATE INTERACTIONS *
1456	**************************/
1457
1458	if (gmx_mm_any_lt(rsq00,rcutoff2))
1459	{
1460
1461	r00 = _mm_mul_ps(rsq00,rinv00);
1462	r00 = _mm_andnot_ps(dummy_mask,r00);
1463
1464	/* Compute parameters for interactions between i and j atoms */
1465	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1466	vdwparam+vdwioffset0+vdwjidx0B,
1467	vdwparam+vdwioffset0+vdwjidx0C,
1468	vdwparam+vdwioffset0+vdwjidx0D,
1469	&c6_00,&c12_00);
1470
1471	/* LENNARD-JONES DISPERSION/REPULSION */
1472
1473	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1474	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1475	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1476	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1477	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1478
1479	d = _mm_sub_ps(r00,rswitch);
1480	d = _mm_max_ps(d,_mm_setzero_ps());
1481	d2 = _mm_mul_ps(d,d);
1482	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1483
1484	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1485
1486	/* Evaluate switch function */
1487	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1488	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1489	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1490
1491	fscal = fvdw;
1492
1493	fscal = _mm_and_ps(fscal,cutoff_mask);
1494
1495	fscal = _mm_andnot_ps(dummy_mask,fscal);
1496
1497	/* Calculate temporary vectorial force */
1498	tx = _mm_mul_ps(fscal,dx00);
1499	ty = _mm_mul_ps(fscal,dy00);
1500	tz = _mm_mul_ps(fscal,dz00);
1501
1502	/* Update vectorial force */
1503	fix0 = _mm_add_ps(fix0,tx);
1504	fiy0 = _mm_add_ps(fiy0,ty);
1505	fiz0 = _mm_add_ps(fiz0,tz);
1506
1507	fjx0 = _mm_add_ps(fjx0,tx);
1508	fjy0 = _mm_add_ps(fjy0,ty);
1509	fjz0 = _mm_add_ps(fjz0,tz);
1510
1511	}
1512
1513	/**************************
1514	* CALCULATE INTERACTIONS *
1515	**************************/
1516
1517	if (gmx_mm_any_lt(rsq10,rcutoff2))
1518	{
1519
1520	r10 = _mm_mul_ps(rsq10,rinv10);
1521	r10 = _mm_andnot_ps(dummy_mask,r10);
1522
1523	/* Compute parameters for interactions between i and j atoms */
1524	qq10 = _mm_mul_ps(iq1,jq0);
1525
1526	/* EWALD ELECTROSTATICS */
1527
1528	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1529	ewrt = _mm_mul_ps(r10,ewtabscale);
1530	ewitab = _mm_cvttps_epi32(ewrt);
1531	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1532	ewitab = _mm_slli_epi32(ewitab,2);
1533	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1534	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1535	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1536	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1537	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1538	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1539	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1540	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1541	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1542
1543	d = _mm_sub_ps(r10,rswitch);
1544	d = _mm_max_ps(d,_mm_setzero_ps());
1545	d2 = _mm_mul_ps(d,d);
1546	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1547
1548	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1549
1550	/* Evaluate switch function */
1551	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1552	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1553	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1554
1555	fscal = felec;
1556
1557	fscal = _mm_and_ps(fscal,cutoff_mask);
1558
1559	fscal = _mm_andnot_ps(dummy_mask,fscal);
1560
1561	/* Calculate temporary vectorial force */
1562	tx = _mm_mul_ps(fscal,dx10);
1563	ty = _mm_mul_ps(fscal,dy10);
1564	tz = _mm_mul_ps(fscal,dz10);
1565
1566	/* Update vectorial force */
1567	fix1 = _mm_add_ps(fix1,tx);
1568	fiy1 = _mm_add_ps(fiy1,ty);
1569	fiz1 = _mm_add_ps(fiz1,tz);
1570
1571	fjx0 = _mm_add_ps(fjx0,tx);
1572	fjy0 = _mm_add_ps(fjy0,ty);
1573	fjz0 = _mm_add_ps(fjz0,tz);
1574
1575	}
1576
1577	/**************************
1578	* CALCULATE INTERACTIONS *
1579	**************************/
1580
1581	if (gmx_mm_any_lt(rsq20,rcutoff2))
1582	{
1583
1584	r20 = _mm_mul_ps(rsq20,rinv20);
1585	r20 = _mm_andnot_ps(dummy_mask,r20);
1586
1587	/* Compute parameters for interactions between i and j atoms */
1588	qq20 = _mm_mul_ps(iq2,jq0);
1589
1590	/* EWALD ELECTROSTATICS */
1591
1592	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1593	ewrt = _mm_mul_ps(r20,ewtabscale);
1594	ewitab = _mm_cvttps_epi32(ewrt);
1595	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1596	ewitab = _mm_slli_epi32(ewitab,2);
1597	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1598	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1599	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1600	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1601	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1602	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1603	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1604	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1605	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1606
1607	d = _mm_sub_ps(r20,rswitch);
1608	d = _mm_max_ps(d,_mm_setzero_ps());
1609	d2 = _mm_mul_ps(d,d);
1610	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1611
1612	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1613
1614	/* Evaluate switch function */
1615	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1616	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1617	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1618
1619	fscal = felec;
1620
1621	fscal = _mm_and_ps(fscal,cutoff_mask);
1622
1623	fscal = _mm_andnot_ps(dummy_mask,fscal);
1624
1625	/* Calculate temporary vectorial force */
1626	tx = _mm_mul_ps(fscal,dx20);
1627	ty = _mm_mul_ps(fscal,dy20);
1628	tz = _mm_mul_ps(fscal,dz20);
1629
1630	/* Update vectorial force */
1631	fix2 = _mm_add_ps(fix2,tx);
1632	fiy2 = _mm_add_ps(fiy2,ty);
1633	fiz2 = _mm_add_ps(fiz2,tz);
1634
1635	fjx0 = _mm_add_ps(fjx0,tx);
1636	fjy0 = _mm_add_ps(fjy0,ty);
1637	fjz0 = _mm_add_ps(fjz0,tz);
1638
1639	}
1640
1641	/**************************
1642	* CALCULATE INTERACTIONS *
1643	**************************/
1644
1645	if (gmx_mm_any_lt(rsq30,rcutoff2))
1646	{
1647
1648	r30 = _mm_mul_ps(rsq30,rinv30);
1649	r30 = _mm_andnot_ps(dummy_mask,r30);
1650
1651	/* Compute parameters for interactions between i and j atoms */
1652	qq30 = _mm_mul_ps(iq3,jq0);
1653
1654	/* EWALD ELECTROSTATICS */
1655
1656	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1657	ewrt = _mm_mul_ps(r30,ewtabscale);
1658	ewitab = _mm_cvttps_epi32(ewrt);
1659	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1660	ewitab = _mm_slli_epi32(ewitab,2);
1661	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1662	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1663	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1664	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1665	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1666	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1667	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1668	velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1669	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1670
1671	d = _mm_sub_ps(r30,rswitch);
1672	d = _mm_max_ps(d,_mm_setzero_ps());
1673	d2 = _mm_mul_ps(d,d);
1674	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1675
1676	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1677
1678	/* Evaluate switch function */
1679	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1680	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1681	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1682
1683	fscal = felec;
1684
1685	fscal = _mm_and_ps(fscal,cutoff_mask);
1686
1687	fscal = _mm_andnot_ps(dummy_mask,fscal);
1688
1689	/* Calculate temporary vectorial force */
1690	tx = _mm_mul_ps(fscal,dx30);
1691	ty = _mm_mul_ps(fscal,dy30);
1692	tz = _mm_mul_ps(fscal,dz30);
1693
1694	/* Update vectorial force */
1695	fix3 = _mm_add_ps(fix3,tx);
1696	fiy3 = _mm_add_ps(fiy3,ty);
1697	fiz3 = _mm_add_ps(fiz3,tz);
1698
1699	fjx0 = _mm_add_ps(fjx0,tx);
1700	fjy0 = _mm_add_ps(fjy0,ty);
1701	fjz0 = _mm_add_ps(fjz0,tz);
1702
1703	}
1704
1705	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1706	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1707	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1708	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1709
1710	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1711
1712	/* Inner loop uses 246 flops */
1713	}
1714
1715	/* End of innermost loop */
1716
1717	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1718	f+i_coord_offset,fshift+i_shift_offset);
1719
1720	/* Increment number of inner iterations */
1721	inneriter += j_index_end - j_index_start;
1722
1723	/* Outer loop uses 24 flops */
1724	}
1725
1726	/* Increment number of outer iterations */
1727	outeriter += nri;
1728
1729	/* Update outer/inner flops */
1730
1731	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter24 + inneriter246)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_F] += outeriter24 + inneriter 246;
1732	}