/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse4_1_single.c
Location:	line 798, column 5
Description:	Value stored to 'gid' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: LennardJones
56	* Geometry: Water3-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
97	real *charge;
98	int nvdwtype;
99	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
100	int *vdwtype;
101	real *vdwparam;
102	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
103	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
104	__m128i ewitab;
105	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
106	real *ewtab;
107	__m128 dummy_mask,cutoff_mask;
108	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109	__m128 one = _mm_set1_ps(1.0);
110	__m128 two = _mm_set1_ps(2.0);
111	x = xx[0];
112	f = ff[0];
113
114	nri = nlist->nri;
115	iinr = nlist->iinr;
116	jindex = nlist->jindex;
117	jjnr = nlist->jjnr;
118	shiftidx = nlist->shift;
119	gid = nlist->gid;
120	shiftvec = fr->shift_vec[0];
121	fshift = fr->fshift[0];
122	facel = _mm_set1_ps(fr->epsfac);
123	charge = mdatoms->chargeA;
124	nvdwtype = fr->ntype;
125	vdwparam = fr->nbfp;
126	vdwtype = mdatoms->typeA;
127
128	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
129	ewtab = fr->ic->tabq_coul_FDV0;
130	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
131	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
132
133	/* Setup water-specific parameters */
134	inr = nlist->iinr[0];
135	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
136	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
139
140	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
141	rcutoff_scalar = fr->rcoulomb;
142	rcutoff = _mm_set1_ps(rcutoff_scalar);
143	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
144
145	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
146	rvdw = _mm_set1_ps(fr->rvdw);
147
148	/* Avoid stupid compiler warnings */
149	jnrA = jnrB = jnrC = jnrD = 0;
150	j_coord_offsetA = 0;
151	j_coord_offsetB = 0;
152	j_coord_offsetC = 0;
153	j_coord_offsetD = 0;
154
155	outeriter = 0;
156	inneriter = 0;
157
158	for(iidx=0;iidx<4*DIM3;iidx++)
159	{
160	scratch[iidx] = 0.0;
161	}
162
163	/* Start outer loop over neighborlists */
164	for(iidx=0; iidx<nri; iidx++)
165	{
166	/* Load shift vector for this list */
167	i_shift_offset = DIM3*shiftidx[iidx];
168
169	/* Load limits for loop over neighbors */
170	j_index_start = jindex[iidx];
171	j_index_end = jindex[iidx+1];
172
173	/* Get outer coordinate index */
174	inr = iinr[iidx];
175	i_coord_offset = DIM3*inr;
176
177	/* Load i particle coords and add shift vector */
178	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
179	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
180
181	fix0 = _mm_setzero_ps();
182	fiy0 = _mm_setzero_ps();
183	fiz0 = _mm_setzero_ps();
184	fix1 = _mm_setzero_ps();
185	fiy1 = _mm_setzero_ps();
186	fiz1 = _mm_setzero_ps();
187	fix2 = _mm_setzero_ps();
188	fiy2 = _mm_setzero_ps();
189	fiz2 = _mm_setzero_ps();
190
191	/* Reset potential sums */
192	velecsum = _mm_setzero_ps();
193	vvdwsum = _mm_setzero_ps();
194
195	/* Start inner kernel loop */
196	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
197	{
198
199	/* Get j neighbor index, and coordinate index */
200	jnrA = jjnr[jidx];
201	jnrB = jjnr[jidx+1];
202	jnrC = jjnr[jidx+2];
203	jnrD = jjnr[jidx+3];
204	j_coord_offsetA = DIM3*jnrA;
205	j_coord_offsetB = DIM3*jnrB;
206	j_coord_offsetC = DIM3*jnrC;
207	j_coord_offsetD = DIM3*jnrD;
208
209	/* load j atom coordinates */
210	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
211	x+j_coord_offsetC,x+j_coord_offsetD,
212	&jx0,&jy0,&jz0);
213
214	/* Calculate displacement vector */
215	dx00 = _mm_sub_ps(ix0,jx0);
216	dy00 = _mm_sub_ps(iy0,jy0);
217	dz00 = _mm_sub_ps(iz0,jz0);
218	dx10 = _mm_sub_ps(ix1,jx0);
219	dy10 = _mm_sub_ps(iy1,jy0);
220	dz10 = _mm_sub_ps(iz1,jz0);
221	dx20 = _mm_sub_ps(ix2,jx0);
222	dy20 = _mm_sub_ps(iy2,jy0);
223	dz20 = _mm_sub_ps(iz2,jz0);
224
225	/* Calculate squared distance and things based on it */
226	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
227	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
228	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
229
230	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
231	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
232	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
233
234	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
235	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
236	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
237
238	/* Load parameters for j particles */
239	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
240	charge+jnrC+0,charge+jnrD+0);
241	vdwjidx0A = 2*vdwtype[jnrA+0];
242	vdwjidx0B = 2*vdwtype[jnrB+0];
243	vdwjidx0C = 2*vdwtype[jnrC+0];
244	vdwjidx0D = 2*vdwtype[jnrD+0];
245
246	fjx0 = _mm_setzero_ps();
247	fjy0 = _mm_setzero_ps();
248	fjz0 = _mm_setzero_ps();
249
250	/**************************
251	* CALCULATE INTERACTIONS *
252	**************************/
253
254	if (gmx_mm_any_lt(rsq00,rcutoff2))
255	{
256
257	r00 = _mm_mul_ps(rsq00,rinv00);
258
259	/* Compute parameters for interactions between i and j atoms */
260	qq00 = _mm_mul_ps(iq0,jq0);
261	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
262	vdwparam+vdwioffset0+vdwjidx0B,
263	vdwparam+vdwioffset0+vdwjidx0C,
264	vdwparam+vdwioffset0+vdwjidx0D,
265	&c6_00,&c12_00);
266
267	/* EWALD ELECTROSTATICS */
268
269	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
270	ewrt = _mm_mul_ps(r00,ewtabscale);
271	ewitab = _mm_cvttps_epi32(ewrt);
272	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
273	ewitab = _mm_slli_epi32(ewitab,2);
274	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
275	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
276	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
277	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
278	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
279	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
280	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
281	velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
282	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
283
284	/* LENNARD-JONES DISPERSION/REPULSION */
285
286	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
287	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
288	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
289	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
290	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
291	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
292
293	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
294
295	/* Update potential sum for this i atom from the interaction with this j atom. */
296	velec = _mm_and_ps(velec,cutoff_mask);
297	velecsum = _mm_add_ps(velecsum,velec);
298	vvdw = _mm_and_ps(vvdw,cutoff_mask);
299	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
300
301	fscal = _mm_add_ps(felec,fvdw);
302
303	fscal = _mm_and_ps(fscal,cutoff_mask);
304
305	/* Calculate temporary vectorial force */
306	tx = _mm_mul_ps(fscal,dx00);
307	ty = _mm_mul_ps(fscal,dy00);
308	tz = _mm_mul_ps(fscal,dz00);
309
310	/* Update vectorial force */
311	fix0 = _mm_add_ps(fix0,tx);
312	fiy0 = _mm_add_ps(fiy0,ty);
313	fiz0 = _mm_add_ps(fiz0,tz);
314
315	fjx0 = _mm_add_ps(fjx0,tx);
316	fjy0 = _mm_add_ps(fjy0,ty);
317	fjz0 = _mm_add_ps(fjz0,tz);
318
319	}
320
321	/**************************
322	* CALCULATE INTERACTIONS *
323	**************************/
324
325	if (gmx_mm_any_lt(rsq10,rcutoff2))
326	{
327
328	r10 = _mm_mul_ps(rsq10,rinv10);
329
330	/* Compute parameters for interactions between i and j atoms */
331	qq10 = _mm_mul_ps(iq1,jq0);
332
333	/* EWALD ELECTROSTATICS */
334
335	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
336	ewrt = _mm_mul_ps(r10,ewtabscale);
337	ewitab = _mm_cvttps_epi32(ewrt);
338	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
339	ewitab = _mm_slli_epi32(ewitab,2);
340	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
341	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
342	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
343	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
344	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
345	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
346	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
347	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
348	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
349
350	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
351
352	/* Update potential sum for this i atom from the interaction with this j atom. */
353	velec = _mm_and_ps(velec,cutoff_mask);
354	velecsum = _mm_add_ps(velecsum,velec);
355
356	fscal = felec;
357
358	fscal = _mm_and_ps(fscal,cutoff_mask);
359
360	/* Calculate temporary vectorial force */
361	tx = _mm_mul_ps(fscal,dx10);
362	ty = _mm_mul_ps(fscal,dy10);
363	tz = _mm_mul_ps(fscal,dz10);
364
365	/* Update vectorial force */
366	fix1 = _mm_add_ps(fix1,tx);
367	fiy1 = _mm_add_ps(fiy1,ty);
368	fiz1 = _mm_add_ps(fiz1,tz);
369
370	fjx0 = _mm_add_ps(fjx0,tx);
371	fjy0 = _mm_add_ps(fjy0,ty);
372	fjz0 = _mm_add_ps(fjz0,tz);
373
374	}
375
376	/**************************
377	* CALCULATE INTERACTIONS *
378	**************************/
379
380	if (gmx_mm_any_lt(rsq20,rcutoff2))
381	{
382
383	r20 = _mm_mul_ps(rsq20,rinv20);
384
385	/* Compute parameters for interactions between i and j atoms */
386	qq20 = _mm_mul_ps(iq2,jq0);
387
388	/* EWALD ELECTROSTATICS */
389
390	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391	ewrt = _mm_mul_ps(r20,ewtabscale);
392	ewitab = _mm_cvttps_epi32(ewrt);
393	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
394	ewitab = _mm_slli_epi32(ewitab,2);
395	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
396	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
397	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
398	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
399	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
400	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
401	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
402	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
403	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
404
405	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
406
407	/* Update potential sum for this i atom from the interaction with this j atom. */
408	velec = _mm_and_ps(velec,cutoff_mask);
409	velecsum = _mm_add_ps(velecsum,velec);
410
411	fscal = felec;
412
413	fscal = _mm_and_ps(fscal,cutoff_mask);
414
415	/* Calculate temporary vectorial force */
416	tx = _mm_mul_ps(fscal,dx20);
417	ty = _mm_mul_ps(fscal,dy20);
418	tz = _mm_mul_ps(fscal,dz20);
419
420	/* Update vectorial force */
421	fix2 = _mm_add_ps(fix2,tx);
422	fiy2 = _mm_add_ps(fiy2,ty);
423	fiz2 = _mm_add_ps(fiz2,tz);
424
425	fjx0 = _mm_add_ps(fjx0,tx);
426	fjy0 = _mm_add_ps(fjy0,ty);
427	fjz0 = _mm_add_ps(fjz0,tz);
428
429	}
430
431	fjptrA = f+j_coord_offsetA;
432	fjptrB = f+j_coord_offsetB;
433	fjptrC = f+j_coord_offsetC;
434	fjptrD = f+j_coord_offsetD;
435
436	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
437
438	/* Inner loop uses 156 flops */
439	}
440
441	if(jidx<j_index_end)
442	{
443
444	/* Get j neighbor index, and coordinate index */
445	jnrlistA = jjnr[jidx];
446	jnrlistB = jjnr[jidx+1];
447	jnrlistC = jjnr[jidx+2];
448	jnrlistD = jjnr[jidx+3];
449	/* Sign of each element will be negative for non-real atoms.
450	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
451	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
452	*/
453	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
454	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
455	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
456	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
457	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
458	j_coord_offsetA = DIM3*jnrA;
459	j_coord_offsetB = DIM3*jnrB;
460	j_coord_offsetC = DIM3*jnrC;
461	j_coord_offsetD = DIM3*jnrD;
462
463	/* load j atom coordinates */
464	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
465	x+j_coord_offsetC,x+j_coord_offsetD,
466	&jx0,&jy0,&jz0);
467
468	/* Calculate displacement vector */
469	dx00 = _mm_sub_ps(ix0,jx0);
470	dy00 = _mm_sub_ps(iy0,jy0);
471	dz00 = _mm_sub_ps(iz0,jz0);
472	dx10 = _mm_sub_ps(ix1,jx0);
473	dy10 = _mm_sub_ps(iy1,jy0);
474	dz10 = _mm_sub_ps(iz1,jz0);
475	dx20 = _mm_sub_ps(ix2,jx0);
476	dy20 = _mm_sub_ps(iy2,jy0);
477	dz20 = _mm_sub_ps(iz2,jz0);
478
479	/* Calculate squared distance and things based on it */
480	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
481	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
482	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
483
484	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
485	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
486	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
487
488	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
489	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
490	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
491
492	/* Load parameters for j particles */
493	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
494	charge+jnrC+0,charge+jnrD+0);
495	vdwjidx0A = 2*vdwtype[jnrA+0];
496	vdwjidx0B = 2*vdwtype[jnrB+0];
497	vdwjidx0C = 2*vdwtype[jnrC+0];
498	vdwjidx0D = 2*vdwtype[jnrD+0];
499
500	fjx0 = _mm_setzero_ps();
501	fjy0 = _mm_setzero_ps();
502	fjz0 = _mm_setzero_ps();
503
504	/**************************
505	* CALCULATE INTERACTIONS *
506	**************************/
507
508	if (gmx_mm_any_lt(rsq00,rcutoff2))
509	{
510
511	r00 = _mm_mul_ps(rsq00,rinv00);
512	r00 = _mm_andnot_ps(dummy_mask,r00);
513
514	/* Compute parameters for interactions between i and j atoms */
515	qq00 = _mm_mul_ps(iq0,jq0);
516	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
517	vdwparam+vdwioffset0+vdwjidx0B,
518	vdwparam+vdwioffset0+vdwjidx0C,
519	vdwparam+vdwioffset0+vdwjidx0D,
520	&c6_00,&c12_00);
521
522	/* EWALD ELECTROSTATICS */
523
524	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525	ewrt = _mm_mul_ps(r00,ewtabscale);
526	ewitab = _mm_cvttps_epi32(ewrt);
527	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
528	ewitab = _mm_slli_epi32(ewitab,2);
529	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
530	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
531	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
532	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
533	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
534	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
535	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
536	velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
537	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
538
539	/* LENNARD-JONES DISPERSION/REPULSION */
540
541	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
542	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
543	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
544	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
545	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
546	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
547
548	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
549
550	/* Update potential sum for this i atom from the interaction with this j atom. */
551	velec = _mm_and_ps(velec,cutoff_mask);
552	velec = _mm_andnot_ps(dummy_mask,velec);
553	velecsum = _mm_add_ps(velecsum,velec);
554	vvdw = _mm_and_ps(vvdw,cutoff_mask);
555	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
556	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
557
558	fscal = _mm_add_ps(felec,fvdw);
559
560	fscal = _mm_and_ps(fscal,cutoff_mask);
561
562	fscal = _mm_andnot_ps(dummy_mask,fscal);
563
564	/* Calculate temporary vectorial force */
565	tx = _mm_mul_ps(fscal,dx00);
566	ty = _mm_mul_ps(fscal,dy00);
567	tz = _mm_mul_ps(fscal,dz00);
568
569	/* Update vectorial force */
570	fix0 = _mm_add_ps(fix0,tx);
571	fiy0 = _mm_add_ps(fiy0,ty);
572	fiz0 = _mm_add_ps(fiz0,tz);
573
574	fjx0 = _mm_add_ps(fjx0,tx);
575	fjy0 = _mm_add_ps(fjy0,ty);
576	fjz0 = _mm_add_ps(fjz0,tz);
577
578	}
579
580	/**************************
581	* CALCULATE INTERACTIONS *
582	**************************/
583
584	if (gmx_mm_any_lt(rsq10,rcutoff2))
585	{
586
587	r10 = _mm_mul_ps(rsq10,rinv10);
588	r10 = _mm_andnot_ps(dummy_mask,r10);
589
590	/* Compute parameters for interactions between i and j atoms */
591	qq10 = _mm_mul_ps(iq1,jq0);
592
593	/* EWALD ELECTROSTATICS */
594
595	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
596	ewrt = _mm_mul_ps(r10,ewtabscale);
597	ewitab = _mm_cvttps_epi32(ewrt);
598	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
599	ewitab = _mm_slli_epi32(ewitab,2);
600	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
601	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
602	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
603	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
604	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
605	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
606	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
607	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
608	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
609
610	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
611
612	/* Update potential sum for this i atom from the interaction with this j atom. */
613	velec = _mm_and_ps(velec,cutoff_mask);
614	velec = _mm_andnot_ps(dummy_mask,velec);
615	velecsum = _mm_add_ps(velecsum,velec);
616
617	fscal = felec;
618
619	fscal = _mm_and_ps(fscal,cutoff_mask);
620
621	fscal = _mm_andnot_ps(dummy_mask,fscal);
622
623	/* Calculate temporary vectorial force */
624	tx = _mm_mul_ps(fscal,dx10);
625	ty = _mm_mul_ps(fscal,dy10);
626	tz = _mm_mul_ps(fscal,dz10);
627
628	/* Update vectorial force */
629	fix1 = _mm_add_ps(fix1,tx);
630	fiy1 = _mm_add_ps(fiy1,ty);
631	fiz1 = _mm_add_ps(fiz1,tz);
632
633	fjx0 = _mm_add_ps(fjx0,tx);
634	fjy0 = _mm_add_ps(fjy0,ty);
635	fjz0 = _mm_add_ps(fjz0,tz);
636
637	}
638
639	/**************************
640	* CALCULATE INTERACTIONS *
641	**************************/
642
643	if (gmx_mm_any_lt(rsq20,rcutoff2))
644	{
645
646	r20 = _mm_mul_ps(rsq20,rinv20);
647	r20 = _mm_andnot_ps(dummy_mask,r20);
648
649	/* Compute parameters for interactions between i and j atoms */
650	qq20 = _mm_mul_ps(iq2,jq0);
651
652	/* EWALD ELECTROSTATICS */
653
654	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
655	ewrt = _mm_mul_ps(r20,ewtabscale);
656	ewitab = _mm_cvttps_epi32(ewrt);
657	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
658	ewitab = _mm_slli_epi32(ewitab,2);
659	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
660	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
661	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
662	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
663	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
664	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
665	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
666	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
667	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
668
669	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
670
671	/* Update potential sum for this i atom from the interaction with this j atom. */
672	velec = _mm_and_ps(velec,cutoff_mask);
673	velec = _mm_andnot_ps(dummy_mask,velec);
674	velecsum = _mm_add_ps(velecsum,velec);
675
676	fscal = felec;
677
678	fscal = _mm_and_ps(fscal,cutoff_mask);
679
680	fscal = _mm_andnot_ps(dummy_mask,fscal);
681
682	/* Calculate temporary vectorial force */
683	tx = _mm_mul_ps(fscal,dx20);
684	ty = _mm_mul_ps(fscal,dy20);
685	tz = _mm_mul_ps(fscal,dz20);
686
687	/* Update vectorial force */
688	fix2 = _mm_add_ps(fix2,tx);
689	fiy2 = _mm_add_ps(fiy2,ty);
690	fiz2 = _mm_add_ps(fiz2,tz);
691
692	fjx0 = _mm_add_ps(fjx0,tx);
693	fjy0 = _mm_add_ps(fjy0,ty);
694	fjz0 = _mm_add_ps(fjz0,tz);
695
696	}
697
698	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
699	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
700	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
701	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
702
703	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
704
705	/* Inner loop uses 159 flops */
706	}
707
708	/* End of innermost loop */
709
710	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
711	f+i_coord_offset,fshift+i_shift_offset);
712
713	ggid = gid[iidx];
714	/* Update potential energies */
715	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
716	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
717
718	/* Increment number of inner iterations */
719	inneriter += j_index_end - j_index_start;
720
721	/* Outer loop uses 20 flops */
722	}
723
724	/* Increment number of outer iterations */
725	outeriter += nri;
726
727	/* Update outer/inner flops */
728
729	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter20 + inneriter159)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3_VF] += outeriter20 + inneriter 159;
730	}
731	/*
732	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
733	* Electrostatics interaction: Ewald
734	* VdW interaction: LennardJones
735	* Geometry: Water3-Particle
736	* Calculate force/pot: Force
737	*/
738	void
739	nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
740	(t_nblist * gmx_restrict nlist,
741	rvec * gmx_restrict xx,
742	rvec * gmx_restrict ff,
743	t_forcerec * gmx_restrict fr,
744	t_mdatoms * gmx_restrict mdatoms,
745	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
746	t_nrnb * gmx_restrict nrnb)
747	{
748	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
749	* just 0 for non-waters.
750	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
751	* jnr indices corresponding to data put in the four positions in the SIMD register.
752	*/
753	int i_shift_offset,i_coord_offset,outeriter,inneriter;
754	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
755	int jnrA,jnrB,jnrC,jnrD;
756	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
757	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
758	int iinr,jindex,jjnr,shiftidx,*gid;
759	real rcutoff_scalar;
760	real shiftvec,fshift,x,f;
761	real fjptrA,fjptrB,fjptrC,fjptrD;
762	real scratch[4*DIM3];
763	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
764	int vdwioffset0;
765	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
766	int vdwioffset1;
767	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
768	int vdwioffset2;
769	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
770	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
771	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
772	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
773	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
774	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
775	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
776	real *charge;
777	int nvdwtype;
778	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
779	int *vdwtype;
780	real *vdwparam;
781	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
782	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
783	__m128i ewitab;
784	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
785	real *ewtab;
786	__m128 dummy_mask,cutoff_mask;
787	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
788	__m128 one = _mm_set1_ps(1.0);
789	__m128 two = _mm_set1_ps(2.0);
790	x = xx[0];
791	f = ff[0];
792
793	nri = nlist->nri;
794	iinr = nlist->iinr;
795	jindex = nlist->jindex;
796	jjnr = nlist->jjnr;
797	shiftidx = nlist->shift;
798	gid = nlist->gid;
	Value stored to 'gid' is never read
799	shiftvec = fr->shift_vec[0];
800	fshift = fr->fshift[0];
801	facel = _mm_set1_ps(fr->epsfac);
802	charge = mdatoms->chargeA;
803	nvdwtype = fr->ntype;
804	vdwparam = fr->nbfp;
805	vdwtype = mdatoms->typeA;
806
807	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
808	ewtab = fr->ic->tabq_coul_F;
809	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
810	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
811
812	/* Setup water-specific parameters */
813	inr = nlist->iinr[0];
814	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
815	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
816	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
817	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
818
819	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
820	rcutoff_scalar = fr->rcoulomb;
821	rcutoff = _mm_set1_ps(rcutoff_scalar);
822	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
823
824	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
825	rvdw = _mm_set1_ps(fr->rvdw);
826
827	/* Avoid stupid compiler warnings */
828	jnrA = jnrB = jnrC = jnrD = 0;
829	j_coord_offsetA = 0;
830	j_coord_offsetB = 0;
831	j_coord_offsetC = 0;
832	j_coord_offsetD = 0;
833
834	outeriter = 0;
835	inneriter = 0;
836
837	for(iidx=0;iidx<4*DIM3;iidx++)
838	{
839	scratch[iidx] = 0.0;
840	}
841
842	/* Start outer loop over neighborlists */
843	for(iidx=0; iidx<nri; iidx++)
844	{
845	/* Load shift vector for this list */
846	i_shift_offset = DIM3*shiftidx[iidx];
847
848	/* Load limits for loop over neighbors */
849	j_index_start = jindex[iidx];
850	j_index_end = jindex[iidx+1];
851
852	/* Get outer coordinate index */
853	inr = iinr[iidx];
854	i_coord_offset = DIM3*inr;
855
856	/* Load i particle coords and add shift vector */
857	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
858	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
859
860	fix0 = _mm_setzero_ps();
861	fiy0 = _mm_setzero_ps();
862	fiz0 = _mm_setzero_ps();
863	fix1 = _mm_setzero_ps();
864	fiy1 = _mm_setzero_ps();
865	fiz1 = _mm_setzero_ps();
866	fix2 = _mm_setzero_ps();
867	fiy2 = _mm_setzero_ps();
868	fiz2 = _mm_setzero_ps();
869
870	/* Start inner kernel loop */
871	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
872	{
873
874	/* Get j neighbor index, and coordinate index */
875	jnrA = jjnr[jidx];
876	jnrB = jjnr[jidx+1];
877	jnrC = jjnr[jidx+2];
878	jnrD = jjnr[jidx+3];
879	j_coord_offsetA = DIM3*jnrA;
880	j_coord_offsetB = DIM3*jnrB;
881	j_coord_offsetC = DIM3*jnrC;
882	j_coord_offsetD = DIM3*jnrD;
883
884	/* load j atom coordinates */
885	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
886	x+j_coord_offsetC,x+j_coord_offsetD,
887	&jx0,&jy0,&jz0);
888
889	/* Calculate displacement vector */
890	dx00 = _mm_sub_ps(ix0,jx0);
891	dy00 = _mm_sub_ps(iy0,jy0);
892	dz00 = _mm_sub_ps(iz0,jz0);
893	dx10 = _mm_sub_ps(ix1,jx0);
894	dy10 = _mm_sub_ps(iy1,jy0);
895	dz10 = _mm_sub_ps(iz1,jz0);
896	dx20 = _mm_sub_ps(ix2,jx0);
897	dy20 = _mm_sub_ps(iy2,jy0);
898	dz20 = _mm_sub_ps(iz2,jz0);
899
900	/* Calculate squared distance and things based on it */
901	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
902	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
903	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
904
905	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
906	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
907	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
908
909	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
910	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
911	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
912
913	/* Load parameters for j particles */
914	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
915	charge+jnrC+0,charge+jnrD+0);
916	vdwjidx0A = 2*vdwtype[jnrA+0];
917	vdwjidx0B = 2*vdwtype[jnrB+0];
918	vdwjidx0C = 2*vdwtype[jnrC+0];
919	vdwjidx0D = 2*vdwtype[jnrD+0];
920
921	fjx0 = _mm_setzero_ps();
922	fjy0 = _mm_setzero_ps();
923	fjz0 = _mm_setzero_ps();
924
925	/**************************
926	* CALCULATE INTERACTIONS *
927	**************************/
928
929	if (gmx_mm_any_lt(rsq00,rcutoff2))
930	{
931
932	r00 = _mm_mul_ps(rsq00,rinv00);
933
934	/* Compute parameters for interactions between i and j atoms */
935	qq00 = _mm_mul_ps(iq0,jq0);
936	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
937	vdwparam+vdwioffset0+vdwjidx0B,
938	vdwparam+vdwioffset0+vdwjidx0C,
939	vdwparam+vdwioffset0+vdwjidx0D,
940	&c6_00,&c12_00);
941
942	/* EWALD ELECTROSTATICS */
943
944	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
945	ewrt = _mm_mul_ps(r00,ewtabscale);
946	ewitab = _mm_cvttps_epi32(ewrt);
947	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
948	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
949	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
950	&ewtabF,&ewtabFn);
951	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
952	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
953
954	/* LENNARD-JONES DISPERSION/REPULSION */
955
956	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
957	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
958
959	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
960
961	fscal = _mm_add_ps(felec,fvdw);
962
963	fscal = _mm_and_ps(fscal,cutoff_mask);
964
965	/* Calculate temporary vectorial force */
966	tx = _mm_mul_ps(fscal,dx00);
967	ty = _mm_mul_ps(fscal,dy00);
968	tz = _mm_mul_ps(fscal,dz00);
969
970	/* Update vectorial force */
971	fix0 = _mm_add_ps(fix0,tx);
972	fiy0 = _mm_add_ps(fiy0,ty);
973	fiz0 = _mm_add_ps(fiz0,tz);
974
975	fjx0 = _mm_add_ps(fjx0,tx);
976	fjy0 = _mm_add_ps(fjy0,ty);
977	fjz0 = _mm_add_ps(fjz0,tz);
978
979	}
980
981	/**************************
982	* CALCULATE INTERACTIONS *
983	**************************/
984
985	if (gmx_mm_any_lt(rsq10,rcutoff2))
986	{
987
988	r10 = _mm_mul_ps(rsq10,rinv10);
989
990	/* Compute parameters for interactions between i and j atoms */
991	qq10 = _mm_mul_ps(iq1,jq0);
992
993	/* EWALD ELECTROSTATICS */
994
995	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
996	ewrt = _mm_mul_ps(r10,ewtabscale);
997	ewitab = _mm_cvttps_epi32(ewrt);
998	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
999	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1000	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1001	&ewtabF,&ewtabFn);
1002	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1003	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1004
1005	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1006
1007	fscal = felec;
1008
1009	fscal = _mm_and_ps(fscal,cutoff_mask);
1010
1011	/* Calculate temporary vectorial force */
1012	tx = _mm_mul_ps(fscal,dx10);
1013	ty = _mm_mul_ps(fscal,dy10);
1014	tz = _mm_mul_ps(fscal,dz10);
1015
1016	/* Update vectorial force */
1017	fix1 = _mm_add_ps(fix1,tx);
1018	fiy1 = _mm_add_ps(fiy1,ty);
1019	fiz1 = _mm_add_ps(fiz1,tz);
1020
1021	fjx0 = _mm_add_ps(fjx0,tx);
1022	fjy0 = _mm_add_ps(fjy0,ty);
1023	fjz0 = _mm_add_ps(fjz0,tz);
1024
1025	}
1026
1027	/**************************
1028	* CALCULATE INTERACTIONS *
1029	**************************/
1030
1031	if (gmx_mm_any_lt(rsq20,rcutoff2))
1032	{
1033
1034	r20 = _mm_mul_ps(rsq20,rinv20);
1035
1036	/* Compute parameters for interactions between i and j atoms */
1037	qq20 = _mm_mul_ps(iq2,jq0);
1038
1039	/* EWALD ELECTROSTATICS */
1040
1041	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1042	ewrt = _mm_mul_ps(r20,ewtabscale);
1043	ewitab = _mm_cvttps_epi32(ewrt);
1044	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1045	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1046	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1047	&ewtabF,&ewtabFn);
1048	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1049	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1050
1051	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1052
1053	fscal = felec;
1054
1055	fscal = _mm_and_ps(fscal,cutoff_mask);
1056
1057	/* Calculate temporary vectorial force */
1058	tx = _mm_mul_ps(fscal,dx20);
1059	ty = _mm_mul_ps(fscal,dy20);
1060	tz = _mm_mul_ps(fscal,dz20);
1061
1062	/* Update vectorial force */
1063	fix2 = _mm_add_ps(fix2,tx);
1064	fiy2 = _mm_add_ps(fiy2,ty);
1065	fiz2 = _mm_add_ps(fiz2,tz);
1066
1067	fjx0 = _mm_add_ps(fjx0,tx);
1068	fjy0 = _mm_add_ps(fjy0,ty);
1069	fjz0 = _mm_add_ps(fjz0,tz);
1070
1071	}
1072
1073	fjptrA = f+j_coord_offsetA;
1074	fjptrB = f+j_coord_offsetB;
1075	fjptrC = f+j_coord_offsetC;
1076	fjptrD = f+j_coord_offsetD;
1077
1078	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1079
1080	/* Inner loop uses 124 flops */
1081	}
1082
1083	if(jidx<j_index_end)
1084	{
1085
1086	/* Get j neighbor index, and coordinate index */
1087	jnrlistA = jjnr[jidx];
1088	jnrlistB = jjnr[jidx+1];
1089	jnrlistC = jjnr[jidx+2];
1090	jnrlistD = jjnr[jidx+3];
1091	/* Sign of each element will be negative for non-real atoms.
1092	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1093	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1094	*/
1095	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1096	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1097	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1098	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1099	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1100	j_coord_offsetA = DIM3*jnrA;
1101	j_coord_offsetB = DIM3*jnrB;
1102	j_coord_offsetC = DIM3*jnrC;
1103	j_coord_offsetD = DIM3*jnrD;
1104
1105	/* load j atom coordinates */
1106	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1107	x+j_coord_offsetC,x+j_coord_offsetD,
1108	&jx0,&jy0,&jz0);
1109
1110	/* Calculate displacement vector */
1111	dx00 = _mm_sub_ps(ix0,jx0);
1112	dy00 = _mm_sub_ps(iy0,jy0);
1113	dz00 = _mm_sub_ps(iz0,jz0);
1114	dx10 = _mm_sub_ps(ix1,jx0);
1115	dy10 = _mm_sub_ps(iy1,jy0);
1116	dz10 = _mm_sub_ps(iz1,jz0);
1117	dx20 = _mm_sub_ps(ix2,jx0);
1118	dy20 = _mm_sub_ps(iy2,jy0);
1119	dz20 = _mm_sub_ps(iz2,jz0);
1120
1121	/* Calculate squared distance and things based on it */
1122	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1123	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1124	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1125
1126	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1127	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1128	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1129
1130	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1131	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1132	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1133
1134	/* Load parameters for j particles */
1135	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1136	charge+jnrC+0,charge+jnrD+0);
1137	vdwjidx0A = 2*vdwtype[jnrA+0];
1138	vdwjidx0B = 2*vdwtype[jnrB+0];
1139	vdwjidx0C = 2*vdwtype[jnrC+0];
1140	vdwjidx0D = 2*vdwtype[jnrD+0];
1141
1142	fjx0 = _mm_setzero_ps();
1143	fjy0 = _mm_setzero_ps();
1144	fjz0 = _mm_setzero_ps();
1145
1146	/**************************
1147	* CALCULATE INTERACTIONS *
1148	**************************/
1149
1150	if (gmx_mm_any_lt(rsq00,rcutoff2))
1151	{
1152
1153	r00 = _mm_mul_ps(rsq00,rinv00);
1154	r00 = _mm_andnot_ps(dummy_mask,r00);
1155
1156	/* Compute parameters for interactions between i and j atoms */
1157	qq00 = _mm_mul_ps(iq0,jq0);
1158	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1159	vdwparam+vdwioffset0+vdwjidx0B,
1160	vdwparam+vdwioffset0+vdwjidx0C,
1161	vdwparam+vdwioffset0+vdwjidx0D,
1162	&c6_00,&c12_00);
1163
1164	/* EWALD ELECTROSTATICS */
1165
1166	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1167	ewrt = _mm_mul_ps(r00,ewtabscale);
1168	ewitab = _mm_cvttps_epi32(ewrt);
1169	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1170	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1171	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1172	&ewtabF,&ewtabFn);
1173	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1174	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1175
1176	/* LENNARD-JONES DISPERSION/REPULSION */
1177
1178	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1179	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1180
1181	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1182
1183	fscal = _mm_add_ps(felec,fvdw);
1184
1185	fscal = _mm_and_ps(fscal,cutoff_mask);
1186
1187	fscal = _mm_andnot_ps(dummy_mask,fscal);
1188
1189	/* Calculate temporary vectorial force */
1190	tx = _mm_mul_ps(fscal,dx00);
1191	ty = _mm_mul_ps(fscal,dy00);
1192	tz = _mm_mul_ps(fscal,dz00);
1193
1194	/* Update vectorial force */
1195	fix0 = _mm_add_ps(fix0,tx);
1196	fiy0 = _mm_add_ps(fiy0,ty);
1197	fiz0 = _mm_add_ps(fiz0,tz);
1198
1199	fjx0 = _mm_add_ps(fjx0,tx);
1200	fjy0 = _mm_add_ps(fjy0,ty);
1201	fjz0 = _mm_add_ps(fjz0,tz);
1202
1203	}
1204
1205	/**************************
1206	* CALCULATE INTERACTIONS *
1207	**************************/
1208
1209	if (gmx_mm_any_lt(rsq10,rcutoff2))
1210	{
1211
1212	r10 = _mm_mul_ps(rsq10,rinv10);
1213	r10 = _mm_andnot_ps(dummy_mask,r10);
1214
1215	/* Compute parameters for interactions between i and j atoms */
1216	qq10 = _mm_mul_ps(iq1,jq0);
1217
1218	/* EWALD ELECTROSTATICS */
1219
1220	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1221	ewrt = _mm_mul_ps(r10,ewtabscale);
1222	ewitab = _mm_cvttps_epi32(ewrt);
1223	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1224	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1225	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1226	&ewtabF,&ewtabFn);
1227	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1228	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1229
1230	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1231
1232	fscal = felec;
1233
1234	fscal = _mm_and_ps(fscal,cutoff_mask);
1235
1236	fscal = _mm_andnot_ps(dummy_mask,fscal);
1237
1238	/* Calculate temporary vectorial force */
1239	tx = _mm_mul_ps(fscal,dx10);
1240	ty = _mm_mul_ps(fscal,dy10);
1241	tz = _mm_mul_ps(fscal,dz10);
1242
1243	/* Update vectorial force */
1244	fix1 = _mm_add_ps(fix1,tx);
1245	fiy1 = _mm_add_ps(fiy1,ty);
1246	fiz1 = _mm_add_ps(fiz1,tz);
1247
1248	fjx0 = _mm_add_ps(fjx0,tx);
1249	fjy0 = _mm_add_ps(fjy0,ty);
1250	fjz0 = _mm_add_ps(fjz0,tz);
1251
1252	}
1253
1254	/**************************
1255	* CALCULATE INTERACTIONS *
1256	**************************/
1257
1258	if (gmx_mm_any_lt(rsq20,rcutoff2))
1259	{
1260
1261	r20 = _mm_mul_ps(rsq20,rinv20);
1262	r20 = _mm_andnot_ps(dummy_mask,r20);
1263
1264	/* Compute parameters for interactions between i and j atoms */
1265	qq20 = _mm_mul_ps(iq2,jq0);
1266
1267	/* EWALD ELECTROSTATICS */
1268
1269	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1270	ewrt = _mm_mul_ps(r20,ewtabscale);
1271	ewitab = _mm_cvttps_epi32(ewrt);
1272	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1273	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1274	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1275	&ewtabF,&ewtabFn);
1276	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1277	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1278
1279	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1280
1281	fscal = felec;
1282
1283	fscal = _mm_and_ps(fscal,cutoff_mask);
1284
1285	fscal = _mm_andnot_ps(dummy_mask,fscal);
1286
1287	/* Calculate temporary vectorial force */
1288	tx = _mm_mul_ps(fscal,dx20);
1289	ty = _mm_mul_ps(fscal,dy20);
1290	tz = _mm_mul_ps(fscal,dz20);
1291
1292	/* Update vectorial force */
1293	fix2 = _mm_add_ps(fix2,tx);
1294	fiy2 = _mm_add_ps(fiy2,ty);
1295	fiz2 = _mm_add_ps(fiz2,tz);
1296
1297	fjx0 = _mm_add_ps(fjx0,tx);
1298	fjy0 = _mm_add_ps(fjy0,ty);
1299	fjz0 = _mm_add_ps(fjz0,tz);
1300
1301	}
1302
1303	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1304	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1305	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1306	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1307
1308	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1309
1310	/* Inner loop uses 127 flops */
1311	}
1312
1313	/* End of innermost loop */
1314
1315	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1316	f+i_coord_offset,fshift+i_shift_offset);
1317
1318	/* Increment number of inner iterations */
1319	inneriter += j_index_end - j_index_start;
1320
1321	/* Outer loop uses 18 flops */
1322	}
1323
1324	/* Increment number of outer iterations */
1325	outeriter += nri;
1326
1327	/* Update outer/inner flops */
1328
1329	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter18 + inneriter127)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3_F] += outeriter18 + inneriter 127;
1330	}