/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse4_1_single.c
Location:	line 912, column 5
Description:	Value stored to 'rvdw' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: LennardJones
56	* Geometry: Water4-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwioffset3;
92	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
100	real *charge;
101	int nvdwtype;
102	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103	int *vdwtype;
104	real *vdwparam;
105	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
106	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
107	__m128i ewitab;
108	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109	real *ewtab;
110	__m128 dummy_mask,cutoff_mask;
111	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112	__m128 one = _mm_set1_ps(1.0);
113	__m128 two = _mm_set1_ps(2.0);
114	x = xx[0];
115	f = ff[0];
116
117	nri = nlist->nri;
118	iinr = nlist->iinr;
119	jindex = nlist->jindex;
120	jjnr = nlist->jjnr;
121	shiftidx = nlist->shift;
122	gid = nlist->gid;
123	shiftvec = fr->shift_vec[0];
124	fshift = fr->fshift[0];
125	facel = _mm_set1_ps(fr->epsfac);
126	charge = mdatoms->chargeA;
127	nvdwtype = fr->ntype;
128	vdwparam = fr->nbfp;
129	vdwtype = mdatoms->typeA;
130
131	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
132	ewtab = fr->ic->tabq_coul_FDV0;
133	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
134	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
135
136	/* Setup water-specific parameters */
137	inr = nlist->iinr[0];
138	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
141	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
142
143	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
144	rcutoff_scalar = fr->rcoulomb;
145	rcutoff = _mm_set1_ps(rcutoff_scalar);
146	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
147
148	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
149	rvdw = _mm_set1_ps(fr->rvdw);
150
151	/* Avoid stupid compiler warnings */
152	jnrA = jnrB = jnrC = jnrD = 0;
153	j_coord_offsetA = 0;
154	j_coord_offsetB = 0;
155	j_coord_offsetC = 0;
156	j_coord_offsetD = 0;
157
158	outeriter = 0;
159	inneriter = 0;
160
161	for(iidx=0;iidx<4*DIM3;iidx++)
162	{
163	scratch[iidx] = 0.0;
164	}
165
166	/* Start outer loop over neighborlists */
167	for(iidx=0; iidx<nri; iidx++)
168	{
169	/* Load shift vector for this list */
170	i_shift_offset = DIM3*shiftidx[iidx];
171
172	/* Load limits for loop over neighbors */
173	j_index_start = jindex[iidx];
174	j_index_end = jindex[iidx+1];
175
176	/* Get outer coordinate index */
177	inr = iinr[iidx];
178	i_coord_offset = DIM3*inr;
179
180	/* Load i particle coords and add shift vector */
181	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
182	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183
184	fix0 = _mm_setzero_ps();
185	fiy0 = _mm_setzero_ps();
186	fiz0 = _mm_setzero_ps();
187	fix1 = _mm_setzero_ps();
188	fiy1 = _mm_setzero_ps();
189	fiz1 = _mm_setzero_ps();
190	fix2 = _mm_setzero_ps();
191	fiy2 = _mm_setzero_ps();
192	fiz2 = _mm_setzero_ps();
193	fix3 = _mm_setzero_ps();
194	fiy3 = _mm_setzero_ps();
195	fiz3 = _mm_setzero_ps();
196
197	/* Reset potential sums */
198	velecsum = _mm_setzero_ps();
199	vvdwsum = _mm_setzero_ps();
200
201	/* Start inner kernel loop */
202	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
203	{
204
205	/* Get j neighbor index, and coordinate index */
206	jnrA = jjnr[jidx];
207	jnrB = jjnr[jidx+1];
208	jnrC = jjnr[jidx+2];
209	jnrD = jjnr[jidx+3];
210	j_coord_offsetA = DIM3*jnrA;
211	j_coord_offsetB = DIM3*jnrB;
212	j_coord_offsetC = DIM3*jnrC;
213	j_coord_offsetD = DIM3*jnrD;
214
215	/* load j atom coordinates */
216	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
217	x+j_coord_offsetC,x+j_coord_offsetD,
218	&jx0,&jy0,&jz0);
219
220	/* Calculate displacement vector */
221	dx00 = _mm_sub_ps(ix0,jx0);
222	dy00 = _mm_sub_ps(iy0,jy0);
223	dz00 = _mm_sub_ps(iz0,jz0);
224	dx10 = _mm_sub_ps(ix1,jx0);
225	dy10 = _mm_sub_ps(iy1,jy0);
226	dz10 = _mm_sub_ps(iz1,jz0);
227	dx20 = _mm_sub_ps(ix2,jx0);
228	dy20 = _mm_sub_ps(iy2,jy0);
229	dz20 = _mm_sub_ps(iz2,jz0);
230	dx30 = _mm_sub_ps(ix3,jx0);
231	dy30 = _mm_sub_ps(iy3,jy0);
232	dz30 = _mm_sub_ps(iz3,jz0);
233
234	/* Calculate squared distance and things based on it */
235	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
236	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
237	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
238	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
239
240	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
241	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
242	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
243
244	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
245	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
246	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
247	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
248
249	/* Load parameters for j particles */
250	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
251	charge+jnrC+0,charge+jnrD+0);
252	vdwjidx0A = 2*vdwtype[jnrA+0];
253	vdwjidx0B = 2*vdwtype[jnrB+0];
254	vdwjidx0C = 2*vdwtype[jnrC+0];
255	vdwjidx0D = 2*vdwtype[jnrD+0];
256
257	fjx0 = _mm_setzero_ps();
258	fjy0 = _mm_setzero_ps();
259	fjz0 = _mm_setzero_ps();
260
261	/**************************
262	* CALCULATE INTERACTIONS *
263	**************************/
264
265	if (gmx_mm_any_lt(rsq00,rcutoff2))
266	{
267
268	/* Compute parameters for interactions between i and j atoms */
269	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
270	vdwparam+vdwioffset0+vdwjidx0B,
271	vdwparam+vdwioffset0+vdwjidx0C,
272	vdwparam+vdwioffset0+vdwjidx0D,
273	&c6_00,&c12_00);
274
275	/* LENNARD-JONES DISPERSION/REPULSION */
276
277	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
278	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
279	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
280	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
281	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
282	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
283
284	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
285
286	/* Update potential sum for this i atom from the interaction with this j atom. */
287	vvdw = _mm_and_ps(vvdw,cutoff_mask);
288	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
289
290	fscal = fvdw;
291
292	fscal = _mm_and_ps(fscal,cutoff_mask);
293
294	/* Calculate temporary vectorial force */
295	tx = _mm_mul_ps(fscal,dx00);
296	ty = _mm_mul_ps(fscal,dy00);
297	tz = _mm_mul_ps(fscal,dz00);
298
299	/* Update vectorial force */
300	fix0 = _mm_add_ps(fix0,tx);
301	fiy0 = _mm_add_ps(fiy0,ty);
302	fiz0 = _mm_add_ps(fiz0,tz);
303
304	fjx0 = _mm_add_ps(fjx0,tx);
305	fjy0 = _mm_add_ps(fjy0,ty);
306	fjz0 = _mm_add_ps(fjz0,tz);
307
308	}
309
310	/**************************
311	* CALCULATE INTERACTIONS *
312	**************************/
313
314	if (gmx_mm_any_lt(rsq10,rcutoff2))
315	{
316
317	r10 = _mm_mul_ps(rsq10,rinv10);
318
319	/* Compute parameters for interactions between i and j atoms */
320	qq10 = _mm_mul_ps(iq1,jq0);
321
322	/* EWALD ELECTROSTATICS */
323
324	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
325	ewrt = _mm_mul_ps(r10,ewtabscale);
326	ewitab = _mm_cvttps_epi32(ewrt);
327	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
328	ewitab = _mm_slli_epi32(ewitab,2);
329	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
330	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
331	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
332	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
333	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
334	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
335	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
336	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
337	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
338
339	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
340
341	/* Update potential sum for this i atom from the interaction with this j atom. */
342	velec = _mm_and_ps(velec,cutoff_mask);
343	velecsum = _mm_add_ps(velecsum,velec);
344
345	fscal = felec;
346
347	fscal = _mm_and_ps(fscal,cutoff_mask);
348
349	/* Calculate temporary vectorial force */
350	tx = _mm_mul_ps(fscal,dx10);
351	ty = _mm_mul_ps(fscal,dy10);
352	tz = _mm_mul_ps(fscal,dz10);
353
354	/* Update vectorial force */
355	fix1 = _mm_add_ps(fix1,tx);
356	fiy1 = _mm_add_ps(fiy1,ty);
357	fiz1 = _mm_add_ps(fiz1,tz);
358
359	fjx0 = _mm_add_ps(fjx0,tx);
360	fjy0 = _mm_add_ps(fjy0,ty);
361	fjz0 = _mm_add_ps(fjz0,tz);
362
363	}
364
365	/**************************
366	* CALCULATE INTERACTIONS *
367	**************************/
368
369	if (gmx_mm_any_lt(rsq20,rcutoff2))
370	{
371
372	r20 = _mm_mul_ps(rsq20,rinv20);
373
374	/* Compute parameters for interactions between i and j atoms */
375	qq20 = _mm_mul_ps(iq2,jq0);
376
377	/* EWALD ELECTROSTATICS */
378
379	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
380	ewrt = _mm_mul_ps(r20,ewtabscale);
381	ewitab = _mm_cvttps_epi32(ewrt);
382	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
383	ewitab = _mm_slli_epi32(ewitab,2);
384	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
385	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
386	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
387	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
388	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
389	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
390	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
391	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
392	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
393
394	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
395
396	/* Update potential sum for this i atom from the interaction with this j atom. */
397	velec = _mm_and_ps(velec,cutoff_mask);
398	velecsum = _mm_add_ps(velecsum,velec);
399
400	fscal = felec;
401
402	fscal = _mm_and_ps(fscal,cutoff_mask);
403
404	/* Calculate temporary vectorial force */
405	tx = _mm_mul_ps(fscal,dx20);
406	ty = _mm_mul_ps(fscal,dy20);
407	tz = _mm_mul_ps(fscal,dz20);
408
409	/* Update vectorial force */
410	fix2 = _mm_add_ps(fix2,tx);
411	fiy2 = _mm_add_ps(fiy2,ty);
412	fiz2 = _mm_add_ps(fiz2,tz);
413
414	fjx0 = _mm_add_ps(fjx0,tx);
415	fjy0 = _mm_add_ps(fjy0,ty);
416	fjz0 = _mm_add_ps(fjz0,tz);
417
418	}
419
420	/**************************
421	* CALCULATE INTERACTIONS *
422	**************************/
423
424	if (gmx_mm_any_lt(rsq30,rcutoff2))
425	{
426
427	r30 = _mm_mul_ps(rsq30,rinv30);
428
429	/* Compute parameters for interactions between i and j atoms */
430	qq30 = _mm_mul_ps(iq3,jq0);
431
432	/* EWALD ELECTROSTATICS */
433
434	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
435	ewrt = _mm_mul_ps(r30,ewtabscale);
436	ewitab = _mm_cvttps_epi32(ewrt);
437	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
438	ewitab = _mm_slli_epi32(ewitab,2);
439	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
440	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
441	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
442	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
443	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
444	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
445	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
446	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
447	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
448
449	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
450
451	/* Update potential sum for this i atom from the interaction with this j atom. */
452	velec = _mm_and_ps(velec,cutoff_mask);
453	velecsum = _mm_add_ps(velecsum,velec);
454
455	fscal = felec;
456
457	fscal = _mm_and_ps(fscal,cutoff_mask);
458
459	/* Calculate temporary vectorial force */
460	tx = _mm_mul_ps(fscal,dx30);
461	ty = _mm_mul_ps(fscal,dy30);
462	tz = _mm_mul_ps(fscal,dz30);
463
464	/* Update vectorial force */
465	fix3 = _mm_add_ps(fix3,tx);
466	fiy3 = _mm_add_ps(fiy3,ty);
467	fiz3 = _mm_add_ps(fiz3,tz);
468
469	fjx0 = _mm_add_ps(fjx0,tx);
470	fjy0 = _mm_add_ps(fjy0,ty);
471	fjz0 = _mm_add_ps(fjz0,tz);
472
473	}
474
475	fjptrA = f+j_coord_offsetA;
476	fjptrB = f+j_coord_offsetB;
477	fjptrC = f+j_coord_offsetC;
478	fjptrD = f+j_coord_offsetD;
479
480	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
481
482	/* Inner loop uses 179 flops */
483	}
484
485	if(jidx<j_index_end)
486	{
487
488	/* Get j neighbor index, and coordinate index */
489	jnrlistA = jjnr[jidx];
490	jnrlistB = jjnr[jidx+1];
491	jnrlistC = jjnr[jidx+2];
492	jnrlistD = jjnr[jidx+3];
493	/* Sign of each element will be negative for non-real atoms.
494	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
495	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
496	*/
497	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
498	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
499	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
500	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
501	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
502	j_coord_offsetA = DIM3*jnrA;
503	j_coord_offsetB = DIM3*jnrB;
504	j_coord_offsetC = DIM3*jnrC;
505	j_coord_offsetD = DIM3*jnrD;
506
507	/* load j atom coordinates */
508	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
509	x+j_coord_offsetC,x+j_coord_offsetD,
510	&jx0,&jy0,&jz0);
511
512	/* Calculate displacement vector */
513	dx00 = _mm_sub_ps(ix0,jx0);
514	dy00 = _mm_sub_ps(iy0,jy0);
515	dz00 = _mm_sub_ps(iz0,jz0);
516	dx10 = _mm_sub_ps(ix1,jx0);
517	dy10 = _mm_sub_ps(iy1,jy0);
518	dz10 = _mm_sub_ps(iz1,jz0);
519	dx20 = _mm_sub_ps(ix2,jx0);
520	dy20 = _mm_sub_ps(iy2,jy0);
521	dz20 = _mm_sub_ps(iz2,jz0);
522	dx30 = _mm_sub_ps(ix3,jx0);
523	dy30 = _mm_sub_ps(iy3,jy0);
524	dz30 = _mm_sub_ps(iz3,jz0);
525
526	/* Calculate squared distance and things based on it */
527	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
528	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
529	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
530	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
531
532	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
533	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
534	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
535
536	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
537	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
538	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
539	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
540
541	/* Load parameters for j particles */
542	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
543	charge+jnrC+0,charge+jnrD+0);
544	vdwjidx0A = 2*vdwtype[jnrA+0];
545	vdwjidx0B = 2*vdwtype[jnrB+0];
546	vdwjidx0C = 2*vdwtype[jnrC+0];
547	vdwjidx0D = 2*vdwtype[jnrD+0];
548
549	fjx0 = _mm_setzero_ps();
550	fjy0 = _mm_setzero_ps();
551	fjz0 = _mm_setzero_ps();
552
553	/**************************
554	* CALCULATE INTERACTIONS *
555	**************************/
556
557	if (gmx_mm_any_lt(rsq00,rcutoff2))
558	{
559
560	/* Compute parameters for interactions between i and j atoms */
561	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
562	vdwparam+vdwioffset0+vdwjidx0B,
563	vdwparam+vdwioffset0+vdwjidx0C,
564	vdwparam+vdwioffset0+vdwjidx0D,
565	&c6_00,&c12_00);
566
567	/* LENNARD-JONES DISPERSION/REPULSION */
568
569	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
570	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
571	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
572	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
573	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
574	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
575
576	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
577
578	/* Update potential sum for this i atom from the interaction with this j atom. */
579	vvdw = _mm_and_ps(vvdw,cutoff_mask);
580	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
581	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
582
583	fscal = fvdw;
584
585	fscal = _mm_and_ps(fscal,cutoff_mask);
586
587	fscal = _mm_andnot_ps(dummy_mask,fscal);
588
589	/* Calculate temporary vectorial force */
590	tx = _mm_mul_ps(fscal,dx00);
591	ty = _mm_mul_ps(fscal,dy00);
592	tz = _mm_mul_ps(fscal,dz00);
593
594	/* Update vectorial force */
595	fix0 = _mm_add_ps(fix0,tx);
596	fiy0 = _mm_add_ps(fiy0,ty);
597	fiz0 = _mm_add_ps(fiz0,tz);
598
599	fjx0 = _mm_add_ps(fjx0,tx);
600	fjy0 = _mm_add_ps(fjy0,ty);
601	fjz0 = _mm_add_ps(fjz0,tz);
602
603	}
604
605	/**************************
606	* CALCULATE INTERACTIONS *
607	**************************/
608
609	if (gmx_mm_any_lt(rsq10,rcutoff2))
610	{
611
612	r10 = _mm_mul_ps(rsq10,rinv10);
613	r10 = _mm_andnot_ps(dummy_mask,r10);
614
615	/* Compute parameters for interactions between i and j atoms */
616	qq10 = _mm_mul_ps(iq1,jq0);
617
618	/* EWALD ELECTROSTATICS */
619
620	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
621	ewrt = _mm_mul_ps(r10,ewtabscale);
622	ewitab = _mm_cvttps_epi32(ewrt);
623	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
624	ewitab = _mm_slli_epi32(ewitab,2);
625	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
626	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
627	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
628	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
629	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
630	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
631	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
632	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
633	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
634
635	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
636
637	/* Update potential sum for this i atom from the interaction with this j atom. */
638	velec = _mm_and_ps(velec,cutoff_mask);
639	velec = _mm_andnot_ps(dummy_mask,velec);
640	velecsum = _mm_add_ps(velecsum,velec);
641
642	fscal = felec;
643
644	fscal = _mm_and_ps(fscal,cutoff_mask);
645
646	fscal = _mm_andnot_ps(dummy_mask,fscal);
647
648	/* Calculate temporary vectorial force */
649	tx = _mm_mul_ps(fscal,dx10);
650	ty = _mm_mul_ps(fscal,dy10);
651	tz = _mm_mul_ps(fscal,dz10);
652
653	/* Update vectorial force */
654	fix1 = _mm_add_ps(fix1,tx);
655	fiy1 = _mm_add_ps(fiy1,ty);
656	fiz1 = _mm_add_ps(fiz1,tz);
657
658	fjx0 = _mm_add_ps(fjx0,tx);
659	fjy0 = _mm_add_ps(fjy0,ty);
660	fjz0 = _mm_add_ps(fjz0,tz);
661
662	}
663
664	/**************************
665	* CALCULATE INTERACTIONS *
666	**************************/
667
668	if (gmx_mm_any_lt(rsq20,rcutoff2))
669	{
670
671	r20 = _mm_mul_ps(rsq20,rinv20);
672	r20 = _mm_andnot_ps(dummy_mask,r20);
673
674	/* Compute parameters for interactions between i and j atoms */
675	qq20 = _mm_mul_ps(iq2,jq0);
676
677	/* EWALD ELECTROSTATICS */
678
679	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
680	ewrt = _mm_mul_ps(r20,ewtabscale);
681	ewitab = _mm_cvttps_epi32(ewrt);
682	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
683	ewitab = _mm_slli_epi32(ewitab,2);
684	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
685	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
686	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
687	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
688	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
689	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
690	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
691	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
692	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
693
694	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
695
696	/* Update potential sum for this i atom from the interaction with this j atom. */
697	velec = _mm_and_ps(velec,cutoff_mask);
698	velec = _mm_andnot_ps(dummy_mask,velec);
699	velecsum = _mm_add_ps(velecsum,velec);
700
701	fscal = felec;
702
703	fscal = _mm_and_ps(fscal,cutoff_mask);
704
705	fscal = _mm_andnot_ps(dummy_mask,fscal);
706
707	/* Calculate temporary vectorial force */
708	tx = _mm_mul_ps(fscal,dx20);
709	ty = _mm_mul_ps(fscal,dy20);
710	tz = _mm_mul_ps(fscal,dz20);
711
712	/* Update vectorial force */
713	fix2 = _mm_add_ps(fix2,tx);
714	fiy2 = _mm_add_ps(fiy2,ty);
715	fiz2 = _mm_add_ps(fiz2,tz);
716
717	fjx0 = _mm_add_ps(fjx0,tx);
718	fjy0 = _mm_add_ps(fjy0,ty);
719	fjz0 = _mm_add_ps(fjz0,tz);
720
721	}
722
723	/**************************
724	* CALCULATE INTERACTIONS *
725	**************************/
726
727	if (gmx_mm_any_lt(rsq30,rcutoff2))
728	{
729
730	r30 = _mm_mul_ps(rsq30,rinv30);
731	r30 = _mm_andnot_ps(dummy_mask,r30);
732
733	/* Compute parameters for interactions between i and j atoms */
734	qq30 = _mm_mul_ps(iq3,jq0);
735
736	/* EWALD ELECTROSTATICS */
737
738	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
739	ewrt = _mm_mul_ps(r30,ewtabscale);
740	ewitab = _mm_cvttps_epi32(ewrt);
741	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
742	ewitab = _mm_slli_epi32(ewitab,2);
743	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
744	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
745	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
746	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
747	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
748	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
749	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
750	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
751	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
752
753	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
754
755	/* Update potential sum for this i atom from the interaction with this j atom. */
756	velec = _mm_and_ps(velec,cutoff_mask);
757	velec = _mm_andnot_ps(dummy_mask,velec);
758	velecsum = _mm_add_ps(velecsum,velec);
759
760	fscal = felec;
761
762	fscal = _mm_and_ps(fscal,cutoff_mask);
763
764	fscal = _mm_andnot_ps(dummy_mask,fscal);
765
766	/* Calculate temporary vectorial force */
767	tx = _mm_mul_ps(fscal,dx30);
768	ty = _mm_mul_ps(fscal,dy30);
769	tz = _mm_mul_ps(fscal,dz30);
770
771	/* Update vectorial force */
772	fix3 = _mm_add_ps(fix3,tx);
773	fiy3 = _mm_add_ps(fiy3,ty);
774	fiz3 = _mm_add_ps(fiz3,tz);
775
776	fjx0 = _mm_add_ps(fjx0,tx);
777	fjy0 = _mm_add_ps(fjy0,ty);
778	fjz0 = _mm_add_ps(fjz0,tz);
779
780	}
781
782	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
783	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
784	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
785	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
786
787	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
788
789	/* Inner loop uses 182 flops */
790	}
791
792	/* End of innermost loop */
793
794	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
795	f+i_coord_offset,fshift+i_shift_offset);
796
797	ggid = gid[iidx];
798	/* Update potential energies */
799	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
800	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
801
802	/* Increment number of inner iterations */
803	inneriter += j_index_end - j_index_start;
804
805	/* Outer loop uses 26 flops */
806	}
807
808	/* Increment number of outer iterations */
809	outeriter += nri;
810
811	/* Update outer/inner flops */
812
813	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter26 + inneriter182)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_VF] += outeriter26 + inneriter 182;
814	}
815	/*
816	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
817	* Electrostatics interaction: Ewald
818	* VdW interaction: LennardJones
819	* Geometry: Water4-Particle
820	* Calculate force/pot: Force
821	*/
822	void
823	nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
824	(t_nblist * gmx_restrict nlist,
825	rvec * gmx_restrict xx,
826	rvec * gmx_restrict ff,
827	t_forcerec * gmx_restrict fr,
828	t_mdatoms * gmx_restrict mdatoms,
829	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
830	t_nrnb * gmx_restrict nrnb)
831	{
832	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
833	* just 0 for non-waters.
834	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
835	* jnr indices corresponding to data put in the four positions in the SIMD register.
836	*/
837	int i_shift_offset,i_coord_offset,outeriter,inneriter;
838	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
839	int jnrA,jnrB,jnrC,jnrD;
840	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
841	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
842	int iinr,jindex,jjnr,shiftidx,*gid;
843	real rcutoff_scalar;
844	real shiftvec,fshift,x,f;
845	real fjptrA,fjptrB,fjptrC,fjptrD;
846	real scratch[4*DIM3];
847	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
848	int vdwioffset0;
849	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
850	int vdwioffset1;
851	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
852	int vdwioffset2;
853	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
854	int vdwioffset3;
855	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
856	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
857	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
858	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
859	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
860	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
861	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
862	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
863	real *charge;
864	int nvdwtype;
865	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
866	int *vdwtype;
867	real *vdwparam;
868	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
869	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
870	__m128i ewitab;
871	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
872	real *ewtab;
873	__m128 dummy_mask,cutoff_mask;
874	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
875	__m128 one = _mm_set1_ps(1.0);
876	__m128 two = _mm_set1_ps(2.0);
877	x = xx[0];
878	f = ff[0];
879
880	nri = nlist->nri;
881	iinr = nlist->iinr;
882	jindex = nlist->jindex;
883	jjnr = nlist->jjnr;
884	shiftidx = nlist->shift;
885	gid = nlist->gid;
886	shiftvec = fr->shift_vec[0];
887	fshift = fr->fshift[0];
888	facel = _mm_set1_ps(fr->epsfac);
889	charge = mdatoms->chargeA;
890	nvdwtype = fr->ntype;
891	vdwparam = fr->nbfp;
892	vdwtype = mdatoms->typeA;
893
894	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
895	ewtab = fr->ic->tabq_coul_F;
896	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
897	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
898
899	/* Setup water-specific parameters */
900	inr = nlist->iinr[0];
901	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
902	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
903	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
904	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
905
906	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
907	rcutoff_scalar = fr->rcoulomb;
908	rcutoff = _mm_set1_ps(rcutoff_scalar);
909	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
910
911	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
912	rvdw = _mm_set1_ps(fr->rvdw);
	Value stored to 'rvdw' is never read
913
914	/* Avoid stupid compiler warnings */
915	jnrA = jnrB = jnrC = jnrD = 0;
916	j_coord_offsetA = 0;
917	j_coord_offsetB = 0;
918	j_coord_offsetC = 0;
919	j_coord_offsetD = 0;
920
921	outeriter = 0;
922	inneriter = 0;
923
924	for(iidx=0;iidx<4*DIM3;iidx++)
925	{
926	scratch[iidx] = 0.0;
927	}
928
929	/* Start outer loop over neighborlists */
930	for(iidx=0; iidx<nri; iidx++)
931	{
932	/* Load shift vector for this list */
933	i_shift_offset = DIM3*shiftidx[iidx];
934
935	/* Load limits for loop over neighbors */
936	j_index_start = jindex[iidx];
937	j_index_end = jindex[iidx+1];
938
939	/* Get outer coordinate index */
940	inr = iinr[iidx];
941	i_coord_offset = DIM3*inr;
942
943	/* Load i particle coords and add shift vector */
944	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
945	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
946
947	fix0 = _mm_setzero_ps();
948	fiy0 = _mm_setzero_ps();
949	fiz0 = _mm_setzero_ps();
950	fix1 = _mm_setzero_ps();
951	fiy1 = _mm_setzero_ps();
952	fiz1 = _mm_setzero_ps();
953	fix2 = _mm_setzero_ps();
954	fiy2 = _mm_setzero_ps();
955	fiz2 = _mm_setzero_ps();
956	fix3 = _mm_setzero_ps();
957	fiy3 = _mm_setzero_ps();
958	fiz3 = _mm_setzero_ps();
959
960	/* Start inner kernel loop */
961	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
962	{
963
964	/* Get j neighbor index, and coordinate index */
965	jnrA = jjnr[jidx];
966	jnrB = jjnr[jidx+1];
967	jnrC = jjnr[jidx+2];
968	jnrD = jjnr[jidx+3];
969	j_coord_offsetA = DIM3*jnrA;
970	j_coord_offsetB = DIM3*jnrB;
971	j_coord_offsetC = DIM3*jnrC;
972	j_coord_offsetD = DIM3*jnrD;
973
974	/* load j atom coordinates */
975	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
976	x+j_coord_offsetC,x+j_coord_offsetD,
977	&jx0,&jy0,&jz0);
978
979	/* Calculate displacement vector */
980	dx00 = _mm_sub_ps(ix0,jx0);
981	dy00 = _mm_sub_ps(iy0,jy0);
982	dz00 = _mm_sub_ps(iz0,jz0);
983	dx10 = _mm_sub_ps(ix1,jx0);
984	dy10 = _mm_sub_ps(iy1,jy0);
985	dz10 = _mm_sub_ps(iz1,jz0);
986	dx20 = _mm_sub_ps(ix2,jx0);
987	dy20 = _mm_sub_ps(iy2,jy0);
988	dz20 = _mm_sub_ps(iz2,jz0);
989	dx30 = _mm_sub_ps(ix3,jx0);
990	dy30 = _mm_sub_ps(iy3,jy0);
991	dz30 = _mm_sub_ps(iz3,jz0);
992
993	/* Calculate squared distance and things based on it */
994	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
995	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
996	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
997	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
998
999	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1000	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1001	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1002
1003	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
1004	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1005	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1006	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1007
1008	/* Load parameters for j particles */
1009	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1010	charge+jnrC+0,charge+jnrD+0);
1011	vdwjidx0A = 2*vdwtype[jnrA+0];
1012	vdwjidx0B = 2*vdwtype[jnrB+0];
1013	vdwjidx0C = 2*vdwtype[jnrC+0];
1014	vdwjidx0D = 2*vdwtype[jnrD+0];
1015
1016	fjx0 = _mm_setzero_ps();
1017	fjy0 = _mm_setzero_ps();
1018	fjz0 = _mm_setzero_ps();
1019
1020	/**************************
1021	* CALCULATE INTERACTIONS *
1022	**************************/
1023
1024	if (gmx_mm_any_lt(rsq00,rcutoff2))
1025	{
1026
1027	/* Compute parameters for interactions between i and j atoms */
1028	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1029	vdwparam+vdwioffset0+vdwjidx0B,
1030	vdwparam+vdwioffset0+vdwjidx0C,
1031	vdwparam+vdwioffset0+vdwjidx0D,
1032	&c6_00,&c12_00);
1033
1034	/* LENNARD-JONES DISPERSION/REPULSION */
1035
1036	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1037	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1038
1039	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1040
1041	fscal = fvdw;
1042
1043	fscal = _mm_and_ps(fscal,cutoff_mask);
1044
1045	/* Calculate temporary vectorial force */
1046	tx = _mm_mul_ps(fscal,dx00);
1047	ty = _mm_mul_ps(fscal,dy00);
1048	tz = _mm_mul_ps(fscal,dz00);
1049
1050	/* Update vectorial force */
1051	fix0 = _mm_add_ps(fix0,tx);
1052	fiy0 = _mm_add_ps(fiy0,ty);
1053	fiz0 = _mm_add_ps(fiz0,tz);
1054
1055	fjx0 = _mm_add_ps(fjx0,tx);
1056	fjy0 = _mm_add_ps(fjy0,ty);
1057	fjz0 = _mm_add_ps(fjz0,tz);
1058
1059	}
1060
1061	/**************************
1062	* CALCULATE INTERACTIONS *
1063	**************************/
1064
1065	if (gmx_mm_any_lt(rsq10,rcutoff2))
1066	{
1067
1068	r10 = _mm_mul_ps(rsq10,rinv10);
1069
1070	/* Compute parameters for interactions between i and j atoms */
1071	qq10 = _mm_mul_ps(iq1,jq0);
1072
1073	/* EWALD ELECTROSTATICS */
1074
1075	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1076	ewrt = _mm_mul_ps(r10,ewtabscale);
1077	ewitab = _mm_cvttps_epi32(ewrt);
1078	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1079	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1080	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1081	&ewtabF,&ewtabFn);
1082	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1083	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1084
1085	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1086
1087	fscal = felec;
1088
1089	fscal = _mm_and_ps(fscal,cutoff_mask);
1090
1091	/* Calculate temporary vectorial force */
1092	tx = _mm_mul_ps(fscal,dx10);
1093	ty = _mm_mul_ps(fscal,dy10);
1094	tz = _mm_mul_ps(fscal,dz10);
1095
1096	/* Update vectorial force */
1097	fix1 = _mm_add_ps(fix1,tx);
1098	fiy1 = _mm_add_ps(fiy1,ty);
1099	fiz1 = _mm_add_ps(fiz1,tz);
1100
1101	fjx0 = _mm_add_ps(fjx0,tx);
1102	fjy0 = _mm_add_ps(fjy0,ty);
1103	fjz0 = _mm_add_ps(fjz0,tz);
1104
1105	}
1106
1107	/**************************
1108	* CALCULATE INTERACTIONS *
1109	**************************/
1110
1111	if (gmx_mm_any_lt(rsq20,rcutoff2))
1112	{
1113
1114	r20 = _mm_mul_ps(rsq20,rinv20);
1115
1116	/* Compute parameters for interactions between i and j atoms */
1117	qq20 = _mm_mul_ps(iq2,jq0);
1118
1119	/* EWALD ELECTROSTATICS */
1120
1121	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1122	ewrt = _mm_mul_ps(r20,ewtabscale);
1123	ewitab = _mm_cvttps_epi32(ewrt);
1124	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1125	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1126	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1127	&ewtabF,&ewtabFn);
1128	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1129	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1130
1131	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1132
1133	fscal = felec;
1134
1135	fscal = _mm_and_ps(fscal,cutoff_mask);
1136
1137	/* Calculate temporary vectorial force */
1138	tx = _mm_mul_ps(fscal,dx20);
1139	ty = _mm_mul_ps(fscal,dy20);
1140	tz = _mm_mul_ps(fscal,dz20);
1141
1142	/* Update vectorial force */
1143	fix2 = _mm_add_ps(fix2,tx);
1144	fiy2 = _mm_add_ps(fiy2,ty);
1145	fiz2 = _mm_add_ps(fiz2,tz);
1146
1147	fjx0 = _mm_add_ps(fjx0,tx);
1148	fjy0 = _mm_add_ps(fjy0,ty);
1149	fjz0 = _mm_add_ps(fjz0,tz);
1150
1151	}
1152
1153	/**************************
1154	* CALCULATE INTERACTIONS *
1155	**************************/
1156
1157	if (gmx_mm_any_lt(rsq30,rcutoff2))
1158	{
1159
1160	r30 = _mm_mul_ps(rsq30,rinv30);
1161
1162	/* Compute parameters for interactions between i and j atoms */
1163	qq30 = _mm_mul_ps(iq3,jq0);
1164
1165	/* EWALD ELECTROSTATICS */
1166
1167	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1168	ewrt = _mm_mul_ps(r30,ewtabscale);
1169	ewitab = _mm_cvttps_epi32(ewrt);
1170	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1171	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1172	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1173	&ewtabF,&ewtabFn);
1174	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1175	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1176
1177	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1178
1179	fscal = felec;
1180
1181	fscal = _mm_and_ps(fscal,cutoff_mask);
1182
1183	/* Calculate temporary vectorial force */
1184	tx = _mm_mul_ps(fscal,dx30);
1185	ty = _mm_mul_ps(fscal,dy30);
1186	tz = _mm_mul_ps(fscal,dz30);
1187
1188	/* Update vectorial force */
1189	fix3 = _mm_add_ps(fix3,tx);
1190	fiy3 = _mm_add_ps(fiy3,ty);
1191	fiz3 = _mm_add_ps(fiz3,tz);
1192
1193	fjx0 = _mm_add_ps(fjx0,tx);
1194	fjy0 = _mm_add_ps(fjy0,ty);
1195	fjz0 = _mm_add_ps(fjz0,tz);
1196
1197	}
1198
1199	fjptrA = f+j_coord_offsetA;
1200	fjptrB = f+j_coord_offsetB;
1201	fjptrC = f+j_coord_offsetC;
1202	fjptrD = f+j_coord_offsetD;
1203
1204	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1205
1206	/* Inner loop uses 147 flops */
1207	}
1208
1209	if(jidx<j_index_end)
1210	{
1211
1212	/* Get j neighbor index, and coordinate index */
1213	jnrlistA = jjnr[jidx];
1214	jnrlistB = jjnr[jidx+1];
1215	jnrlistC = jjnr[jidx+2];
1216	jnrlistD = jjnr[jidx+3];
1217	/* Sign of each element will be negative for non-real atoms.
1218	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1219	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1220	*/
1221	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1222	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1223	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1224	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1225	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1226	j_coord_offsetA = DIM3*jnrA;
1227	j_coord_offsetB = DIM3*jnrB;
1228	j_coord_offsetC = DIM3*jnrC;
1229	j_coord_offsetD = DIM3*jnrD;
1230
1231	/* load j atom coordinates */
1232	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1233	x+j_coord_offsetC,x+j_coord_offsetD,
1234	&jx0,&jy0,&jz0);
1235
1236	/* Calculate displacement vector */
1237	dx00 = _mm_sub_ps(ix0,jx0);
1238	dy00 = _mm_sub_ps(iy0,jy0);
1239	dz00 = _mm_sub_ps(iz0,jz0);
1240	dx10 = _mm_sub_ps(ix1,jx0);
1241	dy10 = _mm_sub_ps(iy1,jy0);
1242	dz10 = _mm_sub_ps(iz1,jz0);
1243	dx20 = _mm_sub_ps(ix2,jx0);
1244	dy20 = _mm_sub_ps(iy2,jy0);
1245	dz20 = _mm_sub_ps(iz2,jz0);
1246	dx30 = _mm_sub_ps(ix3,jx0);
1247	dy30 = _mm_sub_ps(iy3,jy0);
1248	dz30 = _mm_sub_ps(iz3,jz0);
1249
1250	/* Calculate squared distance and things based on it */
1251	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1252	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1253	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1254	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1255
1256	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1257	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1258	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1259
1260	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
1261	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1262	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1263	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1264
1265	/* Load parameters for j particles */
1266	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1267	charge+jnrC+0,charge+jnrD+0);
1268	vdwjidx0A = 2*vdwtype[jnrA+0];
1269	vdwjidx0B = 2*vdwtype[jnrB+0];
1270	vdwjidx0C = 2*vdwtype[jnrC+0];
1271	vdwjidx0D = 2*vdwtype[jnrD+0];
1272
1273	fjx0 = _mm_setzero_ps();
1274	fjy0 = _mm_setzero_ps();
1275	fjz0 = _mm_setzero_ps();
1276
1277	/**************************
1278	* CALCULATE INTERACTIONS *
1279	**************************/
1280
1281	if (gmx_mm_any_lt(rsq00,rcutoff2))
1282	{
1283
1284	/* Compute parameters for interactions between i and j atoms */
1285	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1286	vdwparam+vdwioffset0+vdwjidx0B,
1287	vdwparam+vdwioffset0+vdwjidx0C,
1288	vdwparam+vdwioffset0+vdwjidx0D,
1289	&c6_00,&c12_00);
1290
1291	/* LENNARD-JONES DISPERSION/REPULSION */
1292
1293	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1294	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1295
1296	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1297
1298	fscal = fvdw;
1299
1300	fscal = _mm_and_ps(fscal,cutoff_mask);
1301
1302	fscal = _mm_andnot_ps(dummy_mask,fscal);
1303
1304	/* Calculate temporary vectorial force */
1305	tx = _mm_mul_ps(fscal,dx00);
1306	ty = _mm_mul_ps(fscal,dy00);
1307	tz = _mm_mul_ps(fscal,dz00);
1308
1309	/* Update vectorial force */
1310	fix0 = _mm_add_ps(fix0,tx);
1311	fiy0 = _mm_add_ps(fiy0,ty);
1312	fiz0 = _mm_add_ps(fiz0,tz);
1313
1314	fjx0 = _mm_add_ps(fjx0,tx);
1315	fjy0 = _mm_add_ps(fjy0,ty);
1316	fjz0 = _mm_add_ps(fjz0,tz);
1317
1318	}
1319
1320	/**************************
1321	* CALCULATE INTERACTIONS *
1322	**************************/
1323
1324	if (gmx_mm_any_lt(rsq10,rcutoff2))
1325	{
1326
1327	r10 = _mm_mul_ps(rsq10,rinv10);
1328	r10 = _mm_andnot_ps(dummy_mask,r10);
1329
1330	/* Compute parameters for interactions between i and j atoms */
1331	qq10 = _mm_mul_ps(iq1,jq0);
1332
1333	/* EWALD ELECTROSTATICS */
1334
1335	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1336	ewrt = _mm_mul_ps(r10,ewtabscale);
1337	ewitab = _mm_cvttps_epi32(ewrt);
1338	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1339	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1340	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1341	&ewtabF,&ewtabFn);
1342	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1343	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1344
1345	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1346
1347	fscal = felec;
1348
1349	fscal = _mm_and_ps(fscal,cutoff_mask);
1350
1351	fscal = _mm_andnot_ps(dummy_mask,fscal);
1352
1353	/* Calculate temporary vectorial force */
1354	tx = _mm_mul_ps(fscal,dx10);
1355	ty = _mm_mul_ps(fscal,dy10);
1356	tz = _mm_mul_ps(fscal,dz10);
1357
1358	/* Update vectorial force */
1359	fix1 = _mm_add_ps(fix1,tx);
1360	fiy1 = _mm_add_ps(fiy1,ty);
1361	fiz1 = _mm_add_ps(fiz1,tz);
1362
1363	fjx0 = _mm_add_ps(fjx0,tx);
1364	fjy0 = _mm_add_ps(fjy0,ty);
1365	fjz0 = _mm_add_ps(fjz0,tz);
1366
1367	}
1368
1369	/**************************
1370	* CALCULATE INTERACTIONS *
1371	**************************/
1372
1373	if (gmx_mm_any_lt(rsq20,rcutoff2))
1374	{
1375
1376	r20 = _mm_mul_ps(rsq20,rinv20);
1377	r20 = _mm_andnot_ps(dummy_mask,r20);
1378
1379	/* Compute parameters for interactions between i and j atoms */
1380	qq20 = _mm_mul_ps(iq2,jq0);
1381
1382	/* EWALD ELECTROSTATICS */
1383
1384	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1385	ewrt = _mm_mul_ps(r20,ewtabscale);
1386	ewitab = _mm_cvttps_epi32(ewrt);
1387	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1388	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1389	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1390	&ewtabF,&ewtabFn);
1391	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1392	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1393
1394	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1395
1396	fscal = felec;
1397
1398	fscal = _mm_and_ps(fscal,cutoff_mask);
1399
1400	fscal = _mm_andnot_ps(dummy_mask,fscal);
1401
1402	/* Calculate temporary vectorial force */
1403	tx = _mm_mul_ps(fscal,dx20);
1404	ty = _mm_mul_ps(fscal,dy20);
1405	tz = _mm_mul_ps(fscal,dz20);
1406
1407	/* Update vectorial force */
1408	fix2 = _mm_add_ps(fix2,tx);
1409	fiy2 = _mm_add_ps(fiy2,ty);
1410	fiz2 = _mm_add_ps(fiz2,tz);
1411
1412	fjx0 = _mm_add_ps(fjx0,tx);
1413	fjy0 = _mm_add_ps(fjy0,ty);
1414	fjz0 = _mm_add_ps(fjz0,tz);
1415
1416	}
1417
1418	/**************************
1419	* CALCULATE INTERACTIONS *
1420	**************************/
1421
1422	if (gmx_mm_any_lt(rsq30,rcutoff2))
1423	{
1424
1425	r30 = _mm_mul_ps(rsq30,rinv30);
1426	r30 = _mm_andnot_ps(dummy_mask,r30);
1427
1428	/* Compute parameters for interactions between i and j atoms */
1429	qq30 = _mm_mul_ps(iq3,jq0);
1430
1431	/* EWALD ELECTROSTATICS */
1432
1433	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1434	ewrt = _mm_mul_ps(r30,ewtabscale);
1435	ewitab = _mm_cvttps_epi32(ewrt);
1436	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1437	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1438	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1439	&ewtabF,&ewtabFn);
1440	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1441	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1442
1443	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1444
1445	fscal = felec;
1446
1447	fscal = _mm_and_ps(fscal,cutoff_mask);
1448
1449	fscal = _mm_andnot_ps(dummy_mask,fscal);
1450
1451	/* Calculate temporary vectorial force */
1452	tx = _mm_mul_ps(fscal,dx30);
1453	ty = _mm_mul_ps(fscal,dy30);
1454	tz = _mm_mul_ps(fscal,dz30);
1455
1456	/* Update vectorial force */
1457	fix3 = _mm_add_ps(fix3,tx);
1458	fiy3 = _mm_add_ps(fiy3,ty);
1459	fiz3 = _mm_add_ps(fiz3,tz);
1460
1461	fjx0 = _mm_add_ps(fjx0,tx);
1462	fjy0 = _mm_add_ps(fjy0,ty);
1463	fjz0 = _mm_add_ps(fjz0,tz);
1464
1465	}
1466
1467	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1468	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1469	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1470	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1471
1472	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1473
1474	/* Inner loop uses 150 flops */
1475	}
1476
1477	/* End of innermost loop */
1478
1479	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1480	f+i_coord_offset,fshift+i_shift_offset);
1481
1482	/* Increment number of inner iterations */
1483	inneriter += j_index_end - j_index_start;
1484
1485	/* Outer loop uses 24 flops */
1486	}
1487
1488	/* Increment number of outer iterations */
1489	outeriter += nri;
1490
1491	/* Update outer/inner flops */
1492
1493	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter24 + inneriter150)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_F] += outeriter24 + inneriter 150;
1494	}