/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1_single.c
Location:	line 281, column 13
Description:	Value stored to 'r00' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: LJEwald
56	* Geometry: Water4-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwioffset3;
92	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
100	real *charge;
101	int nvdwtype;
102	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103	int *vdwtype;
104	real *vdwparam;
105	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
106	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
107	__m128 c6grid_00;
108	__m128 c6grid_10;
109	__m128 c6grid_20;
110	__m128 c6grid_30;
111	__m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
112	real *vdwgridparam;
113	__m128 one_half = _mm_set1_ps(0.5);
114	__m128 minus_one = _mm_set1_ps(-1.0);
115	__m128i ewitab;
116	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117	real *ewtab;
118	__m128 dummy_mask,cutoff_mask;
119	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
120	__m128 one = _mm_set1_ps(1.0);
121	__m128 two = _mm_set1_ps(2.0);
122	x = xx[0];
123	f = ff[0];
124
125	nri = nlist->nri;
126	iinr = nlist->iinr;
127	jindex = nlist->jindex;
128	jjnr = nlist->jjnr;
129	shiftidx = nlist->shift;
130	gid = nlist->gid;
131	shiftvec = fr->shift_vec[0];
132	fshift = fr->fshift[0];
133	facel = _mm_set1_ps(fr->epsfac);
134	charge = mdatoms->chargeA;
135	nvdwtype = fr->ntype;
136	vdwparam = fr->nbfp;
137	vdwtype = mdatoms->typeA;
138	vdwgridparam = fr->ljpme_c6grid;
139	sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
140	ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
141	ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
142
143	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
144	ewtab = fr->ic->tabq_coul_FDV0;
145	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
146	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
147
148	/* Setup water-specific parameters */
149	inr = nlist->iinr[0];
150	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
151	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
152	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
153	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
154
155	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
156	rcutoff_scalar = fr->rcoulomb;
157	rcutoff = _mm_set1_ps(rcutoff_scalar);
158	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
159
160	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
161	rvdw = _mm_set1_ps(fr->rvdw);
162
163	/* Avoid stupid compiler warnings */
164	jnrA = jnrB = jnrC = jnrD = 0;
165	j_coord_offsetA = 0;
166	j_coord_offsetB = 0;
167	j_coord_offsetC = 0;
168	j_coord_offsetD = 0;
169
170	outeriter = 0;
171	inneriter = 0;
172
173	for(iidx=0;iidx<4*DIM3;iidx++)
174	{
175	scratch[iidx] = 0.0;
176	}
177
178	/* Start outer loop over neighborlists */
179	for(iidx=0; iidx<nri; iidx++)
180	{
181	/* Load shift vector for this list */
182	i_shift_offset = DIM3*shiftidx[iidx];
183
184	/* Load limits for loop over neighbors */
185	j_index_start = jindex[iidx];
186	j_index_end = jindex[iidx+1];
187
188	/* Get outer coordinate index */
189	inr = iinr[iidx];
190	i_coord_offset = DIM3*inr;
191
192	/* Load i particle coords and add shift vector */
193	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195
196	fix0 = _mm_setzero_ps();
197	fiy0 = _mm_setzero_ps();
198	fiz0 = _mm_setzero_ps();
199	fix1 = _mm_setzero_ps();
200	fiy1 = _mm_setzero_ps();
201	fiz1 = _mm_setzero_ps();
202	fix2 = _mm_setzero_ps();
203	fiy2 = _mm_setzero_ps();
204	fiz2 = _mm_setzero_ps();
205	fix3 = _mm_setzero_ps();
206	fiy3 = _mm_setzero_ps();
207	fiz3 = _mm_setzero_ps();
208
209	/* Reset potential sums */
210	velecsum = _mm_setzero_ps();
211	vvdwsum = _mm_setzero_ps();
212
213	/* Start inner kernel loop */
214	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215	{
216
217	/* Get j neighbor index, and coordinate index */
218	jnrA = jjnr[jidx];
219	jnrB = jjnr[jidx+1];
220	jnrC = jjnr[jidx+2];
221	jnrD = jjnr[jidx+3];
222	j_coord_offsetA = DIM3*jnrA;
223	j_coord_offsetB = DIM3*jnrB;
224	j_coord_offsetC = DIM3*jnrC;
225	j_coord_offsetD = DIM3*jnrD;
226
227	/* load j atom coordinates */
228	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
229	x+j_coord_offsetC,x+j_coord_offsetD,
230	&jx0,&jy0,&jz0);
231
232	/* Calculate displacement vector */
233	dx00 = _mm_sub_ps(ix0,jx0);
234	dy00 = _mm_sub_ps(iy0,jy0);
235	dz00 = _mm_sub_ps(iz0,jz0);
236	dx10 = _mm_sub_ps(ix1,jx0);
237	dy10 = _mm_sub_ps(iy1,jy0);
238	dz10 = _mm_sub_ps(iz1,jz0);
239	dx20 = _mm_sub_ps(ix2,jx0);
240	dy20 = _mm_sub_ps(iy2,jy0);
241	dz20 = _mm_sub_ps(iz2,jz0);
242	dx30 = _mm_sub_ps(ix3,jx0);
243	dy30 = _mm_sub_ps(iy3,jy0);
244	dz30 = _mm_sub_ps(iz3,jz0);
245
246	/* Calculate squared distance and things based on it */
247	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
248	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
249	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
250	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
251
252	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
253	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
254	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
255	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
256
257	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
258	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
259	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
260	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
261
262	/* Load parameters for j particles */
263	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
264	charge+jnrC+0,charge+jnrD+0);
265	vdwjidx0A = 2*vdwtype[jnrA+0];
266	vdwjidx0B = 2*vdwtype[jnrB+0];
267	vdwjidx0C = 2*vdwtype[jnrC+0];
268	vdwjidx0D = 2*vdwtype[jnrD+0];
269
270	fjx0 = _mm_setzero_ps();
271	fjy0 = _mm_setzero_ps();
272	fjz0 = _mm_setzero_ps();
273
274	/**************************
275	* CALCULATE INTERACTIONS *
276	**************************/
277
278	if (gmx_mm_any_lt(rsq00,rcutoff2))
279	{
280
281	r00 = _mm_mul_ps(rsq00,rinv00);
	Value stored to 'r00' is never read
282
283	/* Compute parameters for interactions between i and j atoms */
284	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
285	vdwparam+vdwioffset0+vdwjidx0B,
286	vdwparam+vdwioffset0+vdwjidx0C,
287	vdwparam+vdwioffset0+vdwjidx0D,
288	&c6_00,&c12_00);
289
290	c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
291	vdwgridparam+vdwioffset0+vdwjidx0B,
292	vdwgridparam+vdwioffset0+vdwjidx0C,
293	vdwgridparam+vdwioffset0+vdwjidx0D);
294
295	/* Analytical LJ-PME */
296	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
297	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
298	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
299	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
300	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
301	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
302	/* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
303	vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
304	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
305	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
306	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
307	/* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
308	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
309
310	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
311
312	/* Update potential sum for this i atom from the interaction with this j atom. */
313	vvdw = _mm_and_ps(vvdw,cutoff_mask);
314	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
315
316	fscal = fvdw;
317
318	fscal = _mm_and_ps(fscal,cutoff_mask);
319
320	/* Calculate temporary vectorial force */
321	tx = _mm_mul_ps(fscal,dx00);
322	ty = _mm_mul_ps(fscal,dy00);
323	tz = _mm_mul_ps(fscal,dz00);
324
325	/* Update vectorial force */
326	fix0 = _mm_add_ps(fix0,tx);
327	fiy0 = _mm_add_ps(fiy0,ty);
328	fiz0 = _mm_add_ps(fiz0,tz);
329
330	fjx0 = _mm_add_ps(fjx0,tx);
331	fjy0 = _mm_add_ps(fjy0,ty);
332	fjz0 = _mm_add_ps(fjz0,tz);
333
334	}
335
336	/**************************
337	* CALCULATE INTERACTIONS *
338	**************************/
339
340	if (gmx_mm_any_lt(rsq10,rcutoff2))
341	{
342
343	r10 = _mm_mul_ps(rsq10,rinv10);
344
345	/* Compute parameters for interactions between i and j atoms */
346	qq10 = _mm_mul_ps(iq1,jq0);
347
348	/* EWALD ELECTROSTATICS */
349
350	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
351	ewrt = _mm_mul_ps(r10,ewtabscale);
352	ewitab = _mm_cvttps_epi32(ewrt);
353	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
354	ewitab = _mm_slli_epi32(ewitab,2);
355	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
356	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
357	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
358	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
359	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
360	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
361	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
362	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
363	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
364
365	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
366
367	/* Update potential sum for this i atom from the interaction with this j atom. */
368	velec = _mm_and_ps(velec,cutoff_mask);
369	velecsum = _mm_add_ps(velecsum,velec);
370
371	fscal = felec;
372
373	fscal = _mm_and_ps(fscal,cutoff_mask);
374
375	/* Calculate temporary vectorial force */
376	tx = _mm_mul_ps(fscal,dx10);
377	ty = _mm_mul_ps(fscal,dy10);
378	tz = _mm_mul_ps(fscal,dz10);
379
380	/* Update vectorial force */
381	fix1 = _mm_add_ps(fix1,tx);
382	fiy1 = _mm_add_ps(fiy1,ty);
383	fiz1 = _mm_add_ps(fiz1,tz);
384
385	fjx0 = _mm_add_ps(fjx0,tx);
386	fjy0 = _mm_add_ps(fjy0,ty);
387	fjz0 = _mm_add_ps(fjz0,tz);
388
389	}
390
391	/**************************
392	* CALCULATE INTERACTIONS *
393	**************************/
394
395	if (gmx_mm_any_lt(rsq20,rcutoff2))
396	{
397
398	r20 = _mm_mul_ps(rsq20,rinv20);
399
400	/* Compute parameters for interactions between i and j atoms */
401	qq20 = _mm_mul_ps(iq2,jq0);
402
403	/* EWALD ELECTROSTATICS */
404
405	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
406	ewrt = _mm_mul_ps(r20,ewtabscale);
407	ewitab = _mm_cvttps_epi32(ewrt);
408	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
409	ewitab = _mm_slli_epi32(ewitab,2);
410	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
411	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
412	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
413	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
414	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
415	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
416	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
417	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
418	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
419
420	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
421
422	/* Update potential sum for this i atom from the interaction with this j atom. */
423	velec = _mm_and_ps(velec,cutoff_mask);
424	velecsum = _mm_add_ps(velecsum,velec);
425
426	fscal = felec;
427
428	fscal = _mm_and_ps(fscal,cutoff_mask);
429
430	/* Calculate temporary vectorial force */
431	tx = _mm_mul_ps(fscal,dx20);
432	ty = _mm_mul_ps(fscal,dy20);
433	tz = _mm_mul_ps(fscal,dz20);
434
435	/* Update vectorial force */
436	fix2 = _mm_add_ps(fix2,tx);
437	fiy2 = _mm_add_ps(fiy2,ty);
438	fiz2 = _mm_add_ps(fiz2,tz);
439
440	fjx0 = _mm_add_ps(fjx0,tx);
441	fjy0 = _mm_add_ps(fjy0,ty);
442	fjz0 = _mm_add_ps(fjz0,tz);
443
444	}
445
446	/**************************
447	* CALCULATE INTERACTIONS *
448	**************************/
449
450	if (gmx_mm_any_lt(rsq30,rcutoff2))
451	{
452
453	r30 = _mm_mul_ps(rsq30,rinv30);
454
455	/* Compute parameters for interactions between i and j atoms */
456	qq30 = _mm_mul_ps(iq3,jq0);
457
458	/* EWALD ELECTROSTATICS */
459
460	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
461	ewrt = _mm_mul_ps(r30,ewtabscale);
462	ewitab = _mm_cvttps_epi32(ewrt);
463	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
464	ewitab = _mm_slli_epi32(ewitab,2);
465	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
466	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
467	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
468	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
469	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
470	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
471	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
472	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
473	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
474
475	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
476
477	/* Update potential sum for this i atom from the interaction with this j atom. */
478	velec = _mm_and_ps(velec,cutoff_mask);
479	velecsum = _mm_add_ps(velecsum,velec);
480
481	fscal = felec;
482
483	fscal = _mm_and_ps(fscal,cutoff_mask);
484
485	/* Calculate temporary vectorial force */
486	tx = _mm_mul_ps(fscal,dx30);
487	ty = _mm_mul_ps(fscal,dy30);
488	tz = _mm_mul_ps(fscal,dz30);
489
490	/* Update vectorial force */
491	fix3 = _mm_add_ps(fix3,tx);
492	fiy3 = _mm_add_ps(fiy3,ty);
493	fiz3 = _mm_add_ps(fiz3,tz);
494
495	fjx0 = _mm_add_ps(fjx0,tx);
496	fjy0 = _mm_add_ps(fjy0,ty);
497	fjz0 = _mm_add_ps(fjz0,tz);
498
499	}
500
501	fjptrA = f+j_coord_offsetA;
502	fjptrB = f+j_coord_offsetB;
503	fjptrC = f+j_coord_offsetC;
504	fjptrD = f+j_coord_offsetD;
505
506	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
507
508	/* Inner loop uses 200 flops */
509	}
510
511	if(jidx<j_index_end)
512	{
513
514	/* Get j neighbor index, and coordinate index */
515	jnrlistA = jjnr[jidx];
516	jnrlistB = jjnr[jidx+1];
517	jnrlistC = jjnr[jidx+2];
518	jnrlistD = jjnr[jidx+3];
519	/* Sign of each element will be negative for non-real atoms.
520	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
521	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
522	*/
523	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
524	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
525	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
526	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
527	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
528	j_coord_offsetA = DIM3*jnrA;
529	j_coord_offsetB = DIM3*jnrB;
530	j_coord_offsetC = DIM3*jnrC;
531	j_coord_offsetD = DIM3*jnrD;
532
533	/* load j atom coordinates */
534	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
535	x+j_coord_offsetC,x+j_coord_offsetD,
536	&jx0,&jy0,&jz0);
537
538	/* Calculate displacement vector */
539	dx00 = _mm_sub_ps(ix0,jx0);
540	dy00 = _mm_sub_ps(iy0,jy0);
541	dz00 = _mm_sub_ps(iz0,jz0);
542	dx10 = _mm_sub_ps(ix1,jx0);
543	dy10 = _mm_sub_ps(iy1,jy0);
544	dz10 = _mm_sub_ps(iz1,jz0);
545	dx20 = _mm_sub_ps(ix2,jx0);
546	dy20 = _mm_sub_ps(iy2,jy0);
547	dz20 = _mm_sub_ps(iz2,jz0);
548	dx30 = _mm_sub_ps(ix3,jx0);
549	dy30 = _mm_sub_ps(iy3,jy0);
550	dz30 = _mm_sub_ps(iz3,jz0);
551
552	/* Calculate squared distance and things based on it */
553	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
554	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
555	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
556	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
557
558	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
559	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
560	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
561	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
562
563	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
564	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
565	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
566	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
567
568	/* Load parameters for j particles */
569	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
570	charge+jnrC+0,charge+jnrD+0);
571	vdwjidx0A = 2*vdwtype[jnrA+0];
572	vdwjidx0B = 2*vdwtype[jnrB+0];
573	vdwjidx0C = 2*vdwtype[jnrC+0];
574	vdwjidx0D = 2*vdwtype[jnrD+0];
575
576	fjx0 = _mm_setzero_ps();
577	fjy0 = _mm_setzero_ps();
578	fjz0 = _mm_setzero_ps();
579
580	/**************************
581	* CALCULATE INTERACTIONS *
582	**************************/
583
584	if (gmx_mm_any_lt(rsq00,rcutoff2))
585	{
586
587	r00 = _mm_mul_ps(rsq00,rinv00);
588	r00 = _mm_andnot_ps(dummy_mask,r00);
589
590	/* Compute parameters for interactions between i and j atoms */
591	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
592	vdwparam+vdwioffset0+vdwjidx0B,
593	vdwparam+vdwioffset0+vdwjidx0C,
594	vdwparam+vdwioffset0+vdwjidx0D,
595	&c6_00,&c12_00);
596
597	c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
598	vdwgridparam+vdwioffset0+vdwjidx0B,
599	vdwgridparam+vdwioffset0+vdwjidx0C,
600	vdwgridparam+vdwioffset0+vdwjidx0D);
601
602	/* Analytical LJ-PME */
603	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
604	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
605	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
606	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
607	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
608	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
609	/* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
610	vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
611	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
612	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
613	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
614	/* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
615	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
616
617	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
618
619	/* Update potential sum for this i atom from the interaction with this j atom. */
620	vvdw = _mm_and_ps(vvdw,cutoff_mask);
621	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
622	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
623
624	fscal = fvdw;
625
626	fscal = _mm_and_ps(fscal,cutoff_mask);
627
628	fscal = _mm_andnot_ps(dummy_mask,fscal);
629
630	/* Calculate temporary vectorial force */
631	tx = _mm_mul_ps(fscal,dx00);
632	ty = _mm_mul_ps(fscal,dy00);
633	tz = _mm_mul_ps(fscal,dz00);
634
635	/* Update vectorial force */
636	fix0 = _mm_add_ps(fix0,tx);
637	fiy0 = _mm_add_ps(fiy0,ty);
638	fiz0 = _mm_add_ps(fiz0,tz);
639
640	fjx0 = _mm_add_ps(fjx0,tx);
641	fjy0 = _mm_add_ps(fjy0,ty);
642	fjz0 = _mm_add_ps(fjz0,tz);
643
644	}
645
646	/**************************
647	* CALCULATE INTERACTIONS *
648	**************************/
649
650	if (gmx_mm_any_lt(rsq10,rcutoff2))
651	{
652
653	r10 = _mm_mul_ps(rsq10,rinv10);
654	r10 = _mm_andnot_ps(dummy_mask,r10);
655
656	/* Compute parameters for interactions between i and j atoms */
657	qq10 = _mm_mul_ps(iq1,jq0);
658
659	/* EWALD ELECTROSTATICS */
660
661	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
662	ewrt = _mm_mul_ps(r10,ewtabscale);
663	ewitab = _mm_cvttps_epi32(ewrt);
664	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
665	ewitab = _mm_slli_epi32(ewitab,2);
666	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
667	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
668	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
669	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
670	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
671	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
672	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
673	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
674	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
675
676	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
677
678	/* Update potential sum for this i atom from the interaction with this j atom. */
679	velec = _mm_and_ps(velec,cutoff_mask);
680	velec = _mm_andnot_ps(dummy_mask,velec);
681	velecsum = _mm_add_ps(velecsum,velec);
682
683	fscal = felec;
684
685	fscal = _mm_and_ps(fscal,cutoff_mask);
686
687	fscal = _mm_andnot_ps(dummy_mask,fscal);
688
689	/* Calculate temporary vectorial force */
690	tx = _mm_mul_ps(fscal,dx10);
691	ty = _mm_mul_ps(fscal,dy10);
692	tz = _mm_mul_ps(fscal,dz10);
693
694	/* Update vectorial force */
695	fix1 = _mm_add_ps(fix1,tx);
696	fiy1 = _mm_add_ps(fiy1,ty);
697	fiz1 = _mm_add_ps(fiz1,tz);
698
699	fjx0 = _mm_add_ps(fjx0,tx);
700	fjy0 = _mm_add_ps(fjy0,ty);
701	fjz0 = _mm_add_ps(fjz0,tz);
702
703	}
704
705	/**************************
706	* CALCULATE INTERACTIONS *
707	**************************/
708
709	if (gmx_mm_any_lt(rsq20,rcutoff2))
710	{
711
712	r20 = _mm_mul_ps(rsq20,rinv20);
713	r20 = _mm_andnot_ps(dummy_mask,r20);
714
715	/* Compute parameters for interactions between i and j atoms */
716	qq20 = _mm_mul_ps(iq2,jq0);
717
718	/* EWALD ELECTROSTATICS */
719
720	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
721	ewrt = _mm_mul_ps(r20,ewtabscale);
722	ewitab = _mm_cvttps_epi32(ewrt);
723	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
724	ewitab = _mm_slli_epi32(ewitab,2);
725	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
726	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
727	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
728	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
729	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
730	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
731	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
732	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
733	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
734
735	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
736
737	/* Update potential sum for this i atom from the interaction with this j atom. */
738	velec = _mm_and_ps(velec,cutoff_mask);
739	velec = _mm_andnot_ps(dummy_mask,velec);
740	velecsum = _mm_add_ps(velecsum,velec);
741
742	fscal = felec;
743
744	fscal = _mm_and_ps(fscal,cutoff_mask);
745
746	fscal = _mm_andnot_ps(dummy_mask,fscal);
747
748	/* Calculate temporary vectorial force */
749	tx = _mm_mul_ps(fscal,dx20);
750	ty = _mm_mul_ps(fscal,dy20);
751	tz = _mm_mul_ps(fscal,dz20);
752
753	/* Update vectorial force */
754	fix2 = _mm_add_ps(fix2,tx);
755	fiy2 = _mm_add_ps(fiy2,ty);
756	fiz2 = _mm_add_ps(fiz2,tz);
757
758	fjx0 = _mm_add_ps(fjx0,tx);
759	fjy0 = _mm_add_ps(fjy0,ty);
760	fjz0 = _mm_add_ps(fjz0,tz);
761
762	}
763
764	/**************************
765	* CALCULATE INTERACTIONS *
766	**************************/
767
768	if (gmx_mm_any_lt(rsq30,rcutoff2))
769	{
770
771	r30 = _mm_mul_ps(rsq30,rinv30);
772	r30 = _mm_andnot_ps(dummy_mask,r30);
773
774	/* Compute parameters for interactions between i and j atoms */
775	qq30 = _mm_mul_ps(iq3,jq0);
776
777	/* EWALD ELECTROSTATICS */
778
779	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
780	ewrt = _mm_mul_ps(r30,ewtabscale);
781	ewitab = _mm_cvttps_epi32(ewrt);
782	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
783	ewitab = _mm_slli_epi32(ewitab,2);
784	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
785	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
786	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
787	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
788	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
789	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
790	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
791	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
792	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
793
794	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
795
796	/* Update potential sum for this i atom from the interaction with this j atom. */
797	velec = _mm_and_ps(velec,cutoff_mask);
798	velec = _mm_andnot_ps(dummy_mask,velec);
799	velecsum = _mm_add_ps(velecsum,velec);
800
801	fscal = felec;
802
803	fscal = _mm_and_ps(fscal,cutoff_mask);
804
805	fscal = _mm_andnot_ps(dummy_mask,fscal);
806
807	/* Calculate temporary vectorial force */
808	tx = _mm_mul_ps(fscal,dx30);
809	ty = _mm_mul_ps(fscal,dy30);
810	tz = _mm_mul_ps(fscal,dz30);
811
812	/* Update vectorial force */
813	fix3 = _mm_add_ps(fix3,tx);
814	fiy3 = _mm_add_ps(fiy3,ty);
815	fiz3 = _mm_add_ps(fiz3,tz);
816
817	fjx0 = _mm_add_ps(fjx0,tx);
818	fjy0 = _mm_add_ps(fjy0,ty);
819	fjz0 = _mm_add_ps(fjz0,tz);
820
821	}
822
823	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
824	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
825	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
826	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
827
828	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
829
830	/* Inner loop uses 204 flops */
831	}
832
833	/* End of innermost loop */
834
835	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
836	f+i_coord_offset,fshift+i_shift_offset);
837
838	ggid = gid[iidx];
839	/* Update potential energies */
840	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
841	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
842
843	/* Increment number of inner iterations */
844	inneriter += j_index_end - j_index_start;
845
846	/* Outer loop uses 26 flops */
847	}
848
849	/* Increment number of outer iterations */
850	outeriter += nri;
851
852	/* Update outer/inner flops */
853
854	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter26 + inneriter204)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_VF] += outeriter26 + inneriter 204;
855	}
856	/*
857	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single
858	* Electrostatics interaction: Ewald
859	* VdW interaction: LJEwald
860	* Geometry: Water4-Particle
861	* Calculate force/pot: Force
862	*/
863	void
864	nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single
865	(t_nblist * gmx_restrict nlist,
866	rvec * gmx_restrict xx,
867	rvec * gmx_restrict ff,
868	t_forcerec * gmx_restrict fr,
869	t_mdatoms * gmx_restrict mdatoms,
870	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
871	t_nrnb * gmx_restrict nrnb)
872	{
873	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
874	* just 0 for non-waters.
875	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
876	* jnr indices corresponding to data put in the four positions in the SIMD register.
877	*/
878	int i_shift_offset,i_coord_offset,outeriter,inneriter;
879	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
880	int jnrA,jnrB,jnrC,jnrD;
881	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
882	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
883	int iinr,jindex,jjnr,shiftidx,*gid;
884	real rcutoff_scalar;
885	real shiftvec,fshift,x,f;
886	real fjptrA,fjptrB,fjptrC,fjptrD;
887	real scratch[4*DIM3];
888	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
889	int vdwioffset0;
890	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
891	int vdwioffset1;
892	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
893	int vdwioffset2;
894	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
895	int vdwioffset3;
896	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
897	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
898	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
899	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
900	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
901	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
902	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
903	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
904	real *charge;
905	int nvdwtype;
906	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
907	int *vdwtype;
908	real *vdwparam;
909	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
910	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
911	__m128 c6grid_00;
912	__m128 c6grid_10;
913	__m128 c6grid_20;
914	__m128 c6grid_30;
915	__m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
916	real *vdwgridparam;
917	__m128 one_half = _mm_set1_ps(0.5);
918	__m128 minus_one = _mm_set1_ps(-1.0);
919	__m128i ewitab;
920	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
921	real *ewtab;
922	__m128 dummy_mask,cutoff_mask;
923	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
924	__m128 one = _mm_set1_ps(1.0);
925	__m128 two = _mm_set1_ps(2.0);
926	x = xx[0];
927	f = ff[0];
928
929	nri = nlist->nri;
930	iinr = nlist->iinr;
931	jindex = nlist->jindex;
932	jjnr = nlist->jjnr;
933	shiftidx = nlist->shift;
934	gid = nlist->gid;
935	shiftvec = fr->shift_vec[0];
936	fshift = fr->fshift[0];
937	facel = _mm_set1_ps(fr->epsfac);
938	charge = mdatoms->chargeA;
939	nvdwtype = fr->ntype;
940	vdwparam = fr->nbfp;
941	vdwtype = mdatoms->typeA;
942	vdwgridparam = fr->ljpme_c6grid;
943	sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
944	ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
945	ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
946
947	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
948	ewtab = fr->ic->tabq_coul_F;
949	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
950	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
951
952	/* Setup water-specific parameters */
953	inr = nlist->iinr[0];
954	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
955	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
956	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
957	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
958
959	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
960	rcutoff_scalar = fr->rcoulomb;
961	rcutoff = _mm_set1_ps(rcutoff_scalar);
962	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
963
964	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
965	rvdw = _mm_set1_ps(fr->rvdw);
966
967	/* Avoid stupid compiler warnings */
968	jnrA = jnrB = jnrC = jnrD = 0;
969	j_coord_offsetA = 0;
970	j_coord_offsetB = 0;
971	j_coord_offsetC = 0;
972	j_coord_offsetD = 0;
973
974	outeriter = 0;
975	inneriter = 0;
976
977	for(iidx=0;iidx<4*DIM3;iidx++)
978	{
979	scratch[iidx] = 0.0;
980	}
981
982	/* Start outer loop over neighborlists */
983	for(iidx=0; iidx<nri; iidx++)
984	{
985	/* Load shift vector for this list */
986	i_shift_offset = DIM3*shiftidx[iidx];
987
988	/* Load limits for loop over neighbors */
989	j_index_start = jindex[iidx];
990	j_index_end = jindex[iidx+1];
991
992	/* Get outer coordinate index */
993	inr = iinr[iidx];
994	i_coord_offset = DIM3*inr;
995
996	/* Load i particle coords and add shift vector */
997	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
998	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
999
1000	fix0 = _mm_setzero_ps();
1001	fiy0 = _mm_setzero_ps();
1002	fiz0 = _mm_setzero_ps();
1003	fix1 = _mm_setzero_ps();
1004	fiy1 = _mm_setzero_ps();
1005	fiz1 = _mm_setzero_ps();
1006	fix2 = _mm_setzero_ps();
1007	fiy2 = _mm_setzero_ps();
1008	fiz2 = _mm_setzero_ps();
1009	fix3 = _mm_setzero_ps();
1010	fiy3 = _mm_setzero_ps();
1011	fiz3 = _mm_setzero_ps();
1012
1013	/* Start inner kernel loop */
1014	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1015	{
1016
1017	/* Get j neighbor index, and coordinate index */
1018	jnrA = jjnr[jidx];
1019	jnrB = jjnr[jidx+1];
1020	jnrC = jjnr[jidx+2];
1021	jnrD = jjnr[jidx+3];
1022	j_coord_offsetA = DIM3*jnrA;
1023	j_coord_offsetB = DIM3*jnrB;
1024	j_coord_offsetC = DIM3*jnrC;
1025	j_coord_offsetD = DIM3*jnrD;
1026
1027	/* load j atom coordinates */
1028	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1029	x+j_coord_offsetC,x+j_coord_offsetD,
1030	&jx0,&jy0,&jz0);
1031
1032	/* Calculate displacement vector */
1033	dx00 = _mm_sub_ps(ix0,jx0);
1034	dy00 = _mm_sub_ps(iy0,jy0);
1035	dz00 = _mm_sub_ps(iz0,jz0);
1036	dx10 = _mm_sub_ps(ix1,jx0);
1037	dy10 = _mm_sub_ps(iy1,jy0);
1038	dz10 = _mm_sub_ps(iz1,jz0);
1039	dx20 = _mm_sub_ps(ix2,jx0);
1040	dy20 = _mm_sub_ps(iy2,jy0);
1041	dz20 = _mm_sub_ps(iz2,jz0);
1042	dx30 = _mm_sub_ps(ix3,jx0);
1043	dy30 = _mm_sub_ps(iy3,jy0);
1044	dz30 = _mm_sub_ps(iz3,jz0);
1045
1046	/* Calculate squared distance and things based on it */
1047	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1048	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1049	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1050	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1051
1052	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1053	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1054	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1055	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1056
1057	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1058	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1059	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1060	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1061
1062	/* Load parameters for j particles */
1063	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1064	charge+jnrC+0,charge+jnrD+0);
1065	vdwjidx0A = 2*vdwtype[jnrA+0];
1066	vdwjidx0B = 2*vdwtype[jnrB+0];
1067	vdwjidx0C = 2*vdwtype[jnrC+0];
1068	vdwjidx0D = 2*vdwtype[jnrD+0];
1069
1070	fjx0 = _mm_setzero_ps();
1071	fjy0 = _mm_setzero_ps();
1072	fjz0 = _mm_setzero_ps();
1073
1074	/**************************
1075	* CALCULATE INTERACTIONS *
1076	**************************/
1077
1078	if (gmx_mm_any_lt(rsq00,rcutoff2))
1079	{
1080
1081	r00 = _mm_mul_ps(rsq00,rinv00);
1082
1083	/* Compute parameters for interactions between i and j atoms */
1084	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1085	vdwparam+vdwioffset0+vdwjidx0B,
1086	vdwparam+vdwioffset0+vdwjidx0C,
1087	vdwparam+vdwioffset0+vdwjidx0D,
1088	&c6_00,&c12_00);
1089
1090	c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
1091	vdwgridparam+vdwioffset0+vdwjidx0B,
1092	vdwgridparam+vdwioffset0+vdwjidx0C,
1093	vdwgridparam+vdwioffset0+vdwjidx0D);
1094
1095	/* Analytical LJ-PME */
1096	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1097	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1098	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1099	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
1100	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
1101	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1102	/* f6A = 6 * C6grid * (1 - poly) */
1103	f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1104	/* f6B = C6grid * exponent * beta^6 */
1105	f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1106	/* fvdw = 12C12/r13 - ((6C6 - f6A)/r6 + f6B)/r */
1107	fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1108
1109	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1110
1111	fscal = fvdw;
1112
1113	fscal = _mm_and_ps(fscal,cutoff_mask);
1114
1115	/* Calculate temporary vectorial force */
1116	tx = _mm_mul_ps(fscal,dx00);
1117	ty = _mm_mul_ps(fscal,dy00);
1118	tz = _mm_mul_ps(fscal,dz00);
1119
1120	/* Update vectorial force */
1121	fix0 = _mm_add_ps(fix0,tx);
1122	fiy0 = _mm_add_ps(fiy0,ty);
1123	fiz0 = _mm_add_ps(fiz0,tz);
1124
1125	fjx0 = _mm_add_ps(fjx0,tx);
1126	fjy0 = _mm_add_ps(fjy0,ty);
1127	fjz0 = _mm_add_ps(fjz0,tz);
1128
1129	}
1130
1131	/**************************
1132	* CALCULATE INTERACTIONS *
1133	**************************/
1134
1135	if (gmx_mm_any_lt(rsq10,rcutoff2))
1136	{
1137
1138	r10 = _mm_mul_ps(rsq10,rinv10);
1139
1140	/* Compute parameters for interactions between i and j atoms */
1141	qq10 = _mm_mul_ps(iq1,jq0);
1142
1143	/* EWALD ELECTROSTATICS */
1144
1145	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1146	ewrt = _mm_mul_ps(r10,ewtabscale);
1147	ewitab = _mm_cvttps_epi32(ewrt);
1148	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1149	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1150	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1151	&ewtabF,&ewtabFn);
1152	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1153	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1154
1155	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1156
1157	fscal = felec;
1158
1159	fscal = _mm_and_ps(fscal,cutoff_mask);
1160
1161	/* Calculate temporary vectorial force */
1162	tx = _mm_mul_ps(fscal,dx10);
1163	ty = _mm_mul_ps(fscal,dy10);
1164	tz = _mm_mul_ps(fscal,dz10);
1165
1166	/* Update vectorial force */
1167	fix1 = _mm_add_ps(fix1,tx);
1168	fiy1 = _mm_add_ps(fiy1,ty);
1169	fiz1 = _mm_add_ps(fiz1,tz);
1170
1171	fjx0 = _mm_add_ps(fjx0,tx);
1172	fjy0 = _mm_add_ps(fjy0,ty);
1173	fjz0 = _mm_add_ps(fjz0,tz);
1174
1175	}
1176
1177	/**************************
1178	* CALCULATE INTERACTIONS *
1179	**************************/
1180
1181	if (gmx_mm_any_lt(rsq20,rcutoff2))
1182	{
1183
1184	r20 = _mm_mul_ps(rsq20,rinv20);
1185
1186	/* Compute parameters for interactions between i and j atoms */
1187	qq20 = _mm_mul_ps(iq2,jq0);
1188
1189	/* EWALD ELECTROSTATICS */
1190
1191	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1192	ewrt = _mm_mul_ps(r20,ewtabscale);
1193	ewitab = _mm_cvttps_epi32(ewrt);
1194	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1195	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1196	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1197	&ewtabF,&ewtabFn);
1198	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1199	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1200
1201	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1202
1203	fscal = felec;
1204
1205	fscal = _mm_and_ps(fscal,cutoff_mask);
1206
1207	/* Calculate temporary vectorial force */
1208	tx = _mm_mul_ps(fscal,dx20);
1209	ty = _mm_mul_ps(fscal,dy20);
1210	tz = _mm_mul_ps(fscal,dz20);
1211
1212	/* Update vectorial force */
1213	fix2 = _mm_add_ps(fix2,tx);
1214	fiy2 = _mm_add_ps(fiy2,ty);
1215	fiz2 = _mm_add_ps(fiz2,tz);
1216
1217	fjx0 = _mm_add_ps(fjx0,tx);
1218	fjy0 = _mm_add_ps(fjy0,ty);
1219	fjz0 = _mm_add_ps(fjz0,tz);
1220
1221	}
1222
1223	/**************************
1224	* CALCULATE INTERACTIONS *
1225	**************************/
1226
1227	if (gmx_mm_any_lt(rsq30,rcutoff2))
1228	{
1229
1230	r30 = _mm_mul_ps(rsq30,rinv30);
1231
1232	/* Compute parameters for interactions between i and j atoms */
1233	qq30 = _mm_mul_ps(iq3,jq0);
1234
1235	/* EWALD ELECTROSTATICS */
1236
1237	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1238	ewrt = _mm_mul_ps(r30,ewtabscale);
1239	ewitab = _mm_cvttps_epi32(ewrt);
1240	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1241	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1242	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1243	&ewtabF,&ewtabFn);
1244	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1245	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1246
1247	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1248
1249	fscal = felec;
1250
1251	fscal = _mm_and_ps(fscal,cutoff_mask);
1252
1253	/* Calculate temporary vectorial force */
1254	tx = _mm_mul_ps(fscal,dx30);
1255	ty = _mm_mul_ps(fscal,dy30);
1256	tz = _mm_mul_ps(fscal,dz30);
1257
1258	/* Update vectorial force */
1259	fix3 = _mm_add_ps(fix3,tx);
1260	fiy3 = _mm_add_ps(fiy3,ty);
1261	fiz3 = _mm_add_ps(fiz3,tz);
1262
1263	fjx0 = _mm_add_ps(fjx0,tx);
1264	fjy0 = _mm_add_ps(fjy0,ty);
1265	fjz0 = _mm_add_ps(fjz0,tz);
1266
1267	}
1268
1269	fjptrA = f+j_coord_offsetA;
1270	fjptrB = f+j_coord_offsetB;
1271	fjptrC = f+j_coord_offsetC;
1272	fjptrD = f+j_coord_offsetD;
1273
1274	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1275
1276	/* Inner loop uses 166 flops */
1277	}
1278
1279	if(jidx<j_index_end)
1280	{
1281
1282	/* Get j neighbor index, and coordinate index */
1283	jnrlistA = jjnr[jidx];
1284	jnrlistB = jjnr[jidx+1];
1285	jnrlistC = jjnr[jidx+2];
1286	jnrlistD = jjnr[jidx+3];
1287	/* Sign of each element will be negative for non-real atoms.
1288	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1289	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1290	*/
1291	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1292	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1293	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1294	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1295	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1296	j_coord_offsetA = DIM3*jnrA;
1297	j_coord_offsetB = DIM3*jnrB;
1298	j_coord_offsetC = DIM3*jnrC;
1299	j_coord_offsetD = DIM3*jnrD;
1300
1301	/* load j atom coordinates */
1302	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1303	x+j_coord_offsetC,x+j_coord_offsetD,
1304	&jx0,&jy0,&jz0);
1305
1306	/* Calculate displacement vector */
1307	dx00 = _mm_sub_ps(ix0,jx0);
1308	dy00 = _mm_sub_ps(iy0,jy0);
1309	dz00 = _mm_sub_ps(iz0,jz0);
1310	dx10 = _mm_sub_ps(ix1,jx0);
1311	dy10 = _mm_sub_ps(iy1,jy0);
1312	dz10 = _mm_sub_ps(iz1,jz0);
1313	dx20 = _mm_sub_ps(ix2,jx0);
1314	dy20 = _mm_sub_ps(iy2,jy0);
1315	dz20 = _mm_sub_ps(iz2,jz0);
1316	dx30 = _mm_sub_ps(ix3,jx0);
1317	dy30 = _mm_sub_ps(iy3,jy0);
1318	dz30 = _mm_sub_ps(iz3,jz0);
1319
1320	/* Calculate squared distance and things based on it */
1321	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1322	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1323	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1324	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1325
1326	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1327	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1328	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1329	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1330
1331	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1332	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1333	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1334	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1335
1336	/* Load parameters for j particles */
1337	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1338	charge+jnrC+0,charge+jnrD+0);
1339	vdwjidx0A = 2*vdwtype[jnrA+0];
1340	vdwjidx0B = 2*vdwtype[jnrB+0];
1341	vdwjidx0C = 2*vdwtype[jnrC+0];
1342	vdwjidx0D = 2*vdwtype[jnrD+0];
1343
1344	fjx0 = _mm_setzero_ps();
1345	fjy0 = _mm_setzero_ps();
1346	fjz0 = _mm_setzero_ps();
1347
1348	/**************************
1349	* CALCULATE INTERACTIONS *
1350	**************************/
1351
1352	if (gmx_mm_any_lt(rsq00,rcutoff2))
1353	{
1354
1355	r00 = _mm_mul_ps(rsq00,rinv00);
1356	r00 = _mm_andnot_ps(dummy_mask,r00);
1357
1358	/* Compute parameters for interactions between i and j atoms */
1359	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1360	vdwparam+vdwioffset0+vdwjidx0B,
1361	vdwparam+vdwioffset0+vdwjidx0C,
1362	vdwparam+vdwioffset0+vdwjidx0D,
1363	&c6_00,&c12_00);
1364
1365	c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
1366	vdwgridparam+vdwioffset0+vdwjidx0B,
1367	vdwgridparam+vdwioffset0+vdwjidx0C,
1368	vdwgridparam+vdwioffset0+vdwjidx0D);
1369
1370	/* Analytical LJ-PME */
1371	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1372	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1373	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1374	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
1375	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
1376	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1377	/* f6A = 6 * C6grid * (1 - poly) */
1378	f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1379	/* f6B = C6grid * exponent * beta^6 */
1380	f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1381	/* fvdw = 12C12/r13 - ((6C6 - f6A)/r6 + f6B)/r */
1382	fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1383
1384	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1385
1386	fscal = fvdw;
1387
1388	fscal = _mm_and_ps(fscal,cutoff_mask);
1389
1390	fscal = _mm_andnot_ps(dummy_mask,fscal);
1391
1392	/* Calculate temporary vectorial force */
1393	tx = _mm_mul_ps(fscal,dx00);
1394	ty = _mm_mul_ps(fscal,dy00);
1395	tz = _mm_mul_ps(fscal,dz00);
1396
1397	/* Update vectorial force */
1398	fix0 = _mm_add_ps(fix0,tx);
1399	fiy0 = _mm_add_ps(fiy0,ty);
1400	fiz0 = _mm_add_ps(fiz0,tz);
1401
1402	fjx0 = _mm_add_ps(fjx0,tx);
1403	fjy0 = _mm_add_ps(fjy0,ty);
1404	fjz0 = _mm_add_ps(fjz0,tz);
1405
1406	}
1407
1408	/**************************
1409	* CALCULATE INTERACTIONS *
1410	**************************/
1411
1412	if (gmx_mm_any_lt(rsq10,rcutoff2))
1413	{
1414
1415	r10 = _mm_mul_ps(rsq10,rinv10);
1416	r10 = _mm_andnot_ps(dummy_mask,r10);
1417
1418	/* Compute parameters for interactions between i and j atoms */
1419	qq10 = _mm_mul_ps(iq1,jq0);
1420
1421	/* EWALD ELECTROSTATICS */
1422
1423	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1424	ewrt = _mm_mul_ps(r10,ewtabscale);
1425	ewitab = _mm_cvttps_epi32(ewrt);
1426	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1427	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1428	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1429	&ewtabF,&ewtabFn);
1430	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1431	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1432
1433	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1434
1435	fscal = felec;
1436
1437	fscal = _mm_and_ps(fscal,cutoff_mask);
1438
1439	fscal = _mm_andnot_ps(dummy_mask,fscal);
1440
1441	/* Calculate temporary vectorial force */
1442	tx = _mm_mul_ps(fscal,dx10);
1443	ty = _mm_mul_ps(fscal,dy10);
1444	tz = _mm_mul_ps(fscal,dz10);
1445
1446	/* Update vectorial force */
1447	fix1 = _mm_add_ps(fix1,tx);
1448	fiy1 = _mm_add_ps(fiy1,ty);
1449	fiz1 = _mm_add_ps(fiz1,tz);
1450
1451	fjx0 = _mm_add_ps(fjx0,tx);
1452	fjy0 = _mm_add_ps(fjy0,ty);
1453	fjz0 = _mm_add_ps(fjz0,tz);
1454
1455	}
1456
1457	/**************************
1458	* CALCULATE INTERACTIONS *
1459	**************************/
1460
1461	if (gmx_mm_any_lt(rsq20,rcutoff2))
1462	{
1463
1464	r20 = _mm_mul_ps(rsq20,rinv20);
1465	r20 = _mm_andnot_ps(dummy_mask,r20);
1466
1467	/* Compute parameters for interactions between i and j atoms */
1468	qq20 = _mm_mul_ps(iq2,jq0);
1469
1470	/* EWALD ELECTROSTATICS */
1471
1472	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1473	ewrt = _mm_mul_ps(r20,ewtabscale);
1474	ewitab = _mm_cvttps_epi32(ewrt);
1475	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1476	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1477	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1478	&ewtabF,&ewtabFn);
1479	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1480	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1481
1482	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1483
1484	fscal = felec;
1485
1486	fscal = _mm_and_ps(fscal,cutoff_mask);
1487
1488	fscal = _mm_andnot_ps(dummy_mask,fscal);
1489
1490	/* Calculate temporary vectorial force */
1491	tx = _mm_mul_ps(fscal,dx20);
1492	ty = _mm_mul_ps(fscal,dy20);
1493	tz = _mm_mul_ps(fscal,dz20);
1494
1495	/* Update vectorial force */
1496	fix2 = _mm_add_ps(fix2,tx);
1497	fiy2 = _mm_add_ps(fiy2,ty);
1498	fiz2 = _mm_add_ps(fiz2,tz);
1499
1500	fjx0 = _mm_add_ps(fjx0,tx);
1501	fjy0 = _mm_add_ps(fjy0,ty);
1502	fjz0 = _mm_add_ps(fjz0,tz);
1503
1504	}
1505
1506	/**************************
1507	* CALCULATE INTERACTIONS *
1508	**************************/
1509
1510	if (gmx_mm_any_lt(rsq30,rcutoff2))
1511	{
1512
1513	r30 = _mm_mul_ps(rsq30,rinv30);
1514	r30 = _mm_andnot_ps(dummy_mask,r30);
1515
1516	/* Compute parameters for interactions between i and j atoms */
1517	qq30 = _mm_mul_ps(iq3,jq0);
1518
1519	/* EWALD ELECTROSTATICS */
1520
1521	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1522	ewrt = _mm_mul_ps(r30,ewtabscale);
1523	ewitab = _mm_cvttps_epi32(ewrt);
1524	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1525	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1526	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1527	&ewtabF,&ewtabFn);
1528	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1529	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1530
1531	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1532
1533	fscal = felec;
1534
1535	fscal = _mm_and_ps(fscal,cutoff_mask);
1536
1537	fscal = _mm_andnot_ps(dummy_mask,fscal);
1538
1539	/* Calculate temporary vectorial force */
1540	tx = _mm_mul_ps(fscal,dx30);
1541	ty = _mm_mul_ps(fscal,dy30);
1542	tz = _mm_mul_ps(fscal,dz30);
1543
1544	/* Update vectorial force */
1545	fix3 = _mm_add_ps(fix3,tx);
1546	fiy3 = _mm_add_ps(fiy3,ty);
1547	fiz3 = _mm_add_ps(fiz3,tz);
1548
1549	fjx0 = _mm_add_ps(fjx0,tx);
1550	fjy0 = _mm_add_ps(fjy0,ty);
1551	fjz0 = _mm_add_ps(fjz0,tz);
1552
1553	}
1554
1555	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1556	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1557	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1558	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1559
1560	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1561
1562	/* Inner loop uses 170 flops */
1563	}
1564
1565	/* End of innermost loop */
1566
1567	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1568	f+i_coord_offset,fshift+i_shift_offset);
1569
1570	/* Increment number of inner iterations */
1571	inneriter += j_index_end - j_index_start;
1572
1573	/* Outer loop uses 24 flops */
1574	}
1575
1576	/* Increment number of outer iterations */
1577	outeriter += nri;
1578
1579	/* Update outer/inner flops */
1580
1581	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter24 + inneriter170)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_F] += outeriter24 + inneriter 170;
1582	}