/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse4_1_single.c
Location:	line 139, column 5
Description:	Value stored to 'j_coord_offsetC' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: None
56	* Geometry: Water4-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset1;
86	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87	int vdwioffset2;
88	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89	int vdwioffset3;
90	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
96	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
97	real *charge;
98	__m128i ewitab;
99	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100	real *ewtab;
101	__m128 dummy_mask,cutoff_mask;
102	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
103	__m128 one = _mm_set1_ps(1.0);
104	__m128 two = _mm_set1_ps(2.0);
105	x = xx[0];
106	f = ff[0];
107
108	nri = nlist->nri;
109	iinr = nlist->iinr;
110	jindex = nlist->jindex;
111	jjnr = nlist->jjnr;
112	shiftidx = nlist->shift;
113	gid = nlist->gid;
114	shiftvec = fr->shift_vec[0];
115	fshift = fr->fshift[0];
116	facel = _mm_set1_ps(fr->epsfac);
117	charge = mdatoms->chargeA;
118
119	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
120	ewtab = fr->ic->tabq_coul_FDV0;
121	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
122	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
123
124	/* Setup water-specific parameters */
125	inr = nlist->iinr[0];
126	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
127	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
128	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
129
130	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
131	rcutoff_scalar = fr->rcoulomb;
132	rcutoff = _mm_set1_ps(rcutoff_scalar);
133	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
134
135	/* Avoid stupid compiler warnings */
136	jnrA = jnrB = jnrC = jnrD = 0;
137	j_coord_offsetA = 0;
138	j_coord_offsetB = 0;
139	j_coord_offsetC = 0;
	Value stored to 'j_coord_offsetC' is never read
140	j_coord_offsetD = 0;
141
142	outeriter = 0;
143	inneriter = 0;
144
145	for(iidx=0;iidx<4*DIM3;iidx++)
146	{
147	scratch[iidx] = 0.0;
148	}
149
150	/* Start outer loop over neighborlists */
151	for(iidx=0; iidx<nri; iidx++)
152	{
153	/* Load shift vector for this list */
154	i_shift_offset = DIM3*shiftidx[iidx];
155
156	/* Load limits for loop over neighbors */
157	j_index_start = jindex[iidx];
158	j_index_end = jindex[iidx+1];
159
160	/* Get outer coordinate index */
161	inr = iinr[iidx];
162	i_coord_offset = DIM3*inr;
163
164	/* Load i particle coords and add shift vector */
165	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM3,
166	&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
167
168	fix1 = _mm_setzero_ps();
169	fiy1 = _mm_setzero_ps();
170	fiz1 = _mm_setzero_ps();
171	fix2 = _mm_setzero_ps();
172	fiy2 = _mm_setzero_ps();
173	fiz2 = _mm_setzero_ps();
174	fix3 = _mm_setzero_ps();
175	fiy3 = _mm_setzero_ps();
176	fiz3 = _mm_setzero_ps();
177
178	/* Reset potential sums */
179	velecsum = _mm_setzero_ps();
180
181	/* Start inner kernel loop */
182	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
183	{
184
185	/* Get j neighbor index, and coordinate index */
186	jnrA = jjnr[jidx];
187	jnrB = jjnr[jidx+1];
188	jnrC = jjnr[jidx+2];
189	jnrD = jjnr[jidx+3];
190	j_coord_offsetA = DIM3*jnrA;
191	j_coord_offsetB = DIM3*jnrB;
192	j_coord_offsetC = DIM3*jnrC;
193	j_coord_offsetD = DIM3*jnrD;
194
195	/* load j atom coordinates */
196	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
197	x+j_coord_offsetC,x+j_coord_offsetD,
198	&jx0,&jy0,&jz0);
199
200	/* Calculate displacement vector */
201	dx10 = _mm_sub_ps(ix1,jx0);
202	dy10 = _mm_sub_ps(iy1,jy0);
203	dz10 = _mm_sub_ps(iz1,jz0);
204	dx20 = _mm_sub_ps(ix2,jx0);
205	dy20 = _mm_sub_ps(iy2,jy0);
206	dz20 = _mm_sub_ps(iz2,jz0);
207	dx30 = _mm_sub_ps(ix3,jx0);
208	dy30 = _mm_sub_ps(iy3,jy0);
209	dz30 = _mm_sub_ps(iz3,jz0);
210
211	/* Calculate squared distance and things based on it */
212	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
213	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
214	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
215
216	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
217	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
218	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
219
220	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
221	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
222	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
223
224	/* Load parameters for j particles */
225	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
226	charge+jnrC+0,charge+jnrD+0);
227
228	fjx0 = _mm_setzero_ps();
229	fjy0 = _mm_setzero_ps();
230	fjz0 = _mm_setzero_ps();
231
232	/**************************
233	* CALCULATE INTERACTIONS *
234	**************************/
235
236	if (gmx_mm_any_lt(rsq10,rcutoff2))
237	{
238
239	r10 = _mm_mul_ps(rsq10,rinv10);
240
241	/* Compute parameters for interactions between i and j atoms */
242	qq10 = _mm_mul_ps(iq1,jq0);
243
244	/* EWALD ELECTROSTATICS */
245
246	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
247	ewrt = _mm_mul_ps(r10,ewtabscale);
248	ewitab = _mm_cvttps_epi32(ewrt);
249	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
250	ewitab = _mm_slli_epi32(ewitab,2);
251	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
252	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
253	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
254	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
255	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
256	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
257	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
258	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
259	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
260
261	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
262
263	/* Update potential sum for this i atom from the interaction with this j atom. */
264	velec = _mm_and_ps(velec,cutoff_mask);
265	velecsum = _mm_add_ps(velecsum,velec);
266
267	fscal = felec;
268
269	fscal = _mm_and_ps(fscal,cutoff_mask);
270
271	/* Calculate temporary vectorial force */
272	tx = _mm_mul_ps(fscal,dx10);
273	ty = _mm_mul_ps(fscal,dy10);
274	tz = _mm_mul_ps(fscal,dz10);
275
276	/* Update vectorial force */
277	fix1 = _mm_add_ps(fix1,tx);
278	fiy1 = _mm_add_ps(fiy1,ty);
279	fiz1 = _mm_add_ps(fiz1,tz);
280
281	fjx0 = _mm_add_ps(fjx0,tx);
282	fjy0 = _mm_add_ps(fjy0,ty);
283	fjz0 = _mm_add_ps(fjz0,tz);
284
285	}
286
287	/**************************
288	* CALCULATE INTERACTIONS *
289	**************************/
290
291	if (gmx_mm_any_lt(rsq20,rcutoff2))
292	{
293
294	r20 = _mm_mul_ps(rsq20,rinv20);
295
296	/* Compute parameters for interactions between i and j atoms */
297	qq20 = _mm_mul_ps(iq2,jq0);
298
299	/* EWALD ELECTROSTATICS */
300
301	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
302	ewrt = _mm_mul_ps(r20,ewtabscale);
303	ewitab = _mm_cvttps_epi32(ewrt);
304	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
305	ewitab = _mm_slli_epi32(ewitab,2);
306	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
307	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
308	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
309	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
310	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
311	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
312	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
313	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
314	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
315
316	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
317
318	/* Update potential sum for this i atom from the interaction with this j atom. */
319	velec = _mm_and_ps(velec,cutoff_mask);
320	velecsum = _mm_add_ps(velecsum,velec);
321
322	fscal = felec;
323
324	fscal = _mm_and_ps(fscal,cutoff_mask);
325
326	/* Calculate temporary vectorial force */
327	tx = _mm_mul_ps(fscal,dx20);
328	ty = _mm_mul_ps(fscal,dy20);
329	tz = _mm_mul_ps(fscal,dz20);
330
331	/* Update vectorial force */
332	fix2 = _mm_add_ps(fix2,tx);
333	fiy2 = _mm_add_ps(fiy2,ty);
334	fiz2 = _mm_add_ps(fiz2,tz);
335
336	fjx0 = _mm_add_ps(fjx0,tx);
337	fjy0 = _mm_add_ps(fjy0,ty);
338	fjz0 = _mm_add_ps(fjz0,tz);
339
340	}
341
342	/**************************
343	* CALCULATE INTERACTIONS *
344	**************************/
345
346	if (gmx_mm_any_lt(rsq30,rcutoff2))
347	{
348
349	r30 = _mm_mul_ps(rsq30,rinv30);
350
351	/* Compute parameters for interactions between i and j atoms */
352	qq30 = _mm_mul_ps(iq3,jq0);
353
354	/* EWALD ELECTROSTATICS */
355
356	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
357	ewrt = _mm_mul_ps(r30,ewtabscale);
358	ewitab = _mm_cvttps_epi32(ewrt);
359	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
360	ewitab = _mm_slli_epi32(ewitab,2);
361	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
362	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
363	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
364	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
365	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
366	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
367	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
368	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
369	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
370
371	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
372
373	/* Update potential sum for this i atom from the interaction with this j atom. */
374	velec = _mm_and_ps(velec,cutoff_mask);
375	velecsum = _mm_add_ps(velecsum,velec);
376
377	fscal = felec;
378
379	fscal = _mm_and_ps(fscal,cutoff_mask);
380
381	/* Calculate temporary vectorial force */
382	tx = _mm_mul_ps(fscal,dx30);
383	ty = _mm_mul_ps(fscal,dy30);
384	tz = _mm_mul_ps(fscal,dz30);
385
386	/* Update vectorial force */
387	fix3 = _mm_add_ps(fix3,tx);
388	fiy3 = _mm_add_ps(fiy3,ty);
389	fiz3 = _mm_add_ps(fiz3,tz);
390
391	fjx0 = _mm_add_ps(fjx0,tx);
392	fjy0 = _mm_add_ps(fjy0,ty);
393	fjz0 = _mm_add_ps(fjz0,tz);
394
395	}
396
397	fjptrA = f+j_coord_offsetA;
398	fjptrB = f+j_coord_offsetB;
399	fjptrC = f+j_coord_offsetC;
400	fjptrD = f+j_coord_offsetD;
401
402	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
403
404	/* Inner loop uses 138 flops */
405	}
406
407	if(jidx<j_index_end)
408	{
409
410	/* Get j neighbor index, and coordinate index */
411	jnrlistA = jjnr[jidx];
412	jnrlistB = jjnr[jidx+1];
413	jnrlistC = jjnr[jidx+2];
414	jnrlistD = jjnr[jidx+3];
415	/* Sign of each element will be negative for non-real atoms.
416	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
417	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
418	*/
419	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
420	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
421	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
422	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
423	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
424	j_coord_offsetA = DIM3*jnrA;
425	j_coord_offsetB = DIM3*jnrB;
426	j_coord_offsetC = DIM3*jnrC;
427	j_coord_offsetD = DIM3*jnrD;
428
429	/* load j atom coordinates */
430	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
431	x+j_coord_offsetC,x+j_coord_offsetD,
432	&jx0,&jy0,&jz0);
433
434	/* Calculate displacement vector */
435	dx10 = _mm_sub_ps(ix1,jx0);
436	dy10 = _mm_sub_ps(iy1,jy0);
437	dz10 = _mm_sub_ps(iz1,jz0);
438	dx20 = _mm_sub_ps(ix2,jx0);
439	dy20 = _mm_sub_ps(iy2,jy0);
440	dz20 = _mm_sub_ps(iz2,jz0);
441	dx30 = _mm_sub_ps(ix3,jx0);
442	dy30 = _mm_sub_ps(iy3,jy0);
443	dz30 = _mm_sub_ps(iz3,jz0);
444
445	/* Calculate squared distance and things based on it */
446	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
447	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
448	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
449
450	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
451	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
452	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
453
454	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
455	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
456	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
457
458	/* Load parameters for j particles */
459	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
460	charge+jnrC+0,charge+jnrD+0);
461
462	fjx0 = _mm_setzero_ps();
463	fjy0 = _mm_setzero_ps();
464	fjz0 = _mm_setzero_ps();
465
466	/**************************
467	* CALCULATE INTERACTIONS *
468	**************************/
469
470	if (gmx_mm_any_lt(rsq10,rcutoff2))
471	{
472
473	r10 = _mm_mul_ps(rsq10,rinv10);
474	r10 = _mm_andnot_ps(dummy_mask,r10);
475
476	/* Compute parameters for interactions between i and j atoms */
477	qq10 = _mm_mul_ps(iq1,jq0);
478
479	/* EWALD ELECTROSTATICS */
480
481	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
482	ewrt = _mm_mul_ps(r10,ewtabscale);
483	ewitab = _mm_cvttps_epi32(ewrt);
484	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
485	ewitab = _mm_slli_epi32(ewitab,2);
486	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
487	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
488	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
489	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
490	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
491	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
492	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
493	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
494	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
495
496	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
497
498	/* Update potential sum for this i atom from the interaction with this j atom. */
499	velec = _mm_and_ps(velec,cutoff_mask);
500	velec = _mm_andnot_ps(dummy_mask,velec);
501	velecsum = _mm_add_ps(velecsum,velec);
502
503	fscal = felec;
504
505	fscal = _mm_and_ps(fscal,cutoff_mask);
506
507	fscal = _mm_andnot_ps(dummy_mask,fscal);
508
509	/* Calculate temporary vectorial force */
510	tx = _mm_mul_ps(fscal,dx10);
511	ty = _mm_mul_ps(fscal,dy10);
512	tz = _mm_mul_ps(fscal,dz10);
513
514	/* Update vectorial force */
515	fix1 = _mm_add_ps(fix1,tx);
516	fiy1 = _mm_add_ps(fiy1,ty);
517	fiz1 = _mm_add_ps(fiz1,tz);
518
519	fjx0 = _mm_add_ps(fjx0,tx);
520	fjy0 = _mm_add_ps(fjy0,ty);
521	fjz0 = _mm_add_ps(fjz0,tz);
522
523	}
524
525	/**************************
526	* CALCULATE INTERACTIONS *
527	**************************/
528
529	if (gmx_mm_any_lt(rsq20,rcutoff2))
530	{
531
532	r20 = _mm_mul_ps(rsq20,rinv20);
533	r20 = _mm_andnot_ps(dummy_mask,r20);
534
535	/* Compute parameters for interactions between i and j atoms */
536	qq20 = _mm_mul_ps(iq2,jq0);
537
538	/* EWALD ELECTROSTATICS */
539
540	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
541	ewrt = _mm_mul_ps(r20,ewtabscale);
542	ewitab = _mm_cvttps_epi32(ewrt);
543	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
544	ewitab = _mm_slli_epi32(ewitab,2);
545	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
546	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
547	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
548	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
549	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
550	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
551	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
552	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
553	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
554
555	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
556
557	/* Update potential sum for this i atom from the interaction with this j atom. */
558	velec = _mm_and_ps(velec,cutoff_mask);
559	velec = _mm_andnot_ps(dummy_mask,velec);
560	velecsum = _mm_add_ps(velecsum,velec);
561
562	fscal = felec;
563
564	fscal = _mm_and_ps(fscal,cutoff_mask);
565
566	fscal = _mm_andnot_ps(dummy_mask,fscal);
567
568	/* Calculate temporary vectorial force */
569	tx = _mm_mul_ps(fscal,dx20);
570	ty = _mm_mul_ps(fscal,dy20);
571	tz = _mm_mul_ps(fscal,dz20);
572
573	/* Update vectorial force */
574	fix2 = _mm_add_ps(fix2,tx);
575	fiy2 = _mm_add_ps(fiy2,ty);
576	fiz2 = _mm_add_ps(fiz2,tz);
577
578	fjx0 = _mm_add_ps(fjx0,tx);
579	fjy0 = _mm_add_ps(fjy0,ty);
580	fjz0 = _mm_add_ps(fjz0,tz);
581
582	}
583
584	/**************************
585	* CALCULATE INTERACTIONS *
586	**************************/
587
588	if (gmx_mm_any_lt(rsq30,rcutoff2))
589	{
590
591	r30 = _mm_mul_ps(rsq30,rinv30);
592	r30 = _mm_andnot_ps(dummy_mask,r30);
593
594	/* Compute parameters for interactions between i and j atoms */
595	qq30 = _mm_mul_ps(iq3,jq0);
596
597	/* EWALD ELECTROSTATICS */
598
599	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
600	ewrt = _mm_mul_ps(r30,ewtabscale);
601	ewitab = _mm_cvttps_epi32(ewrt);
602	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
603	ewitab = _mm_slli_epi32(ewitab,2);
604	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
605	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
606	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
607	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
608	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
609	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
610	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
611	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
612	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
613
614	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
615
616	/* Update potential sum for this i atom from the interaction with this j atom. */
617	velec = _mm_and_ps(velec,cutoff_mask);
618	velec = _mm_andnot_ps(dummy_mask,velec);
619	velecsum = _mm_add_ps(velecsum,velec);
620
621	fscal = felec;
622
623	fscal = _mm_and_ps(fscal,cutoff_mask);
624
625	fscal = _mm_andnot_ps(dummy_mask,fscal);
626
627	/* Calculate temporary vectorial force */
628	tx = _mm_mul_ps(fscal,dx30);
629	ty = _mm_mul_ps(fscal,dy30);
630	tz = _mm_mul_ps(fscal,dz30);
631
632	/* Update vectorial force */
633	fix3 = _mm_add_ps(fix3,tx);
634	fiy3 = _mm_add_ps(fiy3,ty);
635	fiz3 = _mm_add_ps(fiz3,tz);
636
637	fjx0 = _mm_add_ps(fjx0,tx);
638	fjy0 = _mm_add_ps(fjy0,ty);
639	fjz0 = _mm_add_ps(fjz0,tz);
640
641	}
642
643	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
644	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
645	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
646	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
647
648	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
649
650	/* Inner loop uses 141 flops */
651	}
652
653	/* End of innermost loop */
654
655	gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
656	f+i_coord_offset+DIM3,fshift+i_shift_offset);
657
658	ggid = gid[iidx];
659	/* Update potential energies */
660	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
661
662	/* Increment number of inner iterations */
663	inneriter += j_index_end - j_index_start;
664
665	/* Outer loop uses 19 flops */
666	}
667
668	/* Increment number of outer iterations */
669	outeriter += nri;
670
671	/* Update outer/inner flops */
672
673	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter19 + inneriter141)(nrnb)->n[eNR_NBKERNEL_ELEC_W4_VF] += outeriter19 + inneriter 141;
674	}
675	/*
676	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
677	* Electrostatics interaction: Ewald
678	* VdW interaction: None
679	* Geometry: Water4-Particle
680	* Calculate force/pot: Force
681	*/
682	void
683	nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
684	(t_nblist * gmx_restrict nlist,
685	rvec * gmx_restrict xx,
686	rvec * gmx_restrict ff,
687	t_forcerec * gmx_restrict fr,
688	t_mdatoms * gmx_restrict mdatoms,
689	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
690	t_nrnb * gmx_restrict nrnb)
691	{
692	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
693	* just 0 for non-waters.
694	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
695	* jnr indices corresponding to data put in the four positions in the SIMD register.
696	*/
697	int i_shift_offset,i_coord_offset,outeriter,inneriter;
698	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
699	int jnrA,jnrB,jnrC,jnrD;
700	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
701	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
702	int iinr,jindex,jjnr,shiftidx,*gid;
703	real rcutoff_scalar;
704	real shiftvec,fshift,x,f;
705	real fjptrA,fjptrB,fjptrC,fjptrD;
706	real scratch[4*DIM3];
707	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
708	int vdwioffset1;
709	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
710	int vdwioffset2;
711	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
712	int vdwioffset3;
713	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
714	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
715	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
716	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
717	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
718	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
719	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
720	real *charge;
721	__m128i ewitab;
722	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
723	real *ewtab;
724	__m128 dummy_mask,cutoff_mask;
725	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
726	__m128 one = _mm_set1_ps(1.0);
727	__m128 two = _mm_set1_ps(2.0);
728	x = xx[0];
729	f = ff[0];
730
731	nri = nlist->nri;
732	iinr = nlist->iinr;
733	jindex = nlist->jindex;
734	jjnr = nlist->jjnr;
735	shiftidx = nlist->shift;
736	gid = nlist->gid;
737	shiftvec = fr->shift_vec[0];
738	fshift = fr->fshift[0];
739	facel = _mm_set1_ps(fr->epsfac);
740	charge = mdatoms->chargeA;
741
742	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
743	ewtab = fr->ic->tabq_coul_F;
744	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
745	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
746
747	/* Setup water-specific parameters */
748	inr = nlist->iinr[0];
749	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
750	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
751	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
752
753	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
754	rcutoff_scalar = fr->rcoulomb;
755	rcutoff = _mm_set1_ps(rcutoff_scalar);
756	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
757
758	/* Avoid stupid compiler warnings */
759	jnrA = jnrB = jnrC = jnrD = 0;
760	j_coord_offsetA = 0;
761	j_coord_offsetB = 0;
762	j_coord_offsetC = 0;
763	j_coord_offsetD = 0;
764
765	outeriter = 0;
766	inneriter = 0;
767
768	for(iidx=0;iidx<4*DIM3;iidx++)
769	{
770	scratch[iidx] = 0.0;
771	}
772
773	/* Start outer loop over neighborlists */
774	for(iidx=0; iidx<nri; iidx++)
775	{
776	/* Load shift vector for this list */
777	i_shift_offset = DIM3*shiftidx[iidx];
778
779	/* Load limits for loop over neighbors */
780	j_index_start = jindex[iidx];
781	j_index_end = jindex[iidx+1];
782
783	/* Get outer coordinate index */
784	inr = iinr[iidx];
785	i_coord_offset = DIM3*inr;
786
787	/* Load i particle coords and add shift vector */
788	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM3,
789	&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
790
791	fix1 = _mm_setzero_ps();
792	fiy1 = _mm_setzero_ps();
793	fiz1 = _mm_setzero_ps();
794	fix2 = _mm_setzero_ps();
795	fiy2 = _mm_setzero_ps();
796	fiz2 = _mm_setzero_ps();
797	fix3 = _mm_setzero_ps();
798	fiy3 = _mm_setzero_ps();
799	fiz3 = _mm_setzero_ps();
800
801	/* Start inner kernel loop */
802	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
803	{
804
805	/* Get j neighbor index, and coordinate index */
806	jnrA = jjnr[jidx];
807	jnrB = jjnr[jidx+1];
808	jnrC = jjnr[jidx+2];
809	jnrD = jjnr[jidx+3];
810	j_coord_offsetA = DIM3*jnrA;
811	j_coord_offsetB = DIM3*jnrB;
812	j_coord_offsetC = DIM3*jnrC;
813	j_coord_offsetD = DIM3*jnrD;
814
815	/* load j atom coordinates */
816	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
817	x+j_coord_offsetC,x+j_coord_offsetD,
818	&jx0,&jy0,&jz0);
819
820	/* Calculate displacement vector */
821	dx10 = _mm_sub_ps(ix1,jx0);
822	dy10 = _mm_sub_ps(iy1,jy0);
823	dz10 = _mm_sub_ps(iz1,jz0);
824	dx20 = _mm_sub_ps(ix2,jx0);
825	dy20 = _mm_sub_ps(iy2,jy0);
826	dz20 = _mm_sub_ps(iz2,jz0);
827	dx30 = _mm_sub_ps(ix3,jx0);
828	dy30 = _mm_sub_ps(iy3,jy0);
829	dz30 = _mm_sub_ps(iz3,jz0);
830
831	/* Calculate squared distance and things based on it */
832	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
833	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
834	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
835
836	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
837	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
838	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
839
840	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
841	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
842	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
843
844	/* Load parameters for j particles */
845	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
846	charge+jnrC+0,charge+jnrD+0);
847
848	fjx0 = _mm_setzero_ps();
849	fjy0 = _mm_setzero_ps();
850	fjz0 = _mm_setzero_ps();
851
852	/**************************
853	* CALCULATE INTERACTIONS *
854	**************************/
855
856	if (gmx_mm_any_lt(rsq10,rcutoff2))
857	{
858
859	r10 = _mm_mul_ps(rsq10,rinv10);
860
861	/* Compute parameters for interactions between i and j atoms */
862	qq10 = _mm_mul_ps(iq1,jq0);
863
864	/* EWALD ELECTROSTATICS */
865
866	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
867	ewrt = _mm_mul_ps(r10,ewtabscale);
868	ewitab = _mm_cvttps_epi32(ewrt);
869	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
870	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
871	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
872	&ewtabF,&ewtabFn);
873	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
874	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
875
876	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
877
878	fscal = felec;
879
880	fscal = _mm_and_ps(fscal,cutoff_mask);
881
882	/* Calculate temporary vectorial force */
883	tx = _mm_mul_ps(fscal,dx10);
884	ty = _mm_mul_ps(fscal,dy10);
885	tz = _mm_mul_ps(fscal,dz10);
886
887	/* Update vectorial force */
888	fix1 = _mm_add_ps(fix1,tx);
889	fiy1 = _mm_add_ps(fiy1,ty);
890	fiz1 = _mm_add_ps(fiz1,tz);
891
892	fjx0 = _mm_add_ps(fjx0,tx);
893	fjy0 = _mm_add_ps(fjy0,ty);
894	fjz0 = _mm_add_ps(fjz0,tz);
895
896	}
897
898	/**************************
899	* CALCULATE INTERACTIONS *
900	**************************/
901
902	if (gmx_mm_any_lt(rsq20,rcutoff2))
903	{
904
905	r20 = _mm_mul_ps(rsq20,rinv20);
906
907	/* Compute parameters for interactions between i and j atoms */
908	qq20 = _mm_mul_ps(iq2,jq0);
909
910	/* EWALD ELECTROSTATICS */
911
912	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
913	ewrt = _mm_mul_ps(r20,ewtabscale);
914	ewitab = _mm_cvttps_epi32(ewrt);
915	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
916	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
917	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
918	&ewtabF,&ewtabFn);
919	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
920	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
921
922	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
923
924	fscal = felec;
925
926	fscal = _mm_and_ps(fscal,cutoff_mask);
927
928	/* Calculate temporary vectorial force */
929	tx = _mm_mul_ps(fscal,dx20);
930	ty = _mm_mul_ps(fscal,dy20);
931	tz = _mm_mul_ps(fscal,dz20);
932
933	/* Update vectorial force */
934	fix2 = _mm_add_ps(fix2,tx);
935	fiy2 = _mm_add_ps(fiy2,ty);
936	fiz2 = _mm_add_ps(fiz2,tz);
937
938	fjx0 = _mm_add_ps(fjx0,tx);
939	fjy0 = _mm_add_ps(fjy0,ty);
940	fjz0 = _mm_add_ps(fjz0,tz);
941
942	}
943
944	/**************************
945	* CALCULATE INTERACTIONS *
946	**************************/
947
948	if (gmx_mm_any_lt(rsq30,rcutoff2))
949	{
950
951	r30 = _mm_mul_ps(rsq30,rinv30);
952
953	/* Compute parameters for interactions between i and j atoms */
954	qq30 = _mm_mul_ps(iq3,jq0);
955
956	/* EWALD ELECTROSTATICS */
957
958	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
959	ewrt = _mm_mul_ps(r30,ewtabscale);
960	ewitab = _mm_cvttps_epi32(ewrt);
961	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
962	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
963	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
964	&ewtabF,&ewtabFn);
965	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
966	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
967
968	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
969
970	fscal = felec;
971
972	fscal = _mm_and_ps(fscal,cutoff_mask);
973
974	/* Calculate temporary vectorial force */
975	tx = _mm_mul_ps(fscal,dx30);
976	ty = _mm_mul_ps(fscal,dy30);
977	tz = _mm_mul_ps(fscal,dz30);
978
979	/* Update vectorial force */
980	fix3 = _mm_add_ps(fix3,tx);
981	fiy3 = _mm_add_ps(fiy3,ty);
982	fiz3 = _mm_add_ps(fiz3,tz);
983
984	fjx0 = _mm_add_ps(fjx0,tx);
985	fjy0 = _mm_add_ps(fjy0,ty);
986	fjz0 = _mm_add_ps(fjz0,tz);
987
988	}
989
990	fjptrA = f+j_coord_offsetA;
991	fjptrB = f+j_coord_offsetB;
992	fjptrC = f+j_coord_offsetC;
993	fjptrD = f+j_coord_offsetD;
994
995	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
996
997	/* Inner loop uses 117 flops */
998	}
999
1000	if(jidx<j_index_end)
1001	{
1002
1003	/* Get j neighbor index, and coordinate index */
1004	jnrlistA = jjnr[jidx];
1005	jnrlistB = jjnr[jidx+1];
1006	jnrlistC = jjnr[jidx+2];
1007	jnrlistD = jjnr[jidx+3];
1008	/* Sign of each element will be negative for non-real atoms.
1009	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1010	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1011	*/
1012	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1013	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1014	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1015	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1016	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1017	j_coord_offsetA = DIM3*jnrA;
1018	j_coord_offsetB = DIM3*jnrB;
1019	j_coord_offsetC = DIM3*jnrC;
1020	j_coord_offsetD = DIM3*jnrD;
1021
1022	/* load j atom coordinates */
1023	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1024	x+j_coord_offsetC,x+j_coord_offsetD,
1025	&jx0,&jy0,&jz0);
1026
1027	/* Calculate displacement vector */
1028	dx10 = _mm_sub_ps(ix1,jx0);
1029	dy10 = _mm_sub_ps(iy1,jy0);
1030	dz10 = _mm_sub_ps(iz1,jz0);
1031	dx20 = _mm_sub_ps(ix2,jx0);
1032	dy20 = _mm_sub_ps(iy2,jy0);
1033	dz20 = _mm_sub_ps(iz2,jz0);
1034	dx30 = _mm_sub_ps(ix3,jx0);
1035	dy30 = _mm_sub_ps(iy3,jy0);
1036	dz30 = _mm_sub_ps(iz3,jz0);
1037
1038	/* Calculate squared distance and things based on it */
1039	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1040	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1041	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1042
1043	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1044	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1045	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1046
1047	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1048	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1049	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1050
1051	/* Load parameters for j particles */
1052	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1053	charge+jnrC+0,charge+jnrD+0);
1054
1055	fjx0 = _mm_setzero_ps();
1056	fjy0 = _mm_setzero_ps();
1057	fjz0 = _mm_setzero_ps();
1058
1059	/**************************
1060	* CALCULATE INTERACTIONS *
1061	**************************/
1062
1063	if (gmx_mm_any_lt(rsq10,rcutoff2))
1064	{
1065
1066	r10 = _mm_mul_ps(rsq10,rinv10);
1067	r10 = _mm_andnot_ps(dummy_mask,r10);
1068
1069	/* Compute parameters for interactions between i and j atoms */
1070	qq10 = _mm_mul_ps(iq1,jq0);
1071
1072	/* EWALD ELECTROSTATICS */
1073
1074	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1075	ewrt = _mm_mul_ps(r10,ewtabscale);
1076	ewitab = _mm_cvttps_epi32(ewrt);
1077	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1078	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1079	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1080	&ewtabF,&ewtabFn);
1081	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1082	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1083
1084	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1085
1086	fscal = felec;
1087
1088	fscal = _mm_and_ps(fscal,cutoff_mask);
1089
1090	fscal = _mm_andnot_ps(dummy_mask,fscal);
1091
1092	/* Calculate temporary vectorial force */
1093	tx = _mm_mul_ps(fscal,dx10);
1094	ty = _mm_mul_ps(fscal,dy10);
1095	tz = _mm_mul_ps(fscal,dz10);
1096
1097	/* Update vectorial force */
1098	fix1 = _mm_add_ps(fix1,tx);
1099	fiy1 = _mm_add_ps(fiy1,ty);
1100	fiz1 = _mm_add_ps(fiz1,tz);
1101
1102	fjx0 = _mm_add_ps(fjx0,tx);
1103	fjy0 = _mm_add_ps(fjy0,ty);
1104	fjz0 = _mm_add_ps(fjz0,tz);
1105
1106	}
1107
1108	/**************************
1109	* CALCULATE INTERACTIONS *
1110	**************************/
1111
1112	if (gmx_mm_any_lt(rsq20,rcutoff2))
1113	{
1114
1115	r20 = _mm_mul_ps(rsq20,rinv20);
1116	r20 = _mm_andnot_ps(dummy_mask,r20);
1117
1118	/* Compute parameters for interactions between i and j atoms */
1119	qq20 = _mm_mul_ps(iq2,jq0);
1120
1121	/* EWALD ELECTROSTATICS */
1122
1123	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1124	ewrt = _mm_mul_ps(r20,ewtabscale);
1125	ewitab = _mm_cvttps_epi32(ewrt);
1126	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1127	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1128	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1129	&ewtabF,&ewtabFn);
1130	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1131	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1132
1133	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1134
1135	fscal = felec;
1136
1137	fscal = _mm_and_ps(fscal,cutoff_mask);
1138
1139	fscal = _mm_andnot_ps(dummy_mask,fscal);
1140
1141	/* Calculate temporary vectorial force */
1142	tx = _mm_mul_ps(fscal,dx20);
1143	ty = _mm_mul_ps(fscal,dy20);
1144	tz = _mm_mul_ps(fscal,dz20);
1145
1146	/* Update vectorial force */
1147	fix2 = _mm_add_ps(fix2,tx);
1148	fiy2 = _mm_add_ps(fiy2,ty);
1149	fiz2 = _mm_add_ps(fiz2,tz);
1150
1151	fjx0 = _mm_add_ps(fjx0,tx);
1152	fjy0 = _mm_add_ps(fjy0,ty);
1153	fjz0 = _mm_add_ps(fjz0,tz);
1154
1155	}
1156
1157	/**************************
1158	* CALCULATE INTERACTIONS *
1159	**************************/
1160
1161	if (gmx_mm_any_lt(rsq30,rcutoff2))
1162	{
1163
1164	r30 = _mm_mul_ps(rsq30,rinv30);
1165	r30 = _mm_andnot_ps(dummy_mask,r30);
1166
1167	/* Compute parameters for interactions between i and j atoms */
1168	qq30 = _mm_mul_ps(iq3,jq0);
1169
1170	/* EWALD ELECTROSTATICS */
1171
1172	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1173	ewrt = _mm_mul_ps(r30,ewtabscale);
1174	ewitab = _mm_cvttps_epi32(ewrt);
1175	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1176	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1177	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1178	&ewtabF,&ewtabFn);
1179	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1180	felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1181
1182	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1183
1184	fscal = felec;
1185
1186	fscal = _mm_and_ps(fscal,cutoff_mask);
1187
1188	fscal = _mm_andnot_ps(dummy_mask,fscal);
1189
1190	/* Calculate temporary vectorial force */
1191	tx = _mm_mul_ps(fscal,dx30);
1192	ty = _mm_mul_ps(fscal,dy30);
1193	tz = _mm_mul_ps(fscal,dz30);
1194
1195	/* Update vectorial force */
1196	fix3 = _mm_add_ps(fix3,tx);
1197	fiy3 = _mm_add_ps(fiy3,ty);
1198	fiz3 = _mm_add_ps(fiz3,tz);
1199
1200	fjx0 = _mm_add_ps(fjx0,tx);
1201	fjy0 = _mm_add_ps(fjy0,ty);
1202	fjz0 = _mm_add_ps(fjz0,tz);
1203
1204	}
1205
1206	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1207	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1208	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1209	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1210
1211	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1212
1213	/* Inner loop uses 120 flops */
1214	}
1215
1216	/* End of innermost loop */
1217
1218	gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1219	f+i_coord_offset+DIM3,fshift+i_shift_offset);
1220
1221	/* Increment number of inner iterations */
1222	inneriter += j_index_end - j_index_start;
1223
1224	/* Outer loop uses 18 flops */
1225	}
1226
1227	/* Increment number of outer iterations */
1228	outeriter += nri;
1229
1230	/* Update outer/inner flops */
1231
1232	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter18 + inneriter120)(nrnb)->n[eNR_NBKERNEL_ELEC_W4_F] += outeriter18 + inneriter 120;
1233	}