/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse4_1_single.c
Location:	line 110, column 22
Description:	Value stored to 'signbit' during its initialization is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
54	* Electrostatics interaction: ReactionField
55	* VdW interaction: LennardJones
56	* Geometry: Water4-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwioffset3;
92	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
100	real *charge;
101	int nvdwtype;
102	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103	int *vdwtype;
104	real *vdwparam;
105	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
106	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
107	__m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
108	real rswitch_scalar,d_scalar;
109	__m128 dummy_mask,cutoff_mask;
110	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
	Value stored to 'signbit' during its initialization is never read
111	__m128 one = _mm_set1_ps(1.0);
112	__m128 two = _mm_set1_ps(2.0);
113	x = xx[0];
114	f = ff[0];
115
116	nri = nlist->nri;
117	iinr = nlist->iinr;
118	jindex = nlist->jindex;
119	jjnr = nlist->jjnr;
120	shiftidx = nlist->shift;
121	gid = nlist->gid;
122	shiftvec = fr->shift_vec[0];
123	fshift = fr->fshift[0];
124	facel = _mm_set1_ps(fr->epsfac);
125	charge = mdatoms->chargeA;
126	krf = _mm_set1_ps(fr->ic->k_rf);
127	krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
128	crf = _mm_set1_ps(fr->ic->c_rf);
129	nvdwtype = fr->ntype;
130	vdwparam = fr->nbfp;
131	vdwtype = mdatoms->typeA;
132
133	/* Setup water-specific parameters */
134	inr = nlist->iinr[0];
135	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
136	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
137	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
138	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
139
140	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
141	rcutoff_scalar = fr->rcoulomb;
142	rcutoff = _mm_set1_ps(rcutoff_scalar);
143	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
144
145	rswitch_scalar = fr->rvdw_switch;
146	rswitch = _mm_set1_ps(rswitch_scalar);
147	/* Setup switch parameters */
148	d_scalar = rcutoff_scalar-rswitch_scalar;
149	d = _mm_set1_ps(d_scalar);
150	swV3 = _mm_set1_ps(-10.0/(d_scalard_scalard_scalar));
151	swV4 = _mm_set1_ps( 15.0/(d_scalard_scalard_scalar*d_scalar));
152	swV5 = _mm_set1_ps( -6.0/(d_scalard_scalard_scalard_scalard_scalar));
153	swF2 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalar));
154	swF3 = _mm_set1_ps( 60.0/(d_scalard_scalard_scalar*d_scalar));
155	swF4 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalard_scalard_scalar));
156
157	/* Avoid stupid compiler warnings */
158	jnrA = jnrB = jnrC = jnrD = 0;
159	j_coord_offsetA = 0;
160	j_coord_offsetB = 0;
161	j_coord_offsetC = 0;
162	j_coord_offsetD = 0;
163
164	outeriter = 0;
165	inneriter = 0;
166
167	for(iidx=0;iidx<4*DIM3;iidx++)
168	{
169	scratch[iidx] = 0.0;
170	}
171
172	/* Start outer loop over neighborlists */
173	for(iidx=0; iidx<nri; iidx++)
174	{
175	/* Load shift vector for this list */
176	i_shift_offset = DIM3*shiftidx[iidx];
177
178	/* Load limits for loop over neighbors */
179	j_index_start = jindex[iidx];
180	j_index_end = jindex[iidx+1];
181
182	/* Get outer coordinate index */
183	inr = iinr[iidx];
184	i_coord_offset = DIM3*inr;
185
186	/* Load i particle coords and add shift vector */
187	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
188	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
189
190	fix0 = _mm_setzero_ps();
191	fiy0 = _mm_setzero_ps();
192	fiz0 = _mm_setzero_ps();
193	fix1 = _mm_setzero_ps();
194	fiy1 = _mm_setzero_ps();
195	fiz1 = _mm_setzero_ps();
196	fix2 = _mm_setzero_ps();
197	fiy2 = _mm_setzero_ps();
198	fiz2 = _mm_setzero_ps();
199	fix3 = _mm_setzero_ps();
200	fiy3 = _mm_setzero_ps();
201	fiz3 = _mm_setzero_ps();
202
203	/* Reset potential sums */
204	velecsum = _mm_setzero_ps();
205	vvdwsum = _mm_setzero_ps();
206
207	/* Start inner kernel loop */
208	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
209	{
210
211	/* Get j neighbor index, and coordinate index */
212	jnrA = jjnr[jidx];
213	jnrB = jjnr[jidx+1];
214	jnrC = jjnr[jidx+2];
215	jnrD = jjnr[jidx+3];
216	j_coord_offsetA = DIM3*jnrA;
217	j_coord_offsetB = DIM3*jnrB;
218	j_coord_offsetC = DIM3*jnrC;
219	j_coord_offsetD = DIM3*jnrD;
220
221	/* load j atom coordinates */
222	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
223	x+j_coord_offsetC,x+j_coord_offsetD,
224	&jx0,&jy0,&jz0);
225
226	/* Calculate displacement vector */
227	dx00 = _mm_sub_ps(ix0,jx0);
228	dy00 = _mm_sub_ps(iy0,jy0);
229	dz00 = _mm_sub_ps(iz0,jz0);
230	dx10 = _mm_sub_ps(ix1,jx0);
231	dy10 = _mm_sub_ps(iy1,jy0);
232	dz10 = _mm_sub_ps(iz1,jz0);
233	dx20 = _mm_sub_ps(ix2,jx0);
234	dy20 = _mm_sub_ps(iy2,jy0);
235	dz20 = _mm_sub_ps(iz2,jz0);
236	dx30 = _mm_sub_ps(ix3,jx0);
237	dy30 = _mm_sub_ps(iy3,jy0);
238	dz30 = _mm_sub_ps(iz3,jz0);
239
240	/* Calculate squared distance and things based on it */
241	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
242	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
243	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
244	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
245
246	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
247	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
248	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
249	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
250
251	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
252	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
253	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
254	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
255
256	/* Load parameters for j particles */
257	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
258	charge+jnrC+0,charge+jnrD+0);
259	vdwjidx0A = 2*vdwtype[jnrA+0];
260	vdwjidx0B = 2*vdwtype[jnrB+0];
261	vdwjidx0C = 2*vdwtype[jnrC+0];
262	vdwjidx0D = 2*vdwtype[jnrD+0];
263
264	fjx0 = _mm_setzero_ps();
265	fjy0 = _mm_setzero_ps();
266	fjz0 = _mm_setzero_ps();
267
268	/**************************
269	* CALCULATE INTERACTIONS *
270	**************************/
271
272	if (gmx_mm_any_lt(rsq00,rcutoff2))
273	{
274
275	r00 = _mm_mul_ps(rsq00,rinv00);
276
277	/* Compute parameters for interactions between i and j atoms */
278	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
279	vdwparam+vdwioffset0+vdwjidx0B,
280	vdwparam+vdwioffset0+vdwjidx0C,
281	vdwparam+vdwioffset0+vdwjidx0D,
282	&c6_00,&c12_00);
283
284	/* LENNARD-JONES DISPERSION/REPULSION */
285
286	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
287	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
288	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
289	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
290	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
291
292	d = _mm_sub_ps(r00,rswitch);
293	d = _mm_max_ps(d,_mm_setzero_ps());
294	d2 = _mm_mul_ps(d,d);
295	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
296
297	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
298
299	/* Evaluate switch function */
300	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
301	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
302	vvdw = _mm_mul_ps(vvdw,sw);
303	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
304
305	/* Update potential sum for this i atom from the interaction with this j atom. */
306	vvdw = _mm_and_ps(vvdw,cutoff_mask);
307	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
308
309	fscal = fvdw;
310
311	fscal = _mm_and_ps(fscal,cutoff_mask);
312
313	/* Calculate temporary vectorial force */
314	tx = _mm_mul_ps(fscal,dx00);
315	ty = _mm_mul_ps(fscal,dy00);
316	tz = _mm_mul_ps(fscal,dz00);
317
318	/* Update vectorial force */
319	fix0 = _mm_add_ps(fix0,tx);
320	fiy0 = _mm_add_ps(fiy0,ty);
321	fiz0 = _mm_add_ps(fiz0,tz);
322
323	fjx0 = _mm_add_ps(fjx0,tx);
324	fjy0 = _mm_add_ps(fjy0,ty);
325	fjz0 = _mm_add_ps(fjz0,tz);
326
327	}
328
329	/**************************
330	* CALCULATE INTERACTIONS *
331	**************************/
332
333	if (gmx_mm_any_lt(rsq10,rcutoff2))
334	{
335
336	/* Compute parameters for interactions between i and j atoms */
337	qq10 = _mm_mul_ps(iq1,jq0);
338
339	/* REACTION-FIELD ELECTROSTATICS */
340	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
341	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
342
343	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
344
345	/* Update potential sum for this i atom from the interaction with this j atom. */
346	velec = _mm_and_ps(velec,cutoff_mask);
347	velecsum = _mm_add_ps(velecsum,velec);
348
349	fscal = felec;
350
351	fscal = _mm_and_ps(fscal,cutoff_mask);
352
353	/* Calculate temporary vectorial force */
354	tx = _mm_mul_ps(fscal,dx10);
355	ty = _mm_mul_ps(fscal,dy10);
356	tz = _mm_mul_ps(fscal,dz10);
357
358	/* Update vectorial force */
359	fix1 = _mm_add_ps(fix1,tx);
360	fiy1 = _mm_add_ps(fiy1,ty);
361	fiz1 = _mm_add_ps(fiz1,tz);
362
363	fjx0 = _mm_add_ps(fjx0,tx);
364	fjy0 = _mm_add_ps(fjy0,ty);
365	fjz0 = _mm_add_ps(fjz0,tz);
366
367	}
368
369	/**************************
370	* CALCULATE INTERACTIONS *
371	**************************/
372
373	if (gmx_mm_any_lt(rsq20,rcutoff2))
374	{
375
376	/* Compute parameters for interactions between i and j atoms */
377	qq20 = _mm_mul_ps(iq2,jq0);
378
379	/* REACTION-FIELD ELECTROSTATICS */
380	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
381	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
382
383	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
384
385	/* Update potential sum for this i atom from the interaction with this j atom. */
386	velec = _mm_and_ps(velec,cutoff_mask);
387	velecsum = _mm_add_ps(velecsum,velec);
388
389	fscal = felec;
390
391	fscal = _mm_and_ps(fscal,cutoff_mask);
392
393	/* Calculate temporary vectorial force */
394	tx = _mm_mul_ps(fscal,dx20);
395	ty = _mm_mul_ps(fscal,dy20);
396	tz = _mm_mul_ps(fscal,dz20);
397
398	/* Update vectorial force */
399	fix2 = _mm_add_ps(fix2,tx);
400	fiy2 = _mm_add_ps(fiy2,ty);
401	fiz2 = _mm_add_ps(fiz2,tz);
402
403	fjx0 = _mm_add_ps(fjx0,tx);
404	fjy0 = _mm_add_ps(fjy0,ty);
405	fjz0 = _mm_add_ps(fjz0,tz);
406
407	}
408
409	/**************************
410	* CALCULATE INTERACTIONS *
411	**************************/
412
413	if (gmx_mm_any_lt(rsq30,rcutoff2))
414	{
415
416	/* Compute parameters for interactions between i and j atoms */
417	qq30 = _mm_mul_ps(iq3,jq0);
418
419	/* REACTION-FIELD ELECTROSTATICS */
420	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
421	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
422
423	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
424
425	/* Update potential sum for this i atom from the interaction with this j atom. */
426	velec = _mm_and_ps(velec,cutoff_mask);
427	velecsum = _mm_add_ps(velecsum,velec);
428
429	fscal = felec;
430
431	fscal = _mm_and_ps(fscal,cutoff_mask);
432
433	/* Calculate temporary vectorial force */
434	tx = _mm_mul_ps(fscal,dx30);
435	ty = _mm_mul_ps(fscal,dy30);
436	tz = _mm_mul_ps(fscal,dz30);
437
438	/* Update vectorial force */
439	fix3 = _mm_add_ps(fix3,tx);
440	fiy3 = _mm_add_ps(fiy3,ty);
441	fiz3 = _mm_add_ps(fiz3,tz);
442
443	fjx0 = _mm_add_ps(fjx0,tx);
444	fjy0 = _mm_add_ps(fjy0,ty);
445	fjz0 = _mm_add_ps(fjz0,tz);
446
447	}
448
449	fjptrA = f+j_coord_offsetA;
450	fjptrB = f+j_coord_offsetB;
451	fjptrC = f+j_coord_offsetC;
452	fjptrD = f+j_coord_offsetD;
453
454	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
455
456	/* Inner loop uses 167 flops */
457	}
458
459	if(jidx<j_index_end)
460	{
461
462	/* Get j neighbor index, and coordinate index */
463	jnrlistA = jjnr[jidx];
464	jnrlistB = jjnr[jidx+1];
465	jnrlistC = jjnr[jidx+2];
466	jnrlistD = jjnr[jidx+3];
467	/* Sign of each element will be negative for non-real atoms.
468	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
469	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
470	*/
471	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
472	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
473	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
474	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
475	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
476	j_coord_offsetA = DIM3*jnrA;
477	j_coord_offsetB = DIM3*jnrB;
478	j_coord_offsetC = DIM3*jnrC;
479	j_coord_offsetD = DIM3*jnrD;
480
481	/* load j atom coordinates */
482	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
483	x+j_coord_offsetC,x+j_coord_offsetD,
484	&jx0,&jy0,&jz0);
485
486	/* Calculate displacement vector */
487	dx00 = _mm_sub_ps(ix0,jx0);
488	dy00 = _mm_sub_ps(iy0,jy0);
489	dz00 = _mm_sub_ps(iz0,jz0);
490	dx10 = _mm_sub_ps(ix1,jx0);
491	dy10 = _mm_sub_ps(iy1,jy0);
492	dz10 = _mm_sub_ps(iz1,jz0);
493	dx20 = _mm_sub_ps(ix2,jx0);
494	dy20 = _mm_sub_ps(iy2,jy0);
495	dz20 = _mm_sub_ps(iz2,jz0);
496	dx30 = _mm_sub_ps(ix3,jx0);
497	dy30 = _mm_sub_ps(iy3,jy0);
498	dz30 = _mm_sub_ps(iz3,jz0);
499
500	/* Calculate squared distance and things based on it */
501	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
502	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
503	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
504	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
505
506	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
507	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
508	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
509	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
510
511	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
512	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
513	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
514	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
515
516	/* Load parameters for j particles */
517	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
518	charge+jnrC+0,charge+jnrD+0);
519	vdwjidx0A = 2*vdwtype[jnrA+0];
520	vdwjidx0B = 2*vdwtype[jnrB+0];
521	vdwjidx0C = 2*vdwtype[jnrC+0];
522	vdwjidx0D = 2*vdwtype[jnrD+0];
523
524	fjx0 = _mm_setzero_ps();
525	fjy0 = _mm_setzero_ps();
526	fjz0 = _mm_setzero_ps();
527
528	/**************************
529	* CALCULATE INTERACTIONS *
530	**************************/
531
532	if (gmx_mm_any_lt(rsq00,rcutoff2))
533	{
534
535	r00 = _mm_mul_ps(rsq00,rinv00);
536	r00 = _mm_andnot_ps(dummy_mask,r00);
537
538	/* Compute parameters for interactions between i and j atoms */
539	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
540	vdwparam+vdwioffset0+vdwjidx0B,
541	vdwparam+vdwioffset0+vdwjidx0C,
542	vdwparam+vdwioffset0+vdwjidx0D,
543	&c6_00,&c12_00);
544
545	/* LENNARD-JONES DISPERSION/REPULSION */
546
547	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
548	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
549	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
550	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
551	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
552
553	d = _mm_sub_ps(r00,rswitch);
554	d = _mm_max_ps(d,_mm_setzero_ps());
555	d2 = _mm_mul_ps(d,d);
556	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
557
558	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
559
560	/* Evaluate switch function */
561	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
562	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
563	vvdw = _mm_mul_ps(vvdw,sw);
564	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
565
566	/* Update potential sum for this i atom from the interaction with this j atom. */
567	vvdw = _mm_and_ps(vvdw,cutoff_mask);
568	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
569	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
570
571	fscal = fvdw;
572
573	fscal = _mm_and_ps(fscal,cutoff_mask);
574
575	fscal = _mm_andnot_ps(dummy_mask,fscal);
576
577	/* Calculate temporary vectorial force */
578	tx = _mm_mul_ps(fscal,dx00);
579	ty = _mm_mul_ps(fscal,dy00);
580	tz = _mm_mul_ps(fscal,dz00);
581
582	/* Update vectorial force */
583	fix0 = _mm_add_ps(fix0,tx);
584	fiy0 = _mm_add_ps(fiy0,ty);
585	fiz0 = _mm_add_ps(fiz0,tz);
586
587	fjx0 = _mm_add_ps(fjx0,tx);
588	fjy0 = _mm_add_ps(fjy0,ty);
589	fjz0 = _mm_add_ps(fjz0,tz);
590
591	}
592
593	/**************************
594	* CALCULATE INTERACTIONS *
595	**************************/
596
597	if (gmx_mm_any_lt(rsq10,rcutoff2))
598	{
599
600	/* Compute parameters for interactions between i and j atoms */
601	qq10 = _mm_mul_ps(iq1,jq0);
602
603	/* REACTION-FIELD ELECTROSTATICS */
604	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
605	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
606
607	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
608
609	/* Update potential sum for this i atom from the interaction with this j atom. */
610	velec = _mm_and_ps(velec,cutoff_mask);
611	velec = _mm_andnot_ps(dummy_mask,velec);
612	velecsum = _mm_add_ps(velecsum,velec);
613
614	fscal = felec;
615
616	fscal = _mm_and_ps(fscal,cutoff_mask);
617
618	fscal = _mm_andnot_ps(dummy_mask,fscal);
619
620	/* Calculate temporary vectorial force */
621	tx = _mm_mul_ps(fscal,dx10);
622	ty = _mm_mul_ps(fscal,dy10);
623	tz = _mm_mul_ps(fscal,dz10);
624
625	/* Update vectorial force */
626	fix1 = _mm_add_ps(fix1,tx);
627	fiy1 = _mm_add_ps(fiy1,ty);
628	fiz1 = _mm_add_ps(fiz1,tz);
629
630	fjx0 = _mm_add_ps(fjx0,tx);
631	fjy0 = _mm_add_ps(fjy0,ty);
632	fjz0 = _mm_add_ps(fjz0,tz);
633
634	}
635
636	/**************************
637	* CALCULATE INTERACTIONS *
638	**************************/
639
640	if (gmx_mm_any_lt(rsq20,rcutoff2))
641	{
642
643	/* Compute parameters for interactions between i and j atoms */
644	qq20 = _mm_mul_ps(iq2,jq0);
645
646	/* REACTION-FIELD ELECTROSTATICS */
647	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
648	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
649
650	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
651
652	/* Update potential sum for this i atom from the interaction with this j atom. */
653	velec = _mm_and_ps(velec,cutoff_mask);
654	velec = _mm_andnot_ps(dummy_mask,velec);
655	velecsum = _mm_add_ps(velecsum,velec);
656
657	fscal = felec;
658
659	fscal = _mm_and_ps(fscal,cutoff_mask);
660
661	fscal = _mm_andnot_ps(dummy_mask,fscal);
662
663	/* Calculate temporary vectorial force */
664	tx = _mm_mul_ps(fscal,dx20);
665	ty = _mm_mul_ps(fscal,dy20);
666	tz = _mm_mul_ps(fscal,dz20);
667
668	/* Update vectorial force */
669	fix2 = _mm_add_ps(fix2,tx);
670	fiy2 = _mm_add_ps(fiy2,ty);
671	fiz2 = _mm_add_ps(fiz2,tz);
672
673	fjx0 = _mm_add_ps(fjx0,tx);
674	fjy0 = _mm_add_ps(fjy0,ty);
675	fjz0 = _mm_add_ps(fjz0,tz);
676
677	}
678
679	/**************************
680	* CALCULATE INTERACTIONS *
681	**************************/
682
683	if (gmx_mm_any_lt(rsq30,rcutoff2))
684	{
685
686	/* Compute parameters for interactions between i and j atoms */
687	qq30 = _mm_mul_ps(iq3,jq0);
688
689	/* REACTION-FIELD ELECTROSTATICS */
690	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
691	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
692
693	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
694
695	/* Update potential sum for this i atom from the interaction with this j atom. */
696	velec = _mm_and_ps(velec,cutoff_mask);
697	velec = _mm_andnot_ps(dummy_mask,velec);
698	velecsum = _mm_add_ps(velecsum,velec);
699
700	fscal = felec;
701
702	fscal = _mm_and_ps(fscal,cutoff_mask);
703
704	fscal = _mm_andnot_ps(dummy_mask,fscal);
705
706	/* Calculate temporary vectorial force */
707	tx = _mm_mul_ps(fscal,dx30);
708	ty = _mm_mul_ps(fscal,dy30);
709	tz = _mm_mul_ps(fscal,dz30);
710
711	/* Update vectorial force */
712	fix3 = _mm_add_ps(fix3,tx);
713	fiy3 = _mm_add_ps(fiy3,ty);
714	fiz3 = _mm_add_ps(fiz3,tz);
715
716	fjx0 = _mm_add_ps(fjx0,tx);
717	fjy0 = _mm_add_ps(fjy0,ty);
718	fjz0 = _mm_add_ps(fjz0,tz);
719
720	}
721
722	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
723	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
724	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
725	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
726
727	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
728
729	/* Inner loop uses 168 flops */
730	}
731
732	/* End of innermost loop */
733
734	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
735	f+i_coord_offset,fshift+i_shift_offset);
736
737	ggid = gid[iidx];
738	/* Update potential energies */
739	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
740	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
741
742	/* Increment number of inner iterations */
743	inneriter += j_index_end - j_index_start;
744
745	/* Outer loop uses 26 flops */
746	}
747
748	/* Increment number of outer iterations */
749	outeriter += nri;
750
751	/* Update outer/inner flops */
752
753	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter26 + inneriter168)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_VF] += outeriter26 + inneriter 168;
754	}
755	/*
756	* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
757	* Electrostatics interaction: ReactionField
758	* VdW interaction: LennardJones
759	* Geometry: Water4-Particle
760	* Calculate force/pot: Force
761	*/
762	void
763	nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
764	(t_nblist * gmx_restrict nlist,
765	rvec * gmx_restrict xx,
766	rvec * gmx_restrict ff,
767	t_forcerec * gmx_restrict fr,
768	t_mdatoms * gmx_restrict mdatoms,
769	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
770	t_nrnb * gmx_restrict nrnb)
771	{
772	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
773	* just 0 for non-waters.
774	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
775	* jnr indices corresponding to data put in the four positions in the SIMD register.
776	*/
777	int i_shift_offset,i_coord_offset,outeriter,inneriter;
778	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
779	int jnrA,jnrB,jnrC,jnrD;
780	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
781	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
782	int iinr,jindex,jjnr,shiftidx,*gid;
783	real rcutoff_scalar;
784	real shiftvec,fshift,x,f;
785	real fjptrA,fjptrB,fjptrC,fjptrD;
786	real scratch[4*DIM3];
787	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
788	int vdwioffset0;
789	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
790	int vdwioffset1;
791	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
792	int vdwioffset2;
793	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
794	int vdwioffset3;
795	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
796	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
797	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
798	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
799	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
800	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
801	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
802	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
803	real *charge;
804	int nvdwtype;
805	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
806	int *vdwtype;
807	real *vdwparam;
808	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
809	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
810	__m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
811	real rswitch_scalar,d_scalar;
812	__m128 dummy_mask,cutoff_mask;
813	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
814	__m128 one = _mm_set1_ps(1.0);
815	__m128 two = _mm_set1_ps(2.0);
816	x = xx[0];
817	f = ff[0];
818
819	nri = nlist->nri;
820	iinr = nlist->iinr;
821	jindex = nlist->jindex;
822	jjnr = nlist->jjnr;
823	shiftidx = nlist->shift;
824	gid = nlist->gid;
825	shiftvec = fr->shift_vec[0];
826	fshift = fr->fshift[0];
827	facel = _mm_set1_ps(fr->epsfac);
828	charge = mdatoms->chargeA;
829	krf = _mm_set1_ps(fr->ic->k_rf);
830	krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
831	crf = _mm_set1_ps(fr->ic->c_rf);
832	nvdwtype = fr->ntype;
833	vdwparam = fr->nbfp;
834	vdwtype = mdatoms->typeA;
835
836	/* Setup water-specific parameters */
837	inr = nlist->iinr[0];
838	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
839	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
840	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
841	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
842
843	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
844	rcutoff_scalar = fr->rcoulomb;
845	rcutoff = _mm_set1_ps(rcutoff_scalar);
846	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
847
848	rswitch_scalar = fr->rvdw_switch;
849	rswitch = _mm_set1_ps(rswitch_scalar);
850	/* Setup switch parameters */
851	d_scalar = rcutoff_scalar-rswitch_scalar;
852	d = _mm_set1_ps(d_scalar);
853	swV3 = _mm_set1_ps(-10.0/(d_scalard_scalard_scalar));
854	swV4 = _mm_set1_ps( 15.0/(d_scalard_scalard_scalar*d_scalar));
855	swV5 = _mm_set1_ps( -6.0/(d_scalard_scalard_scalard_scalard_scalar));
856	swF2 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalar));
857	swF3 = _mm_set1_ps( 60.0/(d_scalard_scalard_scalar*d_scalar));
858	swF4 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalard_scalard_scalar));
859
860	/* Avoid stupid compiler warnings */
861	jnrA = jnrB = jnrC = jnrD = 0;
862	j_coord_offsetA = 0;
863	j_coord_offsetB = 0;
864	j_coord_offsetC = 0;
865	j_coord_offsetD = 0;
866
867	outeriter = 0;
868	inneriter = 0;
869
870	for(iidx=0;iidx<4*DIM3;iidx++)
871	{
872	scratch[iidx] = 0.0;
873	}
874
875	/* Start outer loop over neighborlists */
876	for(iidx=0; iidx<nri; iidx++)
877	{
878	/* Load shift vector for this list */
879	i_shift_offset = DIM3*shiftidx[iidx];
880
881	/* Load limits for loop over neighbors */
882	j_index_start = jindex[iidx];
883	j_index_end = jindex[iidx+1];
884
885	/* Get outer coordinate index */
886	inr = iinr[iidx];
887	i_coord_offset = DIM3*inr;
888
889	/* Load i particle coords and add shift vector */
890	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
891	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
892
893	fix0 = _mm_setzero_ps();
894	fiy0 = _mm_setzero_ps();
895	fiz0 = _mm_setzero_ps();
896	fix1 = _mm_setzero_ps();
897	fiy1 = _mm_setzero_ps();
898	fiz1 = _mm_setzero_ps();
899	fix2 = _mm_setzero_ps();
900	fiy2 = _mm_setzero_ps();
901	fiz2 = _mm_setzero_ps();
902	fix3 = _mm_setzero_ps();
903	fiy3 = _mm_setzero_ps();
904	fiz3 = _mm_setzero_ps();
905
906	/* Start inner kernel loop */
907	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
908	{
909
910	/* Get j neighbor index, and coordinate index */
911	jnrA = jjnr[jidx];
912	jnrB = jjnr[jidx+1];
913	jnrC = jjnr[jidx+2];
914	jnrD = jjnr[jidx+3];
915	j_coord_offsetA = DIM3*jnrA;
916	j_coord_offsetB = DIM3*jnrB;
917	j_coord_offsetC = DIM3*jnrC;
918	j_coord_offsetD = DIM3*jnrD;
919
920	/* load j atom coordinates */
921	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
922	x+j_coord_offsetC,x+j_coord_offsetD,
923	&jx0,&jy0,&jz0);
924
925	/* Calculate displacement vector */
926	dx00 = _mm_sub_ps(ix0,jx0);
927	dy00 = _mm_sub_ps(iy0,jy0);
928	dz00 = _mm_sub_ps(iz0,jz0);
929	dx10 = _mm_sub_ps(ix1,jx0);
930	dy10 = _mm_sub_ps(iy1,jy0);
931	dz10 = _mm_sub_ps(iz1,jz0);
932	dx20 = _mm_sub_ps(ix2,jx0);
933	dy20 = _mm_sub_ps(iy2,jy0);
934	dz20 = _mm_sub_ps(iz2,jz0);
935	dx30 = _mm_sub_ps(ix3,jx0);
936	dy30 = _mm_sub_ps(iy3,jy0);
937	dz30 = _mm_sub_ps(iz3,jz0);
938
939	/* Calculate squared distance and things based on it */
940	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
941	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
942	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
943	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
944
945	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
946	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
947	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
948	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
949
950	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
951	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
952	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
953	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
954
955	/* Load parameters for j particles */
956	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
957	charge+jnrC+0,charge+jnrD+0);
958	vdwjidx0A = 2*vdwtype[jnrA+0];
959	vdwjidx0B = 2*vdwtype[jnrB+0];
960	vdwjidx0C = 2*vdwtype[jnrC+0];
961	vdwjidx0D = 2*vdwtype[jnrD+0];
962
963	fjx0 = _mm_setzero_ps();
964	fjy0 = _mm_setzero_ps();
965	fjz0 = _mm_setzero_ps();
966
967	/**************************
968	* CALCULATE INTERACTIONS *
969	**************************/
970
971	if (gmx_mm_any_lt(rsq00,rcutoff2))
972	{
973
974	r00 = _mm_mul_ps(rsq00,rinv00);
975
976	/* Compute parameters for interactions between i and j atoms */
977	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
978	vdwparam+vdwioffset0+vdwjidx0B,
979	vdwparam+vdwioffset0+vdwjidx0C,
980	vdwparam+vdwioffset0+vdwjidx0D,
981	&c6_00,&c12_00);
982
983	/* LENNARD-JONES DISPERSION/REPULSION */
984
985	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
986	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
987	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
988	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
989	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
990
991	d = _mm_sub_ps(r00,rswitch);
992	d = _mm_max_ps(d,_mm_setzero_ps());
993	d2 = _mm_mul_ps(d,d);
994	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
995
996	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
997
998	/* Evaluate switch function */
999	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1000	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1001	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1002
1003	fscal = fvdw;
1004
1005	fscal = _mm_and_ps(fscal,cutoff_mask);
1006
1007	/* Calculate temporary vectorial force */
1008	tx = _mm_mul_ps(fscal,dx00);
1009	ty = _mm_mul_ps(fscal,dy00);
1010	tz = _mm_mul_ps(fscal,dz00);
1011
1012	/* Update vectorial force */
1013	fix0 = _mm_add_ps(fix0,tx);
1014	fiy0 = _mm_add_ps(fiy0,ty);
1015	fiz0 = _mm_add_ps(fiz0,tz);
1016
1017	fjx0 = _mm_add_ps(fjx0,tx);
1018	fjy0 = _mm_add_ps(fjy0,ty);
1019	fjz0 = _mm_add_ps(fjz0,tz);
1020
1021	}
1022
1023	/**************************
1024	* CALCULATE INTERACTIONS *
1025	**************************/
1026
1027	if (gmx_mm_any_lt(rsq10,rcutoff2))
1028	{
1029
1030	/* Compute parameters for interactions between i and j atoms */
1031	qq10 = _mm_mul_ps(iq1,jq0);
1032
1033	/* REACTION-FIELD ELECTROSTATICS */
1034	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1035
1036	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1037
1038	fscal = felec;
1039
1040	fscal = _mm_and_ps(fscal,cutoff_mask);
1041
1042	/* Calculate temporary vectorial force */
1043	tx = _mm_mul_ps(fscal,dx10);
1044	ty = _mm_mul_ps(fscal,dy10);
1045	tz = _mm_mul_ps(fscal,dz10);
1046
1047	/* Update vectorial force */
1048	fix1 = _mm_add_ps(fix1,tx);
1049	fiy1 = _mm_add_ps(fiy1,ty);
1050	fiz1 = _mm_add_ps(fiz1,tz);
1051
1052	fjx0 = _mm_add_ps(fjx0,tx);
1053	fjy0 = _mm_add_ps(fjy0,ty);
1054	fjz0 = _mm_add_ps(fjz0,tz);
1055
1056	}
1057
1058	/**************************
1059	* CALCULATE INTERACTIONS *
1060	**************************/
1061
1062	if (gmx_mm_any_lt(rsq20,rcutoff2))
1063	{
1064
1065	/* Compute parameters for interactions between i and j atoms */
1066	qq20 = _mm_mul_ps(iq2,jq0);
1067
1068	/* REACTION-FIELD ELECTROSTATICS */
1069	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1070
1071	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1072
1073	fscal = felec;
1074
1075	fscal = _mm_and_ps(fscal,cutoff_mask);
1076
1077	/* Calculate temporary vectorial force */
1078	tx = _mm_mul_ps(fscal,dx20);
1079	ty = _mm_mul_ps(fscal,dy20);
1080	tz = _mm_mul_ps(fscal,dz20);
1081
1082	/* Update vectorial force */
1083	fix2 = _mm_add_ps(fix2,tx);
1084	fiy2 = _mm_add_ps(fiy2,ty);
1085	fiz2 = _mm_add_ps(fiz2,tz);
1086
1087	fjx0 = _mm_add_ps(fjx0,tx);
1088	fjy0 = _mm_add_ps(fjy0,ty);
1089	fjz0 = _mm_add_ps(fjz0,tz);
1090
1091	}
1092
1093	/**************************
1094	* CALCULATE INTERACTIONS *
1095	**************************/
1096
1097	if (gmx_mm_any_lt(rsq30,rcutoff2))
1098	{
1099
1100	/* Compute parameters for interactions between i and j atoms */
1101	qq30 = _mm_mul_ps(iq3,jq0);
1102
1103	/* REACTION-FIELD ELECTROSTATICS */
1104	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1105
1106	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1107
1108	fscal = felec;
1109
1110	fscal = _mm_and_ps(fscal,cutoff_mask);
1111
1112	/* Calculate temporary vectorial force */
1113	tx = _mm_mul_ps(fscal,dx30);
1114	ty = _mm_mul_ps(fscal,dy30);
1115	tz = _mm_mul_ps(fscal,dz30);
1116
1117	/* Update vectorial force */
1118	fix3 = _mm_add_ps(fix3,tx);
1119	fiy3 = _mm_add_ps(fiy3,ty);
1120	fiz3 = _mm_add_ps(fiz3,tz);
1121
1122	fjx0 = _mm_add_ps(fjx0,tx);
1123	fjy0 = _mm_add_ps(fjy0,ty);
1124	fjz0 = _mm_add_ps(fjz0,tz);
1125
1126	}
1127
1128	fjptrA = f+j_coord_offsetA;
1129	fjptrB = f+j_coord_offsetB;
1130	fjptrC = f+j_coord_offsetC;
1131	fjptrD = f+j_coord_offsetD;
1132
1133	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1134
1135	/* Inner loop uses 146 flops */
1136	}
1137
1138	if(jidx<j_index_end)
1139	{
1140
1141	/* Get j neighbor index, and coordinate index */
1142	jnrlistA = jjnr[jidx];
1143	jnrlistB = jjnr[jidx+1];
1144	jnrlistC = jjnr[jidx+2];
1145	jnrlistD = jjnr[jidx+3];
1146	/* Sign of each element will be negative for non-real atoms.
1147	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1148	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1149	*/
1150	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1151	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1152	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1153	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1154	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1155	j_coord_offsetA = DIM3*jnrA;
1156	j_coord_offsetB = DIM3*jnrB;
1157	j_coord_offsetC = DIM3*jnrC;
1158	j_coord_offsetD = DIM3*jnrD;
1159
1160	/* load j atom coordinates */
1161	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1162	x+j_coord_offsetC,x+j_coord_offsetD,
1163	&jx0,&jy0,&jz0);
1164
1165	/* Calculate displacement vector */
1166	dx00 = _mm_sub_ps(ix0,jx0);
1167	dy00 = _mm_sub_ps(iy0,jy0);
1168	dz00 = _mm_sub_ps(iz0,jz0);
1169	dx10 = _mm_sub_ps(ix1,jx0);
1170	dy10 = _mm_sub_ps(iy1,jy0);
1171	dz10 = _mm_sub_ps(iz1,jz0);
1172	dx20 = _mm_sub_ps(ix2,jx0);
1173	dy20 = _mm_sub_ps(iy2,jy0);
1174	dz20 = _mm_sub_ps(iz2,jz0);
1175	dx30 = _mm_sub_ps(ix3,jx0);
1176	dy30 = _mm_sub_ps(iy3,jy0);
1177	dz30 = _mm_sub_ps(iz3,jz0);
1178
1179	/* Calculate squared distance and things based on it */
1180	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1181	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1182	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1183	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1184
1185	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1186	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1187	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1188	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1189
1190	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1191	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1192	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1193	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1194
1195	/* Load parameters for j particles */
1196	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1197	charge+jnrC+0,charge+jnrD+0);
1198	vdwjidx0A = 2*vdwtype[jnrA+0];
1199	vdwjidx0B = 2*vdwtype[jnrB+0];
1200	vdwjidx0C = 2*vdwtype[jnrC+0];
1201	vdwjidx0D = 2*vdwtype[jnrD+0];
1202
1203	fjx0 = _mm_setzero_ps();
1204	fjy0 = _mm_setzero_ps();
1205	fjz0 = _mm_setzero_ps();
1206
1207	/**************************
1208	* CALCULATE INTERACTIONS *
1209	**************************/
1210
1211	if (gmx_mm_any_lt(rsq00,rcutoff2))
1212	{
1213
1214	r00 = _mm_mul_ps(rsq00,rinv00);
1215	r00 = _mm_andnot_ps(dummy_mask,r00);
1216
1217	/* Compute parameters for interactions between i and j atoms */
1218	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1219	vdwparam+vdwioffset0+vdwjidx0B,
1220	vdwparam+vdwioffset0+vdwjidx0C,
1221	vdwparam+vdwioffset0+vdwjidx0D,
1222	&c6_00,&c12_00);
1223
1224	/* LENNARD-JONES DISPERSION/REPULSION */
1225
1226	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1227	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1228	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1229	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1230	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1231
1232	d = _mm_sub_ps(r00,rswitch);
1233	d = _mm_max_ps(d,_mm_setzero_ps());
1234	d2 = _mm_mul_ps(d,d);
1235	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1236
1237	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1238
1239	/* Evaluate switch function */
1240	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1241	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1242	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1243
1244	fscal = fvdw;
1245
1246	fscal = _mm_and_ps(fscal,cutoff_mask);
1247
1248	fscal = _mm_andnot_ps(dummy_mask,fscal);
1249
1250	/* Calculate temporary vectorial force */
1251	tx = _mm_mul_ps(fscal,dx00);
1252	ty = _mm_mul_ps(fscal,dy00);
1253	tz = _mm_mul_ps(fscal,dz00);
1254
1255	/* Update vectorial force */
1256	fix0 = _mm_add_ps(fix0,tx);
1257	fiy0 = _mm_add_ps(fiy0,ty);
1258	fiz0 = _mm_add_ps(fiz0,tz);
1259
1260	fjx0 = _mm_add_ps(fjx0,tx);
1261	fjy0 = _mm_add_ps(fjy0,ty);
1262	fjz0 = _mm_add_ps(fjz0,tz);
1263
1264	}
1265
1266	/**************************
1267	* CALCULATE INTERACTIONS *
1268	**************************/
1269
1270	if (gmx_mm_any_lt(rsq10,rcutoff2))
1271	{
1272
1273	/* Compute parameters for interactions between i and j atoms */
1274	qq10 = _mm_mul_ps(iq1,jq0);
1275
1276	/* REACTION-FIELD ELECTROSTATICS */
1277	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1278
1279	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1280
1281	fscal = felec;
1282
1283	fscal = _mm_and_ps(fscal,cutoff_mask);
1284
1285	fscal = _mm_andnot_ps(dummy_mask,fscal);
1286
1287	/* Calculate temporary vectorial force */
1288	tx = _mm_mul_ps(fscal,dx10);
1289	ty = _mm_mul_ps(fscal,dy10);
1290	tz = _mm_mul_ps(fscal,dz10);
1291
1292	/* Update vectorial force */
1293	fix1 = _mm_add_ps(fix1,tx);
1294	fiy1 = _mm_add_ps(fiy1,ty);
1295	fiz1 = _mm_add_ps(fiz1,tz);
1296
1297	fjx0 = _mm_add_ps(fjx0,tx);
1298	fjy0 = _mm_add_ps(fjy0,ty);
1299	fjz0 = _mm_add_ps(fjz0,tz);
1300
1301	}
1302
1303	/**************************
1304	* CALCULATE INTERACTIONS *
1305	**************************/
1306
1307	if (gmx_mm_any_lt(rsq20,rcutoff2))
1308	{
1309
1310	/* Compute parameters for interactions between i and j atoms */
1311	qq20 = _mm_mul_ps(iq2,jq0);
1312
1313	/* REACTION-FIELD ELECTROSTATICS */
1314	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1315
1316	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1317
1318	fscal = felec;
1319
1320	fscal = _mm_and_ps(fscal,cutoff_mask);
1321
1322	fscal = _mm_andnot_ps(dummy_mask,fscal);
1323
1324	/* Calculate temporary vectorial force */
1325	tx = _mm_mul_ps(fscal,dx20);
1326	ty = _mm_mul_ps(fscal,dy20);
1327	tz = _mm_mul_ps(fscal,dz20);
1328
1329	/* Update vectorial force */
1330	fix2 = _mm_add_ps(fix2,tx);
1331	fiy2 = _mm_add_ps(fiy2,ty);
1332	fiz2 = _mm_add_ps(fiz2,tz);
1333
1334	fjx0 = _mm_add_ps(fjx0,tx);
1335	fjy0 = _mm_add_ps(fjy0,ty);
1336	fjz0 = _mm_add_ps(fjz0,tz);
1337
1338	}
1339
1340	/**************************
1341	* CALCULATE INTERACTIONS *
1342	**************************/
1343
1344	if (gmx_mm_any_lt(rsq30,rcutoff2))
1345	{
1346
1347	/* Compute parameters for interactions between i and j atoms */
1348	qq30 = _mm_mul_ps(iq3,jq0);
1349
1350	/* REACTION-FIELD ELECTROSTATICS */
1351	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1352
1353	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1354
1355	fscal = felec;
1356
1357	fscal = _mm_and_ps(fscal,cutoff_mask);
1358
1359	fscal = _mm_andnot_ps(dummy_mask,fscal);
1360
1361	/* Calculate temporary vectorial force */
1362	tx = _mm_mul_ps(fscal,dx30);
1363	ty = _mm_mul_ps(fscal,dy30);
1364	tz = _mm_mul_ps(fscal,dz30);
1365
1366	/* Update vectorial force */
1367	fix3 = _mm_add_ps(fix3,tx);
1368	fiy3 = _mm_add_ps(fiy3,ty);
1369	fiz3 = _mm_add_ps(fiz3,tz);
1370
1371	fjx0 = _mm_add_ps(fjx0,tx);
1372	fjy0 = _mm_add_ps(fjy0,ty);
1373	fjz0 = _mm_add_ps(fjz0,tz);
1374
1375	}
1376
1377	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1378	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1379	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1380	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1381
1382	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1383
1384	/* Inner loop uses 147 flops */
1385	}
1386
1387	/* End of innermost loop */
1388
1389	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1390	f+i_coord_offset,fshift+i_shift_offset);
1391
1392	/* Increment number of inner iterations */
1393	inneriter += j_index_end - j_index_start;
1394
1395	/* Outer loop uses 24 flops */
1396	}
1397
1398	/* Increment number of outer iterations */
1399	outeriter += nri;
1400
1401	/* Update outer/inner flops */
1402
1403	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter24 + inneriter147)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_F] += outeriter24 + inneriter 147;
1404	}