/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse4_1_single.c
Location:	line 814, column 5
Description:	Value stored to 'j_coord_offsetB' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
54	* Electrostatics interaction: ReactionField
55	* VdW interaction: LennardJones
56	* Geometry: Water4-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwioffset3;
92	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
100	real *charge;
101	int nvdwtype;
102	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103	int *vdwtype;
104	real *vdwparam;
105	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
106	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
107	__m128 dummy_mask,cutoff_mask;
108	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109	__m128 one = _mm_set1_ps(1.0);
110	__m128 two = _mm_set1_ps(2.0);
111	x = xx[0];
112	f = ff[0];
113
114	nri = nlist->nri;
115	iinr = nlist->iinr;
116	jindex = nlist->jindex;
117	jjnr = nlist->jjnr;
118	shiftidx = nlist->shift;
119	gid = nlist->gid;
120	shiftvec = fr->shift_vec[0];
121	fshift = fr->fshift[0];
122	facel = _mm_set1_ps(fr->epsfac);
123	charge = mdatoms->chargeA;
124	krf = _mm_set1_ps(fr->ic->k_rf);
125	krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
126	crf = _mm_set1_ps(fr->ic->c_rf);
127	nvdwtype = fr->ntype;
128	vdwparam = fr->nbfp;
129	vdwtype = mdatoms->typeA;
130
131	/* Setup water-specific parameters */
132	inr = nlist->iinr[0];
133	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
136	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
137
138	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
139	rcutoff_scalar = fr->rcoulomb;
140	rcutoff = _mm_set1_ps(rcutoff_scalar);
141	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
142
143	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
144	rvdw = _mm_set1_ps(fr->rvdw);
145
146	/* Avoid stupid compiler warnings */
147	jnrA = jnrB = jnrC = jnrD = 0;
148	j_coord_offsetA = 0;
149	j_coord_offsetB = 0;
150	j_coord_offsetC = 0;
151	j_coord_offsetD = 0;
152
153	outeriter = 0;
154	inneriter = 0;
155
156	for(iidx=0;iidx<4*DIM3;iidx++)
157	{
158	scratch[iidx] = 0.0;
159	}
160
161	/* Start outer loop over neighborlists */
162	for(iidx=0; iidx<nri; iidx++)
163	{
164	/* Load shift vector for this list */
165	i_shift_offset = DIM3*shiftidx[iidx];
166
167	/* Load limits for loop over neighbors */
168	j_index_start = jindex[iidx];
169	j_index_end = jindex[iidx+1];
170
171	/* Get outer coordinate index */
172	inr = iinr[iidx];
173	i_coord_offset = DIM3*inr;
174
175	/* Load i particle coords and add shift vector */
176	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
177	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
178
179	fix0 = _mm_setzero_ps();
180	fiy0 = _mm_setzero_ps();
181	fiz0 = _mm_setzero_ps();
182	fix1 = _mm_setzero_ps();
183	fiy1 = _mm_setzero_ps();
184	fiz1 = _mm_setzero_ps();
185	fix2 = _mm_setzero_ps();
186	fiy2 = _mm_setzero_ps();
187	fiz2 = _mm_setzero_ps();
188	fix3 = _mm_setzero_ps();
189	fiy3 = _mm_setzero_ps();
190	fiz3 = _mm_setzero_ps();
191
192	/* Reset potential sums */
193	velecsum = _mm_setzero_ps();
194	vvdwsum = _mm_setzero_ps();
195
196	/* Start inner kernel loop */
197	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
198	{
199
200	/* Get j neighbor index, and coordinate index */
201	jnrA = jjnr[jidx];
202	jnrB = jjnr[jidx+1];
203	jnrC = jjnr[jidx+2];
204	jnrD = jjnr[jidx+3];
205	j_coord_offsetA = DIM3*jnrA;
206	j_coord_offsetB = DIM3*jnrB;
207	j_coord_offsetC = DIM3*jnrC;
208	j_coord_offsetD = DIM3*jnrD;
209
210	/* load j atom coordinates */
211	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212	x+j_coord_offsetC,x+j_coord_offsetD,
213	&jx0,&jy0,&jz0);
214
215	/* Calculate displacement vector */
216	dx00 = _mm_sub_ps(ix0,jx0);
217	dy00 = _mm_sub_ps(iy0,jy0);
218	dz00 = _mm_sub_ps(iz0,jz0);
219	dx10 = _mm_sub_ps(ix1,jx0);
220	dy10 = _mm_sub_ps(iy1,jy0);
221	dz10 = _mm_sub_ps(iz1,jz0);
222	dx20 = _mm_sub_ps(ix2,jx0);
223	dy20 = _mm_sub_ps(iy2,jy0);
224	dz20 = _mm_sub_ps(iz2,jz0);
225	dx30 = _mm_sub_ps(ix3,jx0);
226	dy30 = _mm_sub_ps(iy3,jy0);
227	dz30 = _mm_sub_ps(iz3,jz0);
228
229	/* Calculate squared distance and things based on it */
230	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
231	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
232	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
233	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
234
235	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
236	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
237	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
238
239	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
240	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
241	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
242	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
243
244	/* Load parameters for j particles */
245	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
246	charge+jnrC+0,charge+jnrD+0);
247	vdwjidx0A = 2*vdwtype[jnrA+0];
248	vdwjidx0B = 2*vdwtype[jnrB+0];
249	vdwjidx0C = 2*vdwtype[jnrC+0];
250	vdwjidx0D = 2*vdwtype[jnrD+0];
251
252	fjx0 = _mm_setzero_ps();
253	fjy0 = _mm_setzero_ps();
254	fjz0 = _mm_setzero_ps();
255
256	/**************************
257	* CALCULATE INTERACTIONS *
258	**************************/
259
260	if (gmx_mm_any_lt(rsq00,rcutoff2))
261	{
262
263	/* Compute parameters for interactions between i and j atoms */
264	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
265	vdwparam+vdwioffset0+vdwjidx0B,
266	vdwparam+vdwioffset0+vdwjidx0C,
267	vdwparam+vdwioffset0+vdwjidx0D,
268	&c6_00,&c12_00);
269
270	/* LENNARD-JONES DISPERSION/REPULSION */
271
272	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
273	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
274	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
275	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
276	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
277	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
278
279	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
280
281	/* Update potential sum for this i atom from the interaction with this j atom. */
282	vvdw = _mm_and_ps(vvdw,cutoff_mask);
283	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
284
285	fscal = fvdw;
286
287	fscal = _mm_and_ps(fscal,cutoff_mask);
288
289	/* Calculate temporary vectorial force */
290	tx = _mm_mul_ps(fscal,dx00);
291	ty = _mm_mul_ps(fscal,dy00);
292	tz = _mm_mul_ps(fscal,dz00);
293
294	/* Update vectorial force */
295	fix0 = _mm_add_ps(fix0,tx);
296	fiy0 = _mm_add_ps(fiy0,ty);
297	fiz0 = _mm_add_ps(fiz0,tz);
298
299	fjx0 = _mm_add_ps(fjx0,tx);
300	fjy0 = _mm_add_ps(fjy0,ty);
301	fjz0 = _mm_add_ps(fjz0,tz);
302
303	}
304
305	/**************************
306	* CALCULATE INTERACTIONS *
307	**************************/
308
309	if (gmx_mm_any_lt(rsq10,rcutoff2))
310	{
311
312	/* Compute parameters for interactions between i and j atoms */
313	qq10 = _mm_mul_ps(iq1,jq0);
314
315	/* REACTION-FIELD ELECTROSTATICS */
316	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
317	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
318
319	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
320
321	/* Update potential sum for this i atom from the interaction with this j atom. */
322	velec = _mm_and_ps(velec,cutoff_mask);
323	velecsum = _mm_add_ps(velecsum,velec);
324
325	fscal = felec;
326
327	fscal = _mm_and_ps(fscal,cutoff_mask);
328
329	/* Calculate temporary vectorial force */
330	tx = _mm_mul_ps(fscal,dx10);
331	ty = _mm_mul_ps(fscal,dy10);
332	tz = _mm_mul_ps(fscal,dz10);
333
334	/* Update vectorial force */
335	fix1 = _mm_add_ps(fix1,tx);
336	fiy1 = _mm_add_ps(fiy1,ty);
337	fiz1 = _mm_add_ps(fiz1,tz);
338
339	fjx0 = _mm_add_ps(fjx0,tx);
340	fjy0 = _mm_add_ps(fjy0,ty);
341	fjz0 = _mm_add_ps(fjz0,tz);
342
343	}
344
345	/**************************
346	* CALCULATE INTERACTIONS *
347	**************************/
348
349	if (gmx_mm_any_lt(rsq20,rcutoff2))
350	{
351
352	/* Compute parameters for interactions between i and j atoms */
353	qq20 = _mm_mul_ps(iq2,jq0);
354
355	/* REACTION-FIELD ELECTROSTATICS */
356	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
357	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
358
359	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
360
361	/* Update potential sum for this i atom from the interaction with this j atom. */
362	velec = _mm_and_ps(velec,cutoff_mask);
363	velecsum = _mm_add_ps(velecsum,velec);
364
365	fscal = felec;
366
367	fscal = _mm_and_ps(fscal,cutoff_mask);
368
369	/* Calculate temporary vectorial force */
370	tx = _mm_mul_ps(fscal,dx20);
371	ty = _mm_mul_ps(fscal,dy20);
372	tz = _mm_mul_ps(fscal,dz20);
373
374	/* Update vectorial force */
375	fix2 = _mm_add_ps(fix2,tx);
376	fiy2 = _mm_add_ps(fiy2,ty);
377	fiz2 = _mm_add_ps(fiz2,tz);
378
379	fjx0 = _mm_add_ps(fjx0,tx);
380	fjy0 = _mm_add_ps(fjy0,ty);
381	fjz0 = _mm_add_ps(fjz0,tz);
382
383	}
384
385	/**************************
386	* CALCULATE INTERACTIONS *
387	**************************/
388
389	if (gmx_mm_any_lt(rsq30,rcutoff2))
390	{
391
392	/* Compute parameters for interactions between i and j atoms */
393	qq30 = _mm_mul_ps(iq3,jq0);
394
395	/* REACTION-FIELD ELECTROSTATICS */
396	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
397	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
398
399	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
400
401	/* Update potential sum for this i atom from the interaction with this j atom. */
402	velec = _mm_and_ps(velec,cutoff_mask);
403	velecsum = _mm_add_ps(velecsum,velec);
404
405	fscal = felec;
406
407	fscal = _mm_and_ps(fscal,cutoff_mask);
408
409	/* Calculate temporary vectorial force */
410	tx = _mm_mul_ps(fscal,dx30);
411	ty = _mm_mul_ps(fscal,dy30);
412	tz = _mm_mul_ps(fscal,dz30);
413
414	/* Update vectorial force */
415	fix3 = _mm_add_ps(fix3,tx);
416	fiy3 = _mm_add_ps(fiy3,ty);
417	fiz3 = _mm_add_ps(fiz3,tz);
418
419	fjx0 = _mm_add_ps(fjx0,tx);
420	fjy0 = _mm_add_ps(fjy0,ty);
421	fjz0 = _mm_add_ps(fjz0,tz);
422
423	}
424
425	fjptrA = f+j_coord_offsetA;
426	fjptrB = f+j_coord_offsetB;
427	fjptrC = f+j_coord_offsetC;
428	fjptrD = f+j_coord_offsetD;
429
430	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
431
432	/* Inner loop uses 149 flops */
433	}
434
435	if(jidx<j_index_end)
436	{
437
438	/* Get j neighbor index, and coordinate index */
439	jnrlistA = jjnr[jidx];
440	jnrlistB = jjnr[jidx+1];
441	jnrlistC = jjnr[jidx+2];
442	jnrlistD = jjnr[jidx+3];
443	/* Sign of each element will be negative for non-real atoms.
444	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
445	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
446	*/
447	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
448	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
449	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
450	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
451	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
452	j_coord_offsetA = DIM3*jnrA;
453	j_coord_offsetB = DIM3*jnrB;
454	j_coord_offsetC = DIM3*jnrC;
455	j_coord_offsetD = DIM3*jnrD;
456
457	/* load j atom coordinates */
458	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
459	x+j_coord_offsetC,x+j_coord_offsetD,
460	&jx0,&jy0,&jz0);
461
462	/* Calculate displacement vector */
463	dx00 = _mm_sub_ps(ix0,jx0);
464	dy00 = _mm_sub_ps(iy0,jy0);
465	dz00 = _mm_sub_ps(iz0,jz0);
466	dx10 = _mm_sub_ps(ix1,jx0);
467	dy10 = _mm_sub_ps(iy1,jy0);
468	dz10 = _mm_sub_ps(iz1,jz0);
469	dx20 = _mm_sub_ps(ix2,jx0);
470	dy20 = _mm_sub_ps(iy2,jy0);
471	dz20 = _mm_sub_ps(iz2,jz0);
472	dx30 = _mm_sub_ps(ix3,jx0);
473	dy30 = _mm_sub_ps(iy3,jy0);
474	dz30 = _mm_sub_ps(iz3,jz0);
475
476	/* Calculate squared distance and things based on it */
477	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
478	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
479	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
480	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
481
482	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
483	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
484	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
485
486	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
487	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
488	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
489	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
490
491	/* Load parameters for j particles */
492	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
493	charge+jnrC+0,charge+jnrD+0);
494	vdwjidx0A = 2*vdwtype[jnrA+0];
495	vdwjidx0B = 2*vdwtype[jnrB+0];
496	vdwjidx0C = 2*vdwtype[jnrC+0];
497	vdwjidx0D = 2*vdwtype[jnrD+0];
498
499	fjx0 = _mm_setzero_ps();
500	fjy0 = _mm_setzero_ps();
501	fjz0 = _mm_setzero_ps();
502
503	/**************************
504	* CALCULATE INTERACTIONS *
505	**************************/
506
507	if (gmx_mm_any_lt(rsq00,rcutoff2))
508	{
509
510	/* Compute parameters for interactions between i and j atoms */
511	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
512	vdwparam+vdwioffset0+vdwjidx0B,
513	vdwparam+vdwioffset0+vdwjidx0C,
514	vdwparam+vdwioffset0+vdwjidx0D,
515	&c6_00,&c12_00);
516
517	/* LENNARD-JONES DISPERSION/REPULSION */
518
519	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
520	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
521	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
522	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
523	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
524	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
525
526	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
527
528	/* Update potential sum for this i atom from the interaction with this j atom. */
529	vvdw = _mm_and_ps(vvdw,cutoff_mask);
530	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
531	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
532
533	fscal = fvdw;
534
535	fscal = _mm_and_ps(fscal,cutoff_mask);
536
537	fscal = _mm_andnot_ps(dummy_mask,fscal);
538
539	/* Calculate temporary vectorial force */
540	tx = _mm_mul_ps(fscal,dx00);
541	ty = _mm_mul_ps(fscal,dy00);
542	tz = _mm_mul_ps(fscal,dz00);
543
544	/* Update vectorial force */
545	fix0 = _mm_add_ps(fix0,tx);
546	fiy0 = _mm_add_ps(fiy0,ty);
547	fiz0 = _mm_add_ps(fiz0,tz);
548
549	fjx0 = _mm_add_ps(fjx0,tx);
550	fjy0 = _mm_add_ps(fjy0,ty);
551	fjz0 = _mm_add_ps(fjz0,tz);
552
553	}
554
555	/**************************
556	* CALCULATE INTERACTIONS *
557	**************************/
558
559	if (gmx_mm_any_lt(rsq10,rcutoff2))
560	{
561
562	/* Compute parameters for interactions between i and j atoms */
563	qq10 = _mm_mul_ps(iq1,jq0);
564
565	/* REACTION-FIELD ELECTROSTATICS */
566	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
567	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
568
569	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
570
571	/* Update potential sum for this i atom from the interaction with this j atom. */
572	velec = _mm_and_ps(velec,cutoff_mask);
573	velec = _mm_andnot_ps(dummy_mask,velec);
574	velecsum = _mm_add_ps(velecsum,velec);
575
576	fscal = felec;
577
578	fscal = _mm_and_ps(fscal,cutoff_mask);
579
580	fscal = _mm_andnot_ps(dummy_mask,fscal);
581
582	/* Calculate temporary vectorial force */
583	tx = _mm_mul_ps(fscal,dx10);
584	ty = _mm_mul_ps(fscal,dy10);
585	tz = _mm_mul_ps(fscal,dz10);
586
587	/* Update vectorial force */
588	fix1 = _mm_add_ps(fix1,tx);
589	fiy1 = _mm_add_ps(fiy1,ty);
590	fiz1 = _mm_add_ps(fiz1,tz);
591
592	fjx0 = _mm_add_ps(fjx0,tx);
593	fjy0 = _mm_add_ps(fjy0,ty);
594	fjz0 = _mm_add_ps(fjz0,tz);
595
596	}
597
598	/**************************
599	* CALCULATE INTERACTIONS *
600	**************************/
601
602	if (gmx_mm_any_lt(rsq20,rcutoff2))
603	{
604
605	/* Compute parameters for interactions between i and j atoms */
606	qq20 = _mm_mul_ps(iq2,jq0);
607
608	/* REACTION-FIELD ELECTROSTATICS */
609	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
610	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
611
612	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
613
614	/* Update potential sum for this i atom from the interaction with this j atom. */
615	velec = _mm_and_ps(velec,cutoff_mask);
616	velec = _mm_andnot_ps(dummy_mask,velec);
617	velecsum = _mm_add_ps(velecsum,velec);
618
619	fscal = felec;
620
621	fscal = _mm_and_ps(fscal,cutoff_mask);
622
623	fscal = _mm_andnot_ps(dummy_mask,fscal);
624
625	/* Calculate temporary vectorial force */
626	tx = _mm_mul_ps(fscal,dx20);
627	ty = _mm_mul_ps(fscal,dy20);
628	tz = _mm_mul_ps(fscal,dz20);
629
630	/* Update vectorial force */
631	fix2 = _mm_add_ps(fix2,tx);
632	fiy2 = _mm_add_ps(fiy2,ty);
633	fiz2 = _mm_add_ps(fiz2,tz);
634
635	fjx0 = _mm_add_ps(fjx0,tx);
636	fjy0 = _mm_add_ps(fjy0,ty);
637	fjz0 = _mm_add_ps(fjz0,tz);
638
639	}
640
641	/**************************
642	* CALCULATE INTERACTIONS *
643	**************************/
644
645	if (gmx_mm_any_lt(rsq30,rcutoff2))
646	{
647
648	/* Compute parameters for interactions between i and j atoms */
649	qq30 = _mm_mul_ps(iq3,jq0);
650
651	/* REACTION-FIELD ELECTROSTATICS */
652	velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_add_ps(rinv30,_mm_mul_ps(krf,rsq30)),crf));
653	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
654
655	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
656
657	/* Update potential sum for this i atom from the interaction with this j atom. */
658	velec = _mm_and_ps(velec,cutoff_mask);
659	velec = _mm_andnot_ps(dummy_mask,velec);
660	velecsum = _mm_add_ps(velecsum,velec);
661
662	fscal = felec;
663
664	fscal = _mm_and_ps(fscal,cutoff_mask);
665
666	fscal = _mm_andnot_ps(dummy_mask,fscal);
667
668	/* Calculate temporary vectorial force */
669	tx = _mm_mul_ps(fscal,dx30);
670	ty = _mm_mul_ps(fscal,dy30);
671	tz = _mm_mul_ps(fscal,dz30);
672
673	/* Update vectorial force */
674	fix3 = _mm_add_ps(fix3,tx);
675	fiy3 = _mm_add_ps(fiy3,ty);
676	fiz3 = _mm_add_ps(fiz3,tz);
677
678	fjx0 = _mm_add_ps(fjx0,tx);
679	fjy0 = _mm_add_ps(fjy0,ty);
680	fjz0 = _mm_add_ps(fjz0,tz);
681
682	}
683
684	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
685	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
686	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
687	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
688
689	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
690
691	/* Inner loop uses 149 flops */
692	}
693
694	/* End of innermost loop */
695
696	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
697	f+i_coord_offset,fshift+i_shift_offset);
698
699	ggid = gid[iidx];
700	/* Update potential energies */
701	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
702	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
703
704	/* Increment number of inner iterations */
705	inneriter += j_index_end - j_index_start;
706
707	/* Outer loop uses 26 flops */
708	}
709
710	/* Increment number of outer iterations */
711	outeriter += nri;
712
713	/* Update outer/inner flops */
714
715	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter26 + inneriter149)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_VF] += outeriter26 + inneriter 149;
716	}
717	/*
718	* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
719	* Electrostatics interaction: ReactionField
720	* VdW interaction: LennardJones
721	* Geometry: Water4-Particle
722	* Calculate force/pot: Force
723	*/
724	void
725	nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
726	(t_nblist * gmx_restrict nlist,
727	rvec * gmx_restrict xx,
728	rvec * gmx_restrict ff,
729	t_forcerec * gmx_restrict fr,
730	t_mdatoms * gmx_restrict mdatoms,
731	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
732	t_nrnb * gmx_restrict nrnb)
733	{
734	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
735	* just 0 for non-waters.
736	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
737	* jnr indices corresponding to data put in the four positions in the SIMD register.
738	*/
739	int i_shift_offset,i_coord_offset,outeriter,inneriter;
740	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
741	int jnrA,jnrB,jnrC,jnrD;
742	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
743	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
744	int iinr,jindex,jjnr,shiftidx,*gid;
745	real rcutoff_scalar;
746	real shiftvec,fshift,x,f;
747	real fjptrA,fjptrB,fjptrC,fjptrD;
748	real scratch[4*DIM3];
749	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
750	int vdwioffset0;
751	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
752	int vdwioffset1;
753	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
754	int vdwioffset2;
755	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
756	int vdwioffset3;
757	__m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
758	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
759	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
760	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
761	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
762	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
763	__m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
764	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
765	real *charge;
766	int nvdwtype;
767	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
768	int *vdwtype;
769	real *vdwparam;
770	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
771	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
772	__m128 dummy_mask,cutoff_mask;
773	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
774	__m128 one = _mm_set1_ps(1.0);
775	__m128 two = _mm_set1_ps(2.0);
776	x = xx[0];
777	f = ff[0];
778
779	nri = nlist->nri;
780	iinr = nlist->iinr;
781	jindex = nlist->jindex;
782	jjnr = nlist->jjnr;
783	shiftidx = nlist->shift;
784	gid = nlist->gid;
785	shiftvec = fr->shift_vec[0];
786	fshift = fr->fshift[0];
787	facel = _mm_set1_ps(fr->epsfac);
788	charge = mdatoms->chargeA;
789	krf = _mm_set1_ps(fr->ic->k_rf);
790	krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
791	crf = _mm_set1_ps(fr->ic->c_rf);
792	nvdwtype = fr->ntype;
793	vdwparam = fr->nbfp;
794	vdwtype = mdatoms->typeA;
795
796	/* Setup water-specific parameters */
797	inr = nlist->iinr[0];
798	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
799	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
800	iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
801	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
802
803	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
804	rcutoff_scalar = fr->rcoulomb;
805	rcutoff = _mm_set1_ps(rcutoff_scalar);
806	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
807
808	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
809	rvdw = _mm_set1_ps(fr->rvdw);
810
811	/* Avoid stupid compiler warnings */
812	jnrA = jnrB = jnrC = jnrD = 0;
813	j_coord_offsetA = 0;
814	j_coord_offsetB = 0;
	Value stored to 'j_coord_offsetB' is never read
815	j_coord_offsetC = 0;
816	j_coord_offsetD = 0;
817
818	outeriter = 0;
819	inneriter = 0;
820
821	for(iidx=0;iidx<4*DIM3;iidx++)
822	{
823	scratch[iidx] = 0.0;
824	}
825
826	/* Start outer loop over neighborlists */
827	for(iidx=0; iidx<nri; iidx++)
828	{
829	/* Load shift vector for this list */
830	i_shift_offset = DIM3*shiftidx[iidx];
831
832	/* Load limits for loop over neighbors */
833	j_index_start = jindex[iidx];
834	j_index_end = jindex[iidx+1];
835
836	/* Get outer coordinate index */
837	inr = iinr[iidx];
838	i_coord_offset = DIM3*inr;
839
840	/* Load i particle coords and add shift vector */
841	gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
842	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
843
844	fix0 = _mm_setzero_ps();
845	fiy0 = _mm_setzero_ps();
846	fiz0 = _mm_setzero_ps();
847	fix1 = _mm_setzero_ps();
848	fiy1 = _mm_setzero_ps();
849	fiz1 = _mm_setzero_ps();
850	fix2 = _mm_setzero_ps();
851	fiy2 = _mm_setzero_ps();
852	fiz2 = _mm_setzero_ps();
853	fix3 = _mm_setzero_ps();
854	fiy3 = _mm_setzero_ps();
855	fiz3 = _mm_setzero_ps();
856
857	/* Start inner kernel loop */
858	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
859	{
860
861	/* Get j neighbor index, and coordinate index */
862	jnrA = jjnr[jidx];
863	jnrB = jjnr[jidx+1];
864	jnrC = jjnr[jidx+2];
865	jnrD = jjnr[jidx+3];
866	j_coord_offsetA = DIM3*jnrA;
867	j_coord_offsetB = DIM3*jnrB;
868	j_coord_offsetC = DIM3*jnrC;
869	j_coord_offsetD = DIM3*jnrD;
870
871	/* load j atom coordinates */
872	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
873	x+j_coord_offsetC,x+j_coord_offsetD,
874	&jx0,&jy0,&jz0);
875
876	/* Calculate displacement vector */
877	dx00 = _mm_sub_ps(ix0,jx0);
878	dy00 = _mm_sub_ps(iy0,jy0);
879	dz00 = _mm_sub_ps(iz0,jz0);
880	dx10 = _mm_sub_ps(ix1,jx0);
881	dy10 = _mm_sub_ps(iy1,jy0);
882	dz10 = _mm_sub_ps(iz1,jz0);
883	dx20 = _mm_sub_ps(ix2,jx0);
884	dy20 = _mm_sub_ps(iy2,jy0);
885	dz20 = _mm_sub_ps(iz2,jz0);
886	dx30 = _mm_sub_ps(ix3,jx0);
887	dy30 = _mm_sub_ps(iy3,jy0);
888	dz30 = _mm_sub_ps(iz3,jz0);
889
890	/* Calculate squared distance and things based on it */
891	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
892	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
893	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
894	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
895
896	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
897	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
898	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
899
900	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
901	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
902	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
903	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
904
905	/* Load parameters for j particles */
906	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
907	charge+jnrC+0,charge+jnrD+0);
908	vdwjidx0A = 2*vdwtype[jnrA+0];
909	vdwjidx0B = 2*vdwtype[jnrB+0];
910	vdwjidx0C = 2*vdwtype[jnrC+0];
911	vdwjidx0D = 2*vdwtype[jnrD+0];
912
913	fjx0 = _mm_setzero_ps();
914	fjy0 = _mm_setzero_ps();
915	fjz0 = _mm_setzero_ps();
916
917	/**************************
918	* CALCULATE INTERACTIONS *
919	**************************/
920
921	if (gmx_mm_any_lt(rsq00,rcutoff2))
922	{
923
924	/* Compute parameters for interactions between i and j atoms */
925	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
926	vdwparam+vdwioffset0+vdwjidx0B,
927	vdwparam+vdwioffset0+vdwjidx0C,
928	vdwparam+vdwioffset0+vdwjidx0D,
929	&c6_00,&c12_00);
930
931	/* LENNARD-JONES DISPERSION/REPULSION */
932
933	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
934	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
935
936	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
937
938	fscal = fvdw;
939
940	fscal = _mm_and_ps(fscal,cutoff_mask);
941
942	/* Calculate temporary vectorial force */
943	tx = _mm_mul_ps(fscal,dx00);
944	ty = _mm_mul_ps(fscal,dy00);
945	tz = _mm_mul_ps(fscal,dz00);
946
947	/* Update vectorial force */
948	fix0 = _mm_add_ps(fix0,tx);
949	fiy0 = _mm_add_ps(fiy0,ty);
950	fiz0 = _mm_add_ps(fiz0,tz);
951
952	fjx0 = _mm_add_ps(fjx0,tx);
953	fjy0 = _mm_add_ps(fjy0,ty);
954	fjz0 = _mm_add_ps(fjz0,tz);
955
956	}
957
958	/**************************
959	* CALCULATE INTERACTIONS *
960	**************************/
961
962	if (gmx_mm_any_lt(rsq10,rcutoff2))
963	{
964
965	/* Compute parameters for interactions between i and j atoms */
966	qq10 = _mm_mul_ps(iq1,jq0);
967
968	/* REACTION-FIELD ELECTROSTATICS */
969	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
970
971	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
972
973	fscal = felec;
974
975	fscal = _mm_and_ps(fscal,cutoff_mask);
976
977	/* Calculate temporary vectorial force */
978	tx = _mm_mul_ps(fscal,dx10);
979	ty = _mm_mul_ps(fscal,dy10);
980	tz = _mm_mul_ps(fscal,dz10);
981
982	/* Update vectorial force */
983	fix1 = _mm_add_ps(fix1,tx);
984	fiy1 = _mm_add_ps(fiy1,ty);
985	fiz1 = _mm_add_ps(fiz1,tz);
986
987	fjx0 = _mm_add_ps(fjx0,tx);
988	fjy0 = _mm_add_ps(fjy0,ty);
989	fjz0 = _mm_add_ps(fjz0,tz);
990
991	}
992
993	/**************************
994	* CALCULATE INTERACTIONS *
995	**************************/
996
997	if (gmx_mm_any_lt(rsq20,rcutoff2))
998	{
999
1000	/* Compute parameters for interactions between i and j atoms */
1001	qq20 = _mm_mul_ps(iq2,jq0);
1002
1003	/* REACTION-FIELD ELECTROSTATICS */
1004	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1005
1006	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1007
1008	fscal = felec;
1009
1010	fscal = _mm_and_ps(fscal,cutoff_mask);
1011
1012	/* Calculate temporary vectorial force */
1013	tx = _mm_mul_ps(fscal,dx20);
1014	ty = _mm_mul_ps(fscal,dy20);
1015	tz = _mm_mul_ps(fscal,dz20);
1016
1017	/* Update vectorial force */
1018	fix2 = _mm_add_ps(fix2,tx);
1019	fiy2 = _mm_add_ps(fiy2,ty);
1020	fiz2 = _mm_add_ps(fiz2,tz);
1021
1022	fjx0 = _mm_add_ps(fjx0,tx);
1023	fjy0 = _mm_add_ps(fjy0,ty);
1024	fjz0 = _mm_add_ps(fjz0,tz);
1025
1026	}
1027
1028	/**************************
1029	* CALCULATE INTERACTIONS *
1030	**************************/
1031
1032	if (gmx_mm_any_lt(rsq30,rcutoff2))
1033	{
1034
1035	/* Compute parameters for interactions between i and j atoms */
1036	qq30 = _mm_mul_ps(iq3,jq0);
1037
1038	/* REACTION-FIELD ELECTROSTATICS */
1039	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1040
1041	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1042
1043	fscal = felec;
1044
1045	fscal = _mm_and_ps(fscal,cutoff_mask);
1046
1047	/* Calculate temporary vectorial force */
1048	tx = _mm_mul_ps(fscal,dx30);
1049	ty = _mm_mul_ps(fscal,dy30);
1050	tz = _mm_mul_ps(fscal,dz30);
1051
1052	/* Update vectorial force */
1053	fix3 = _mm_add_ps(fix3,tx);
1054	fiy3 = _mm_add_ps(fiy3,ty);
1055	fiz3 = _mm_add_ps(fiz3,tz);
1056
1057	fjx0 = _mm_add_ps(fjx0,tx);
1058	fjy0 = _mm_add_ps(fjy0,ty);
1059	fjz0 = _mm_add_ps(fjz0,tz);
1060
1061	}
1062
1063	fjptrA = f+j_coord_offsetA;
1064	fjptrB = f+j_coord_offsetB;
1065	fjptrC = f+j_coord_offsetC;
1066	fjptrD = f+j_coord_offsetD;
1067
1068	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1069
1070	/* Inner loop uses 120 flops */
1071	}
1072
1073	if(jidx<j_index_end)
1074	{
1075
1076	/* Get j neighbor index, and coordinate index */
1077	jnrlistA = jjnr[jidx];
1078	jnrlistB = jjnr[jidx+1];
1079	jnrlistC = jjnr[jidx+2];
1080	jnrlistD = jjnr[jidx+3];
1081	/* Sign of each element will be negative for non-real atoms.
1082	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1083	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1084	*/
1085	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1086	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1087	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1088	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1089	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1090	j_coord_offsetA = DIM3*jnrA;
1091	j_coord_offsetB = DIM3*jnrB;
1092	j_coord_offsetC = DIM3*jnrC;
1093	j_coord_offsetD = DIM3*jnrD;
1094
1095	/* load j atom coordinates */
1096	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1097	x+j_coord_offsetC,x+j_coord_offsetD,
1098	&jx0,&jy0,&jz0);
1099
1100	/* Calculate displacement vector */
1101	dx00 = _mm_sub_ps(ix0,jx0);
1102	dy00 = _mm_sub_ps(iy0,jy0);
1103	dz00 = _mm_sub_ps(iz0,jz0);
1104	dx10 = _mm_sub_ps(ix1,jx0);
1105	dy10 = _mm_sub_ps(iy1,jy0);
1106	dz10 = _mm_sub_ps(iz1,jz0);
1107	dx20 = _mm_sub_ps(ix2,jx0);
1108	dy20 = _mm_sub_ps(iy2,jy0);
1109	dz20 = _mm_sub_ps(iz2,jz0);
1110	dx30 = _mm_sub_ps(ix3,jx0);
1111	dy30 = _mm_sub_ps(iy3,jy0);
1112	dz30 = _mm_sub_ps(iz3,jz0);
1113
1114	/* Calculate squared distance and things based on it */
1115	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1116	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1117	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1118	rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1119
1120	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1121	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1122	rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1123
1124	rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
1125	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1126	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1127	rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1128
1129	/* Load parameters for j particles */
1130	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1131	charge+jnrC+0,charge+jnrD+0);
1132	vdwjidx0A = 2*vdwtype[jnrA+0];
1133	vdwjidx0B = 2*vdwtype[jnrB+0];
1134	vdwjidx0C = 2*vdwtype[jnrC+0];
1135	vdwjidx0D = 2*vdwtype[jnrD+0];
1136
1137	fjx0 = _mm_setzero_ps();
1138	fjy0 = _mm_setzero_ps();
1139	fjz0 = _mm_setzero_ps();
1140
1141	/**************************
1142	* CALCULATE INTERACTIONS *
1143	**************************/
1144
1145	if (gmx_mm_any_lt(rsq00,rcutoff2))
1146	{
1147
1148	/* Compute parameters for interactions between i and j atoms */
1149	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1150	vdwparam+vdwioffset0+vdwjidx0B,
1151	vdwparam+vdwioffset0+vdwjidx0C,
1152	vdwparam+vdwioffset0+vdwjidx0D,
1153	&c6_00,&c12_00);
1154
1155	/* LENNARD-JONES DISPERSION/REPULSION */
1156
1157	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1158	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1159
1160	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1161
1162	fscal = fvdw;
1163
1164	fscal = _mm_and_ps(fscal,cutoff_mask);
1165
1166	fscal = _mm_andnot_ps(dummy_mask,fscal);
1167
1168	/* Calculate temporary vectorial force */
1169	tx = _mm_mul_ps(fscal,dx00);
1170	ty = _mm_mul_ps(fscal,dy00);
1171	tz = _mm_mul_ps(fscal,dz00);
1172
1173	/* Update vectorial force */
1174	fix0 = _mm_add_ps(fix0,tx);
1175	fiy0 = _mm_add_ps(fiy0,ty);
1176	fiz0 = _mm_add_ps(fiz0,tz);
1177
1178	fjx0 = _mm_add_ps(fjx0,tx);
1179	fjy0 = _mm_add_ps(fjy0,ty);
1180	fjz0 = _mm_add_ps(fjz0,tz);
1181
1182	}
1183
1184	/**************************
1185	* CALCULATE INTERACTIONS *
1186	**************************/
1187
1188	if (gmx_mm_any_lt(rsq10,rcutoff2))
1189	{
1190
1191	/* Compute parameters for interactions between i and j atoms */
1192	qq10 = _mm_mul_ps(iq1,jq0);
1193
1194	/* REACTION-FIELD ELECTROSTATICS */
1195	felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1196
1197	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1198
1199	fscal = felec;
1200
1201	fscal = _mm_and_ps(fscal,cutoff_mask);
1202
1203	fscal = _mm_andnot_ps(dummy_mask,fscal);
1204
1205	/* Calculate temporary vectorial force */
1206	tx = _mm_mul_ps(fscal,dx10);
1207	ty = _mm_mul_ps(fscal,dy10);
1208	tz = _mm_mul_ps(fscal,dz10);
1209
1210	/* Update vectorial force */
1211	fix1 = _mm_add_ps(fix1,tx);
1212	fiy1 = _mm_add_ps(fiy1,ty);
1213	fiz1 = _mm_add_ps(fiz1,tz);
1214
1215	fjx0 = _mm_add_ps(fjx0,tx);
1216	fjy0 = _mm_add_ps(fjy0,ty);
1217	fjz0 = _mm_add_ps(fjz0,tz);
1218
1219	}
1220
1221	/**************************
1222	* CALCULATE INTERACTIONS *
1223	**************************/
1224
1225	if (gmx_mm_any_lt(rsq20,rcutoff2))
1226	{
1227
1228	/* Compute parameters for interactions between i and j atoms */
1229	qq20 = _mm_mul_ps(iq2,jq0);
1230
1231	/* REACTION-FIELD ELECTROSTATICS */
1232	felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1233
1234	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1235
1236	fscal = felec;
1237
1238	fscal = _mm_and_ps(fscal,cutoff_mask);
1239
1240	fscal = _mm_andnot_ps(dummy_mask,fscal);
1241
1242	/* Calculate temporary vectorial force */
1243	tx = _mm_mul_ps(fscal,dx20);
1244	ty = _mm_mul_ps(fscal,dy20);
1245	tz = _mm_mul_ps(fscal,dz20);
1246
1247	/* Update vectorial force */
1248	fix2 = _mm_add_ps(fix2,tx);
1249	fiy2 = _mm_add_ps(fiy2,ty);
1250	fiz2 = _mm_add_ps(fiz2,tz);
1251
1252	fjx0 = _mm_add_ps(fjx0,tx);
1253	fjy0 = _mm_add_ps(fjy0,ty);
1254	fjz0 = _mm_add_ps(fjz0,tz);
1255
1256	}
1257
1258	/**************************
1259	* CALCULATE INTERACTIONS *
1260	**************************/
1261
1262	if (gmx_mm_any_lt(rsq30,rcutoff2))
1263	{
1264
1265	/* Compute parameters for interactions between i and j atoms */
1266	qq30 = _mm_mul_ps(iq3,jq0);
1267
1268	/* REACTION-FIELD ELECTROSTATICS */
1269	felec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_mul_ps(rinv30,rinvsq30),krf2));
1270
1271	cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1272
1273	fscal = felec;
1274
1275	fscal = _mm_and_ps(fscal,cutoff_mask);
1276
1277	fscal = _mm_andnot_ps(dummy_mask,fscal);
1278
1279	/* Calculate temporary vectorial force */
1280	tx = _mm_mul_ps(fscal,dx30);
1281	ty = _mm_mul_ps(fscal,dy30);
1282	tz = _mm_mul_ps(fscal,dz30);
1283
1284	/* Update vectorial force */
1285	fix3 = _mm_add_ps(fix3,tx);
1286	fiy3 = _mm_add_ps(fiy3,ty);
1287	fiz3 = _mm_add_ps(fiz3,tz);
1288
1289	fjx0 = _mm_add_ps(fjx0,tx);
1290	fjy0 = _mm_add_ps(fjy0,ty);
1291	fjz0 = _mm_add_ps(fjz0,tz);
1292
1293	}
1294
1295	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1296	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1297	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1298	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1299
1300	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1301
1302	/* Inner loop uses 120 flops */
1303	}
1304
1305	/* End of innermost loop */
1306
1307	gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1308	f+i_coord_offset,fshift+i_shift_offset);
1309
1310	/* Increment number of inner iterations */
1311	inneriter += j_index_end - j_index_start;
1312
1313	/* Outer loop uses 24 flops */
1314	}
1315
1316	/* Increment number of outer iterations */
1317	outeriter += nri;
1318
1319	/* Update outer/inner flops */
1320
1321	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter24 + inneriter120)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_F] += outeriter24 + inneriter 120;
1322	}