/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sse4_1_single.c
Location:	line 130, column 5
Description:	Value stored to 'j_coord_offsetB' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: LennardJones
56	* Geometry: Particle-Particle
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
88	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
90	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
91	real *charge;
92	int nvdwtype;
93	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94	int *vdwtype;
95	real *vdwparam;
96	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
97	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
98	__m128i ewitab;
99	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100	real *ewtab;
101	__m128 dummy_mask,cutoff_mask;
102	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
103	__m128 one = _mm_set1_ps(1.0);
104	__m128 two = _mm_set1_ps(2.0);
105	x = xx[0];
106	f = ff[0];
107
108	nri = nlist->nri;
109	iinr = nlist->iinr;
110	jindex = nlist->jindex;
111	jjnr = nlist->jjnr;
112	shiftidx = nlist->shift;
113	gid = nlist->gid;
114	shiftvec = fr->shift_vec[0];
115	fshift = fr->fshift[0];
116	facel = _mm_set1_ps(fr->epsfac);
117	charge = mdatoms->chargeA;
118	nvdwtype = fr->ntype;
119	vdwparam = fr->nbfp;
120	vdwtype = mdatoms->typeA;
121
122	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
123	ewtab = fr->ic->tabq_coul_FDV0;
124	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
125	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
126
127	/* Avoid stupid compiler warnings */
128	jnrA = jnrB = jnrC = jnrD = 0;
129	j_coord_offsetA = 0;
130	j_coord_offsetB = 0;
	Value stored to 'j_coord_offsetB' is never read
131	j_coord_offsetC = 0;
132	j_coord_offsetD = 0;
133
134	outeriter = 0;
135	inneriter = 0;
136
137	for(iidx=0;iidx<4*DIM3;iidx++)
138	{
139	scratch[iidx] = 0.0;
140	}
141
142	/* Start outer loop over neighborlists */
143	for(iidx=0; iidx<nri; iidx++)
144	{
145	/* Load shift vector for this list */
146	i_shift_offset = DIM3*shiftidx[iidx];
147
148	/* Load limits for loop over neighbors */
149	j_index_start = jindex[iidx];
150	j_index_end = jindex[iidx+1];
151
152	/* Get outer coordinate index */
153	inr = iinr[iidx];
154	i_coord_offset = DIM3*inr;
155
156	/* Load i particle coords and add shift vector */
157	gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
158
159	fix0 = _mm_setzero_ps();
160	fiy0 = _mm_setzero_ps();
161	fiz0 = _mm_setzero_ps();
162
163	/* Load parameters for i particles */
164	iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
165	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
166
167	/* Reset potential sums */
168	velecsum = _mm_setzero_ps();
169	vvdwsum = _mm_setzero_ps();
170
171	/* Start inner kernel loop */
172	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
173	{
174
175	/* Get j neighbor index, and coordinate index */
176	jnrA = jjnr[jidx];
177	jnrB = jjnr[jidx+1];
178	jnrC = jjnr[jidx+2];
179	jnrD = jjnr[jidx+3];
180	j_coord_offsetA = DIM3*jnrA;
181	j_coord_offsetB = DIM3*jnrB;
182	j_coord_offsetC = DIM3*jnrC;
183	j_coord_offsetD = DIM3*jnrD;
184
185	/* load j atom coordinates */
186	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
187	x+j_coord_offsetC,x+j_coord_offsetD,
188	&jx0,&jy0,&jz0);
189
190	/* Calculate displacement vector */
191	dx00 = _mm_sub_ps(ix0,jx0);
192	dy00 = _mm_sub_ps(iy0,jy0);
193	dz00 = _mm_sub_ps(iz0,jz0);
194
195	/* Calculate squared distance and things based on it */
196	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
197
198	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
199
200	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
201
202	/* Load parameters for j particles */
203	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
204	charge+jnrC+0,charge+jnrD+0);
205	vdwjidx0A = 2*vdwtype[jnrA+0];
206	vdwjidx0B = 2*vdwtype[jnrB+0];
207	vdwjidx0C = 2*vdwtype[jnrC+0];
208	vdwjidx0D = 2*vdwtype[jnrD+0];
209
210	/**************************
211	* CALCULATE INTERACTIONS *
212	**************************/
213
214	r00 = _mm_mul_ps(rsq00,rinv00);
215
216	/* Compute parameters for interactions between i and j atoms */
217	qq00 = _mm_mul_ps(iq0,jq0);
218	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
219	vdwparam+vdwioffset0+vdwjidx0B,
220	vdwparam+vdwioffset0+vdwjidx0C,
221	vdwparam+vdwioffset0+vdwjidx0D,
222	&c6_00,&c12_00);
223
224	/* EWALD ELECTROSTATICS */
225
226	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
227	ewrt = _mm_mul_ps(r00,ewtabscale);
228	ewitab = _mm_cvttps_epi32(ewrt);
229	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
230	ewitab = _mm_slli_epi32(ewitab,2);
231	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
232	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
233	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
234	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
235	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
236	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
237	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
238	velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
239	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
240
241	/* LENNARD-JONES DISPERSION/REPULSION */
242
243	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
244	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
245	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
246	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
247	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
248
249	/* Update potential sum for this i atom from the interaction with this j atom. */
250	velecsum = _mm_add_ps(velecsum,velec);
251	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
252
253	fscal = _mm_add_ps(felec,fvdw);
254
255	/* Calculate temporary vectorial force */
256	tx = _mm_mul_ps(fscal,dx00);
257	ty = _mm_mul_ps(fscal,dy00);
258	tz = _mm_mul_ps(fscal,dz00);
259
260	/* Update vectorial force */
261	fix0 = _mm_add_ps(fix0,tx);
262	fiy0 = _mm_add_ps(fiy0,ty);
263	fiz0 = _mm_add_ps(fiz0,tz);
264
265	fjptrA = f+j_coord_offsetA;
266	fjptrB = f+j_coord_offsetB;
267	fjptrC = f+j_coord_offsetC;
268	fjptrD = f+j_coord_offsetD;
269	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
270
271	/* Inner loop uses 53 flops */
272	}
273
274	if(jidx<j_index_end)
275	{
276
277	/* Get j neighbor index, and coordinate index */
278	jnrlistA = jjnr[jidx];
279	jnrlistB = jjnr[jidx+1];
280	jnrlistC = jjnr[jidx+2];
281	jnrlistD = jjnr[jidx+3];
282	/* Sign of each element will be negative for non-real atoms.
283	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
284	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
285	*/
286	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
287	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
288	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
289	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
290	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
291	j_coord_offsetA = DIM3*jnrA;
292	j_coord_offsetB = DIM3*jnrB;
293	j_coord_offsetC = DIM3*jnrC;
294	j_coord_offsetD = DIM3*jnrD;
295
296	/* load j atom coordinates */
297	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
298	x+j_coord_offsetC,x+j_coord_offsetD,
299	&jx0,&jy0,&jz0);
300
301	/* Calculate displacement vector */
302	dx00 = _mm_sub_ps(ix0,jx0);
303	dy00 = _mm_sub_ps(iy0,jy0);
304	dz00 = _mm_sub_ps(iz0,jz0);
305
306	/* Calculate squared distance and things based on it */
307	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
308
309	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
310
311	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
312
313	/* Load parameters for j particles */
314	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
315	charge+jnrC+0,charge+jnrD+0);
316	vdwjidx0A = 2*vdwtype[jnrA+0];
317	vdwjidx0B = 2*vdwtype[jnrB+0];
318	vdwjidx0C = 2*vdwtype[jnrC+0];
319	vdwjidx0D = 2*vdwtype[jnrD+0];
320
321	/**************************
322	* CALCULATE INTERACTIONS *
323	**************************/
324
325	r00 = _mm_mul_ps(rsq00,rinv00);
326	r00 = _mm_andnot_ps(dummy_mask,r00);
327
328	/* Compute parameters for interactions between i and j atoms */
329	qq00 = _mm_mul_ps(iq0,jq0);
330	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
331	vdwparam+vdwioffset0+vdwjidx0B,
332	vdwparam+vdwioffset0+vdwjidx0C,
333	vdwparam+vdwioffset0+vdwjidx0D,
334	&c6_00,&c12_00);
335
336	/* EWALD ELECTROSTATICS */
337
338	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339	ewrt = _mm_mul_ps(r00,ewtabscale);
340	ewitab = _mm_cvttps_epi32(ewrt);
341	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
342	ewitab = _mm_slli_epi32(ewitab,2);
343	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
344	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
345	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
346	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
347	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
348	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
349	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
350	velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
351	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
352
353	/* LENNARD-JONES DISPERSION/REPULSION */
354
355	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
356	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
357	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
358	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
359	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
360
361	/* Update potential sum for this i atom from the interaction with this j atom. */
362	velec = _mm_andnot_ps(dummy_mask,velec);
363	velecsum = _mm_add_ps(velecsum,velec);
364	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
365	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
366
367	fscal = _mm_add_ps(felec,fvdw);
368
369	fscal = _mm_andnot_ps(dummy_mask,fscal);
370
371	/* Calculate temporary vectorial force */
372	tx = _mm_mul_ps(fscal,dx00);
373	ty = _mm_mul_ps(fscal,dy00);
374	tz = _mm_mul_ps(fscal,dz00);
375
376	/* Update vectorial force */
377	fix0 = _mm_add_ps(fix0,tx);
378	fiy0 = _mm_add_ps(fiy0,ty);
379	fiz0 = _mm_add_ps(fiz0,tz);
380
381	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
382	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
383	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
384	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
385	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
386
387	/* Inner loop uses 54 flops */
388	}
389
390	/* End of innermost loop */
391
392	gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
393	f+i_coord_offset,fshift+i_shift_offset);
394
395	ggid = gid[iidx];
396	/* Update potential energies */
397	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
398	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
399
400	/* Increment number of inner iterations */
401	inneriter += j_index_end - j_index_start;
402
403	/* Outer loop uses 9 flops */
404	}
405
406	/* Increment number of outer iterations */
407	outeriter += nri;
408
409	/* Update outer/inner flops */
410
411	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter9 + inneriter54)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_VF] += outeriter9 + inneriter 54;
412	}
413	/*
414	* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single
415	* Electrostatics interaction: Ewald
416	* VdW interaction: LennardJones
417	* Geometry: Particle-Particle
418	* Calculate force/pot: Force
419	*/
420	void
421	nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single
422	(t_nblist * gmx_restrict nlist,
423	rvec * gmx_restrict xx,
424	rvec * gmx_restrict ff,
425	t_forcerec * gmx_restrict fr,
426	t_mdatoms * gmx_restrict mdatoms,
427	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
428	t_nrnb * gmx_restrict nrnb)
429	{
430	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
431	* just 0 for non-waters.
432	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
433	* jnr indices corresponding to data put in the four positions in the SIMD register.
434	*/
435	int i_shift_offset,i_coord_offset,outeriter,inneriter;
436	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
437	int jnrA,jnrB,jnrC,jnrD;
438	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
439	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
440	int iinr,jindex,jjnr,shiftidx,*gid;
441	real rcutoff_scalar;
442	real shiftvec,fshift,x,f;
443	real fjptrA,fjptrB,fjptrC,fjptrD;
444	real scratch[4*DIM3];
445	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
446	int vdwioffset0;
447	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
448	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
449	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
450	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
451	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
452	real *charge;
453	int nvdwtype;
454	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
455	int *vdwtype;
456	real *vdwparam;
457	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
458	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
459	__m128i ewitab;
460	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
461	real *ewtab;
462	__m128 dummy_mask,cutoff_mask;
463	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
464	__m128 one = _mm_set1_ps(1.0);
465	__m128 two = _mm_set1_ps(2.0);
466	x = xx[0];
467	f = ff[0];
468
469	nri = nlist->nri;
470	iinr = nlist->iinr;
471	jindex = nlist->jindex;
472	jjnr = nlist->jjnr;
473	shiftidx = nlist->shift;
474	gid = nlist->gid;
475	shiftvec = fr->shift_vec[0];
476	fshift = fr->fshift[0];
477	facel = _mm_set1_ps(fr->epsfac);
478	charge = mdatoms->chargeA;
479	nvdwtype = fr->ntype;
480	vdwparam = fr->nbfp;
481	vdwtype = mdatoms->typeA;
482
483	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
484	ewtab = fr->ic->tabq_coul_F;
485	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
486	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
487
488	/* Avoid stupid compiler warnings */
489	jnrA = jnrB = jnrC = jnrD = 0;
490	j_coord_offsetA = 0;
491	j_coord_offsetB = 0;
492	j_coord_offsetC = 0;
493	j_coord_offsetD = 0;
494
495	outeriter = 0;
496	inneriter = 0;
497
498	for(iidx=0;iidx<4*DIM3;iidx++)
499	{
500	scratch[iidx] = 0.0;
501	}
502
503	/* Start outer loop over neighborlists */
504	for(iidx=0; iidx<nri; iidx++)
505	{
506	/* Load shift vector for this list */
507	i_shift_offset = DIM3*shiftidx[iidx];
508
509	/* Load limits for loop over neighbors */
510	j_index_start = jindex[iidx];
511	j_index_end = jindex[iidx+1];
512
513	/* Get outer coordinate index */
514	inr = iinr[iidx];
515	i_coord_offset = DIM3*inr;
516
517	/* Load i particle coords and add shift vector */
518	gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
519
520	fix0 = _mm_setzero_ps();
521	fiy0 = _mm_setzero_ps();
522	fiz0 = _mm_setzero_ps();
523
524	/* Load parameters for i particles */
525	iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
526	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
527
528	/* Start inner kernel loop */
529	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
530	{
531
532	/* Get j neighbor index, and coordinate index */
533	jnrA = jjnr[jidx];
534	jnrB = jjnr[jidx+1];
535	jnrC = jjnr[jidx+2];
536	jnrD = jjnr[jidx+3];
537	j_coord_offsetA = DIM3*jnrA;
538	j_coord_offsetB = DIM3*jnrB;
539	j_coord_offsetC = DIM3*jnrC;
540	j_coord_offsetD = DIM3*jnrD;
541
542	/* load j atom coordinates */
543	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
544	x+j_coord_offsetC,x+j_coord_offsetD,
545	&jx0,&jy0,&jz0);
546
547	/* Calculate displacement vector */
548	dx00 = _mm_sub_ps(ix0,jx0);
549	dy00 = _mm_sub_ps(iy0,jy0);
550	dz00 = _mm_sub_ps(iz0,jz0);
551
552	/* Calculate squared distance and things based on it */
553	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
554
555	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
556
557	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
558
559	/* Load parameters for j particles */
560	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
561	charge+jnrC+0,charge+jnrD+0);
562	vdwjidx0A = 2*vdwtype[jnrA+0];
563	vdwjidx0B = 2*vdwtype[jnrB+0];
564	vdwjidx0C = 2*vdwtype[jnrC+0];
565	vdwjidx0D = 2*vdwtype[jnrD+0];
566
567	/**************************
568	* CALCULATE INTERACTIONS *
569	**************************/
570
571	r00 = _mm_mul_ps(rsq00,rinv00);
572
573	/* Compute parameters for interactions between i and j atoms */
574	qq00 = _mm_mul_ps(iq0,jq0);
575	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
576	vdwparam+vdwioffset0+vdwjidx0B,
577	vdwparam+vdwioffset0+vdwjidx0C,
578	vdwparam+vdwioffset0+vdwjidx0D,
579	&c6_00,&c12_00);
580
581	/* EWALD ELECTROSTATICS */
582
583	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584	ewrt = _mm_mul_ps(r00,ewtabscale);
585	ewitab = _mm_cvttps_epi32(ewrt);
586	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
587	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
588	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
589	&ewtabF,&ewtabFn);
590	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
591	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
592
593	/* LENNARD-JONES DISPERSION/REPULSION */
594
595	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
596	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
597
598	fscal = _mm_add_ps(felec,fvdw);
599
600	/* Calculate temporary vectorial force */
601	tx = _mm_mul_ps(fscal,dx00);
602	ty = _mm_mul_ps(fscal,dy00);
603	tz = _mm_mul_ps(fscal,dz00);
604
605	/* Update vectorial force */
606	fix0 = _mm_add_ps(fix0,tx);
607	fiy0 = _mm_add_ps(fiy0,ty);
608	fiz0 = _mm_add_ps(fiz0,tz);
609
610	fjptrA = f+j_coord_offsetA;
611	fjptrB = f+j_coord_offsetB;
612	fjptrC = f+j_coord_offsetC;
613	fjptrD = f+j_coord_offsetD;
614	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
615
616	/* Inner loop uses 43 flops */
617	}
618
619	if(jidx<j_index_end)
620	{
621
622	/* Get j neighbor index, and coordinate index */
623	jnrlistA = jjnr[jidx];
624	jnrlistB = jjnr[jidx+1];
625	jnrlistC = jjnr[jidx+2];
626	jnrlistD = jjnr[jidx+3];
627	/* Sign of each element will be negative for non-real atoms.
628	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
629	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
630	*/
631	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
632	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
633	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
634	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
635	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
636	j_coord_offsetA = DIM3*jnrA;
637	j_coord_offsetB = DIM3*jnrB;
638	j_coord_offsetC = DIM3*jnrC;
639	j_coord_offsetD = DIM3*jnrD;
640
641	/* load j atom coordinates */
642	gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
643	x+j_coord_offsetC,x+j_coord_offsetD,
644	&jx0,&jy0,&jz0);
645
646	/* Calculate displacement vector */
647	dx00 = _mm_sub_ps(ix0,jx0);
648	dy00 = _mm_sub_ps(iy0,jy0);
649	dz00 = _mm_sub_ps(iz0,jz0);
650
651	/* Calculate squared distance and things based on it */
652	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
653
654	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
655
656	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
657
658	/* Load parameters for j particles */
659	jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
660	charge+jnrC+0,charge+jnrD+0);
661	vdwjidx0A = 2*vdwtype[jnrA+0];
662	vdwjidx0B = 2*vdwtype[jnrB+0];
663	vdwjidx0C = 2*vdwtype[jnrC+0];
664	vdwjidx0D = 2*vdwtype[jnrD+0];
665
666	/**************************
667	* CALCULATE INTERACTIONS *
668	**************************/
669
670	r00 = _mm_mul_ps(rsq00,rinv00);
671	r00 = _mm_andnot_ps(dummy_mask,r00);
672
673	/* Compute parameters for interactions between i and j atoms */
674	qq00 = _mm_mul_ps(iq0,jq0);
675	gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
676	vdwparam+vdwioffset0+vdwjidx0B,
677	vdwparam+vdwioffset0+vdwjidx0C,
678	vdwparam+vdwioffset0+vdwjidx0D,
679	&c6_00,&c12_00);
680
681	/* EWALD ELECTROSTATICS */
682
683	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
684	ewrt = _mm_mul_ps(r00,ewtabscale);
685	ewitab = _mm_cvttps_epi32(ewrt);
686	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
687	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
688	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
689	&ewtabF,&ewtabFn);
690	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
691	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
692
693	/* LENNARD-JONES DISPERSION/REPULSION */
694
695	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
696	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
697
698	fscal = _mm_add_ps(felec,fvdw);
699
700	fscal = _mm_andnot_ps(dummy_mask,fscal);
701
702	/* Calculate temporary vectorial force */
703	tx = _mm_mul_ps(fscal,dx00);
704	ty = _mm_mul_ps(fscal,dy00);
705	tz = _mm_mul_ps(fscal,dz00);
706
707	/* Update vectorial force */
708	fix0 = _mm_add_ps(fix0,tx);
709	fiy0 = _mm_add_ps(fiy0,ty);
710	fiz0 = _mm_add_ps(fiz0,tz);
711
712	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
713	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
714	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
715	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
716	gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
717
718	/* Inner loop uses 44 flops */
719	}
720
721	/* End of innermost loop */
722
723	gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
724	f+i_coord_offset,fshift+i_shift_offset);
725
726	/* Increment number of inner iterations */
727	inneriter += j_index_end - j_index_start;
728
729	/* Outer loop uses 7 flops */
730	}
731
732	/* Increment number of outer iterations */
733	outeriter += nri;
734
735	/* Update outer/inner flops */
736
737	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter7 + inneriter44)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_F] += outeriter7 + inneriter 44;
738	}