/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn

Bug Summary

File:	gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
Location:	line 602, column 9
Description:	Value stored to 'vctot_S' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35
36
37	{
38	const nbnxn_ci_t *nbln;
39	const nbnxn_cj_t *l_cj;
40	const int * type;
41	const real * q;
42	const real *shiftvec;
43	const real *x;
44	const real nbfp0, nbfp1, nbfp2 = NULL((void)0), nbfp3 = NULL((void)0);
45	real facel;
46	real *nbfp_ptr;
47	int n, ci, ci_sh;
48	int ish, ish3;
49	gmx_bool do_LJ, half_LJ, do_coul, do_self;
50	int sci, scix, sciy, sciz, sci2;
51	int cjind0, cjind1, cjind;
52	int ip, jp;
53
54	#ifdef ENERGY_GROUPS
55	int Vstride_i;
56	int egps_ishift, egps_imask;
57	int egps_jshift, egps_jmask, egps_jstride;
58	int egps_i;
59	real *vvdwtp[UNROLLI4];
60	real *vctp[UNROLLI4];
61	#endif
62
63	gmx_simd_real_t__m128 shX_S;
64	gmx_simd_real_t__m128 shY_S;
65	gmx_simd_real_t__m128 shZ_S;
66	gmx_simd_real_t__m128 ix_S0, iy_S0, iz_S0;
67	gmx_simd_real_t__m128 ix_S1, iy_S1, iz_S1;
68	gmx_simd_real_t__m128 ix_S2, iy_S2, iz_S2;
69	gmx_simd_real_t__m128 ix_S3, iy_S3, iz_S3;
70	gmx_simd_real_t__m128 fix_S0, fiy_S0, fiz_S0;
71	gmx_simd_real_t__m128 fix_S1, fiy_S1, fiz_S1;
72	gmx_simd_real_t__m128 fix_S2, fiy_S2, fiz_S2;
73	gmx_simd_real_t__m128 fix_S3, fiy_S3, fiz_S3;
74	#if UNROLLJ(4/1) >= 4
75	/* We use an i-force SIMD register width of 4 */
76	gmx_simd4_real_t__m128 fix_S, fiy_S, fiz_S;
77	#else
78	/* We use an i-force SIMD register width of 2 */
79	gmx_simd_real_t__m128 fix0_S, fiy0_S, fiz0_S;
80	gmx_simd_real_t__m128 fix2_S, fiy2_S, fiz2_S;
81	#endif
82
83	gmx_simd_real_t__m128 diagonal_jmi_S;
84	#if UNROLLI4 == UNROLLJ(4/1)
85	gmx_simd_bool_t__m128 diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
86	#else
87	gmx_simd_bool_t__m128 diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
88	gmx_simd_bool_t__m128 diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
89	#endif
90
91	unsigned *exclusion_filter;
92	gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
93
94	gmx_simd_real_t__m128 zero_S = gmx_simd_set1_r_mm_set1_ps(0.0);
95
96	gmx_simd_real_t__m128 one_S = gmx_simd_set1_r_mm_set1_ps(1.0);
97	gmx_simd_real_t__m128 iq_S0 = gmx_simd_setzero_r_mm_setzero_ps();
98	gmx_simd_real_t__m128 iq_S1 = gmx_simd_setzero_r_mm_setzero_ps();
99	gmx_simd_real_t__m128 iq_S2 = gmx_simd_setzero_r_mm_setzero_ps();
100	gmx_simd_real_t__m128 iq_S3 = gmx_simd_setzero_r_mm_setzero_ps();
101
102	#ifdef CALC_COUL_RF
103	gmx_simd_real_t__m128 mrc_3_S;
104	#ifdef CALC_ENERGIES
105	gmx_simd_real_t__m128 hrc_3_S, moh_rc_S;
106	#endif
107	#endif
108
109	#ifdef CALC_COUL_TAB
110	/* Coulomb table variables */
111	gmx_simd_real_t__m128 invtsp_S;
112	const real * tab_coul_F;
113	#ifndef TAB_FDV0
114	const real * tab_coul_V;
115	#endif
116	/* Thread-local working buffers for force and potential lookups */
117	int ti0_array[2GMX_SIMD_REAL_WIDTH4], ti0 = NULL((void*)0);
118	int ti1_array[2GMX_SIMD_REAL_WIDTH4], ti1 = NULL((void*)0);
119	int ti2_array[2GMX_SIMD_REAL_WIDTH4], ti2 = NULL((void*)0);
120	int ti3_array[2GMX_SIMD_REAL_WIDTH4], ti3 = NULL((void*)0);
121	#ifdef CALC_ENERGIES
122	gmx_simd_real_t__m128 mhalfsp_S;
123	#endif
124	#endif
125
126	#ifdef CALC_COUL_EWALD
127	gmx_simd_real_t__m128 beta2_S, beta_S;
128	#endif
129
130	#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD \|\| defined CALC_COUL_TAB)
131	gmx_simd_real_t__m128 sh_ewald_S;
132	#endif
133
134	#if defined LJ_CUT && defined CALC_ENERGIES
135	gmx_simd_real_t__m128 p6_cpot_S, p12_cpot_S;
136	#endif
137	#ifdef LJ_POT_SWITCH
138	gmx_simd_real_t__m128 rswitch_S;
139	gmx_simd_real_t__m128 swV3_S, swV4_S, swV5_S;
140	gmx_simd_real_t__m128 swF2_S, swF3_S, swF4_S;
141	#endif
142	#ifdef LJ_FORCE_SWITCH
143	gmx_simd_real_t__m128 rswitch_S;
144	gmx_simd_real_t__m128 p6_fc2_S, p6_fc3_S;
145	gmx_simd_real_t__m128 p12_fc2_S, p12_fc3_S;
146	#ifdef CALC_ENERGIES
147	gmx_simd_real_t__m128 p6_vc3_S, p6_vc4_S;
148	gmx_simd_real_t__m128 p12_vc3_S, p12_vc4_S;
149	gmx_simd_real_t__m128 p6_6cpot_S, p12_12cpot_S;
150	#endif
151	#endif
152	#ifdef LJ_EWALD_GEOM
153	real lj_ewaldcoeff2, lj_ewaldcoeff6_6;
154	gmx_simd_real_t__m128 mone_S, half_S, lje_c2_S, lje_c6_6_S, lje_vc_S;
155	#endif
156
157	#ifdef LJ_COMB_LB
158	const real *ljc;
159
160	gmx_simd_real_t__m128 hsig_i_S0, seps_i_S0;
161	gmx_simd_real_t__m128 hsig_i_S1, seps_i_S1;
162	gmx_simd_real_t__m128 hsig_i_S2, seps_i_S2;
163	gmx_simd_real_t__m128 hsig_i_S3, seps_i_S3;
164	#else
165	#ifdef FIX_LJ_C
166	real pvdw_array[2UNROLLI4UNROLLJ(4/1)+3];
167	real pvdw_c6, pvdw_c12;
168	gmx_simd_real_t__m128 c6_S0, c12_S0;
169	gmx_simd_real_t__m128 c6_S1, c12_S1;
170	gmx_simd_real_t__m128 c6_S2, c12_S2;
171	gmx_simd_real_t__m128 c6_S3, c12_S3;
172	#endif
173
174	#if defined LJ_COMB_GEOM \|\| defined LJ_EWALD_GEOM
175	const real *ljc;
176
177	gmx_simd_real_t__m128 c6s_S0, c12s_S0;
178	gmx_simd_real_t__m128 c6s_S1, c12s_S1;
179	gmx_simd_real_t__m128 c6s_S2 = gmx_simd_setzero_r_mm_setzero_ps();
180	gmx_simd_real_t__m128 c12s_S2 = gmx_simd_setzero_r_mm_setzero_ps();
181	gmx_simd_real_t__m128 c6s_S3 = gmx_simd_setzero_r_mm_setzero_ps();
182	gmx_simd_real_t__m128 c12s_S3 = gmx_simd_setzero_r_mm_setzero_ps();
183	#endif
184	#endif /* LJ_COMB_LB */
185
186	gmx_simd_real_t__m128 vctot_S, Vvdwtot_S;
187	gmx_simd_real_t__m128 sixth_S, twelveth_S;
188
189	gmx_simd_real_t__m128 avoid_sing_S;
190	gmx_simd_real_t__m128 rc2_S;
191	#ifdef VDW_CUTOFF_CHECK
192	gmx_simd_real_t__m128 rcvdw2_S;
193	#endif
194
195	int ninner;
196
197	#ifdef COUNT_PAIRS
198	int npair = 0;
199	#endif
200
201	#if defined LJ_COMB_GEOM \|\| defined LJ_COMB_LB \|\| defined LJ_EWALD_GEOM
202	ljc = nbat->lj_comb;
203	#endif
204	#if !(defined LJ_COMB_GEOM \|\| defined LJ_COMB_LB)
205	/* No combination rule used */
206	nbfp_ptr = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
207	#endif
208
209	/* Load j-i for the first i */
210	diagonal_jmi_S = gmx_simd_load_r_mm_load_ps(nbat->simd_4xn_diagonal_j_minus_i);
211	/* Generate all the diagonal masks as comparison results */
212	#if UNROLLI4 == UNROLLJ(4/1)
213	diagonal_mask_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
214	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
215	diagonal_mask_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
216	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
217	diagonal_mask_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
218	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
219	diagonal_mask_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
220	#else
221	#if UNROLLI4 == 2UNROLLJ(4/1) \|\| 2UNROLLI4 == UNROLLJ(4/1)
222	diagonal_mask0_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
223	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
224	diagonal_mask0_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
225	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
226	diagonal_mask0_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
227	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
228	diagonal_mask0_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
229	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
230
231	#if UNROLLI4 == 2*UNROLLJ(4/1)
232	/* Load j-i for the second half of the j-cluster */
233	diagonal_jmi_S = gmx_simd_load_r_mm_load_ps(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ(4/1));
234	#endif
235
236	diagonal_mask1_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
237	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
238	diagonal_mask1_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
239	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
240	diagonal_mask1_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
241	diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S);
242	diagonal_mask1_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S);
243	#endif
244	#endif
245
246	/* Load masks for topology exclusion masking. filter_stride is
247	static const, so the conditional will be optimized away. */
248	if (1 == filter_stride)
249	{
250	exclusion_filter = nbat->simd_exclusion_filter1;
251	}
252	else /* (2 == filter_stride) */
253	{
254	exclusion_filter = nbat->simd_exclusion_filter2;
255	}
256
257	/* Here we cast the exclusion filters from unsigned * to int * or real *.
258	* Since we only check bits, the actual value they represent does not
259	* matter, as long as both filter and mask data are treated the same way.
260	*/
261	filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0UNROLLJ(4/1)filter_stride);
262	filter_S1 = gmx_load_exclusion_filter(exclusion_filter + 1UNROLLJ(4/1)filter_stride);
263	filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 2UNROLLJ(4/1)filter_stride);
264	filter_S3 = gmx_load_exclusion_filter(exclusion_filter + 3UNROLLJ(4/1)filter_stride);
265
266	#ifdef CALC_COUL_RF
267	/* Reaction-field constants */
268	mrc_3_S = gmx_simd_set1_r_mm_set1_ps(-2*ic->k_rf);
269	#ifdef CALC_ENERGIES
270	hrc_3_S = gmx_simd_set1_r_mm_set1_ps(ic->k_rf);
271	moh_rc_S = gmx_simd_set1_r_mm_set1_ps(-ic->c_rf);
272	#endif
273	#endif
274
275	#ifdef CALC_COUL_TAB
276	/* Generate aligned table index pointers */
277	ti0 = prepare_table_load_buffer(ti0_array);
278	ti1 = prepare_table_load_buffer(ti1_array);
279	ti2 = prepare_table_load_buffer(ti2_array);
280	ti3 = prepare_table_load_buffer(ti3_array);
281
282	invtsp_S = gmx_simd_set1_r_mm_set1_ps(ic->tabq_scale);
283	#ifdef CALC_ENERGIES
284	mhalfsp_S = gmx_simd_set1_r_mm_set1_ps(-0.5/ic->tabq_scale);
285	#endif
286
287	#ifdef TAB_FDV0
288	tab_coul_F = ic->tabq_coul_FDV0;
289	#else
290	tab_coul_F = ic->tabq_coul_F;
291	tab_coul_V = ic->tabq_coul_V;
292	#endif
293	#endif /* CALC_COUL_TAB */
294
295	#ifdef CALC_COUL_EWALD
296	beta2_S = gmx_simd_set1_r_mm_set1_ps(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
297	beta_S = gmx_simd_set1_r_mm_set1_ps(ic->ewaldcoeff_q);
298	#endif
299
300	#if (defined CALC_COUL_TAB \|\| defined CALC_COUL_EWALD) && defined CALC_ENERGIES
301	sh_ewald_S = gmx_simd_set1_r_mm_set1_ps(ic->sh_ewald);
302	#endif
303
304	/* LJ function constants */
305	#if defined CALC_ENERGIES \|\| defined LJ_POT_SWITCH
306	sixth_S = gmx_simd_set1_r_mm_set1_ps(1.0/6.0);
307	twelveth_S = gmx_simd_set1_r_mm_set1_ps(1.0/12.0);
308	#endif
309
310	#if defined LJ_CUT && defined CALC_ENERGIES
311	/* We shift the potential by cpot, which can be zero */
312	p6_cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.cpot);
313	p12_cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.cpot);
314	#endif
315	#ifdef LJ_POT_SWITCH
316	rswitch_S = gmx_simd_set1_r_mm_set1_ps(ic->rvdw_switch);
317	swV3_S = gmx_simd_set1_r_mm_set1_ps(ic->vdw_switch.c3);
318	swV4_S = gmx_simd_set1_r_mm_set1_ps(ic->vdw_switch.c4);
319	swV5_S = gmx_simd_set1_r_mm_set1_ps(ic->vdw_switch.c5);
320	swF2_S = gmx_simd_set1_r_mm_set1_ps(3*ic->vdw_switch.c3);
321	swF3_S = gmx_simd_set1_r_mm_set1_ps(4*ic->vdw_switch.c4);
322	swF4_S = gmx_simd_set1_r_mm_set1_ps(5*ic->vdw_switch.c5);
323	#endif
324	#ifdef LJ_FORCE_SWITCH
325	rswitch_S = gmx_simd_set1_r_mm_set1_ps(ic->rvdw_switch);
326	p6_fc2_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.c2);
327	p6_fc3_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.c3);
328	p12_fc2_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.c2);
329	p12_fc3_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.c3);
330	#ifdef CALC_ENERGIES
331	{
332	gmx_simd_real_t__m128 mthird_S = gmx_simd_set1_r_mm_set1_ps(-1.0/3.0);
333	gmx_simd_real_t__m128 mfourth_S = gmx_simd_set1_r_mm_set1_ps(-1.0/4.0);
334
335	p6_vc3_S = gmx_simd_mul_r_mm_mul_ps(mthird_S, p6_fc2_S);
336	p6_vc4_S = gmx_simd_mul_r_mm_mul_ps(mfourth_S, p6_fc3_S);
337	p6_6cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.cpot/6);
338	p12_vc3_S = gmx_simd_mul_r_mm_mul_ps(mthird_S, p12_fc2_S);
339	p12_vc4_S = gmx_simd_mul_r_mm_mul_ps(mfourth_S, p12_fc3_S);
340	p12_12cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.cpot/12);
341	}
342	#endif
343	#endif
344	#ifdef LJ_EWALD_GEOM
345	mone_S = gmx_simd_set1_r_mm_set1_ps(-1.0);
346	half_S = gmx_simd_set1_r_mm_set1_ps(0.5);
347	lj_ewaldcoeff2 = ic->ewaldcoeff_lj*ic->ewaldcoeff_lj;
348	lj_ewaldcoeff6_6 = lj_ewaldcoeff2lj_ewaldcoeff2lj_ewaldcoeff2/6;
349	lje_c2_S = gmx_simd_set1_r_mm_set1_ps(lj_ewaldcoeff2);
350	lje_c6_6_S = gmx_simd_set1_r_mm_set1_ps(lj_ewaldcoeff6_6);
351	/* Determine the grid potential at the cut-off */
352	lje_vc_S = gmx_simd_set1_r_mm_set1_ps(ic->sh_lj_ewald);
353	#endif
354
355	/* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
356	rc2_S = gmx_simd_set1_r_mm_set1_ps(ic->rcoulomb*ic->rcoulomb);
357	#ifdef VDW_CUTOFF_CHECK
358	rcvdw2_S = gmx_simd_set1_r_mm_set1_ps(ic->rvdw*ic->rvdw);
359	#endif
360
361	avoid_sing_S = gmx_simd_set1_r_mm_set1_ps(NBNXN_AVOID_SING_R2_INC1.0e-12f);
362
363	q = nbat->q;
364	type = nbat->type;
365	facel = ic->epsfac;
366	shiftvec = shift_vec[0];
367	x = nbat->x;
368
369	#ifdef FIX_LJ_C
370	pvdw_c6 = gmx_simd_align_real(pvdw_array);
371	pvdw_c12 = pvdw_c6 + UNROLLI4*UNROLLJ(4/1);
372
373	for (jp = 0; jp < UNROLLJ(4/1); jp++)
374	{
375	pvdw_c6 [0UNROLLJ(4/1)+jp] = nbat->nbfp[02];
376	pvdw_c6 [1UNROLLJ(4/1)+jp] = nbat->nbfp[02];
377	pvdw_c6 [2UNROLLJ(4/1)+jp] = nbat->nbfp[02];
378	pvdw_c6 [3UNROLLJ(4/1)+jp] = nbat->nbfp[02];
379
380	pvdw_c12[0UNROLLJ(4/1)+jp] = nbat->nbfp[02+1];
381	pvdw_c12[1UNROLLJ(4/1)+jp] = nbat->nbfp[02+1];
382	pvdw_c12[2UNROLLJ(4/1)+jp] = nbat->nbfp[02+1];
383	pvdw_c12[3UNROLLJ(4/1)+jp] = nbat->nbfp[02+1];
384	}
385	c6_S0 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +0*UNROLLJ(4/1));
386	c6_S1 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +1*UNROLLJ(4/1));
387	c6_S2 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +2*UNROLLJ(4/1));
388	c6_S3 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +3*UNROLLJ(4/1));
389
390	c12_S0 = gmx_simd_load_r_mm_load_ps(pvdw_c12+0*UNROLLJ(4/1));
391	c12_S1 = gmx_simd_load_r_mm_load_ps(pvdw_c12+1*UNROLLJ(4/1));
392	c12_S2 = gmx_simd_load_r_mm_load_ps(pvdw_c12+2*UNROLLJ(4/1));
393	c12_S3 = gmx_simd_load_r_mm_load_ps(pvdw_c12+3*UNROLLJ(4/1));
394	#endif /* FIX_LJ_C */
395
396	#ifdef ENERGY_GROUPS
397	egps_ishift = nbat->neg_2log;
398	egps_imask = (1<<egps_ishift) - 1;
399	egps_jshift = 2*nbat->neg_2log;
400	egps_jmask = (1<<egps_jshift) - 1;
401	egps_jstride = (UNROLLJ(4/1)>>1)*UNROLLJ(4/1);
402	/* Major division is over i-particle energy groups, determine the stride */
403	Vstride_i = nbat->nenergrp(1<<nbat->neg_2log)egps_jstride;
404	#endif
405
406	l_cj = nbl->cj;
407
408	ninner = 0;
409	for (n = 0; n < nbl->nci; n++)
410	{
411	nbln = &nbl->ci[n];
412
413	ish = (nbln->shift & NBNXN_CI_SHIFT127);
414	ish3 = ish*3;
415	cjind0 = nbln->cj_ind_start;
416	cjind1 = nbln->cj_ind_end;
417	ci = nbln->ci;
418	ci_sh = (ish == CENTRAL(((21 +1)(21 +1)(2*2 +1))/2) ? ci : -1);
419
420	shX_S = gmx_simd_load1_r_mm_load1_ps(shiftvec+ish3);
421	shY_S = gmx_simd_load1_r_mm_load1_ps(shiftvec+ish3+1);
422	shZ_S = gmx_simd_load1_r_mm_load1_ps(shiftvec+ish3+2);
423
424	#if UNROLLJ(4/1) <= 4
425	sci = ci*STRIDE(4/1);
426	scix = sci*DIM3;
427	sci2 = sci*2;
428	#else
429	sci = (ci>>1)*STRIDE(4/1);
430	scix = sciDIM3 + (ci & 1)(STRIDE(4/1)>>1);
431	sci2 = sci2 + (ci & 1)(STRIDE(4/1)>>1);
432	sci += (ci & 1)*(STRIDE(4/1)>>1);
433	#endif
434
435	/* We have 5 LJ/C combinations, but use only three inner loops,
436	* as the other combinations are unlikely and/or not much faster:
437	* inner half-LJ + C for half-LJ + C / no-LJ + C
438	* inner LJ + C for full-LJ + C
439	* inner LJ for full-LJ + no-C / half-LJ + no-C
440	*/
441	do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0)(1<<(7+3*(0))));
442	do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0)(1<<(9+3*(0))));
443	half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)(1<<(8+3*(0)))) \|\| !do_LJ) && do_coul;
444	#ifdef LJ_EWALD_GEOM
445	do_self = TRUE1;
446	#else
447	do_self = do_coul;
448	#endif
449
450	#ifdef ENERGY_GROUPS
451	egps_i = nbat->energrp[ci];
452	{
453	int ia, egp_ia;
454
455	for (ia = 0; ia < UNROLLI4; ia++)
456	{
457	egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
458	vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
459	vctp[ia] = Vc + egp_ia*Vstride_i;
460	}
461	}
462	#endif
463
464	#ifdef CALC_ENERGIES
465	#if UNROLLJ(4/1) == 4
466	if (do_self && l_cj[nbln->cj_ind_start].cj == ci_sh)
467	#endif
468	#if UNROLLJ(4/1) == 2
469	if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
470	#endif
471	#if UNROLLJ(4/1) == 8
472	if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
473	#endif
474	{
475	if (do_coul)
476	{
477	real Vc_sub_self;
478	int ia;
479
480	#ifdef CALC_COUL_RF
481	Vc_sub_self = 0.5*ic->c_rf;
482	#endif
483	#ifdef CALC_COUL_TAB
484	#ifdef TAB_FDV0
485	Vc_sub_self = 0.5*tab_coul_F[2];
486	#else
487	Vc_sub_self = 0.5*tab_coul_V[0];
488	#endif
489	#endif
490	#ifdef CALC_COUL_EWALD
491	/* beta/sqrt(pi) */
492	Vc_sub_self = 0.5ic->ewaldcoeff_qM_2_SQRTPI1.12837916709551257390;
493	#endif
494
495	for (ia = 0; ia < UNROLLI4; ia++)
496	{
497	real qi;
498
499	qi = q[sci+ia];
500	#ifdef ENERGY_GROUPS
501	vctp[ia][((egps_i>>(iaegps_ishift)) & egps_imask)egps_jstride]
502	#else
503	Vc[0]
504	#endif
505	-= facelqiqi*Vc_sub_self;
506	}
507	}
508
509	#ifdef LJ_EWALD_GEOM
510	{
511	int ia;
512
513	for (ia = 0; ia < UNROLLI4; ia++)
514	{
515	real c6_i;
516
517	c6_i = nbat->nbfp[nbat->type[sci+ia](nbat->ntype + 1)2]/6;
518	#ifdef ENERGY_GROUPS
519	vvdwtp[ia][((egps_i>>(iaegps_ishift)) & egps_imask)egps_jstride]
520	#else
521	Vvdw[0]
522	#endif
523	+= 0.5c6_ilj_ewaldcoeff6_6;
524	}
525	}
526	#endif /* LJ_EWALD_GEOM */
527	}
528	#endif
529
530	/* Load i atom data */
531	sciy = scix + STRIDE(4/1);
532	sciz = sciy + STRIDE(4/1);
533	ix_S0 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix), shX_S);
534	ix_S1 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix+1), shX_S);
535	ix_S2 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix+2), shX_S);
536	ix_S3 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix+3), shX_S);
537	iy_S0 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy), shY_S);
538	iy_S1 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy+1), shY_S);
539	iy_S2 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy+2), shY_S);
540	iy_S3 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy+3), shY_S);
541	iz_S0 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz), shZ_S);
542	iz_S1 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz+1), shZ_S);
543	iz_S2 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz+2), shZ_S);
544	iz_S3 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz+3), shZ_S);
545
546	if (do_coul)
547	{
548	iq_S0 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci]);
549	iq_S1 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci+1]);
550	iq_S2 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci+2]);
551	iq_S3 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci+3]);
552	}
553
554	#ifdef LJ_COMB_LB
555	hsig_i_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+0);
556	hsig_i_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+1);
557	hsig_i_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+2);
558	hsig_i_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+3);
559	seps_i_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+0);
560	seps_i_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+1);
561	seps_i_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+2);
562	seps_i_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+3);
563	#else
564	#ifdef LJ_COMB_GEOM
565	c6s_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+0);
566	c6s_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+1);
567	if (!half_LJ)
568	{
569	c6s_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+2);
570	c6s_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+3);
571	}
572	c12s_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+0);
573	c12s_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+1);
574	if (!half_LJ)
575	{
576	c12s_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+2);
577	c12s_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+3);
578	}
579	#else
580	nbfp0 = nbfp_ptr + type[sci ]nbat->ntypenbfp_stride;
581	nbfp1 = nbfp_ptr + type[sci+1]nbat->ntypenbfp_stride;
582	if (!half_LJ)
583	{
584	nbfp2 = nbfp_ptr + type[sci+2]nbat->ntypenbfp_stride;
585	nbfp3 = nbfp_ptr + type[sci+3]nbat->ntypenbfp_stride;
586	}
587	#endif
588	#endif
589	#ifdef LJ_EWALD_GEOM
590	/* We need the geometrically combined C6 for the PME grid correction */
591	c6s_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+0);
592	c6s_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+1);
593	if (!half_LJ)
594	{
595	c6s_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+2);
596	c6s_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+3);
597	}
598	#endif
599
600	/* Zero the potential energy for this list */
601	Vvdwtot_S = gmx_simd_setzero_r_mm_setzero_ps();
602	vctot_S = gmx_simd_setzero_r_mm_setzero_ps();
	Value stored to 'vctot_S' is never read
603
604	/* Clear i atom forces */
605	fix_S0 = gmx_simd_setzero_r_mm_setzero_ps();
606	fix_S1 = gmx_simd_setzero_r_mm_setzero_ps();
607	fix_S2 = gmx_simd_setzero_r_mm_setzero_ps();
608	fix_S3 = gmx_simd_setzero_r_mm_setzero_ps();
609	fiy_S0 = gmx_simd_setzero_r_mm_setzero_ps();
610	fiy_S1 = gmx_simd_setzero_r_mm_setzero_ps();
611	fiy_S2 = gmx_simd_setzero_r_mm_setzero_ps();
612	fiy_S3 = gmx_simd_setzero_r_mm_setzero_ps();
613	fiz_S0 = gmx_simd_setzero_r_mm_setzero_ps();
614	fiz_S1 = gmx_simd_setzero_r_mm_setzero_ps();
615	fiz_S2 = gmx_simd_setzero_r_mm_setzero_ps();
616	fiz_S3 = gmx_simd_setzero_r_mm_setzero_ps();
617
618	cjind = cjind0;
619
620	/* Currently all kernels use (at least half) LJ */
621	#define CALC_LJ
622	if (half_LJ)
623	{
624	/* Coulomb: all i-atoms, LJ: first half i-atoms */
625	#define CALC_COULOMB
626	#define HALF_LJ
627	#define CHECK_EXCLS
628	while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
629	{
630	#include "nbnxn_kernel_simd_4xn_inner.h"
631	cjind++;
632	}
633	#undef CHECK_EXCLS
634	for (; (cjind < cjind1); cjind++)
635	{
636	#include "nbnxn_kernel_simd_4xn_inner.h"
637	}
638	#undef HALF_LJ
639	#undef CALC_COULOMB
640	}
641	else if (do_coul)
642	{
643	/* Coulomb: all i-atoms, LJ: all i-atoms */
644	#define CALC_COULOMB
645	#define CHECK_EXCLS
646	while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
647	{
648	#include "nbnxn_kernel_simd_4xn_inner.h"
649	cjind++;
650	}
651	#undef CHECK_EXCLS
652	for (; (cjind < cjind1); cjind++)
653	{
654	#include "nbnxn_kernel_simd_4xn_inner.h"
655	}
656	#undef CALC_COULOMB
657	}
658	else
659	{
660	/* Coulomb: none, LJ: all i-atoms */
661	#define CHECK_EXCLS
662	while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
663	{
664	#include "nbnxn_kernel_simd_4xn_inner.h"
665	cjind++;
666	}
667	#undef CHECK_EXCLS
668	for (; (cjind < cjind1); cjind++)
669	{
670	#include "nbnxn_kernel_simd_4xn_inner.h"
671	}
672	}
673	#undef CALC_LJ
674	ninner += cjind1 - cjind0;
675
676	/* Add accumulated i-forces to the force array */
677	#if UNROLLJ(4/1) >= 4
678	fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
679	gmx_simd4_store_r_mm_store_ps(f+scix, gmx_simd4_add_r_mm_add_ps(fix_S, gmx_simd4_load_r_mm_load_ps(f+scix)));
680
681	fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
682	gmx_simd4_store_r_mm_store_ps(f+sciy, gmx_simd4_add_r_mm_add_ps(fiy_S, gmx_simd4_load_r_mm_load_ps(f+sciy)));
683
684	fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
685	gmx_simd4_store_r_mm_store_ps(f+sciz, gmx_simd4_add_r_mm_add_ps(fiz_S, gmx_simd4_load_r_mm_load_ps(f+sciz)));
686
687	#ifdef CALC_SHIFTFORCES
688	fshift[ish3+0] += gmx_simd4_reduce_r(fix_S)gmx_simd_reduce_f_sse4_1(fix_S);
689	fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S)gmx_simd_reduce_f_sse4_1(fiy_S);
690	fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S)gmx_simd_reduce_f_sse4_1(fiz_S);
691	#endif
692	#else
693	fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
694	gmx_simd_store_r_mm_store_ps(f+scix, gmx_simd_add_r_mm_add_ps(fix0_S, gmx_simd_load_r_mm_load_ps(f+scix)));
695	fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
696	gmx_simd_store_r_mm_store_ps(f+scix+2, gmx_simd_add_r_mm_add_ps(fix2_S, gmx_simd_load_r_mm_load_ps(f+scix+2)));
697
698	fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
699	gmx_simd_store_r_mm_store_ps(f+sciy, gmx_simd_add_r_mm_add_ps(fiy0_S, gmx_simd_load_r_mm_load_ps(f+sciy)));
700	fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
701	gmx_simd_store_r_mm_store_ps(f+sciy+2, gmx_simd_add_r_mm_add_ps(fiy2_S, gmx_simd_load_r_mm_load_ps(f+sciy+2)));
702
703	fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
704	gmx_simd_store_r_mm_store_ps(f+sciz, gmx_simd_add_r_mm_add_ps(fiz0_S, gmx_simd_load_r_mm_load_ps(f+sciz)));
705	fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
706	gmx_simd_store_r_mm_store_ps(f+sciz+2, gmx_simd_add_r_mm_add_ps(fiz2_S, gmx_simd_load_r_mm_load_ps(f+sciz+2)));
707
708	#ifdef CALC_SHIFTFORCES
709	fshift[ish3+0] += gmx_simd_reduce_r(gmx_simd_add_r(fix0_S, fix2_S))gmx_simd_reduce_f_sse4_1(_mm_add_ps(fix0_S, fix2_S));
710	fshift[ish3+1] += gmx_simd_reduce_r(gmx_simd_add_r(fiy0_S, fiy2_S))gmx_simd_reduce_f_sse4_1(_mm_add_ps(fiy0_S, fiy2_S));
711	fshift[ish3+2] += gmx_simd_reduce_r(gmx_simd_add_r(fiz0_S, fiz2_S))gmx_simd_reduce_f_sse4_1(_mm_add_ps(fiz0_S, fiz2_S));
712	#endif
713	#endif
714
715	#ifdef CALC_ENERGIES
716	if (do_coul)
717	{
718	*Vc += gmx_simd_reduce_r(vctot_S)gmx_simd_reduce_f_sse4_1(vctot_S);
719	}
720
721	*Vvdw += gmx_simd_reduce_r(Vvdwtot_S)gmx_simd_reduce_f_sse4_1(Vvdwtot_S);
722	#endif
723
724	/* Outer loop uses 6 flops/iteration */
725	}
726
727	#ifdef COUNT_PAIRS
728	printf("atom pairs %d\n", npair);
729	#endif
730	}