/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn

Bug Summary

File:	gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
Location:	line 748, column 5
Description:	Value stored to 'rsw2_r_S1' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35
36	/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
37	* This flavor of the kernel calculates interactions of 4 i-atoms
38	* with N j-atoms stored in N wide SIMD registers.
39	*/
40
41
42	/* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
43	* forces on excluded atom pairs here in the non-bonded loops.
44	* But when energies and/or virial is required we calculate them
45	* separately to as then it is easier to separate the energy and virial
46	* contributions.
47	*/
48	#if defined CHECK_EXCLS && (defined CALC_COULOMB \|\| defined LJ_EWALD_GEOM)
49	#define EXCL_FORCES
50	#endif
51
52	/* Without exclusions and energies we only need to mask the cut-off,
53	* this can be faster when we have defined gmx_simd_blendv_r, i.e. an instruction
54	* that selects from two SIMD registers based on the contents of a third.
55	*/
56	#if !(defined CHECK_EXCLS \|\| defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV
57	/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
58	* With gcc this is slower, except for RF on Sandy Bridge.
59	* Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
60	*/
61	#if (defined CALC_COUL_RF \|\| defined CALC_COUL_TAB) && (!defined __GNUC__4 \|\| (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER))
62	#define NBNXN_CUTOFF_USE_BLENDV
63	#endif
64	/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
65	* This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
66	* Tested with icc 13.
67	*/
68	#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
69	#define NBNXN_CUTOFF_USE_BLENDV
70	#endif
71	#endif
72
73	{
74	int cj, aj, ajx, ajy, ajz;
75
76	#ifdef ENERGY_GROUPS
77	/* Energy group indices for two atoms packed into one int */
78	int egp_jj[UNROLLJ(4/1)/2];
79	#endif
80
81	#ifdef CHECK_EXCLS
82	/* Interaction (non-exclusion) mask of all 1's or 0's */
83	gmx_simd_bool_t__m128 interact_S0;
84	gmx_simd_bool_t__m128 interact_S1;
85	gmx_simd_bool_t__m128 interact_S2;
86	gmx_simd_bool_t__m128 interact_S3;
87	#endif
88
89	gmx_simd_real_t__m128 jx_S, jy_S, jz_S;
90	gmx_simd_real_t__m128 dx_S0, dy_S0, dz_S0;
91	gmx_simd_real_t__m128 dx_S1, dy_S1, dz_S1;
92	gmx_simd_real_t__m128 dx_S2, dy_S2, dz_S2;
93	gmx_simd_real_t__m128 dx_S3, dy_S3, dz_S3;
94	gmx_simd_real_t__m128 tx_S0, ty_S0, tz_S0;
95	gmx_simd_real_t__m128 tx_S1, ty_S1, tz_S1;
96	gmx_simd_real_t__m128 tx_S2, ty_S2, tz_S2;
97	gmx_simd_real_t__m128 tx_S3, ty_S3, tz_S3;
98	gmx_simd_real_t__m128 rsq_S0, rinv_S0, rinvsq_S0;
99	gmx_simd_real_t__m128 rsq_S1, rinv_S1, rinvsq_S1;
100	gmx_simd_real_t__m128 rsq_S2, rinv_S2, rinvsq_S2;
101	gmx_simd_real_t__m128 rsq_S3, rinv_S3, rinvsq_S3;
102	#ifndef NBNXN_CUTOFF_USE_BLENDV
103	/* wco: within cut-off, mask of all 1's or 0's */
104	gmx_simd_bool_t__m128 wco_S0;
105	gmx_simd_bool_t__m128 wco_S1;
106	gmx_simd_bool_t__m128 wco_S2;
107	gmx_simd_bool_t__m128 wco_S3;
108	#endif
109	#ifdef VDW_CUTOFF_CHECK
110	gmx_simd_bool_t__m128 wco_vdw_S0;
111	gmx_simd_bool_t__m128 wco_vdw_S1;
112	#ifndef HALF_LJ
113	gmx_simd_bool_t__m128 wco_vdw_S2;
114	gmx_simd_bool_t__m128 wco_vdw_S3;
115	#endif
116	#endif
117
118	#if (defined CALC_COULOMB && defined CALC_COUL_TAB) \|\| defined LJ_FORCE_SWITCH \|\| defined LJ_POT_SWITCH
119	gmx_simd_real_t__m128 r_S0;
120	gmx_simd_real_t__m128 r_S1;
121	gmx_simd_real_t__m128 r_S2;
122	gmx_simd_real_t__m128 r_S3;
123	#endif
124
125	#if defined LJ_FORCE_SWITCH \|\| defined LJ_POT_SWITCH
126	gmx_simd_real_t__m128 rsw_S0, rsw2_S0, rsw2_r_S0;
127	gmx_simd_real_t__m128 rsw_S1, rsw2_S1, rsw2_r_S1;
128	#ifndef HALF_LJ
129	gmx_simd_real_t__m128 rsw_S2, rsw2_S2, rsw2_r_S2;
130	gmx_simd_real_t__m128 rsw_S3, rsw2_S3, rsw2_r_S3;
131	#endif
132	#endif
133
134	#ifdef CALC_COULOMB
135	#ifdef CHECK_EXCLS
136	/* 1/r masked with the interaction mask */
137	gmx_simd_real_t__m128 rinv_ex_S0;
138	gmx_simd_real_t__m128 rinv_ex_S1;
139	gmx_simd_real_t__m128 rinv_ex_S2;
140	gmx_simd_real_t__m128 rinv_ex_S3;
141	#endif
142	gmx_simd_real_t__m128 jq_S;
143	gmx_simd_real_t__m128 qq_S0;
144	gmx_simd_real_t__m128 qq_S1;
145	gmx_simd_real_t__m128 qq_S2;
146	gmx_simd_real_t__m128 qq_S3;
147	#ifdef CALC_COUL_TAB
148	/* The force (PME mesh force) we need to subtract from 1/r^2 */
149	gmx_simd_real_t__m128 fsub_S0;
150	gmx_simd_real_t__m128 fsub_S1;
151	gmx_simd_real_t__m128 fsub_S2;
152	gmx_simd_real_t__m128 fsub_S3;
153	#endif
154	#ifdef CALC_COUL_EWALD
155	gmx_simd_real_t__m128 brsq_S0, brsq_S1, brsq_S2, brsq_S3;
156	gmx_simd_real_t__m128 ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
157	#endif
158
159	/* frcoul = (1/r - fsub)r /
160	gmx_simd_real_t__m128 frcoul_S0;
161	gmx_simd_real_t__m128 frcoul_S1;
162	gmx_simd_real_t__m128 frcoul_S2;
163	gmx_simd_real_t__m128 frcoul_S3;
164	#ifdef CALC_COUL_TAB
165	/* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
166	gmx_simd_real_t__m128 rs_S0, rf_S0, frac_S0;
167	gmx_simd_real_t__m128 rs_S1, rf_S1, frac_S1;
168	gmx_simd_real_t__m128 rs_S2, rf_S2, frac_S2;
169	gmx_simd_real_t__m128 rs_S3, rf_S3, frac_S3;
170	/* Table index: rs truncated to an int */
171	gmx_simd_int32_t__m128i ti_S0, ti_S1, ti_S2, ti_S3;
172	/* Linear force table values */
173	gmx_simd_real_t__m128 ctab0_S0, ctab1_S0;
174	gmx_simd_real_t__m128 ctab0_S1, ctab1_S1;
175	gmx_simd_real_t__m128 ctab0_S2, ctab1_S2;
176	gmx_simd_real_t__m128 ctab0_S3, ctab1_S3;
177	#ifdef CALC_ENERGIES
178	/* Quadratic energy table value */
179	gmx_simd_real_t__m128 ctabv_S0;
180	gmx_simd_real_t__m128 ctabv_S1;
181	gmx_simd_real_t__m128 ctabv_S2;
182	gmx_simd_real_t__m128 ctabv_S3;
183	#endif
184	#endif
185	#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD \|\| defined CALC_COUL_TAB)
186	/* The potential (PME mesh) we need to subtract from 1/r */
187	gmx_simd_real_t__m128 vc_sub_S0;
188	gmx_simd_real_t__m128 vc_sub_S1;
189	gmx_simd_real_t__m128 vc_sub_S2;
190	gmx_simd_real_t__m128 vc_sub_S3;
191	#endif
192	#ifdef CALC_ENERGIES
193	/* Electrostatic potential */
194	gmx_simd_real_t__m128 vcoul_S0;
195	gmx_simd_real_t__m128 vcoul_S1;
196	gmx_simd_real_t__m128 vcoul_S2;
197	gmx_simd_real_t__m128 vcoul_S3;
198	#endif
199	#endif
200	/* The force times 1/r */
201	gmx_simd_real_t__m128 fscal_S0;
202	gmx_simd_real_t__m128 fscal_S1;
203	gmx_simd_real_t__m128 fscal_S2;
204	gmx_simd_real_t__m128 fscal_S3;
205
206	#ifdef CALC_LJ
207	#ifdef LJ_COMB_LB
208	/* LJ sigma_j/2 and sqrt(epsilon_j) */
209	gmx_simd_real_t__m128 hsig_j_S, seps_j_S;
210	/* LJ sigma_ij and epsilon_ij */
211	gmx_simd_real_t__m128 sig_S0, eps_S0;
212	gmx_simd_real_t__m128 sig_S1, eps_S1;
213	#ifndef HALF_LJ
214	gmx_simd_real_t__m128 sig_S2, eps_S2;
215	gmx_simd_real_t__m128 sig_S3, eps_S3;
216	#endif
217	#ifdef CALC_ENERGIES
218	gmx_simd_real_t__m128 sig2_S0, sig6_S0;
219	gmx_simd_real_t__m128 sig2_S1, sig6_S1;
220	#ifndef HALF_LJ
221	gmx_simd_real_t__m128 sig2_S2, sig6_S2;
222	gmx_simd_real_t__m128 sig2_S3, sig6_S3;
223	#endif
224	#endif /* LJ_COMB_LB */
225	#endif /* CALC_LJ */
226
227	#ifdef LJ_COMB_GEOM
228	gmx_simd_real_t__m128 c6s_j_S, c12s_j_S;
229	#endif
230
231	#if defined LJ_COMB_GEOM \|\| defined LJ_COMB_LB \|\| defined LJ_EWALD_GEOM
232	/* Index for loading LJ parameters, complicated when interleaving */
233	int aj2;
234	#endif
235
236	#ifndef FIX_LJ_C
237	/* LJ C6 and C12 parameters, used with geometric comb. rule */
238	gmx_simd_real_t__m128 c6_S0, c12_S0;
239	gmx_simd_real_t__m128 c6_S1, c12_S1;
240	#ifndef HALF_LJ
241	gmx_simd_real_t__m128 c6_S2, c12_S2;
242	gmx_simd_real_t__m128 c6_S3, c12_S3;
243	#endif
244	#endif
245
246	/* Intermediate variables for LJ calculation */
247	#ifndef LJ_COMB_LB
248	gmx_simd_real_t__m128 rinvsix_S0;
249	gmx_simd_real_t__m128 rinvsix_S1;
250	#ifndef HALF_LJ
251	gmx_simd_real_t__m128 rinvsix_S2;
252	gmx_simd_real_t__m128 rinvsix_S3;
253	#endif
254	#endif
255	#ifdef LJ_COMB_LB
256	gmx_simd_real_t__m128 sir_S0, sir2_S0, sir6_S0;
257	gmx_simd_real_t__m128 sir_S1, sir2_S1, sir6_S1;
258	#ifndef HALF_LJ
259	gmx_simd_real_t__m128 sir_S2, sir2_S2, sir6_S2;
260	gmx_simd_real_t__m128 sir_S3, sir2_S3, sir6_S3;
261	#endif
262	#endif
263
264	gmx_simd_real_t__m128 FrLJ6_S0, FrLJ12_S0, frLJ_S0;
265	gmx_simd_real_t__m128 FrLJ6_S1, FrLJ12_S1, frLJ_S1;
266	#ifndef HALF_LJ
267	gmx_simd_real_t__m128 FrLJ6_S2, FrLJ12_S2, frLJ_S2;
268	gmx_simd_real_t__m128 FrLJ6_S3, FrLJ12_S3, frLJ_S3;
269	#endif
270	#if defined CALC_ENERGIES \|\| defined LJ_POT_SWITCH
271	gmx_simd_real_t__m128 VLJ6_S0, VLJ12_S0, VLJ_S0;
272	gmx_simd_real_t__m128 VLJ6_S1, VLJ12_S1, VLJ_S1;
273	#ifndef HALF_LJ
274	gmx_simd_real_t__m128 VLJ6_S2, VLJ12_S2, VLJ_S2;
275	gmx_simd_real_t__m128 VLJ6_S3, VLJ12_S3, VLJ_S3;
276	#endif
277	#endif
278	#endif /* CALC_LJ */
279
280	/* j-cluster index */
281	cj = l_cj[cjind].cj;
282
283	/* Atom indices (of the first atom in the cluster) */
284	aj = cj*UNROLLJ(4/1);
285	#if defined CALC_LJ && (defined LJ_COMB_GEOM \|\| defined LJ_COMB_LB \|\| defined LJ_EWALD_GEOM)
286	#if UNROLLJ(4/1) == STRIDE(4/1)
287	aj2 = aj*2;
288	#else
289	aj2 = (cj>>1)2STRIDE(4/1) + (cj & 1)*UNROLLJ(4/1);
290	#endif
291	#endif
292	#if UNROLLJ(4/1) == STRIDE(4/1)
293	ajx = aj*DIM3;
294	#else
295	ajx = (cj>>1)DIM3STRIDE(4/1) + (cj & 1)*UNROLLJ(4/1);
296	#endif
297	ajy = ajx + STRIDE(4/1);
298	ajz = ajy + STRIDE(4/1);
299
300	#ifdef CHECK_EXCLS
301	gmx_load_simd_4xn_interactions(l_cj[cjind].excl,
302	filter_S0, filter_S1,
303	filter_S2, filter_S3,
304	#ifdef GMX_SIMD_IBM_QPX
305	l_cj[cjind].interaction_mask_indices,
306	nbat->simd_interaction_array,
307	#else
308	/* The struct fields do not exist
309	except on BlueGene/Q */
310	NULL((void*)0),
311	NULL((void*)0),
312	#endif
313	&interact_S0, &interact_S1,
314	&interact_S2, &interact_S3);
315	#endif /* CHECK_EXCLS */
316
317	/* load j atom coordinates */
318	jx_S = gmx_simd_load_r_mm_load_ps(x+ajx);
319	jy_S = gmx_simd_load_r_mm_load_ps(x+ajy);
320	jz_S = gmx_simd_load_r_mm_load_ps(x+ajz);
321
322	/* Calculate distance */
323	dx_S0 = gmx_simd_sub_r_mm_sub_ps(ix_S0, jx_S);
324	dy_S0 = gmx_simd_sub_r_mm_sub_ps(iy_S0, jy_S);
325	dz_S0 = gmx_simd_sub_r_mm_sub_ps(iz_S0, jz_S);
326	dx_S1 = gmx_simd_sub_r_mm_sub_ps(ix_S1, jx_S);
327	dy_S1 = gmx_simd_sub_r_mm_sub_ps(iy_S1, jy_S);
328	dz_S1 = gmx_simd_sub_r_mm_sub_ps(iz_S1, jz_S);
329	dx_S2 = gmx_simd_sub_r_mm_sub_ps(ix_S2, jx_S);
330	dy_S2 = gmx_simd_sub_r_mm_sub_ps(iy_S2, jy_S);
331	dz_S2 = gmx_simd_sub_r_mm_sub_ps(iz_S2, jz_S);
332	dx_S3 = gmx_simd_sub_r_mm_sub_ps(ix_S3, jx_S);
333	dy_S3 = gmx_simd_sub_r_mm_sub_ps(iy_S3, jy_S);
334	dz_S3 = gmx_simd_sub_r_mm_sub_ps(iz_S3, jz_S);
335
336	/* rsq = dxdx+dydy+dzdz /
337	rsq_S0 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S0, dy_S0, dz_S0);
338	rsq_S1 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S1, dy_S1, dz_S1);
339	rsq_S2 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S2, dy_S2, dz_S2);
340	rsq_S3 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S3, dy_S3, dz_S3);
341
342	#ifndef NBNXN_CUTOFF_USE_BLENDV
343	wco_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S0, rc2_S);
344	wco_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S1, rc2_S);
345	wco_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S2, rc2_S);
346	wco_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S3, rc2_S);
347	#endif
348
349	#ifdef CHECK_EXCLS
350	#ifdef EXCL_FORCES
351	/* Only remove the (sub-)diagonal to avoid double counting */
352	#if UNROLLJ(4/1) == UNROLLI4
353	if (cj == ci_sh)
354	{
355	wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask_S0);
356	wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask_S1);
357	wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask_S2);
358	wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask_S3);
359	}
360	#else
361	#if UNROLLJ(4/1) < UNROLLI4
362	if (cj == ci_sh*2)
363	{
364	wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask0_S0);
365	wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask0_S1);
366	wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask0_S2);
367	wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask0_S3);
368	}
369	if (cj == ci_sh*2 + 1)
370	{
371	wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask1_S0);
372	wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask1_S1);
373	wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask1_S2);
374	wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask1_S3);
375	}
376	#else
377	if (cj*2 == ci_sh)
378	{
379	wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask0_S0);
380	wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask0_S1);
381	wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask0_S2);
382	wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask0_S3);
383	}
384	else if (cj*2 + 1 == ci_sh)
385	{
386	wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask1_S0);
387	wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask1_S1);
388	wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask1_S2);
389	wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask1_S3);
390	}
391	#endif
392	#endif
393	#else /* EXCL_FORCES */
394	/* No exclusion forces: remove all excluded atom pairs from the list */
395	wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, interact_S0);
396	wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, interact_S1);
397	wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, interact_S2);
398	wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, interact_S3);
399	#endif
400	#endif
401
402	#ifdef COUNT_PAIRS
403	{
404	int i, j;
405	real tmpa[2GMX_SIMD_REAL_WIDTH4], tmp;
406	tmp = gmx_simd_align_rgmx_simd_align_f(tmpa);
407	for (i = 0; i < UNROLLI4; i++)
408	{
409	gmx_simd_store_r_mm_store_ps(tmp, gmx_simd_sub_r_mm_sub_ps(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
410	for (j = 0; j < UNROLLJ(4/1); j++)
411	{
412	if (tmp[j] >= 0)
413	{
414	npair++;
415	}
416	}
417	}
418	}
419	#endif
420
421	#ifdef CHECK_EXCLS
422	/* For excluded pairs add a small number to avoid r^-6 = NaN */
423	rsq_S0 = gmx_simd_add_r_mm_add_ps(rsq_S0, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S0));
424	rsq_S1 = gmx_simd_add_r_mm_add_ps(rsq_S1, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S1));
425	rsq_S2 = gmx_simd_add_r_mm_add_ps(rsq_S2, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S2));
426	rsq_S3 = gmx_simd_add_r_mm_add_ps(rsq_S3, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S3));
427	#endif
428
429	/* Calculate 1/r */
430	#ifndef GMX_DOUBLE
431	rinv_S0 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S0);
432	rinv_S1 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S1);
433	rinv_S2 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S2);
434	rinv_S3 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S3);
435	#else
436	gmx_simd_invsqrt_pair_rgmx_simd_invsqrt_pair_f(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
437	gmx_simd_invsqrt_pair_rgmx_simd_invsqrt_pair_f(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
438	#endif
439
440	#ifdef CALC_COULOMB
441	/* Load parameters for j atom */
442	jq_S = gmx_simd_load_r_mm_load_ps(q+aj);
443	qq_S0 = gmx_simd_mul_r_mm_mul_ps(iq_S0, jq_S);
444	qq_S1 = gmx_simd_mul_r_mm_mul_ps(iq_S1, jq_S);
445	qq_S2 = gmx_simd_mul_r_mm_mul_ps(iq_S2, jq_S);
446	qq_S3 = gmx_simd_mul_r_mm_mul_ps(iq_S3, jq_S);
447	#endif
448
449	#ifdef CALC_LJ
450
451	#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
452	load_lj_pair_params(nbfp0, type, aj, &c6_S0, &c12_S0);
453	load_lj_pair_params(nbfp1, type, aj, &c6_S1, &c12_S1);
454	#ifndef HALF_LJ
455	load_lj_pair_params(nbfp2, type, aj, &c6_S2, &c12_S2);
456	load_lj_pair_params(nbfp3, type, aj, &c6_S3, &c12_S3);
457	#endif
458	#endif /* not defined any LJ rule */
459
460	#ifdef LJ_COMB_GEOM
461	c6s_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+0);
462	c12s_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+STRIDE(4/1));
463	c6_S0 = gmx_simd_mul_r_mm_mul_ps(c6s_S0, c6s_j_S );
464	c6_S1 = gmx_simd_mul_r_mm_mul_ps(c6s_S1, c6s_j_S );
465	#ifndef HALF_LJ
466	c6_S2 = gmx_simd_mul_r_mm_mul_ps(c6s_S2, c6s_j_S );
467	c6_S3 = gmx_simd_mul_r_mm_mul_ps(c6s_S3, c6s_j_S );
468	#endif
469	c12_S0 = gmx_simd_mul_r_mm_mul_ps(c12s_S0, c12s_j_S);
470	c12_S1 = gmx_simd_mul_r_mm_mul_ps(c12s_S1, c12s_j_S);
471	#ifndef HALF_LJ
472	c12_S2 = gmx_simd_mul_r_mm_mul_ps(c12s_S2, c12s_j_S);
473	c12_S3 = gmx_simd_mul_r_mm_mul_ps(c12s_S3, c12s_j_S);
474	#endif
475	#endif /* LJ_COMB_GEOM */
476
477	#ifdef LJ_COMB_LB
478	hsig_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+0);
479	seps_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+STRIDE(4/1));
480
481	sig_S0 = gmx_simd_add_r_mm_add_ps(hsig_i_S0, hsig_j_S);
482	sig_S1 = gmx_simd_add_r_mm_add_ps(hsig_i_S1, hsig_j_S);
483	eps_S0 = gmx_simd_mul_r_mm_mul_ps(seps_i_S0, seps_j_S);
484	eps_S1 = gmx_simd_mul_r_mm_mul_ps(seps_i_S1, seps_j_S);
485	#ifndef HALF_LJ
486	sig_S2 = gmx_simd_add_r_mm_add_ps(hsig_i_S2, hsig_j_S);
487	sig_S3 = gmx_simd_add_r_mm_add_ps(hsig_i_S3, hsig_j_S);
488	eps_S2 = gmx_simd_mul_r_mm_mul_ps(seps_i_S2, seps_j_S);
489	eps_S3 = gmx_simd_mul_r_mm_mul_ps(seps_i_S3, seps_j_S);
490	#endif
491	#endif /* LJ_COMB_LB */
492
493	#endif /* CALC_LJ */
494
495	#ifndef NBNXN_CUTOFF_USE_BLENDV
496	rinv_S0 = gmx_simd_blendzero_r_mm_and_ps(rinv_S0, wco_S0);
497	rinv_S1 = gmx_simd_blendzero_r_mm_and_ps(rinv_S1, wco_S1);
498	rinv_S2 = gmx_simd_blendzero_r_mm_and_ps(rinv_S2, wco_S2);
499	rinv_S3 = gmx_simd_blendzero_r_mm_and_ps(rinv_S3, wco_S3);
500	#else
501	/* We only need to mask for the cut-off: blendv is faster */
502	rinv_S0 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S0, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S0));
503	rinv_S1 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S1, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S1));
504	rinv_S2 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S2, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S2));
505	rinv_S3 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S3, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S3));
506	#endif
507
508	rinvsq_S0 = gmx_simd_mul_r_mm_mul_ps(rinv_S0, rinv_S0);
509	rinvsq_S1 = gmx_simd_mul_r_mm_mul_ps(rinv_S1, rinv_S1);
510	rinvsq_S2 = gmx_simd_mul_r_mm_mul_ps(rinv_S2, rinv_S2);
511	rinvsq_S3 = gmx_simd_mul_r_mm_mul_ps(rinv_S3, rinv_S3);
512
513	#ifdef CALC_COULOMB
514	/* Note that here we calculate force*r, not the usual force/r.
515	* This allows avoiding masking the reaction-field contribution,
516	* as frcoul is later multiplied by rinvsq which has been
517	* masked with the cut-off check.
518	*/
519
520	#ifdef EXCL_FORCES
521	/* Only add 1/r for non-excluded atom pairs */
522	rinv_ex_S0 = gmx_simd_blendzero_r_mm_and_ps(rinv_S0, interact_S0);
523	rinv_ex_S1 = gmx_simd_blendzero_r_mm_and_ps(rinv_S1, interact_S1);
524	rinv_ex_S2 = gmx_simd_blendzero_r_mm_and_ps(rinv_S2, interact_S2);
525	rinv_ex_S3 = gmx_simd_blendzero_r_mm_and_ps(rinv_S3, interact_S3);
526	#else
527	/* No exclusion forces, we always need 1/r */
528	#define rinv_ex_S0 rinv_S0
529	#define rinv_ex_S1 rinv_S1
530	#define rinv_ex_S2 rinv_S2
531	#define rinv_ex_S3 rinv_S3
532	#endif
533
534	#ifdef CALC_COUL_RF
535	/* Electrostatic interactions */
536	frcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_fmadd_r(rsq_S0, mrc_3_S, rinv_ex_S0)_mm_add_ps(_mm_mul_ps(rsq_S0, mrc_3_S), rinv_ex_S0));
537	frcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_fmadd_r(rsq_S1, mrc_3_S, rinv_ex_S1)_mm_add_ps(_mm_mul_ps(rsq_S1, mrc_3_S), rinv_ex_S1));
538	frcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_fmadd_r(rsq_S2, mrc_3_S, rinv_ex_S2)_mm_add_ps(_mm_mul_ps(rsq_S2, mrc_3_S), rinv_ex_S2));
539	frcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_fmadd_r(rsq_S3, mrc_3_S, rinv_ex_S3)_mm_add_ps(_mm_mul_ps(rsq_S3, mrc_3_S), rinv_ex_S3));
540
541	#ifdef CALC_ENERGIES
542	vcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_add_r_mm_add_ps(rinv_ex_S0, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S0, hrc_3_S), moh_rc_S)));
543	vcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_add_r_mm_add_ps(rinv_ex_S1, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S1, hrc_3_S), moh_rc_S)));
544	vcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_add_r_mm_add_ps(rinv_ex_S2, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S2, hrc_3_S), moh_rc_S)));
545	vcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_add_r_mm_add_ps(rinv_ex_S3, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S3, hrc_3_S), moh_rc_S)));
546	#endif
547	#endif
548
549	#ifdef CALC_COUL_EWALD
550	/* We need to mask (or limit) rsq for the cut-off,
551	* as large distances can cause an overflow in gmx_pmecorrF/V.
552	*/
553	#ifndef NBNXN_CUTOFF_USE_BLENDV
554	brsq_S0 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S0, wco_S0));
555	brsq_S1 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S1, wco_S1));
556	brsq_S2 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S2, wco_S2));
557	brsq_S3 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S3, wco_S3));
558	#else
559	/* Strangely, putting mul on a separate line is slower (icc 13) */
560	brsq_S0 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S0, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S0)));
561	brsq_S1 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S1, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S1)));
562	brsq_S2 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S2, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S2)));
563	brsq_S3 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S3, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S3)));
564	#endif
565	ewcorr_S0 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S0), beta_S);
566	ewcorr_S1 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S1), beta_S);
567	ewcorr_S2 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S2), beta_S);
568	ewcorr_S3 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S3), beta_S);
569	frcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_fmadd_r(ewcorr_S0, brsq_S0, rinv_ex_S0)_mm_add_ps(_mm_mul_ps(ewcorr_S0, brsq_S0), rinv_ex_S0));
570	frcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_fmadd_r(ewcorr_S1, brsq_S1, rinv_ex_S1)_mm_add_ps(_mm_mul_ps(ewcorr_S1, brsq_S1), rinv_ex_S1));
571	frcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_fmadd_r(ewcorr_S2, brsq_S2, rinv_ex_S2)_mm_add_ps(_mm_mul_ps(ewcorr_S2, brsq_S2), rinv_ex_S2));
572	frcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_fmadd_r(ewcorr_S3, brsq_S3, rinv_ex_S3)_mm_add_ps(_mm_mul_ps(ewcorr_S3, brsq_S3), rinv_ex_S3));
573
574	#ifdef CALC_ENERGIES
575	vc_sub_S0 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S0), beta_S);
576	vc_sub_S1 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S1), beta_S);
577	vc_sub_S2 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S2), beta_S);
578	vc_sub_S3 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S3), beta_S);
579	#endif
580
581	#endif /* CALC_COUL_EWALD */
582
583	#ifdef CALC_COUL_TAB
584	/* Electrostatic interactions */
585	r_S0 = gmx_simd_mul_r_mm_mul_ps(rsq_S0, rinv_S0);
586	r_S1 = gmx_simd_mul_r_mm_mul_ps(rsq_S1, rinv_S1);
587	r_S2 = gmx_simd_mul_r_mm_mul_ps(rsq_S2, rinv_S2);
588	r_S3 = gmx_simd_mul_r_mm_mul_ps(rsq_S3, rinv_S3);
589	/* Convert r to scaled table units */
590	rs_S0 = gmx_simd_mul_r_mm_mul_ps(r_S0, invtsp_S);
591	rs_S1 = gmx_simd_mul_r_mm_mul_ps(r_S1, invtsp_S);
592	rs_S2 = gmx_simd_mul_r_mm_mul_ps(r_S2, invtsp_S);
593	rs_S3 = gmx_simd_mul_r_mm_mul_ps(r_S3, invtsp_S);
594	/* Truncate scaled r to an int */
595	ti_S0 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S0);
596	ti_S1 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S1);
597	ti_S2 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S2);
598	ti_S3 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S3);
599	#ifdef GMX_SIMD_HAVE_TRUNC
600	/* SSE4.1 trunc is faster than gmx_cvtepi32_ps int->float cast */
601	rf_S0 = gmx_simd_trunc_r(rs_S0)__extension__ ({ __m128 __X = (rs_S0); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x03))); });
602	rf_S1 = gmx_simd_trunc_r(rs_S1)__extension__ ({ __m128 __X = (rs_S1); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x03))); });
603	rf_S2 = gmx_simd_trunc_r(rs_S2)__extension__ ({ __m128 __X = (rs_S2); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x03))); });
604	rf_S3 = gmx_simd_trunc_r(rs_S3)__extension__ ({ __m128 __X = (rs_S3); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x03))); });
605	#else
606	rf_S0 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S0);
607	rf_S1 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S1);
608	rf_S2 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S2);
609	rf_S3 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S3);
610	#endif
611	frac_S0 = gmx_simd_sub_r_mm_sub_ps(rs_S0, rf_S0);
612	frac_S1 = gmx_simd_sub_r_mm_sub_ps(rs_S1, rf_S1);
613	frac_S2 = gmx_simd_sub_r_mm_sub_ps(rs_S2, rf_S2);
614	frac_S3 = gmx_simd_sub_r_mm_sub_ps(rs_S3, rf_S3);
615
616	/* Load and interpolate table forces and possibly energies.
617	* Force and energy can be combined in one table, stride 4: FDV0
618	* or in two separate tables with stride 1: F and V
619	* Currently single precision uses FDV0, double F and V.
620	*/
621	#ifndef CALC_ENERGIES
622	load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
623	load_table_f(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1);
624	load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
625	load_table_f(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3);
626	#else
627	#ifdef TAB_FDV0
628	load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
629	load_table_f_v(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
630	load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
631	load_table_f_v(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
632	#else
633	load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
634	load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
635	load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
636	load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
637	#endif
638	#endif
639	fsub_S0 = gmx_simd_add_r_mm_add_ps(ctab0_S0, gmx_simd_mul_r_mm_mul_ps(frac_S0, ctab1_S0));
640	fsub_S1 = gmx_simd_add_r_mm_add_ps(ctab0_S1, gmx_simd_mul_r_mm_mul_ps(frac_S1, ctab1_S1));
641	fsub_S2 = gmx_simd_add_r_mm_add_ps(ctab0_S2, gmx_simd_mul_r_mm_mul_ps(frac_S2, ctab1_S2));
642	fsub_S3 = gmx_simd_add_r_mm_add_ps(ctab0_S3, gmx_simd_mul_r_mm_mul_ps(frac_S3, ctab1_S3));
643	frcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S0, gmx_simd_mul_r_mm_mul_ps(fsub_S0, r_S0)));
644	frcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S1, gmx_simd_mul_r_mm_mul_ps(fsub_S1, r_S1)));
645	frcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S2, gmx_simd_mul_r_mm_mul_ps(fsub_S2, r_S2)));
646	frcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S3, gmx_simd_mul_r_mm_mul_ps(fsub_S3, r_S3)));
647
648	#ifdef CALC_ENERGIES
649	vc_sub_S0 = gmx_simd_add_r_mm_add_ps(ctabv_S0, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S0), gmx_simd_add_r_mm_add_ps(ctab0_S0, fsub_S0)));
650	vc_sub_S1 = gmx_simd_add_r_mm_add_ps(ctabv_S1, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S1), gmx_simd_add_r_mm_add_ps(ctab0_S1, fsub_S1)));
651	vc_sub_S2 = gmx_simd_add_r_mm_add_ps(ctabv_S2, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S2), gmx_simd_add_r_mm_add_ps(ctab0_S2, fsub_S2)));
652	vc_sub_S3 = gmx_simd_add_r_mm_add_ps(ctabv_S3, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S3), gmx_simd_add_r_mm_add_ps(ctab0_S3, fsub_S3)));
653	#endif
654	#endif /* CALC_COUL_TAB */
655
656	#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD \|\| defined CALC_COUL_TAB)
657	#ifndef NO_SHIFT_EWALD
658	/* Add Ewald potential shift to vc_sub for convenience */
659	#ifdef CHECK_EXCLS
660	vc_sub_S0 = gmx_simd_add_r_mm_add_ps(vc_sub_S0, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S0));
661	vc_sub_S1 = gmx_simd_add_r_mm_add_ps(vc_sub_S1, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S1));
662	vc_sub_S2 = gmx_simd_add_r_mm_add_ps(vc_sub_S2, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S2));
663	vc_sub_S3 = gmx_simd_add_r_mm_add_ps(vc_sub_S3, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S3));
664	#else
665	vc_sub_S0 = gmx_simd_add_r_mm_add_ps(vc_sub_S0, sh_ewald_S);
666	vc_sub_S1 = gmx_simd_add_r_mm_add_ps(vc_sub_S1, sh_ewald_S);
667	vc_sub_S2 = gmx_simd_add_r_mm_add_ps(vc_sub_S2, sh_ewald_S);
668	vc_sub_S3 = gmx_simd_add_r_mm_add_ps(vc_sub_S3, sh_ewald_S);
669	#endif
670	#endif
671
672	vcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S0, vc_sub_S0));
673	vcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S1, vc_sub_S1));
674	vcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S2, vc_sub_S2));
675	vcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S3, vc_sub_S3));
676
677	#endif
678
679	#ifdef CALC_ENERGIES
680	/* Mask energy for cut-off and diagonal */
681	vcoul_S0 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S0, wco_S0);
682	vcoul_S1 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S1, wco_S1);
683	vcoul_S2 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S2, wco_S2);
684	vcoul_S3 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S3, wco_S3);
685	#endif
686
687	#endif /* CALC_COULOMB */
688
689	#ifdef CALC_LJ
690	/* Lennard-Jones interaction */
691
692	#ifdef VDW_CUTOFF_CHECK
693	wco_vdw_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S0, rcvdw2_S);
694	wco_vdw_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S1, rcvdw2_S);
695	#ifndef HALF_LJ
696	wco_vdw_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S2, rcvdw2_S);
697	wco_vdw_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S3, rcvdw2_S);
698	#endif
699	#else
700	/* Same cut-off for Coulomb and VdW, reuse the registers */
701	#define wco_vdw_S0 wco_S0
702	#define wco_vdw_S1 wco_S1
703	#define wco_vdw_S2 wco_S2
704	#define wco_vdw_S3 wco_S3
705	#endif
706
707	#ifndef LJ_COMB_LB
708	rinvsix_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, rinvsq_S0));
709	rinvsix_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, rinvsq_S1));
710	#ifdef EXCL_FORCES
711	rinvsix_S0 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S0, interact_S0);
712	rinvsix_S1 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S1, interact_S1);
713	#endif
714	#ifndef HALF_LJ
715	rinvsix_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, rinvsq_S2));
716	rinvsix_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, rinvsq_S3));
717	#ifdef EXCL_FORCES
718	rinvsix_S2 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S2, interact_S2);
719	rinvsix_S3 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S3, interact_S3);
720	#endif
721	#endif
722
723	#if defined LJ_CUT \|\| defined LJ_POT_SWITCH
724	/* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
725	FrLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, rinvsix_S0);
726	FrLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, rinvsix_S1);
727	#ifndef HALF_LJ
728	FrLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, rinvsix_S2);
729	FrLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, rinvsix_S3);
730	#endif
731	FrLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(c12_S0, gmx_simd_mul_r_mm_mul_ps(rinvsix_S0, rinvsix_S0));
732	FrLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(c12_S1, gmx_simd_mul_r_mm_mul_ps(rinvsix_S1, rinvsix_S1));
733	#ifndef HALF_LJ
734	FrLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(c12_S2, gmx_simd_mul_r_mm_mul_ps(rinvsix_S2, rinvsix_S2));
735	FrLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(c12_S3, gmx_simd_mul_r_mm_mul_ps(rinvsix_S3, rinvsix_S3));
736	#endif
737	#endif
738
739	#if defined LJ_FORCE_SWITCH \|\| defined LJ_POT_SWITCH
740	/* We switch the LJ force */
741	r_S0 = gmx_simd_mul_r_mm_mul_ps(rsq_S0, rinv_S0);
742	rsw_S0 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S0, rswitch_S), zero_S);
743	rsw2_S0 = gmx_simd_mul_r_mm_mul_ps(rsw_S0, rsw_S0);
744	rsw2_r_S0 = gmx_simd_mul_r_mm_mul_ps(rsw2_S0, r_S0);
745	r_S1 = gmx_simd_mul_r_mm_mul_ps(rsq_S1, rinv_S1);
746	rsw_S1 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S1, rswitch_S), zero_S);
747	rsw2_S1 = gmx_simd_mul_r_mm_mul_ps(rsw_S1, rsw_S1);
748	rsw2_r_S1 = gmx_simd_mul_r_mm_mul_ps(rsw2_S1, r_S1);
	Value stored to 'rsw2_r_S1' is never read
749	#ifndef HALF_LJ
750	r_S2 = gmx_simd_mul_r_mm_mul_ps(rsq_S2, rinv_S2);
751	rsw_S2 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S2, rswitch_S), zero_S);
752	rsw2_S2 = gmx_simd_mul_r_mm_mul_ps(rsw_S2, rsw_S2);
753	rsw2_r_S2 = gmx_simd_mul_r_mm_mul_ps(rsw2_S2, r_S2);
754	r_S3 = gmx_simd_mul_r_mm_mul_ps(rsq_S3, rinv_S3);
755	rsw_S3 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S3, rswitch_S), zero_S);
756	rsw2_S3 = gmx_simd_mul_r_mm_mul_ps(rsw_S3, rsw_S3);
757	rsw2_r_S3 = gmx_simd_mul_r_mm_mul_ps(rsw2_S3, r_S3);
758	#endif
759	#endif
760
761	#ifdef LJ_FORCE_SWITCH
762
763	#define gmx_add_fr_switch(fr, rsw, rsw2_r, c2, c3) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c3, rsw, c2), rsw2_r, fr)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(c3, rsw), c2), rsw2_r ), fr)
764
765	FrLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, gmx_add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S));
766	FrLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, gmx_add_fr_switch(rinvsix_S1, rsw_S1, rsw2_r_S1, p6_fc2_S, p6_fc3_S));
767	#ifndef HALF_LJ
768	FrLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, gmx_add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S));
769	FrLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, gmx_add_fr_switch(rinvsix_S3, rsw_S3, rsw2_r_S3, p6_fc2_S, p6_fc3_S));
770	#endif
771	FrLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(c12_S0, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S0, rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S));
772	FrLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(c12_S1, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S1, rinvsix_S1), rsw_S1, rsw2_r_S1, p12_fc2_S, p12_fc3_S));
773	#ifndef HALF_LJ
774	FrLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(c12_S2, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S2, rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S));
775	FrLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(c12_S3, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S3, rinvsix_S3), rsw_S3, rsw2_r_S3, p12_fc2_S, p12_fc3_S));
776	#endif
777	#undef gmx_add_fr_switch
778	#endif /* LJ_FORCE_SWITCH */
779
780	#endif /* not LJ_COMB_LB */
781
782	#ifdef LJ_COMB_LB
783	sir_S0 = gmx_simd_mul_r_mm_mul_ps(sig_S0, rinv_S0);
784	sir_S1 = gmx_simd_mul_r_mm_mul_ps(sig_S1, rinv_S1);
785	#ifndef HALF_LJ
786	sir_S2 = gmx_simd_mul_r_mm_mul_ps(sig_S2, rinv_S2);
787	sir_S3 = gmx_simd_mul_r_mm_mul_ps(sig_S3, rinv_S3);
788	#endif
789	sir2_S0 = gmx_simd_mul_r_mm_mul_ps(sir_S0, sir_S0);
790	sir2_S1 = gmx_simd_mul_r_mm_mul_ps(sir_S1, sir_S1);
791	#ifndef HALF_LJ
792	sir2_S2 = gmx_simd_mul_r_mm_mul_ps(sir_S2, sir_S2);
793	sir2_S3 = gmx_simd_mul_r_mm_mul_ps(sir_S3, sir_S3);
794	#endif
795	sir6_S0 = gmx_simd_mul_r_mm_mul_ps(sir2_S0, gmx_simd_mul_r_mm_mul_ps(sir2_S0, sir2_S0));
796	sir6_S1 = gmx_simd_mul_r_mm_mul_ps(sir2_S1, gmx_simd_mul_r_mm_mul_ps(sir2_S1, sir2_S1));
797	#ifdef EXCL_FORCES
798	sir6_S0 = gmx_simd_blendzero_r_mm_and_ps(sir6_S0, interact_S0);
799	sir6_S1 = gmx_simd_blendzero_r_mm_and_ps(sir6_S1, interact_S1);
800	#endif
801	#ifndef HALF_LJ
802	sir6_S2 = gmx_simd_mul_r_mm_mul_ps(sir2_S2, gmx_simd_mul_r_mm_mul_ps(sir2_S2, sir2_S2));
803	sir6_S3 = gmx_simd_mul_r_mm_mul_ps(sir2_S3, gmx_simd_mul_r_mm_mul_ps(sir2_S3, sir2_S3));
804	#ifdef EXCL_FORCES
805	sir6_S2 = gmx_simd_blendzero_r_mm_and_ps(sir6_S2, interact_S2);
806	sir6_S3 = gmx_simd_blendzero_r_mm_and_ps(sir6_S3, interact_S3);
807	#endif
808	#endif
809	#ifdef VDW_CUTOFF_CHECK
810	sir6_S0 = gmx_simd_blendzero_r_mm_and_ps(sir6_S0, wco_vdw_S0);
811	sir6_S1 = gmx_simd_blendzero_r_mm_and_ps(sir6_S1, wco_vdw_S1);
812	#ifndef HALF_LJ
813	sir6_S2 = gmx_simd_blendzero_r_mm_and_ps(sir6_S2, wco_vdw_S2);
814	sir6_S3 = gmx_simd_blendzero_r_mm_and_ps(sir6_S3, wco_vdw_S3);
815	#endif
816	#endif
817	FrLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(eps_S0, sir6_S0);
818	FrLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(eps_S1, sir6_S1);
819	#ifndef HALF_LJ
820	FrLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(eps_S2, sir6_S2);
821	FrLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(eps_S3, sir6_S3);
822	#endif
823	FrLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S0, sir6_S0);
824	FrLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S1, sir6_S1);
825	#ifndef HALF_LJ
826	FrLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S2, sir6_S2);
827	FrLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S3, sir6_S3);
828	#endif
829	#if defined CALC_ENERGIES
830	/* We need C6 and C12 to calculate the LJ potential shift */
831	sig2_S0 = gmx_simd_mul_r_mm_mul_ps(sig_S0, sig_S0);
832	sig2_S1 = gmx_simd_mul_r_mm_mul_ps(sig_S1, sig_S1);
833	#ifndef HALF_LJ
834	sig2_S2 = gmx_simd_mul_r_mm_mul_ps(sig_S2, sig_S2);
835	sig2_S3 = gmx_simd_mul_r_mm_mul_ps(sig_S3, sig_S3);
836	#endif
837	sig6_S0 = gmx_simd_mul_r_mm_mul_ps(sig2_S0, gmx_simd_mul_r_mm_mul_ps(sig2_S0, sig2_S0));
838	sig6_S1 = gmx_simd_mul_r_mm_mul_ps(sig2_S1, gmx_simd_mul_r_mm_mul_ps(sig2_S1, sig2_S1));
839	#ifndef HALF_LJ
840	sig6_S2 = gmx_simd_mul_r_mm_mul_ps(sig2_S2, gmx_simd_mul_r_mm_mul_ps(sig2_S2, sig2_S2));
841	sig6_S3 = gmx_simd_mul_r_mm_mul_ps(sig2_S3, gmx_simd_mul_r_mm_mul_ps(sig2_S3, sig2_S3));
842	#endif
843	c6_S0 = gmx_simd_mul_r_mm_mul_ps(eps_S0, sig6_S0);
844	c6_S1 = gmx_simd_mul_r_mm_mul_ps(eps_S1, sig6_S1);
845	#ifndef HALF_LJ
846	c6_S2 = gmx_simd_mul_r_mm_mul_ps(eps_S2, sig6_S2);
847	c6_S3 = gmx_simd_mul_r_mm_mul_ps(eps_S3, sig6_S3);
848	#endif
849	c12_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, sig6_S0);
850	c12_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, sig6_S1);
851	#ifndef HALF_LJ
852	c12_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, sig6_S2);
853	c12_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, sig6_S3);
854	#endif
855	#endif
856	#endif /* LJ_COMB_LB */
857
858	/* Determine the total scalar LJ forcer /
859	frLJ_S0 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S0, FrLJ6_S0);
860	frLJ_S1 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S1, FrLJ6_S1);
861	#ifndef HALF_LJ
862	frLJ_S2 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S2, FrLJ6_S2);
863	frLJ_S3 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S3, FrLJ6_S3);
864	#endif
865
866	#if (defined LJ_CUT \|\| defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
867
868	#ifdef LJ_CUT
869	/* Calculate the LJ energies, with constant potential shift */
870	VLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S0, p6_cpot_S, FrLJ6_S0)_mm_add_ps(_mm_mul_ps(c6_S0, p6_cpot_S), FrLJ6_S0));
871	VLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S1, p6_cpot_S, FrLJ6_S1)_mm_add_ps(_mm_mul_ps(c6_S1, p6_cpot_S), FrLJ6_S1));
872	#ifndef HALF_LJ
873	VLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S2, p6_cpot_S, FrLJ6_S2)_mm_add_ps(_mm_mul_ps(c6_S2, p6_cpot_S), FrLJ6_S2));
874	VLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S3, p6_cpot_S, FrLJ6_S3)_mm_add_ps(_mm_mul_ps(c6_S3, p6_cpot_S), FrLJ6_S3));
875	#endif
876	VLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S0, p12_cpot_S, FrLJ12_S0)_mm_add_ps(_mm_mul_ps(c12_S0, p12_cpot_S), FrLJ12_S0));
877	VLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S1, p12_cpot_S, FrLJ12_S1)_mm_add_ps(_mm_mul_ps(c12_S1, p12_cpot_S), FrLJ12_S1));
878	#ifndef HALF_LJ
879	VLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S2, p12_cpot_S, FrLJ12_S2)_mm_add_ps(_mm_mul_ps(c12_S2, p12_cpot_S), FrLJ12_S2));
880	VLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S3, p12_cpot_S, FrLJ12_S3)_mm_add_ps(_mm_mul_ps(c12_S3, p12_cpot_S), FrLJ12_S3));
881	#endif
882	#endif /* LJ_CUT */
883
884	#ifdef LJ_FORCE_SWITCH
885	#define v_fswitch_r(rsw, rsw2, c0, c3, c4) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), gmx_simd_mul_r(rsw2, rsw), c0)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(c4, rsw), c3), _mm_mul_ps (rsw2, rsw)), c0)
886
887	VLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, gmx_simd_fmadd_r(sixth_S, rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S0), v_fswitch_r(rsw_S0 , rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
888	VLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, gmx_simd_fmadd_r(sixth_S, rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S1), v_fswitch_r(rsw_S1 , rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
889	#ifndef HALF_LJ
890	VLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, gmx_simd_fmadd_r(sixth_S, rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S2), v_fswitch_r(rsw_S2 , rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
891	VLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, gmx_simd_fmadd_r(sixth_S, rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S3), v_fswitch_r(rsw_S3 , rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
892	#endif
893	VLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(c12_S0, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0), v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S0, rinvsix_S0 )), v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S )));
894	VLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(c12_S1, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S1, rinvsix_S1), v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S1, rinvsix_S1 )), v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S )));
895	#ifndef HALF_LJ
896	VLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(c12_S2, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2), v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S2, rinvsix_S2 )), v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S )));
897	VLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(c12_S3, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S3, rinvsix_S3), v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S3, rinvsix_S3 )), v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S )));
898	#endif
899	#undef v_fswitch_r
900	#endif /* LJ_FORCE_SWITCH */
901
902	/* Add up the repulsion and dispersion */
903	VLJ_S0 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S0, VLJ6_S0);
904	VLJ_S1 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S1, VLJ6_S1);
905	#ifndef HALF_LJ
906	VLJ_S2 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S2, VLJ6_S2);
907	VLJ_S3 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S3, VLJ6_S3);
908	#endif
909
910	#endif /* (LJ_CUT \|\| LJ_FORCE_SWITCH) && CALC_ENERGIES */
911
912	#ifdef LJ_POT_SWITCH
913	/* We always need the potential, since it is needed for the force */
914	VLJ_S0 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S0, gmx_simd_mul_r(twelveth_S, FrLJ12_S0))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S0), _mm_mul_ps(sixth_S , FrLJ6_S0));
915	VLJ_S1 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S1, gmx_simd_mul_r(twelveth_S, FrLJ12_S1))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S1), _mm_mul_ps(sixth_S , FrLJ6_S1));
916	#ifndef HALF_LJ
917	VLJ_S2 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S2, gmx_simd_mul_r(twelveth_S, FrLJ12_S2))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S2), _mm_mul_ps(sixth_S , FrLJ6_S2));
918	VLJ_S3 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S3, gmx_simd_mul_r(twelveth_S, FrLJ12_S3))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S3), _mm_mul_ps(sixth_S , FrLJ6_S3));
919	#endif
920
921	{
922	gmx_simd_real_t__m128 sw_S0, dsw_S0;
923	gmx_simd_real_t__m128 sw_S1, dsw_S1;
924	#ifndef HALF_LJ
925	gmx_simd_real_t__m128 sw_S2, dsw_S2;
926	gmx_simd_real_t__m128 sw_S3, dsw_S3;
927	#endif
928
929	#define switch_r(rsw, rsw2, c3, c4, c5) gmx_simd_fmadd_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c5, rsw, c4), rsw, c3), gmx_simd_mul_r(rsw2, rsw), one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps (c5, rsw), c4), rsw), c3), _mm_mul_ps(rsw2, rsw)), one_S)
930	#define dswitch_r(rsw, rsw2, c2, c3, c4) gmx_simd_mul_r_mm_mul_ps(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), rsw, c2)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(c4, rsw), c3), rsw ), c2), rsw2)
931
932	sw_S0 = switch_r(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
933	dsw_S0 = dswitch_r(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
934	sw_S1 = switch_r(rsw_S1, rsw2_S1, swV3_S, swV4_S, swV5_S);
935	dsw_S1 = dswitch_r(rsw_S1, rsw2_S1, swF2_S, swF3_S, swF4_S);
936	#ifndef HALF_LJ
937	sw_S2 = switch_r(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
938	dsw_S2 = dswitch_r(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
939	sw_S3 = switch_r(rsw_S3, rsw2_S3, swV3_S, swV4_S, swV5_S);
940	dsw_S3 = dswitch_r(rsw_S3, rsw2_S3, swF2_S, swF3_S, swF4_S);
941	#endif
942	frLJ_S0 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S0, VLJ_S0), r_S0, gmx_simd_mul_r(sw_S0, frLJ_S0))_mm_sub_ps(_mm_mul_ps(sw_S0, frLJ_S0), _mm_mul_ps(_mm_mul_ps( dsw_S0, VLJ_S0), r_S0));
943	frLJ_S1 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S1, VLJ_S1), r_S1, gmx_simd_mul_r(sw_S1, frLJ_S1))_mm_sub_ps(_mm_mul_ps(sw_S1, frLJ_S1), _mm_mul_ps(_mm_mul_ps( dsw_S1, VLJ_S1), r_S1));
944	#ifndef HALF_LJ
945	frLJ_S2 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S2, VLJ_S2), r_S2, gmx_simd_mul_r(sw_S2, frLJ_S2))_mm_sub_ps(_mm_mul_ps(sw_S2, frLJ_S2), _mm_mul_ps(_mm_mul_ps( dsw_S2, VLJ_S2), r_S2));
946	frLJ_S3 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S3, VLJ_S3), r_S3, gmx_simd_mul_r(sw_S3, frLJ_S3))_mm_sub_ps(_mm_mul_ps(sw_S3, frLJ_S3), _mm_mul_ps(_mm_mul_ps( dsw_S3, VLJ_S3), r_S3));
947	#endif
948	#ifdef CALC_ENERGIES
949	VLJ_S0 = gmx_simd_mul_r_mm_mul_ps(sw_S0, VLJ_S0);
950	VLJ_S1 = gmx_simd_mul_r_mm_mul_ps(sw_S1, VLJ_S1);
951	#ifndef HALF_LJ
952	VLJ_S2 = gmx_simd_mul_r_mm_mul_ps(sw_S2, VLJ_S2);
953	VLJ_S3 = gmx_simd_mul_r_mm_mul_ps(sw_S3, VLJ_S3);
954	#endif
955	#endif
956
957	#undef switch_r
958	#undef dswitch_r
959	}
960	#endif /* LJ_POT_SWITCH */
961
962	#if defined CALC_ENERGIES && defined CHECK_EXCLS
963	/* The potential shift should be removed for excluded pairs */
964	VLJ_S0 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S0, interact_S0);
965	VLJ_S1 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S1, interact_S1);
966	#ifndef HALF_LJ
967	VLJ_S2 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S2, interact_S2);
968	VLJ_S3 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S3, interact_S3);
969	#endif
970	#endif
971
972	#ifdef LJ_EWALD_GEOM
973	{
974	gmx_simd_real_t__m128 c6s_j_S;
975	gmx_simd_real_t__m128 c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
976	gmx_simd_real_t__m128 c6grid_S1, rinvsix_nm_S1, cr2_S1, expmcr2_S1, poly_S1;
977	#ifndef HALF_LJ
978	gmx_simd_real_t__m128 c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
979	gmx_simd_real_t__m128 c6grid_S3, rinvsix_nm_S3, cr2_S3, expmcr2_S3, poly_S3;
980	#endif
981	#ifdef CALC_ENERGIES
982	gmx_simd_real_t__m128 sh_mask_S0;
983	gmx_simd_real_t__m128 sh_mask_S1;
984	#ifndef HALF_LJ
985	gmx_simd_real_t__m128 sh_mask_S2;
986	gmx_simd_real_t__m128 sh_mask_S3;
987	#endif
988	#endif
989
990	/* Determine C6 for the grid using the geometric combination rule */
991	c6s_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+0);
992	c6grid_S0 = gmx_simd_mul_r_mm_mul_ps(c6s_S0, c6s_j_S);
993	c6grid_S1 = gmx_simd_mul_r_mm_mul_ps(c6s_S1, c6s_j_S);
994	#ifndef HALF_LJ
995	c6grid_S2 = gmx_simd_mul_r_mm_mul_ps(c6s_S2, c6s_j_S);
996	c6grid_S3 = gmx_simd_mul_r_mm_mul_ps(c6s_S3, c6s_j_S);
997	#endif
998
999	#ifdef CHECK_EXCLS
1000	/* Recalculate rinvsix without exclusion mask (compiler might optimize) */
1001	rinvsix_nm_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, rinvsq_S0));
1002	rinvsix_nm_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, rinvsq_S1));
1003	#ifndef HALF_LJ
1004	rinvsix_nm_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, rinvsq_S2));
1005	rinvsix_nm_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, rinvsq_S3));
1006	#endif
1007	#else
1008	/* We didn't use a mask, so we can copy */
1009	rinvsix_nm_S0 = rinvsix_S0;
1010	rinvsix_nm_S1 = rinvsix_S1;
1011	#ifndef HALF_LJ
1012	rinvsix_nm_S2 = rinvsix_S2;
1013	rinvsix_nm_S3 = rinvsix_S3;
1014	#endif
1015	#endif
1016
1017	cr2_S0 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S0);
1018	cr2_S1 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S1);
1019	#ifndef HALF_LJ
1020	cr2_S2 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S2);
1021	cr2_S3 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S3);
1022	#endif
1023	expmcr2_S0 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S0));
1024	expmcr2_S1 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S1));
1025	#ifndef HALF_LJ
1026	expmcr2_S2 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S2));
1027	expmcr2_S3 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S3));
1028	#endif
1029
1030	/* 1 + cr2 + 1/2cr2^2 /
1031	poly_S0 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S0, one_S), cr2_S0, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S0), one_S ), cr2_S0), one_S);
1032	poly_S1 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S1, one_S), cr2_S1, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S1), one_S ), cr2_S1), one_S);
1033	#ifndef HALF_LJ
1034	poly_S2 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S2, one_S), cr2_S2, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S2), one_S ), cr2_S2), one_S);
1035	poly_S3 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S3, one_S), cr2_S3, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S3), one_S ), cr2_S3), one_S);
1036	#endif
1037
1038	/* We calculate LJ Fr = (6C6)*(r^-6 - F_mesh/6), we use:
1039	* r^-6cexp(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp(r^-6poly + c^6/6)
1040	*/
1041	frLJ_S0 = gmx_simd_fmadd_r(c6grid_S0, gmx_simd_fnmadd_r(expmcr2_S0, gmx_simd_fmadd_r(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0)_mm_add_ps(_mm_mul_ps(c6grid_S0, _mm_sub_ps(rinvsix_nm_S0, _mm_mul_ps (expmcr2_S0, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S0, poly_S0), lje_c6_6_S )))), frLJ_S0);
1042	frLJ_S1 = gmx_simd_fmadd_r(c6grid_S1, gmx_simd_fnmadd_r(expmcr2_S1, gmx_simd_fmadd_r(rinvsix_nm_S1, poly_S1, lje_c6_6_S), rinvsix_nm_S1), frLJ_S1)_mm_add_ps(_mm_mul_ps(c6grid_S1, _mm_sub_ps(rinvsix_nm_S1, _mm_mul_ps (expmcr2_S1, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S1, poly_S1), lje_c6_6_S )))), frLJ_S1);
1043	#ifndef HALF_LJ
1044	frLJ_S2 = gmx_simd_fmadd_r(c6grid_S2, gmx_simd_fnmadd_r(expmcr2_S2, gmx_simd_fmadd_r(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2)_mm_add_ps(_mm_mul_ps(c6grid_S2, _mm_sub_ps(rinvsix_nm_S2, _mm_mul_ps (expmcr2_S2, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S2, poly_S2), lje_c6_6_S )))), frLJ_S2);
1045	frLJ_S3 = gmx_simd_fmadd_r(c6grid_S3, gmx_simd_fnmadd_r(expmcr2_S3, gmx_simd_fmadd_r(rinvsix_nm_S3, poly_S3, lje_c6_6_S), rinvsix_nm_S3), frLJ_S3)_mm_add_ps(_mm_mul_ps(c6grid_S3, _mm_sub_ps(rinvsix_nm_S3, _mm_mul_ps (expmcr2_S3, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S3, poly_S3), lje_c6_6_S )))), frLJ_S3);
1046	#endif
1047
1048	#ifdef CALC_ENERGIES
1049	#ifdef CHECK_EXCLS
1050	sh_mask_S0 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S0);
1051	sh_mask_S1 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S1);
1052	#ifndef HALF_LJ
1053	sh_mask_S2 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S2);
1054	sh_mask_S3 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S3);
1055	#endif
1056	#else
1057	sh_mask_S0 = lje_vc_S;
1058	sh_mask_S1 = lje_vc_S;
1059	#ifndef HALF_LJ
1060	sh_mask_S2 = lje_vc_S;
1061	sh_mask_S3 = lje_vc_S;
1062	#endif
1063	#endif
1064
1065	VLJ_S0 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S0), gmx_simd_fmadd_r(rinvsix_nm_S0, gmx_simd_fnmadd_r(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S0), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S0, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S0 , poly_S0))), sh_mask_S0)), VLJ_S0);
1066	VLJ_S1 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S1), gmx_simd_fmadd_r(rinvsix_nm_S1, gmx_simd_fnmadd_r(expmcr2_S1, poly_S1, one_S), sh_mask_S1), VLJ_S1)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S1), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S1, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S1 , poly_S1))), sh_mask_S1)), VLJ_S1);
1067	#ifndef HALF_LJ
1068	VLJ_S2 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S2), gmx_simd_fmadd_r(rinvsix_nm_S2, gmx_simd_fnmadd_r(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S2), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S2, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S2 , poly_S2))), sh_mask_S2)), VLJ_S2);
1069	VLJ_S3 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S3), gmx_simd_fmadd_r(rinvsix_nm_S3, gmx_simd_fnmadd_r(expmcr2_S3, poly_S3, one_S), sh_mask_S3), VLJ_S3)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S3), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S3, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S3 , poly_S3))), sh_mask_S3)), VLJ_S3);
1070	#endif
1071	#endif /* CALC_ENERGIES */
1072	}
1073	#endif /* LJ_EWALD_GEOM */
1074
1075	#if defined VDW_CUTOFF_CHECK
1076	/* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
1077	* cut-off, but if the VdW cut-off is shorter, we need to mask with that.
1078	*/
1079	frLJ_S0 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S0, wco_vdw_S0);
1080	frLJ_S1 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S1, wco_vdw_S1);
1081	#ifndef HALF_LJ
1082	frLJ_S2 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S2, wco_vdw_S2);
1083	frLJ_S3 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S3, wco_vdw_S3);
1084	#endif
1085	#endif
1086
1087	#ifdef CALC_ENERGIES
1088	/* The potential shift should be removed for pairs beyond cut-off */
1089	VLJ_S0 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S0, wco_vdw_S0);
1090	VLJ_S1 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S1, wco_vdw_S1);
1091	#ifndef HALF_LJ
1092	VLJ_S2 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S2, wco_vdw_S2);
1093	VLJ_S3 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S3, wco_vdw_S3);
1094	#endif
1095	#endif
1096
1097	#endif /* CALC_LJ */
1098
1099	#ifdef CALC_ENERGIES
1100	#ifdef ENERGY_GROUPS
1101	/* Extract the group pair index per j pair.
1102	* Energy groups are stored per i-cluster, so things get
1103	* complicated when the i- and j-cluster size don't match.
1104	*/
1105	{
1106	int egps_j;
1107	#if UNROLLJ(4/1) == 2
1108	egps_j = nbat->energrp[cj>>1];
1109	egp_jj[0] = ((egps_j >> ((cj & 1)egps_jshift)) & egps_jmask)egps_jstride;
1110	#else
1111	/* We assume UNROLLI <= UNROLLJ */
1112	int jdi;
1113	for (jdi = 0; jdi < UNROLLJ(4/1)/UNROLLI4; jdi++)
1114	{
1115	int jj;
1116	egps_j = nbat->energrp[cj*(UNROLLJ(4/1)/UNROLLI4)+jdi];
1117	for (jj = 0; jj < (UNROLLI4/2); jj++)
1118	{
1119	egp_jj[jdi(UNROLLI4/2)+jj] = ((egps_j >> (jjegps_jshift)) & egps_jmask)*egps_jstride;
1120	}
1121	}
1122	#endif
1123	}
1124	#endif
1125
1126	#ifdef CALC_COULOMB
1127	#ifndef ENERGY_GROUPS
1128	vctot_S = gmx_simd_add_r_mm_add_ps(vctot_S, gmx_simd_sum4_rgmx_simd_sum4_f(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3));
1129	#else
1130	add_ener_grp(vcoul_S0, vctp[0], egp_jj);
1131	add_ener_grp(vcoul_S1, vctp[1], egp_jj);
1132	add_ener_grp(vcoul_S2, vctp[2], egp_jj);
1133	add_ener_grp(vcoul_S3, vctp[3], egp_jj);
1134	#endif
1135	#endif
1136
1137	#ifdef CALC_LJ
1138
1139	#ifndef ENERGY_GROUPS
1140	#ifndef HALF_LJ
1141	Vvdwtot_S = gmx_simd_add_r_mm_add_ps(Vvdwtot_S,
1142	gmx_simd_sum4_rgmx_simd_sum4_f(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
1143	);
1144	#else
1145	Vvdwtot_S = gmx_simd_add_r_mm_add_ps(Vvdwtot_S,
1146	gmx_simd_add_r_mm_add_ps(VLJ_S0, VLJ_S1)
1147	);
1148	#endif
1149	#else
1150	add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
1151	add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
1152	#ifndef HALF_LJ
1153	add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
1154	add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
1155	#endif
1156	#endif
1157	#endif /* CALC_LJ */
1158	#endif /* CALC_ENERGIES */
1159
1160	#ifdef CALC_LJ
1161	#ifdef CALC_COULOMB
1162	fscal_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, gmx_simd_add_r_mm_add_ps(frcoul_S0, frLJ_S0));
1163	#else
1164	fscal_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, frLJ_S0);
1165	#endif
1166	#ifdef CALC_COULOMB
1167	fscal_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, gmx_simd_add_r_mm_add_ps(frcoul_S1, frLJ_S1));
1168	#else
1169	fscal_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, frLJ_S1);
1170	#endif
1171	#else
1172	fscal_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, frcoul_S0);
1173	fscal_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, frcoul_S1);
1174	#endif /* CALC_LJ */
1175	#if defined CALC_LJ && !defined HALF_LJ
1176	#ifdef CALC_COULOMB
1177	fscal_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, gmx_simd_add_r_mm_add_ps(frcoul_S2, frLJ_S2));
1178	fscal_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, gmx_simd_add_r_mm_add_ps(frcoul_S3, frLJ_S3));
1179	#else
1180	fscal_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, frLJ_S2);
1181	fscal_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, frLJ_S3);
1182	#endif
1183	#else
1184	/* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
1185	fscal_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, frcoul_S2);
1186	fscal_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, frcoul_S3);
1187	#endif
1188
1189	/* Calculate temporary vectorial force */
1190	tx_S0 = gmx_simd_mul_r_mm_mul_ps(fscal_S0, dx_S0);
1191	tx_S1 = gmx_simd_mul_r_mm_mul_ps(fscal_S1, dx_S1);
1192	tx_S2 = gmx_simd_mul_r_mm_mul_ps(fscal_S2, dx_S2);
1193	tx_S3 = gmx_simd_mul_r_mm_mul_ps(fscal_S3, dx_S3);
1194	ty_S0 = gmx_simd_mul_r_mm_mul_ps(fscal_S0, dy_S0);
1195	ty_S1 = gmx_simd_mul_r_mm_mul_ps(fscal_S1, dy_S1);
1196	ty_S2 = gmx_simd_mul_r_mm_mul_ps(fscal_S2, dy_S2);
1197	ty_S3 = gmx_simd_mul_r_mm_mul_ps(fscal_S3, dy_S3);
1198	tz_S0 = gmx_simd_mul_r_mm_mul_ps(fscal_S0, dz_S0);
1199	tz_S1 = gmx_simd_mul_r_mm_mul_ps(fscal_S1, dz_S1);
1200	tz_S2 = gmx_simd_mul_r_mm_mul_ps(fscal_S2, dz_S2);
1201	tz_S3 = gmx_simd_mul_r_mm_mul_ps(fscal_S3, dz_S3);
1202
1203	/* Increment i atom force */
1204	fix_S0 = gmx_simd_add_r_mm_add_ps(fix_S0, tx_S0);
1205	fix_S1 = gmx_simd_add_r_mm_add_ps(fix_S1, tx_S1);
1206	fix_S2 = gmx_simd_add_r_mm_add_ps(fix_S2, tx_S2);
1207	fix_S3 = gmx_simd_add_r_mm_add_ps(fix_S3, tx_S3);
1208	fiy_S0 = gmx_simd_add_r_mm_add_ps(fiy_S0, ty_S0);
1209	fiy_S1 = gmx_simd_add_r_mm_add_ps(fiy_S1, ty_S1);
1210	fiy_S2 = gmx_simd_add_r_mm_add_ps(fiy_S2, ty_S2);
1211	fiy_S3 = gmx_simd_add_r_mm_add_ps(fiy_S3, ty_S3);
1212	fiz_S0 = gmx_simd_add_r_mm_add_ps(fiz_S0, tz_S0);
1213	fiz_S1 = gmx_simd_add_r_mm_add_ps(fiz_S1, tz_S1);
1214	fiz_S2 = gmx_simd_add_r_mm_add_ps(fiz_S2, tz_S2);
1215	fiz_S3 = gmx_simd_add_r_mm_add_ps(fiz_S3, tz_S3);
1216
1217	/* Decrement j atom force */
1218	gmx_simd_store_r_mm_store_ps(f+ajx,
1219	gmx_simd_sub_r_mm_sub_ps( gmx_simd_load_r_mm_load_ps(f+ajx), gmx_simd_sum4_rgmx_simd_sum4_f(tx_S0, tx_S1, tx_S2, tx_S3) ));
1220	gmx_simd_store_r_mm_store_ps(f+ajy,
1221	gmx_simd_sub_r_mm_sub_ps( gmx_simd_load_r_mm_load_ps(f+ajy), gmx_simd_sum4_rgmx_simd_sum4_f(ty_S0, ty_S1, ty_S2, ty_S3) ));
1222	gmx_simd_store_r_mm_store_ps(f+ajz,
1223	gmx_simd_sub_r_mm_sub_ps( gmx_simd_load_r_mm_load_ps(f+ajz), gmx_simd_sum4_rgmx_simd_sum4_f(tz_S0, tz_S1, tz_S2, tz_S3) ));
1224	}
1225
1226	#undef rinv_ex_S0
1227	#undef rinv_ex_S1
1228	#undef rinv_ex_S2
1229	#undef rinv_ex_S3
1230
1231	#undef wco_vdw_S0
1232	#undef wco_vdw_S1
1233	#undef wco_vdw_S2
1234	#undef wco_vdw_S3
1235
1236	#undef NBNXN_CUTOFF_USE_BLENDV
1237
1238	#undef EXCL_FORCES