Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse4_1_single.c
Location:line 865, column 5
Description:Value stored to 'gid' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwioffset3;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
100 real *charge;
101 int nvdwtype;
102 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103 int *vdwtype;
104 real *vdwparam;
105 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
106 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
107 __m128i vfitab;
108 __m128i ifour = _mm_set1_epi32(4);
109 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
110 real *vftab;
111 __m128i ewitab;
112 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113 real *ewtab;
114 __m128 dummy_mask,cutoff_mask;
115 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116 __m128 one = _mm_set1_ps(1.0);
117 __m128 two = _mm_set1_ps(2.0);
118 x = xx[0];
119 f = ff[0];
120
121 nri = nlist->nri;
122 iinr = nlist->iinr;
123 jindex = nlist->jindex;
124 jjnr = nlist->jjnr;
125 shiftidx = nlist->shift;
126 gid = nlist->gid;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm_set1_ps(fr->epsfac);
130 charge = mdatoms->chargeA;
131 nvdwtype = fr->ntype;
132 vdwparam = fr->nbfp;
133 vdwtype = mdatoms->typeA;
134
135 vftab = kernel_data->table_vdw->data;
136 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
137
138 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
139 ewtab = fr->ic->tabq_coul_FDV0;
140 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
141 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
142
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149
150 /* Avoid stupid compiler warnings */
151 jnrA = jnrB = jnrC = jnrD = 0;
152 j_coord_offsetA = 0;
153 j_coord_offsetB = 0;
154 j_coord_offsetC = 0;
155 j_coord_offsetD = 0;
156
157 outeriter = 0;
158 inneriter = 0;
159
160 for(iidx=0;iidx<4*DIM3;iidx++)
161 {
162 scratch[iidx] = 0.0;
163 }
164
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
167 {
168 /* Load shift vector for this list */
169 i_shift_offset = DIM3*shiftidx[iidx];
170
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
174
175 /* Get outer coordinate index */
176 inr = iinr[iidx];
177 i_coord_offset = DIM3*inr;
178
179 /* Load i particle coords and add shift vector */
180 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
182
183 fix0 = _mm_setzero_ps();
184 fiy0 = _mm_setzero_ps();
185 fiz0 = _mm_setzero_ps();
186 fix1 = _mm_setzero_ps();
187 fiy1 = _mm_setzero_ps();
188 fiz1 = _mm_setzero_ps();
189 fix2 = _mm_setzero_ps();
190 fiy2 = _mm_setzero_ps();
191 fiz2 = _mm_setzero_ps();
192 fix3 = _mm_setzero_ps();
193 fiy3 = _mm_setzero_ps();
194 fiz3 = _mm_setzero_ps();
195
196 /* Reset potential sums */
197 velecsum = _mm_setzero_ps();
198 vvdwsum = _mm_setzero_ps();
199
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
202 {
203
204 /* Get j neighbor index, and coordinate index */
205 jnrA = jjnr[jidx];
206 jnrB = jjnr[jidx+1];
207 jnrC = jjnr[jidx+2];
208 jnrD = jjnr[jidx+3];
209 j_coord_offsetA = DIM3*jnrA;
210 j_coord_offsetB = DIM3*jnrB;
211 j_coord_offsetC = DIM3*jnrC;
212 j_coord_offsetD = DIM3*jnrD;
213
214 /* load j atom coordinates */
215 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
216 x+j_coord_offsetC,x+j_coord_offsetD,
217 &jx0,&jy0,&jz0);
218
219 /* Calculate displacement vector */
220 dx00 = _mm_sub_ps(ix0,jx0);
221 dy00 = _mm_sub_ps(iy0,jy0);
222 dz00 = _mm_sub_ps(iz0,jz0);
223 dx10 = _mm_sub_ps(ix1,jx0);
224 dy10 = _mm_sub_ps(iy1,jy0);
225 dz10 = _mm_sub_ps(iz1,jz0);
226 dx20 = _mm_sub_ps(ix2,jx0);
227 dy20 = _mm_sub_ps(iy2,jy0);
228 dz20 = _mm_sub_ps(iz2,jz0);
229 dx30 = _mm_sub_ps(ix3,jx0);
230 dy30 = _mm_sub_ps(iy3,jy0);
231 dz30 = _mm_sub_ps(iz3,jz0);
232
233 /* Calculate squared distance and things based on it */
234 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
235 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
236 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
237 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
238
239 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
240 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
241 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
242 rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
243
244 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
245 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
246 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
247
248 /* Load parameters for j particles */
249 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
250 charge+jnrC+0,charge+jnrD+0);
251 vdwjidx0A = 2*vdwtype[jnrA+0];
252 vdwjidx0B = 2*vdwtype[jnrB+0];
253 vdwjidx0C = 2*vdwtype[jnrC+0];
254 vdwjidx0D = 2*vdwtype[jnrD+0];
255
256 fjx0 = _mm_setzero_ps();
257 fjy0 = _mm_setzero_ps();
258 fjz0 = _mm_setzero_ps();
259
260 /**************************
261 * CALCULATE INTERACTIONS *
262 **************************/
263
264 r00 = _mm_mul_ps(rsq00,rinv00);
265
266 /* Compute parameters for interactions between i and j atoms */
267 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
268 vdwparam+vdwioffset0+vdwjidx0B,
269 vdwparam+vdwioffset0+vdwjidx0C,
270 vdwparam+vdwioffset0+vdwjidx0D,
271 &c6_00,&c12_00);
272
273 /* Calculate table index by multiplying r with table scale and truncate to integer */
274 rt = _mm_mul_ps(r00,vftabscale);
275 vfitab = _mm_cvttps_epi32(rt);
276 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
277 vfitab = _mm_slli_epi32(vfitab,3);
278
279 /* CUBIC SPLINE TABLE DISPERSION */
280 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
281 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
282 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
283 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
284 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
285 Heps = _mm_mul_ps(vfeps,H);
286 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
287 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
288 vvdw6 = _mm_mul_ps(c6_00,VV);
289 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
290 fvdw6 = _mm_mul_ps(c6_00,FF);
291
292 /* CUBIC SPLINE TABLE REPULSION */
293 vfitab = _mm_add_epi32(vfitab,ifour);
294 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
295 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
296 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
297 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
298 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
299 Heps = _mm_mul_ps(vfeps,H);
300 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
301 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
302 vvdw12 = _mm_mul_ps(c12_00,VV);
303 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
304 fvdw12 = _mm_mul_ps(c12_00,FF);
305 vvdw = _mm_add_ps(vvdw12,vvdw6);
306 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
307
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
310
311 fscal = fvdw;
312
313 /* Calculate temporary vectorial force */
314 tx = _mm_mul_ps(fscal,dx00);
315 ty = _mm_mul_ps(fscal,dy00);
316 tz = _mm_mul_ps(fscal,dz00);
317
318 /* Update vectorial force */
319 fix0 = _mm_add_ps(fix0,tx);
320 fiy0 = _mm_add_ps(fiy0,ty);
321 fiz0 = _mm_add_ps(fiz0,tz);
322
323 fjx0 = _mm_add_ps(fjx0,tx);
324 fjy0 = _mm_add_ps(fjy0,ty);
325 fjz0 = _mm_add_ps(fjz0,tz);
326
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
330
331 r10 = _mm_mul_ps(rsq10,rinv10);
332
333 /* Compute parameters for interactions between i and j atoms */
334 qq10 = _mm_mul_ps(iq1,jq0);
335
336 /* EWALD ELECTROSTATICS */
337
338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339 ewrt = _mm_mul_ps(r10,ewtabscale);
340 ewitab = _mm_cvttps_epi32(ewrt);
341 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
342 ewitab = _mm_slli_epi32(ewitab,2);
343 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
344 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
345 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
346 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
347 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
348 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
349 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
350 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
351 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
352
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velecsum = _mm_add_ps(velecsum,velec);
355
356 fscal = felec;
357
358 /* Calculate temporary vectorial force */
359 tx = _mm_mul_ps(fscal,dx10);
360 ty = _mm_mul_ps(fscal,dy10);
361 tz = _mm_mul_ps(fscal,dz10);
362
363 /* Update vectorial force */
364 fix1 = _mm_add_ps(fix1,tx);
365 fiy1 = _mm_add_ps(fiy1,ty);
366 fiz1 = _mm_add_ps(fiz1,tz);
367
368 fjx0 = _mm_add_ps(fjx0,tx);
369 fjy0 = _mm_add_ps(fjy0,ty);
370 fjz0 = _mm_add_ps(fjz0,tz);
371
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
375
376 r20 = _mm_mul_ps(rsq20,rinv20);
377
378 /* Compute parameters for interactions between i and j atoms */
379 qq20 = _mm_mul_ps(iq2,jq0);
380
381 /* EWALD ELECTROSTATICS */
382
383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384 ewrt = _mm_mul_ps(r20,ewtabscale);
385 ewitab = _mm_cvttps_epi32(ewrt);
386 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
387 ewitab = _mm_slli_epi32(ewitab,2);
388 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
389 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
390 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
391 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
392 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
393 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
394 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
395 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
396 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
397
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velecsum = _mm_add_ps(velecsum,velec);
400
401 fscal = felec;
402
403 /* Calculate temporary vectorial force */
404 tx = _mm_mul_ps(fscal,dx20);
405 ty = _mm_mul_ps(fscal,dy20);
406 tz = _mm_mul_ps(fscal,dz20);
407
408 /* Update vectorial force */
409 fix2 = _mm_add_ps(fix2,tx);
410 fiy2 = _mm_add_ps(fiy2,ty);
411 fiz2 = _mm_add_ps(fiz2,tz);
412
413 fjx0 = _mm_add_ps(fjx0,tx);
414 fjy0 = _mm_add_ps(fjy0,ty);
415 fjz0 = _mm_add_ps(fjz0,tz);
416
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
420
421 r30 = _mm_mul_ps(rsq30,rinv30);
422
423 /* Compute parameters for interactions between i and j atoms */
424 qq30 = _mm_mul_ps(iq3,jq0);
425
426 /* EWALD ELECTROSTATICS */
427
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm_mul_ps(r30,ewtabscale);
430 ewitab = _mm_cvttps_epi32(ewrt);
431 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
434 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
435 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
436 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
437 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
438 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
439 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
440 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
441 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
442
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm_add_ps(velecsum,velec);
445
446 fscal = felec;
447
448 /* Calculate temporary vectorial force */
449 tx = _mm_mul_ps(fscal,dx30);
450 ty = _mm_mul_ps(fscal,dy30);
451 tz = _mm_mul_ps(fscal,dz30);
452
453 /* Update vectorial force */
454 fix3 = _mm_add_ps(fix3,tx);
455 fiy3 = _mm_add_ps(fiy3,ty);
456 fiz3 = _mm_add_ps(fiz3,tz);
457
458 fjx0 = _mm_add_ps(fjx0,tx);
459 fjy0 = _mm_add_ps(fjy0,ty);
460 fjz0 = _mm_add_ps(fjz0,tz);
461
462 fjptrA = f+j_coord_offsetA;
463 fjptrB = f+j_coord_offsetB;
464 fjptrC = f+j_coord_offsetC;
465 fjptrD = f+j_coord_offsetD;
466
467 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
468
469 /* Inner loop uses 179 flops */
470 }
471
472 if(jidx<j_index_end)
473 {
474
475 /* Get j neighbor index, and coordinate index */
476 jnrlistA = jjnr[jidx];
477 jnrlistB = jjnr[jidx+1];
478 jnrlistC = jjnr[jidx+2];
479 jnrlistD = jjnr[jidx+3];
480 /* Sign of each element will be negative for non-real atoms.
481 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
482 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
483 */
484 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
485 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
486 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
487 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
488 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
489 j_coord_offsetA = DIM3*jnrA;
490 j_coord_offsetB = DIM3*jnrB;
491 j_coord_offsetC = DIM3*jnrC;
492 j_coord_offsetD = DIM3*jnrD;
493
494 /* load j atom coordinates */
495 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
496 x+j_coord_offsetC,x+j_coord_offsetD,
497 &jx0,&jy0,&jz0);
498
499 /* Calculate displacement vector */
500 dx00 = _mm_sub_ps(ix0,jx0);
501 dy00 = _mm_sub_ps(iy0,jy0);
502 dz00 = _mm_sub_ps(iz0,jz0);
503 dx10 = _mm_sub_ps(ix1,jx0);
504 dy10 = _mm_sub_ps(iy1,jy0);
505 dz10 = _mm_sub_ps(iz1,jz0);
506 dx20 = _mm_sub_ps(ix2,jx0);
507 dy20 = _mm_sub_ps(iy2,jy0);
508 dz20 = _mm_sub_ps(iz2,jz0);
509 dx30 = _mm_sub_ps(ix3,jx0);
510 dy30 = _mm_sub_ps(iy3,jy0);
511 dz30 = _mm_sub_ps(iz3,jz0);
512
513 /* Calculate squared distance and things based on it */
514 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
515 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
516 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
517 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
518
519 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
520 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
521 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
522 rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
523
524 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
525 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
526 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
527
528 /* Load parameters for j particles */
529 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
530 charge+jnrC+0,charge+jnrD+0);
531 vdwjidx0A = 2*vdwtype[jnrA+0];
532 vdwjidx0B = 2*vdwtype[jnrB+0];
533 vdwjidx0C = 2*vdwtype[jnrC+0];
534 vdwjidx0D = 2*vdwtype[jnrD+0];
535
536 fjx0 = _mm_setzero_ps();
537 fjy0 = _mm_setzero_ps();
538 fjz0 = _mm_setzero_ps();
539
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
543
544 r00 = _mm_mul_ps(rsq00,rinv00);
545 r00 = _mm_andnot_ps(dummy_mask,r00);
546
547 /* Compute parameters for interactions between i and j atoms */
548 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
549 vdwparam+vdwioffset0+vdwjidx0B,
550 vdwparam+vdwioffset0+vdwjidx0C,
551 vdwparam+vdwioffset0+vdwjidx0D,
552 &c6_00,&c12_00);
553
554 /* Calculate table index by multiplying r with table scale and truncate to integer */
555 rt = _mm_mul_ps(r00,vftabscale);
556 vfitab = _mm_cvttps_epi32(rt);
557 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
558 vfitab = _mm_slli_epi32(vfitab,3);
559
560 /* CUBIC SPLINE TABLE DISPERSION */
561 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
562 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
563 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
564 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
565 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
566 Heps = _mm_mul_ps(vfeps,H);
567 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
568 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
569 vvdw6 = _mm_mul_ps(c6_00,VV);
570 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
571 fvdw6 = _mm_mul_ps(c6_00,FF);
572
573 /* CUBIC SPLINE TABLE REPULSION */
574 vfitab = _mm_add_epi32(vfitab,ifour);
575 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
576 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
577 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
578 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
579 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
580 Heps = _mm_mul_ps(vfeps,H);
581 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
582 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
583 vvdw12 = _mm_mul_ps(c12_00,VV);
584 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
585 fvdw12 = _mm_mul_ps(c12_00,FF);
586 vvdw = _mm_add_ps(vvdw12,vvdw6);
587 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
588
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
591 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
592
593 fscal = fvdw;
594
595 fscal = _mm_andnot_ps(dummy_mask,fscal);
596
597 /* Calculate temporary vectorial force */
598 tx = _mm_mul_ps(fscal,dx00);
599 ty = _mm_mul_ps(fscal,dy00);
600 tz = _mm_mul_ps(fscal,dz00);
601
602 /* Update vectorial force */
603 fix0 = _mm_add_ps(fix0,tx);
604 fiy0 = _mm_add_ps(fiy0,ty);
605 fiz0 = _mm_add_ps(fiz0,tz);
606
607 fjx0 = _mm_add_ps(fjx0,tx);
608 fjy0 = _mm_add_ps(fjy0,ty);
609 fjz0 = _mm_add_ps(fjz0,tz);
610
611 /**************************
612 * CALCULATE INTERACTIONS *
613 **************************/
614
615 r10 = _mm_mul_ps(rsq10,rinv10);
616 r10 = _mm_andnot_ps(dummy_mask,r10);
617
618 /* Compute parameters for interactions between i and j atoms */
619 qq10 = _mm_mul_ps(iq1,jq0);
620
621 /* EWALD ELECTROSTATICS */
622
623 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
624 ewrt = _mm_mul_ps(r10,ewtabscale);
625 ewitab = _mm_cvttps_epi32(ewrt);
626 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
627 ewitab = _mm_slli_epi32(ewitab,2);
628 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
629 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
630 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
631 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
632 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
633 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
634 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
635 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
636 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
637
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velec = _mm_andnot_ps(dummy_mask,velec);
640 velecsum = _mm_add_ps(velecsum,velec);
641
642 fscal = felec;
643
644 fscal = _mm_andnot_ps(dummy_mask,fscal);
645
646 /* Calculate temporary vectorial force */
647 tx = _mm_mul_ps(fscal,dx10);
648 ty = _mm_mul_ps(fscal,dy10);
649 tz = _mm_mul_ps(fscal,dz10);
650
651 /* Update vectorial force */
652 fix1 = _mm_add_ps(fix1,tx);
653 fiy1 = _mm_add_ps(fiy1,ty);
654 fiz1 = _mm_add_ps(fiz1,tz);
655
656 fjx0 = _mm_add_ps(fjx0,tx);
657 fjy0 = _mm_add_ps(fjy0,ty);
658 fjz0 = _mm_add_ps(fjz0,tz);
659
660 /**************************
661 * CALCULATE INTERACTIONS *
662 **************************/
663
664 r20 = _mm_mul_ps(rsq20,rinv20);
665 r20 = _mm_andnot_ps(dummy_mask,r20);
666
667 /* Compute parameters for interactions between i and j atoms */
668 qq20 = _mm_mul_ps(iq2,jq0);
669
670 /* EWALD ELECTROSTATICS */
671
672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
673 ewrt = _mm_mul_ps(r20,ewtabscale);
674 ewitab = _mm_cvttps_epi32(ewrt);
675 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
676 ewitab = _mm_slli_epi32(ewitab,2);
677 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
678 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
679 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
680 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
681 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
682 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
683 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
684 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
685 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
686
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velec = _mm_andnot_ps(dummy_mask,velec);
689 velecsum = _mm_add_ps(velecsum,velec);
690
691 fscal = felec;
692
693 fscal = _mm_andnot_ps(dummy_mask,fscal);
694
695 /* Calculate temporary vectorial force */
696 tx = _mm_mul_ps(fscal,dx20);
697 ty = _mm_mul_ps(fscal,dy20);
698 tz = _mm_mul_ps(fscal,dz20);
699
700 /* Update vectorial force */
701 fix2 = _mm_add_ps(fix2,tx);
702 fiy2 = _mm_add_ps(fiy2,ty);
703 fiz2 = _mm_add_ps(fiz2,tz);
704
705 fjx0 = _mm_add_ps(fjx0,tx);
706 fjy0 = _mm_add_ps(fjy0,ty);
707 fjz0 = _mm_add_ps(fjz0,tz);
708
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
712
713 r30 = _mm_mul_ps(rsq30,rinv30);
714 r30 = _mm_andnot_ps(dummy_mask,r30);
715
716 /* Compute parameters for interactions between i and j atoms */
717 qq30 = _mm_mul_ps(iq3,jq0);
718
719 /* EWALD ELECTROSTATICS */
720
721 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
722 ewrt = _mm_mul_ps(r30,ewtabscale);
723 ewitab = _mm_cvttps_epi32(ewrt);
724 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
725 ewitab = _mm_slli_epi32(ewitab,2);
726 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
727 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
728 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
729 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
730 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
731 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
732 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
733 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
734 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
735
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec = _mm_andnot_ps(dummy_mask,velec);
738 velecsum = _mm_add_ps(velecsum,velec);
739
740 fscal = felec;
741
742 fscal = _mm_andnot_ps(dummy_mask,fscal);
743
744 /* Calculate temporary vectorial force */
745 tx = _mm_mul_ps(fscal,dx30);
746 ty = _mm_mul_ps(fscal,dy30);
747 tz = _mm_mul_ps(fscal,dz30);
748
749 /* Update vectorial force */
750 fix3 = _mm_add_ps(fix3,tx);
751 fiy3 = _mm_add_ps(fiy3,ty);
752 fiz3 = _mm_add_ps(fiz3,tz);
753
754 fjx0 = _mm_add_ps(fjx0,tx);
755 fjy0 = _mm_add_ps(fjy0,ty);
756 fjz0 = _mm_add_ps(fjz0,tz);
757
758 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
759 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
760 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
761 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
762
763 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
764
765 /* Inner loop uses 183 flops */
766 }
767
768 /* End of innermost loop */
769
770 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
771 f+i_coord_offset,fshift+i_shift_offset);
772
773 ggid = gid[iidx];
774 /* Update potential energies */
775 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
776 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
777
778 /* Increment number of inner iterations */
779 inneriter += j_index_end - j_index_start;
780
781 /* Outer loop uses 26 flops */
782 }
783
784 /* Increment number of outer iterations */
785 outeriter += nri;
786
787 /* Update outer/inner flops */
788
789 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*183)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_VF] += outeriter*26 + inneriter
*183
;
790}
791/*
792 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
793 * Electrostatics interaction: Ewald
794 * VdW interaction: CubicSplineTable
795 * Geometry: Water4-Particle
796 * Calculate force/pot: Force
797 */
798void
799nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
800 (t_nblist * gmx_restrict nlist,
801 rvec * gmx_restrict xx,
802 rvec * gmx_restrict ff,
803 t_forcerec * gmx_restrict fr,
804 t_mdatoms * gmx_restrict mdatoms,
805 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
806 t_nrnb * gmx_restrict nrnb)
807{
808 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
809 * just 0 for non-waters.
810 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
811 * jnr indices corresponding to data put in the four positions in the SIMD register.
812 */
813 int i_shift_offset,i_coord_offset,outeriter,inneriter;
814 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
815 int jnrA,jnrB,jnrC,jnrD;
816 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
817 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
818 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
819 real rcutoff_scalar;
820 real *shiftvec,*fshift,*x,*f;
821 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
822 real scratch[4*DIM3];
823 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
824 int vdwioffset0;
825 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
826 int vdwioffset1;
827 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
828 int vdwioffset2;
829 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
830 int vdwioffset3;
831 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
832 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
833 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
834 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
835 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
836 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
837 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
838 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
839 real *charge;
840 int nvdwtype;
841 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
842 int *vdwtype;
843 real *vdwparam;
844 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
845 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
846 __m128i vfitab;
847 __m128i ifour = _mm_set1_epi32(4);
848 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
849 real *vftab;
850 __m128i ewitab;
851 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
852 real *ewtab;
853 __m128 dummy_mask,cutoff_mask;
854 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
855 __m128 one = _mm_set1_ps(1.0);
856 __m128 two = _mm_set1_ps(2.0);
857 x = xx[0];
858 f = ff[0];
859
860 nri = nlist->nri;
861 iinr = nlist->iinr;
862 jindex = nlist->jindex;
863 jjnr = nlist->jjnr;
864 shiftidx = nlist->shift;
865 gid = nlist->gid;
Value stored to 'gid' is never read
866 shiftvec = fr->shift_vec[0];
867 fshift = fr->fshift[0];
868 facel = _mm_set1_ps(fr->epsfac);
869 charge = mdatoms->chargeA;
870 nvdwtype = fr->ntype;
871 vdwparam = fr->nbfp;
872 vdwtype = mdatoms->typeA;
873
874 vftab = kernel_data->table_vdw->data;
875 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
876
877 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
878 ewtab = fr->ic->tabq_coul_F;
879 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
880 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
881
882 /* Setup water-specific parameters */
883 inr = nlist->iinr[0];
884 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
885 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
886 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
887 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
888
889 /* Avoid stupid compiler warnings */
890 jnrA = jnrB = jnrC = jnrD = 0;
891 j_coord_offsetA = 0;
892 j_coord_offsetB = 0;
893 j_coord_offsetC = 0;
894 j_coord_offsetD = 0;
895
896 outeriter = 0;
897 inneriter = 0;
898
899 for(iidx=0;iidx<4*DIM3;iidx++)
900 {
901 scratch[iidx] = 0.0;
902 }
903
904 /* Start outer loop over neighborlists */
905 for(iidx=0; iidx<nri; iidx++)
906 {
907 /* Load shift vector for this list */
908 i_shift_offset = DIM3*shiftidx[iidx];
909
910 /* Load limits for loop over neighbors */
911 j_index_start = jindex[iidx];
912 j_index_end = jindex[iidx+1];
913
914 /* Get outer coordinate index */
915 inr = iinr[iidx];
916 i_coord_offset = DIM3*inr;
917
918 /* Load i particle coords and add shift vector */
919 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
920 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
921
922 fix0 = _mm_setzero_ps();
923 fiy0 = _mm_setzero_ps();
924 fiz0 = _mm_setzero_ps();
925 fix1 = _mm_setzero_ps();
926 fiy1 = _mm_setzero_ps();
927 fiz1 = _mm_setzero_ps();
928 fix2 = _mm_setzero_ps();
929 fiy2 = _mm_setzero_ps();
930 fiz2 = _mm_setzero_ps();
931 fix3 = _mm_setzero_ps();
932 fiy3 = _mm_setzero_ps();
933 fiz3 = _mm_setzero_ps();
934
935 /* Start inner kernel loop */
936 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
937 {
938
939 /* Get j neighbor index, and coordinate index */
940 jnrA = jjnr[jidx];
941 jnrB = jjnr[jidx+1];
942 jnrC = jjnr[jidx+2];
943 jnrD = jjnr[jidx+3];
944 j_coord_offsetA = DIM3*jnrA;
945 j_coord_offsetB = DIM3*jnrB;
946 j_coord_offsetC = DIM3*jnrC;
947 j_coord_offsetD = DIM3*jnrD;
948
949 /* load j atom coordinates */
950 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
951 x+j_coord_offsetC,x+j_coord_offsetD,
952 &jx0,&jy0,&jz0);
953
954 /* Calculate displacement vector */
955 dx00 = _mm_sub_ps(ix0,jx0);
956 dy00 = _mm_sub_ps(iy0,jy0);
957 dz00 = _mm_sub_ps(iz0,jz0);
958 dx10 = _mm_sub_ps(ix1,jx0);
959 dy10 = _mm_sub_ps(iy1,jy0);
960 dz10 = _mm_sub_ps(iz1,jz0);
961 dx20 = _mm_sub_ps(ix2,jx0);
962 dy20 = _mm_sub_ps(iy2,jy0);
963 dz20 = _mm_sub_ps(iz2,jz0);
964 dx30 = _mm_sub_ps(ix3,jx0);
965 dy30 = _mm_sub_ps(iy3,jy0);
966 dz30 = _mm_sub_ps(iz3,jz0);
967
968 /* Calculate squared distance and things based on it */
969 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
970 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
971 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
972 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
973
974 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
975 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
976 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
977 rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
978
979 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
980 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
981 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
982
983 /* Load parameters for j particles */
984 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
985 charge+jnrC+0,charge+jnrD+0);
986 vdwjidx0A = 2*vdwtype[jnrA+0];
987 vdwjidx0B = 2*vdwtype[jnrB+0];
988 vdwjidx0C = 2*vdwtype[jnrC+0];
989 vdwjidx0D = 2*vdwtype[jnrD+0];
990
991 fjx0 = _mm_setzero_ps();
992 fjy0 = _mm_setzero_ps();
993 fjz0 = _mm_setzero_ps();
994
995 /**************************
996 * CALCULATE INTERACTIONS *
997 **************************/
998
999 r00 = _mm_mul_ps(rsq00,rinv00);
1000
1001 /* Compute parameters for interactions between i and j atoms */
1002 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1003 vdwparam+vdwioffset0+vdwjidx0B,
1004 vdwparam+vdwioffset0+vdwjidx0C,
1005 vdwparam+vdwioffset0+vdwjidx0D,
1006 &c6_00,&c12_00);
1007
1008 /* Calculate table index by multiplying r with table scale and truncate to integer */
1009 rt = _mm_mul_ps(r00,vftabscale);
1010 vfitab = _mm_cvttps_epi32(rt);
1011 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1012 vfitab = _mm_slli_epi32(vfitab,3);
1013
1014 /* CUBIC SPLINE TABLE DISPERSION */
1015 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1016 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1017 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1018 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1019 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1020 Heps = _mm_mul_ps(vfeps,H);
1021 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1022 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1023 fvdw6 = _mm_mul_ps(c6_00,FF);
1024
1025 /* CUBIC SPLINE TABLE REPULSION */
1026 vfitab = _mm_add_epi32(vfitab,ifour);
1027 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1028 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1029 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1030 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1031 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1032 Heps = _mm_mul_ps(vfeps,H);
1033 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1034 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1035 fvdw12 = _mm_mul_ps(c12_00,FF);
1036 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1037
1038 fscal = fvdw;
1039
1040 /* Calculate temporary vectorial force */
1041 tx = _mm_mul_ps(fscal,dx00);
1042 ty = _mm_mul_ps(fscal,dy00);
1043 tz = _mm_mul_ps(fscal,dz00);
1044
1045 /* Update vectorial force */
1046 fix0 = _mm_add_ps(fix0,tx);
1047 fiy0 = _mm_add_ps(fiy0,ty);
1048 fiz0 = _mm_add_ps(fiz0,tz);
1049
1050 fjx0 = _mm_add_ps(fjx0,tx);
1051 fjy0 = _mm_add_ps(fjy0,ty);
1052 fjz0 = _mm_add_ps(fjz0,tz);
1053
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1057
1058 r10 = _mm_mul_ps(rsq10,rinv10);
1059
1060 /* Compute parameters for interactions between i and j atoms */
1061 qq10 = _mm_mul_ps(iq1,jq0);
1062
1063 /* EWALD ELECTROSTATICS */
1064
1065 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1066 ewrt = _mm_mul_ps(r10,ewtabscale);
1067 ewitab = _mm_cvttps_epi32(ewrt);
1068 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1069 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1070 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1071 &ewtabF,&ewtabFn);
1072 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1073 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1074
1075 fscal = felec;
1076
1077 /* Calculate temporary vectorial force */
1078 tx = _mm_mul_ps(fscal,dx10);
1079 ty = _mm_mul_ps(fscal,dy10);
1080 tz = _mm_mul_ps(fscal,dz10);
1081
1082 /* Update vectorial force */
1083 fix1 = _mm_add_ps(fix1,tx);
1084 fiy1 = _mm_add_ps(fiy1,ty);
1085 fiz1 = _mm_add_ps(fiz1,tz);
1086
1087 fjx0 = _mm_add_ps(fjx0,tx);
1088 fjy0 = _mm_add_ps(fjy0,ty);
1089 fjz0 = _mm_add_ps(fjz0,tz);
1090
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1094
1095 r20 = _mm_mul_ps(rsq20,rinv20);
1096
1097 /* Compute parameters for interactions between i and j atoms */
1098 qq20 = _mm_mul_ps(iq2,jq0);
1099
1100 /* EWALD ELECTROSTATICS */
1101
1102 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1103 ewrt = _mm_mul_ps(r20,ewtabscale);
1104 ewitab = _mm_cvttps_epi32(ewrt);
1105 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1106 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1107 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1108 &ewtabF,&ewtabFn);
1109 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1110 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1111
1112 fscal = felec;
1113
1114 /* Calculate temporary vectorial force */
1115 tx = _mm_mul_ps(fscal,dx20);
1116 ty = _mm_mul_ps(fscal,dy20);
1117 tz = _mm_mul_ps(fscal,dz20);
1118
1119 /* Update vectorial force */
1120 fix2 = _mm_add_ps(fix2,tx);
1121 fiy2 = _mm_add_ps(fiy2,ty);
1122 fiz2 = _mm_add_ps(fiz2,tz);
1123
1124 fjx0 = _mm_add_ps(fjx0,tx);
1125 fjy0 = _mm_add_ps(fjy0,ty);
1126 fjz0 = _mm_add_ps(fjz0,tz);
1127
1128 /**************************
1129 * CALCULATE INTERACTIONS *
1130 **************************/
1131
1132 r30 = _mm_mul_ps(rsq30,rinv30);
1133
1134 /* Compute parameters for interactions between i and j atoms */
1135 qq30 = _mm_mul_ps(iq3,jq0);
1136
1137 /* EWALD ELECTROSTATICS */
1138
1139 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1140 ewrt = _mm_mul_ps(r30,ewtabscale);
1141 ewitab = _mm_cvttps_epi32(ewrt);
1142 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1143 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1144 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1145 &ewtabF,&ewtabFn);
1146 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1147 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1148
1149 fscal = felec;
1150
1151 /* Calculate temporary vectorial force */
1152 tx = _mm_mul_ps(fscal,dx30);
1153 ty = _mm_mul_ps(fscal,dy30);
1154 tz = _mm_mul_ps(fscal,dz30);
1155
1156 /* Update vectorial force */
1157 fix3 = _mm_add_ps(fix3,tx);
1158 fiy3 = _mm_add_ps(fiy3,ty);
1159 fiz3 = _mm_add_ps(fiz3,tz);
1160
1161 fjx0 = _mm_add_ps(fjx0,tx);
1162 fjy0 = _mm_add_ps(fjy0,ty);
1163 fjz0 = _mm_add_ps(fjz0,tz);
1164
1165 fjptrA = f+j_coord_offsetA;
1166 fjptrB = f+j_coord_offsetB;
1167 fjptrC = f+j_coord_offsetC;
1168 fjptrD = f+j_coord_offsetD;
1169
1170 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1171
1172 /* Inner loop uses 156 flops */
1173 }
1174
1175 if(jidx<j_index_end)
1176 {
1177
1178 /* Get j neighbor index, and coordinate index */
1179 jnrlistA = jjnr[jidx];
1180 jnrlistB = jjnr[jidx+1];
1181 jnrlistC = jjnr[jidx+2];
1182 jnrlistD = jjnr[jidx+3];
1183 /* Sign of each element will be negative for non-real atoms.
1184 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1185 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1186 */
1187 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1188 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1189 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1190 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1191 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1192 j_coord_offsetA = DIM3*jnrA;
1193 j_coord_offsetB = DIM3*jnrB;
1194 j_coord_offsetC = DIM3*jnrC;
1195 j_coord_offsetD = DIM3*jnrD;
1196
1197 /* load j atom coordinates */
1198 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1199 x+j_coord_offsetC,x+j_coord_offsetD,
1200 &jx0,&jy0,&jz0);
1201
1202 /* Calculate displacement vector */
1203 dx00 = _mm_sub_ps(ix0,jx0);
1204 dy00 = _mm_sub_ps(iy0,jy0);
1205 dz00 = _mm_sub_ps(iz0,jz0);
1206 dx10 = _mm_sub_ps(ix1,jx0);
1207 dy10 = _mm_sub_ps(iy1,jy0);
1208 dz10 = _mm_sub_ps(iz1,jz0);
1209 dx20 = _mm_sub_ps(ix2,jx0);
1210 dy20 = _mm_sub_ps(iy2,jy0);
1211 dz20 = _mm_sub_ps(iz2,jz0);
1212 dx30 = _mm_sub_ps(ix3,jx0);
1213 dy30 = _mm_sub_ps(iy3,jy0);
1214 dz30 = _mm_sub_ps(iz3,jz0);
1215
1216 /* Calculate squared distance and things based on it */
1217 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1218 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1219 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1220 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1221
1222 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1223 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1224 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1225 rinv30 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq30);
1226
1227 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1228 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1229 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1230
1231 /* Load parameters for j particles */
1232 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1233 charge+jnrC+0,charge+jnrD+0);
1234 vdwjidx0A = 2*vdwtype[jnrA+0];
1235 vdwjidx0B = 2*vdwtype[jnrB+0];
1236 vdwjidx0C = 2*vdwtype[jnrC+0];
1237 vdwjidx0D = 2*vdwtype[jnrD+0];
1238
1239 fjx0 = _mm_setzero_ps();
1240 fjy0 = _mm_setzero_ps();
1241 fjz0 = _mm_setzero_ps();
1242
1243 /**************************
1244 * CALCULATE INTERACTIONS *
1245 **************************/
1246
1247 r00 = _mm_mul_ps(rsq00,rinv00);
1248 r00 = _mm_andnot_ps(dummy_mask,r00);
1249
1250 /* Compute parameters for interactions between i and j atoms */
1251 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1252 vdwparam+vdwioffset0+vdwjidx0B,
1253 vdwparam+vdwioffset0+vdwjidx0C,
1254 vdwparam+vdwioffset0+vdwjidx0D,
1255 &c6_00,&c12_00);
1256
1257 /* Calculate table index by multiplying r with table scale and truncate to integer */
1258 rt = _mm_mul_ps(r00,vftabscale);
1259 vfitab = _mm_cvttps_epi32(rt);
1260 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1261 vfitab = _mm_slli_epi32(vfitab,3);
1262
1263 /* CUBIC SPLINE TABLE DISPERSION */
1264 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1265 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1266 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1267 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1268 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1269 Heps = _mm_mul_ps(vfeps,H);
1270 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1271 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1272 fvdw6 = _mm_mul_ps(c6_00,FF);
1273
1274 /* CUBIC SPLINE TABLE REPULSION */
1275 vfitab = _mm_add_epi32(vfitab,ifour);
1276 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1277 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1278 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1279 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1280 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1281 Heps = _mm_mul_ps(vfeps,H);
1282 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1283 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1284 fvdw12 = _mm_mul_ps(c12_00,FF);
1285 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1286
1287 fscal = fvdw;
1288
1289 fscal = _mm_andnot_ps(dummy_mask,fscal);
1290
1291 /* Calculate temporary vectorial force */
1292 tx = _mm_mul_ps(fscal,dx00);
1293 ty = _mm_mul_ps(fscal,dy00);
1294 tz = _mm_mul_ps(fscal,dz00);
1295
1296 /* Update vectorial force */
1297 fix0 = _mm_add_ps(fix0,tx);
1298 fiy0 = _mm_add_ps(fiy0,ty);
1299 fiz0 = _mm_add_ps(fiz0,tz);
1300
1301 fjx0 = _mm_add_ps(fjx0,tx);
1302 fjy0 = _mm_add_ps(fjy0,ty);
1303 fjz0 = _mm_add_ps(fjz0,tz);
1304
1305 /**************************
1306 * CALCULATE INTERACTIONS *
1307 **************************/
1308
1309 r10 = _mm_mul_ps(rsq10,rinv10);
1310 r10 = _mm_andnot_ps(dummy_mask,r10);
1311
1312 /* Compute parameters for interactions between i and j atoms */
1313 qq10 = _mm_mul_ps(iq1,jq0);
1314
1315 /* EWALD ELECTROSTATICS */
1316
1317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1318 ewrt = _mm_mul_ps(r10,ewtabscale);
1319 ewitab = _mm_cvttps_epi32(ewrt);
1320 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1321 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1322 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1323 &ewtabF,&ewtabFn);
1324 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1325 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1326
1327 fscal = felec;
1328
1329 fscal = _mm_andnot_ps(dummy_mask,fscal);
1330
1331 /* Calculate temporary vectorial force */
1332 tx = _mm_mul_ps(fscal,dx10);
1333 ty = _mm_mul_ps(fscal,dy10);
1334 tz = _mm_mul_ps(fscal,dz10);
1335
1336 /* Update vectorial force */
1337 fix1 = _mm_add_ps(fix1,tx);
1338 fiy1 = _mm_add_ps(fiy1,ty);
1339 fiz1 = _mm_add_ps(fiz1,tz);
1340
1341 fjx0 = _mm_add_ps(fjx0,tx);
1342 fjy0 = _mm_add_ps(fjy0,ty);
1343 fjz0 = _mm_add_ps(fjz0,tz);
1344
1345 /**************************
1346 * CALCULATE INTERACTIONS *
1347 **************************/
1348
1349 r20 = _mm_mul_ps(rsq20,rinv20);
1350 r20 = _mm_andnot_ps(dummy_mask,r20);
1351
1352 /* Compute parameters for interactions between i and j atoms */
1353 qq20 = _mm_mul_ps(iq2,jq0);
1354
1355 /* EWALD ELECTROSTATICS */
1356
1357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1358 ewrt = _mm_mul_ps(r20,ewtabscale);
1359 ewitab = _mm_cvttps_epi32(ewrt);
1360 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1361 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1362 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1363 &ewtabF,&ewtabFn);
1364 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1365 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1366
1367 fscal = felec;
1368
1369 fscal = _mm_andnot_ps(dummy_mask,fscal);
1370
1371 /* Calculate temporary vectorial force */
1372 tx = _mm_mul_ps(fscal,dx20);
1373 ty = _mm_mul_ps(fscal,dy20);
1374 tz = _mm_mul_ps(fscal,dz20);
1375
1376 /* Update vectorial force */
1377 fix2 = _mm_add_ps(fix2,tx);
1378 fiy2 = _mm_add_ps(fiy2,ty);
1379 fiz2 = _mm_add_ps(fiz2,tz);
1380
1381 fjx0 = _mm_add_ps(fjx0,tx);
1382 fjy0 = _mm_add_ps(fjy0,ty);
1383 fjz0 = _mm_add_ps(fjz0,tz);
1384
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1388
1389 r30 = _mm_mul_ps(rsq30,rinv30);
1390 r30 = _mm_andnot_ps(dummy_mask,r30);
1391
1392 /* Compute parameters for interactions between i and j atoms */
1393 qq30 = _mm_mul_ps(iq3,jq0);
1394
1395 /* EWALD ELECTROSTATICS */
1396
1397 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1398 ewrt = _mm_mul_ps(r30,ewtabscale);
1399 ewitab = _mm_cvttps_epi32(ewrt);
1400 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1401 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1402 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1403 &ewtabF,&ewtabFn);
1404 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1405 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1406
1407 fscal = felec;
1408
1409 fscal = _mm_andnot_ps(dummy_mask,fscal);
1410
1411 /* Calculate temporary vectorial force */
1412 tx = _mm_mul_ps(fscal,dx30);
1413 ty = _mm_mul_ps(fscal,dy30);
1414 tz = _mm_mul_ps(fscal,dz30);
1415
1416 /* Update vectorial force */
1417 fix3 = _mm_add_ps(fix3,tx);
1418 fiy3 = _mm_add_ps(fiy3,ty);
1419 fiz3 = _mm_add_ps(fiz3,tz);
1420
1421 fjx0 = _mm_add_ps(fjx0,tx);
1422 fjy0 = _mm_add_ps(fjy0,ty);
1423 fjz0 = _mm_add_ps(fjz0,tz);
1424
1425 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1426 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1427 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1428 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1429
1430 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1431
1432 /* Inner loop uses 160 flops */
1433 }
1434
1435 /* End of innermost loop */
1436
1437 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1438 f+i_coord_offset,fshift+i_shift_offset);
1439
1440 /* Increment number of inner iterations */
1441 inneriter += j_index_end - j_index_start;
1442
1443 /* Outer loop uses 24 flops */
1444 }
1445
1446 /* Increment number of outer iterations */
1447 outeriter += nri;
1448
1449 /* Update outer/inner flops */
1450
1451 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*160)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4_F] += outeriter*24 + inneriter
*160
;
1452}