Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse4_1_single.c
Location:line 1418, column 5
Description:Value stored to 'jnrA' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LJEwald
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 real *charge;
108 int nvdwtype;
109 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 int *vdwtype;
111 real *vdwparam;
112 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
113 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
114 __m128 c6grid_00;
115 __m128 c6grid_01;
116 __m128 c6grid_02;
117 __m128 c6grid_10;
118 __m128 c6grid_11;
119 __m128 c6grid_12;
120 __m128 c6grid_20;
121 __m128 c6grid_21;
122 __m128 c6grid_22;
123 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
124 real *vdwgridparam;
125 __m128 one_half = _mm_set1_ps(0.5);
126 __m128 minus_one = _mm_set1_ps(-1.0);
127 __m128i ewitab;
128 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
129 real *ewtab;
130 __m128 dummy_mask,cutoff_mask;
131 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
132 __m128 one = _mm_set1_ps(1.0);
133 __m128 two = _mm_set1_ps(2.0);
134 x = xx[0];
135 f = ff[0];
136
137 nri = nlist->nri;
138 iinr = nlist->iinr;
139 jindex = nlist->jindex;
140 jjnr = nlist->jjnr;
141 shiftidx = nlist->shift;
142 gid = nlist->gid;
143 shiftvec = fr->shift_vec[0];
144 fshift = fr->fshift[0];
145 facel = _mm_set1_ps(fr->epsfac);
146 charge = mdatoms->chargeA;
147 nvdwtype = fr->ntype;
148 vdwparam = fr->nbfp;
149 vdwtype = mdatoms->typeA;
150 vdwgridparam = fr->ljpme_c6grid;
151 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
152 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
153 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
154
155 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
156 ewtab = fr->ic->tabq_coul_FDV0;
157 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
158 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
159
160 /* Setup water-specific parameters */
161 inr = nlist->iinr[0];
162 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
163 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
164 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
165 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
166
167 jq0 = _mm_set1_ps(charge[inr+0]);
168 jq1 = _mm_set1_ps(charge[inr+1]);
169 jq2 = _mm_set1_ps(charge[inr+2]);
170 vdwjidx0A = 2*vdwtype[inr+0];
171 qq00 = _mm_mul_ps(iq0,jq0);
172 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
173 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
174 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
175 qq01 = _mm_mul_ps(iq0,jq1);
176 qq02 = _mm_mul_ps(iq0,jq2);
177 qq10 = _mm_mul_ps(iq1,jq0);
178 qq11 = _mm_mul_ps(iq1,jq1);
179 qq12 = _mm_mul_ps(iq1,jq2);
180 qq20 = _mm_mul_ps(iq2,jq0);
181 qq21 = _mm_mul_ps(iq2,jq1);
182 qq22 = _mm_mul_ps(iq2,jq2);
183
184 /* Avoid stupid compiler warnings */
185 jnrA = jnrB = jnrC = jnrD = 0;
186 j_coord_offsetA = 0;
187 j_coord_offsetB = 0;
188 j_coord_offsetC = 0;
189 j_coord_offsetD = 0;
190
191 outeriter = 0;
192 inneriter = 0;
193
194 for(iidx=0;iidx<4*DIM3;iidx++)
195 {
196 scratch[iidx] = 0.0;
197 }
198
199 /* Start outer loop over neighborlists */
200 for(iidx=0; iidx<nri; iidx++)
201 {
202 /* Load shift vector for this list */
203 i_shift_offset = DIM3*shiftidx[iidx];
204
205 /* Load limits for loop over neighbors */
206 j_index_start = jindex[iidx];
207 j_index_end = jindex[iidx+1];
208
209 /* Get outer coordinate index */
210 inr = iinr[iidx];
211 i_coord_offset = DIM3*inr;
212
213 /* Load i particle coords and add shift vector */
214 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
215 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
216
217 fix0 = _mm_setzero_ps();
218 fiy0 = _mm_setzero_ps();
219 fiz0 = _mm_setzero_ps();
220 fix1 = _mm_setzero_ps();
221 fiy1 = _mm_setzero_ps();
222 fiz1 = _mm_setzero_ps();
223 fix2 = _mm_setzero_ps();
224 fiy2 = _mm_setzero_ps();
225 fiz2 = _mm_setzero_ps();
226
227 /* Reset potential sums */
228 velecsum = _mm_setzero_ps();
229 vvdwsum = _mm_setzero_ps();
230
231 /* Start inner kernel loop */
232 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
233 {
234
235 /* Get j neighbor index, and coordinate index */
236 jnrA = jjnr[jidx];
237 jnrB = jjnr[jidx+1];
238 jnrC = jjnr[jidx+2];
239 jnrD = jjnr[jidx+3];
240 j_coord_offsetA = DIM3*jnrA;
241 j_coord_offsetB = DIM3*jnrB;
242 j_coord_offsetC = DIM3*jnrC;
243 j_coord_offsetD = DIM3*jnrD;
244
245 /* load j atom coordinates */
246 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
247 x+j_coord_offsetC,x+j_coord_offsetD,
248 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
249
250 /* Calculate displacement vector */
251 dx00 = _mm_sub_ps(ix0,jx0);
252 dy00 = _mm_sub_ps(iy0,jy0);
253 dz00 = _mm_sub_ps(iz0,jz0);
254 dx01 = _mm_sub_ps(ix0,jx1);
255 dy01 = _mm_sub_ps(iy0,jy1);
256 dz01 = _mm_sub_ps(iz0,jz1);
257 dx02 = _mm_sub_ps(ix0,jx2);
258 dy02 = _mm_sub_ps(iy0,jy2);
259 dz02 = _mm_sub_ps(iz0,jz2);
260 dx10 = _mm_sub_ps(ix1,jx0);
261 dy10 = _mm_sub_ps(iy1,jy0);
262 dz10 = _mm_sub_ps(iz1,jz0);
263 dx11 = _mm_sub_ps(ix1,jx1);
264 dy11 = _mm_sub_ps(iy1,jy1);
265 dz11 = _mm_sub_ps(iz1,jz1);
266 dx12 = _mm_sub_ps(ix1,jx2);
267 dy12 = _mm_sub_ps(iy1,jy2);
268 dz12 = _mm_sub_ps(iz1,jz2);
269 dx20 = _mm_sub_ps(ix2,jx0);
270 dy20 = _mm_sub_ps(iy2,jy0);
271 dz20 = _mm_sub_ps(iz2,jz0);
272 dx21 = _mm_sub_ps(ix2,jx1);
273 dy21 = _mm_sub_ps(iy2,jy1);
274 dz21 = _mm_sub_ps(iz2,jz1);
275 dx22 = _mm_sub_ps(ix2,jx2);
276 dy22 = _mm_sub_ps(iy2,jy2);
277 dz22 = _mm_sub_ps(iz2,jz2);
278
279 /* Calculate squared distance and things based on it */
280 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
281 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
282 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
283 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
284 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
285 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
286 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
287 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
288 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
289
290 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
291 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
292 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
293 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
294 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
295 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
296 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
297 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
298 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
299
300 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
301 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
302 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
303 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
304 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
305 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
306 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
307 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
308 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
309
310 fjx0 = _mm_setzero_ps();
311 fjy0 = _mm_setzero_ps();
312 fjz0 = _mm_setzero_ps();
313 fjx1 = _mm_setzero_ps();
314 fjy1 = _mm_setzero_ps();
315 fjz1 = _mm_setzero_ps();
316 fjx2 = _mm_setzero_ps();
317 fjy2 = _mm_setzero_ps();
318 fjz2 = _mm_setzero_ps();
319
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
323
324 r00 = _mm_mul_ps(rsq00,rinv00);
325
326 /* EWALD ELECTROSTATICS */
327
328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
329 ewrt = _mm_mul_ps(r00,ewtabscale);
330 ewitab = _mm_cvttps_epi32(ewrt);
331 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
332 ewitab = _mm_slli_epi32(ewitab,2);
333 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
334 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
335 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
336 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
337 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
338 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
339 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
340 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
341 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
342
343 /* Analytical LJ-PME */
344 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
345 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
346 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
347 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
348 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
349 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
350 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
351 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
352 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
353 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
354 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
355 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
356
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm_add_ps(velecsum,velec);
359 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
360
361 fscal = _mm_add_ps(felec,fvdw);
362
363 /* Calculate temporary vectorial force */
364 tx = _mm_mul_ps(fscal,dx00);
365 ty = _mm_mul_ps(fscal,dy00);
366 tz = _mm_mul_ps(fscal,dz00);
367
368 /* Update vectorial force */
369 fix0 = _mm_add_ps(fix0,tx);
370 fiy0 = _mm_add_ps(fiy0,ty);
371 fiz0 = _mm_add_ps(fiz0,tz);
372
373 fjx0 = _mm_add_ps(fjx0,tx);
374 fjy0 = _mm_add_ps(fjy0,ty);
375 fjz0 = _mm_add_ps(fjz0,tz);
376
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
380
381 r01 = _mm_mul_ps(rsq01,rinv01);
382
383 /* EWALD ELECTROSTATICS */
384
385 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
386 ewrt = _mm_mul_ps(r01,ewtabscale);
387 ewitab = _mm_cvttps_epi32(ewrt);
388 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
389 ewitab = _mm_slli_epi32(ewitab,2);
390 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
391 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
392 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
393 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
394 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
395 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
396 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
397 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
398 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
399
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velecsum = _mm_add_ps(velecsum,velec);
402
403 fscal = felec;
404
405 /* Calculate temporary vectorial force */
406 tx = _mm_mul_ps(fscal,dx01);
407 ty = _mm_mul_ps(fscal,dy01);
408 tz = _mm_mul_ps(fscal,dz01);
409
410 /* Update vectorial force */
411 fix0 = _mm_add_ps(fix0,tx);
412 fiy0 = _mm_add_ps(fiy0,ty);
413 fiz0 = _mm_add_ps(fiz0,tz);
414
415 fjx1 = _mm_add_ps(fjx1,tx);
416 fjy1 = _mm_add_ps(fjy1,ty);
417 fjz1 = _mm_add_ps(fjz1,tz);
418
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
422
423 r02 = _mm_mul_ps(rsq02,rinv02);
424
425 /* EWALD ELECTROSTATICS */
426
427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
428 ewrt = _mm_mul_ps(r02,ewtabscale);
429 ewitab = _mm_cvttps_epi32(ewrt);
430 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
431 ewitab = _mm_slli_epi32(ewitab,2);
432 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
433 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
434 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
435 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
436 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
437 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
438 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
439 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
440 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
441
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velecsum = _mm_add_ps(velecsum,velec);
444
445 fscal = felec;
446
447 /* Calculate temporary vectorial force */
448 tx = _mm_mul_ps(fscal,dx02);
449 ty = _mm_mul_ps(fscal,dy02);
450 tz = _mm_mul_ps(fscal,dz02);
451
452 /* Update vectorial force */
453 fix0 = _mm_add_ps(fix0,tx);
454 fiy0 = _mm_add_ps(fiy0,ty);
455 fiz0 = _mm_add_ps(fiz0,tz);
456
457 fjx2 = _mm_add_ps(fjx2,tx);
458 fjy2 = _mm_add_ps(fjy2,ty);
459 fjz2 = _mm_add_ps(fjz2,tz);
460
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
464
465 r10 = _mm_mul_ps(rsq10,rinv10);
466
467 /* EWALD ELECTROSTATICS */
468
469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
470 ewrt = _mm_mul_ps(r10,ewtabscale);
471 ewitab = _mm_cvttps_epi32(ewrt);
472 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
473 ewitab = _mm_slli_epi32(ewitab,2);
474 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
475 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
476 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
477 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
478 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
479 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
480 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
481 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
482 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
483
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm_add_ps(velecsum,velec);
486
487 fscal = felec;
488
489 /* Calculate temporary vectorial force */
490 tx = _mm_mul_ps(fscal,dx10);
491 ty = _mm_mul_ps(fscal,dy10);
492 tz = _mm_mul_ps(fscal,dz10);
493
494 /* Update vectorial force */
495 fix1 = _mm_add_ps(fix1,tx);
496 fiy1 = _mm_add_ps(fiy1,ty);
497 fiz1 = _mm_add_ps(fiz1,tz);
498
499 fjx0 = _mm_add_ps(fjx0,tx);
500 fjy0 = _mm_add_ps(fjy0,ty);
501 fjz0 = _mm_add_ps(fjz0,tz);
502
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
506
507 r11 = _mm_mul_ps(rsq11,rinv11);
508
509 /* EWALD ELECTROSTATICS */
510
511 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
512 ewrt = _mm_mul_ps(r11,ewtabscale);
513 ewitab = _mm_cvttps_epi32(ewrt);
514 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
515 ewitab = _mm_slli_epi32(ewitab,2);
516 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
517 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
518 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
519 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
520 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
521 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
522 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
523 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
524 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
525
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velecsum = _mm_add_ps(velecsum,velec);
528
529 fscal = felec;
530
531 /* Calculate temporary vectorial force */
532 tx = _mm_mul_ps(fscal,dx11);
533 ty = _mm_mul_ps(fscal,dy11);
534 tz = _mm_mul_ps(fscal,dz11);
535
536 /* Update vectorial force */
537 fix1 = _mm_add_ps(fix1,tx);
538 fiy1 = _mm_add_ps(fiy1,ty);
539 fiz1 = _mm_add_ps(fiz1,tz);
540
541 fjx1 = _mm_add_ps(fjx1,tx);
542 fjy1 = _mm_add_ps(fjy1,ty);
543 fjz1 = _mm_add_ps(fjz1,tz);
544
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
548
549 r12 = _mm_mul_ps(rsq12,rinv12);
550
551 /* EWALD ELECTROSTATICS */
552
553 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
554 ewrt = _mm_mul_ps(r12,ewtabscale);
555 ewitab = _mm_cvttps_epi32(ewrt);
556 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
557 ewitab = _mm_slli_epi32(ewitab,2);
558 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
559 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
560 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
561 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
562 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
563 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
564 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
565 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
566 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
567
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velecsum = _mm_add_ps(velecsum,velec);
570
571 fscal = felec;
572
573 /* Calculate temporary vectorial force */
574 tx = _mm_mul_ps(fscal,dx12);
575 ty = _mm_mul_ps(fscal,dy12);
576 tz = _mm_mul_ps(fscal,dz12);
577
578 /* Update vectorial force */
579 fix1 = _mm_add_ps(fix1,tx);
580 fiy1 = _mm_add_ps(fiy1,ty);
581 fiz1 = _mm_add_ps(fiz1,tz);
582
583 fjx2 = _mm_add_ps(fjx2,tx);
584 fjy2 = _mm_add_ps(fjy2,ty);
585 fjz2 = _mm_add_ps(fjz2,tz);
586
587 /**************************
588 * CALCULATE INTERACTIONS *
589 **************************/
590
591 r20 = _mm_mul_ps(rsq20,rinv20);
592
593 /* EWALD ELECTROSTATICS */
594
595 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
596 ewrt = _mm_mul_ps(r20,ewtabscale);
597 ewitab = _mm_cvttps_epi32(ewrt);
598 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
599 ewitab = _mm_slli_epi32(ewitab,2);
600 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
601 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
602 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
603 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
604 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
605 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
606 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
607 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
608 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
609
610 /* Update potential sum for this i atom from the interaction with this j atom. */
611 velecsum = _mm_add_ps(velecsum,velec);
612
613 fscal = felec;
614
615 /* Calculate temporary vectorial force */
616 tx = _mm_mul_ps(fscal,dx20);
617 ty = _mm_mul_ps(fscal,dy20);
618 tz = _mm_mul_ps(fscal,dz20);
619
620 /* Update vectorial force */
621 fix2 = _mm_add_ps(fix2,tx);
622 fiy2 = _mm_add_ps(fiy2,ty);
623 fiz2 = _mm_add_ps(fiz2,tz);
624
625 fjx0 = _mm_add_ps(fjx0,tx);
626 fjy0 = _mm_add_ps(fjy0,ty);
627 fjz0 = _mm_add_ps(fjz0,tz);
628
629 /**************************
630 * CALCULATE INTERACTIONS *
631 **************************/
632
633 r21 = _mm_mul_ps(rsq21,rinv21);
634
635 /* EWALD ELECTROSTATICS */
636
637 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
638 ewrt = _mm_mul_ps(r21,ewtabscale);
639 ewitab = _mm_cvttps_epi32(ewrt);
640 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
641 ewitab = _mm_slli_epi32(ewitab,2);
642 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
643 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
644 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
645 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
646 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
647 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
648 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
649 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
650 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
651
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velecsum = _mm_add_ps(velecsum,velec);
654
655 fscal = felec;
656
657 /* Calculate temporary vectorial force */
658 tx = _mm_mul_ps(fscal,dx21);
659 ty = _mm_mul_ps(fscal,dy21);
660 tz = _mm_mul_ps(fscal,dz21);
661
662 /* Update vectorial force */
663 fix2 = _mm_add_ps(fix2,tx);
664 fiy2 = _mm_add_ps(fiy2,ty);
665 fiz2 = _mm_add_ps(fiz2,tz);
666
667 fjx1 = _mm_add_ps(fjx1,tx);
668 fjy1 = _mm_add_ps(fjy1,ty);
669 fjz1 = _mm_add_ps(fjz1,tz);
670
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
674
675 r22 = _mm_mul_ps(rsq22,rinv22);
676
677 /* EWALD ELECTROSTATICS */
678
679 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
680 ewrt = _mm_mul_ps(r22,ewtabscale);
681 ewitab = _mm_cvttps_epi32(ewrt);
682 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
683 ewitab = _mm_slli_epi32(ewitab,2);
684 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
685 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
686 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
687 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
688 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
689 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
690 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
691 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
692 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
693
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velecsum = _mm_add_ps(velecsum,velec);
696
697 fscal = felec;
698
699 /* Calculate temporary vectorial force */
700 tx = _mm_mul_ps(fscal,dx22);
701 ty = _mm_mul_ps(fscal,dy22);
702 tz = _mm_mul_ps(fscal,dz22);
703
704 /* Update vectorial force */
705 fix2 = _mm_add_ps(fix2,tx);
706 fiy2 = _mm_add_ps(fiy2,ty);
707 fiz2 = _mm_add_ps(fiz2,tz);
708
709 fjx2 = _mm_add_ps(fjx2,tx);
710 fjy2 = _mm_add_ps(fjy2,ty);
711 fjz2 = _mm_add_ps(fjz2,tz);
712
713 fjptrA = f+j_coord_offsetA;
714 fjptrB = f+j_coord_offsetB;
715 fjptrC = f+j_coord_offsetC;
716 fjptrD = f+j_coord_offsetD;
717
718 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
719 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
720
721 /* Inner loop uses 397 flops */
722 }
723
724 if(jidx<j_index_end)
725 {
726
727 /* Get j neighbor index, and coordinate index */
728 jnrlistA = jjnr[jidx];
729 jnrlistB = jjnr[jidx+1];
730 jnrlistC = jjnr[jidx+2];
731 jnrlistD = jjnr[jidx+3];
732 /* Sign of each element will be negative for non-real atoms.
733 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
734 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
735 */
736 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
737 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
738 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
739 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
740 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
741 j_coord_offsetA = DIM3*jnrA;
742 j_coord_offsetB = DIM3*jnrB;
743 j_coord_offsetC = DIM3*jnrC;
744 j_coord_offsetD = DIM3*jnrD;
745
746 /* load j atom coordinates */
747 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
748 x+j_coord_offsetC,x+j_coord_offsetD,
749 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
750
751 /* Calculate displacement vector */
752 dx00 = _mm_sub_ps(ix0,jx0);
753 dy00 = _mm_sub_ps(iy0,jy0);
754 dz00 = _mm_sub_ps(iz0,jz0);
755 dx01 = _mm_sub_ps(ix0,jx1);
756 dy01 = _mm_sub_ps(iy0,jy1);
757 dz01 = _mm_sub_ps(iz0,jz1);
758 dx02 = _mm_sub_ps(ix0,jx2);
759 dy02 = _mm_sub_ps(iy0,jy2);
760 dz02 = _mm_sub_ps(iz0,jz2);
761 dx10 = _mm_sub_ps(ix1,jx0);
762 dy10 = _mm_sub_ps(iy1,jy0);
763 dz10 = _mm_sub_ps(iz1,jz0);
764 dx11 = _mm_sub_ps(ix1,jx1);
765 dy11 = _mm_sub_ps(iy1,jy1);
766 dz11 = _mm_sub_ps(iz1,jz1);
767 dx12 = _mm_sub_ps(ix1,jx2);
768 dy12 = _mm_sub_ps(iy1,jy2);
769 dz12 = _mm_sub_ps(iz1,jz2);
770 dx20 = _mm_sub_ps(ix2,jx0);
771 dy20 = _mm_sub_ps(iy2,jy0);
772 dz20 = _mm_sub_ps(iz2,jz0);
773 dx21 = _mm_sub_ps(ix2,jx1);
774 dy21 = _mm_sub_ps(iy2,jy1);
775 dz21 = _mm_sub_ps(iz2,jz1);
776 dx22 = _mm_sub_ps(ix2,jx2);
777 dy22 = _mm_sub_ps(iy2,jy2);
778 dz22 = _mm_sub_ps(iz2,jz2);
779
780 /* Calculate squared distance and things based on it */
781 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
782 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
783 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
784 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
785 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
786 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
787 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
788 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
789 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
790
791 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
792 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
793 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
794 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
795 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
796 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
797 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
798 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
799 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
800
801 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
802 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
803 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
804 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
805 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
806 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
807 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
808 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
809 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
810
811 fjx0 = _mm_setzero_ps();
812 fjy0 = _mm_setzero_ps();
813 fjz0 = _mm_setzero_ps();
814 fjx1 = _mm_setzero_ps();
815 fjy1 = _mm_setzero_ps();
816 fjz1 = _mm_setzero_ps();
817 fjx2 = _mm_setzero_ps();
818 fjy2 = _mm_setzero_ps();
819 fjz2 = _mm_setzero_ps();
820
821 /**************************
822 * CALCULATE INTERACTIONS *
823 **************************/
824
825 r00 = _mm_mul_ps(rsq00,rinv00);
826 r00 = _mm_andnot_ps(dummy_mask,r00);
827
828 /* EWALD ELECTROSTATICS */
829
830 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
831 ewrt = _mm_mul_ps(r00,ewtabscale);
832 ewitab = _mm_cvttps_epi32(ewrt);
833 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
834 ewitab = _mm_slli_epi32(ewitab,2);
835 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
836 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
837 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
838 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
839 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
840 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
841 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
842 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
843 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
844
845 /* Analytical LJ-PME */
846 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
847 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
848 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
849 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
850 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
851 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
852 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
853 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
854 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
855 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
856 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
857 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
858
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm_andnot_ps(dummy_mask,velec);
861 velecsum = _mm_add_ps(velecsum,velec);
862 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
863 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
864
865 fscal = _mm_add_ps(felec,fvdw);
866
867 fscal = _mm_andnot_ps(dummy_mask,fscal);
868
869 /* Calculate temporary vectorial force */
870 tx = _mm_mul_ps(fscal,dx00);
871 ty = _mm_mul_ps(fscal,dy00);
872 tz = _mm_mul_ps(fscal,dz00);
873
874 /* Update vectorial force */
875 fix0 = _mm_add_ps(fix0,tx);
876 fiy0 = _mm_add_ps(fiy0,ty);
877 fiz0 = _mm_add_ps(fiz0,tz);
878
879 fjx0 = _mm_add_ps(fjx0,tx);
880 fjy0 = _mm_add_ps(fjy0,ty);
881 fjz0 = _mm_add_ps(fjz0,tz);
882
883 /**************************
884 * CALCULATE INTERACTIONS *
885 **************************/
886
887 r01 = _mm_mul_ps(rsq01,rinv01);
888 r01 = _mm_andnot_ps(dummy_mask,r01);
889
890 /* EWALD ELECTROSTATICS */
891
892 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893 ewrt = _mm_mul_ps(r01,ewtabscale);
894 ewitab = _mm_cvttps_epi32(ewrt);
895 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
896 ewitab = _mm_slli_epi32(ewitab,2);
897 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
898 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
899 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
900 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
901 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
902 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
903 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
904 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
905 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
906
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm_andnot_ps(dummy_mask,velec);
909 velecsum = _mm_add_ps(velecsum,velec);
910
911 fscal = felec;
912
913 fscal = _mm_andnot_ps(dummy_mask,fscal);
914
915 /* Calculate temporary vectorial force */
916 tx = _mm_mul_ps(fscal,dx01);
917 ty = _mm_mul_ps(fscal,dy01);
918 tz = _mm_mul_ps(fscal,dz01);
919
920 /* Update vectorial force */
921 fix0 = _mm_add_ps(fix0,tx);
922 fiy0 = _mm_add_ps(fiy0,ty);
923 fiz0 = _mm_add_ps(fiz0,tz);
924
925 fjx1 = _mm_add_ps(fjx1,tx);
926 fjy1 = _mm_add_ps(fjy1,ty);
927 fjz1 = _mm_add_ps(fjz1,tz);
928
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
932
933 r02 = _mm_mul_ps(rsq02,rinv02);
934 r02 = _mm_andnot_ps(dummy_mask,r02);
935
936 /* EWALD ELECTROSTATICS */
937
938 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
939 ewrt = _mm_mul_ps(r02,ewtabscale);
940 ewitab = _mm_cvttps_epi32(ewrt);
941 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
942 ewitab = _mm_slli_epi32(ewitab,2);
943 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
944 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
945 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
946 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
947 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
948 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
949 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
950 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
951 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
952
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_andnot_ps(dummy_mask,velec);
955 velecsum = _mm_add_ps(velecsum,velec);
956
957 fscal = felec;
958
959 fscal = _mm_andnot_ps(dummy_mask,fscal);
960
961 /* Calculate temporary vectorial force */
962 tx = _mm_mul_ps(fscal,dx02);
963 ty = _mm_mul_ps(fscal,dy02);
964 tz = _mm_mul_ps(fscal,dz02);
965
966 /* Update vectorial force */
967 fix0 = _mm_add_ps(fix0,tx);
968 fiy0 = _mm_add_ps(fiy0,ty);
969 fiz0 = _mm_add_ps(fiz0,tz);
970
971 fjx2 = _mm_add_ps(fjx2,tx);
972 fjy2 = _mm_add_ps(fjy2,ty);
973 fjz2 = _mm_add_ps(fjz2,tz);
974
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
978
979 r10 = _mm_mul_ps(rsq10,rinv10);
980 r10 = _mm_andnot_ps(dummy_mask,r10);
981
982 /* EWALD ELECTROSTATICS */
983
984 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
985 ewrt = _mm_mul_ps(r10,ewtabscale);
986 ewitab = _mm_cvttps_epi32(ewrt);
987 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
988 ewitab = _mm_slli_epi32(ewitab,2);
989 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
990 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
991 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
992 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
993 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
994 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
995 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
996 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
997 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
998
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec = _mm_andnot_ps(dummy_mask,velec);
1001 velecsum = _mm_add_ps(velecsum,velec);
1002
1003 fscal = felec;
1004
1005 fscal = _mm_andnot_ps(dummy_mask,fscal);
1006
1007 /* Calculate temporary vectorial force */
1008 tx = _mm_mul_ps(fscal,dx10);
1009 ty = _mm_mul_ps(fscal,dy10);
1010 tz = _mm_mul_ps(fscal,dz10);
1011
1012 /* Update vectorial force */
1013 fix1 = _mm_add_ps(fix1,tx);
1014 fiy1 = _mm_add_ps(fiy1,ty);
1015 fiz1 = _mm_add_ps(fiz1,tz);
1016
1017 fjx0 = _mm_add_ps(fjx0,tx);
1018 fjy0 = _mm_add_ps(fjy0,ty);
1019 fjz0 = _mm_add_ps(fjz0,tz);
1020
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1024
1025 r11 = _mm_mul_ps(rsq11,rinv11);
1026 r11 = _mm_andnot_ps(dummy_mask,r11);
1027
1028 /* EWALD ELECTROSTATICS */
1029
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _mm_mul_ps(r11,ewtabscale);
1032 ewitab = _mm_cvttps_epi32(ewrt);
1033 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1034 ewitab = _mm_slli_epi32(ewitab,2);
1035 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1036 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1037 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1038 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1039 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1040 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1041 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1042 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1043 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1044
1045 /* Update potential sum for this i atom from the interaction with this j atom. */
1046 velec = _mm_andnot_ps(dummy_mask,velec);
1047 velecsum = _mm_add_ps(velecsum,velec);
1048
1049 fscal = felec;
1050
1051 fscal = _mm_andnot_ps(dummy_mask,fscal);
1052
1053 /* Calculate temporary vectorial force */
1054 tx = _mm_mul_ps(fscal,dx11);
1055 ty = _mm_mul_ps(fscal,dy11);
1056 tz = _mm_mul_ps(fscal,dz11);
1057
1058 /* Update vectorial force */
1059 fix1 = _mm_add_ps(fix1,tx);
1060 fiy1 = _mm_add_ps(fiy1,ty);
1061 fiz1 = _mm_add_ps(fiz1,tz);
1062
1063 fjx1 = _mm_add_ps(fjx1,tx);
1064 fjy1 = _mm_add_ps(fjy1,ty);
1065 fjz1 = _mm_add_ps(fjz1,tz);
1066
1067 /**************************
1068 * CALCULATE INTERACTIONS *
1069 **************************/
1070
1071 r12 = _mm_mul_ps(rsq12,rinv12);
1072 r12 = _mm_andnot_ps(dummy_mask,r12);
1073
1074 /* EWALD ELECTROSTATICS */
1075
1076 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1077 ewrt = _mm_mul_ps(r12,ewtabscale);
1078 ewitab = _mm_cvttps_epi32(ewrt);
1079 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1080 ewitab = _mm_slli_epi32(ewitab,2);
1081 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1082 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1083 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1084 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1085 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1086 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1087 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1088 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1089 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1090
1091 /* Update potential sum for this i atom from the interaction with this j atom. */
1092 velec = _mm_andnot_ps(dummy_mask,velec);
1093 velecsum = _mm_add_ps(velecsum,velec);
1094
1095 fscal = felec;
1096
1097 fscal = _mm_andnot_ps(dummy_mask,fscal);
1098
1099 /* Calculate temporary vectorial force */
1100 tx = _mm_mul_ps(fscal,dx12);
1101 ty = _mm_mul_ps(fscal,dy12);
1102 tz = _mm_mul_ps(fscal,dz12);
1103
1104 /* Update vectorial force */
1105 fix1 = _mm_add_ps(fix1,tx);
1106 fiy1 = _mm_add_ps(fiy1,ty);
1107 fiz1 = _mm_add_ps(fiz1,tz);
1108
1109 fjx2 = _mm_add_ps(fjx2,tx);
1110 fjy2 = _mm_add_ps(fjy2,ty);
1111 fjz2 = _mm_add_ps(fjz2,tz);
1112
1113 /**************************
1114 * CALCULATE INTERACTIONS *
1115 **************************/
1116
1117 r20 = _mm_mul_ps(rsq20,rinv20);
1118 r20 = _mm_andnot_ps(dummy_mask,r20);
1119
1120 /* EWALD ELECTROSTATICS */
1121
1122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123 ewrt = _mm_mul_ps(r20,ewtabscale);
1124 ewitab = _mm_cvttps_epi32(ewrt);
1125 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1126 ewitab = _mm_slli_epi32(ewitab,2);
1127 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1128 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1129 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1130 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1131 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1132 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1133 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1134 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1135 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1136
1137 /* Update potential sum for this i atom from the interaction with this j atom. */
1138 velec = _mm_andnot_ps(dummy_mask,velec);
1139 velecsum = _mm_add_ps(velecsum,velec);
1140
1141 fscal = felec;
1142
1143 fscal = _mm_andnot_ps(dummy_mask,fscal);
1144
1145 /* Calculate temporary vectorial force */
1146 tx = _mm_mul_ps(fscal,dx20);
1147 ty = _mm_mul_ps(fscal,dy20);
1148 tz = _mm_mul_ps(fscal,dz20);
1149
1150 /* Update vectorial force */
1151 fix2 = _mm_add_ps(fix2,tx);
1152 fiy2 = _mm_add_ps(fiy2,ty);
1153 fiz2 = _mm_add_ps(fiz2,tz);
1154
1155 fjx0 = _mm_add_ps(fjx0,tx);
1156 fjy0 = _mm_add_ps(fjy0,ty);
1157 fjz0 = _mm_add_ps(fjz0,tz);
1158
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1162
1163 r21 = _mm_mul_ps(rsq21,rinv21);
1164 r21 = _mm_andnot_ps(dummy_mask,r21);
1165
1166 /* EWALD ELECTROSTATICS */
1167
1168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169 ewrt = _mm_mul_ps(r21,ewtabscale);
1170 ewitab = _mm_cvttps_epi32(ewrt);
1171 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1172 ewitab = _mm_slli_epi32(ewitab,2);
1173 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1174 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1175 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1176 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1177 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1178 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1179 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1180 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1181 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1182
1183 /* Update potential sum for this i atom from the interaction with this j atom. */
1184 velec = _mm_andnot_ps(dummy_mask,velec);
1185 velecsum = _mm_add_ps(velecsum,velec);
1186
1187 fscal = felec;
1188
1189 fscal = _mm_andnot_ps(dummy_mask,fscal);
1190
1191 /* Calculate temporary vectorial force */
1192 tx = _mm_mul_ps(fscal,dx21);
1193 ty = _mm_mul_ps(fscal,dy21);
1194 tz = _mm_mul_ps(fscal,dz21);
1195
1196 /* Update vectorial force */
1197 fix2 = _mm_add_ps(fix2,tx);
1198 fiy2 = _mm_add_ps(fiy2,ty);
1199 fiz2 = _mm_add_ps(fiz2,tz);
1200
1201 fjx1 = _mm_add_ps(fjx1,tx);
1202 fjy1 = _mm_add_ps(fjy1,ty);
1203 fjz1 = _mm_add_ps(fjz1,tz);
1204
1205 /**************************
1206 * CALCULATE INTERACTIONS *
1207 **************************/
1208
1209 r22 = _mm_mul_ps(rsq22,rinv22);
1210 r22 = _mm_andnot_ps(dummy_mask,r22);
1211
1212 /* EWALD ELECTROSTATICS */
1213
1214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1215 ewrt = _mm_mul_ps(r22,ewtabscale);
1216 ewitab = _mm_cvttps_epi32(ewrt);
1217 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1218 ewitab = _mm_slli_epi32(ewitab,2);
1219 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1220 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1221 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1222 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1223 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1224 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1225 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1226 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1227 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1228
1229 /* Update potential sum for this i atom from the interaction with this j atom. */
1230 velec = _mm_andnot_ps(dummy_mask,velec);
1231 velecsum = _mm_add_ps(velecsum,velec);
1232
1233 fscal = felec;
1234
1235 fscal = _mm_andnot_ps(dummy_mask,fscal);
1236
1237 /* Calculate temporary vectorial force */
1238 tx = _mm_mul_ps(fscal,dx22);
1239 ty = _mm_mul_ps(fscal,dy22);
1240 tz = _mm_mul_ps(fscal,dz22);
1241
1242 /* Update vectorial force */
1243 fix2 = _mm_add_ps(fix2,tx);
1244 fiy2 = _mm_add_ps(fiy2,ty);
1245 fiz2 = _mm_add_ps(fiz2,tz);
1246
1247 fjx2 = _mm_add_ps(fjx2,tx);
1248 fjy2 = _mm_add_ps(fjy2,ty);
1249 fjz2 = _mm_add_ps(fjz2,tz);
1250
1251 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1252 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1253 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1254 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1255
1256 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1257 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1258
1259 /* Inner loop uses 406 flops */
1260 }
1261
1262 /* End of innermost loop */
1263
1264 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1265 f+i_coord_offset,fshift+i_shift_offset);
1266
1267 ggid = gid[iidx];
1268 /* Update potential energies */
1269 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1270 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1271
1272 /* Increment number of inner iterations */
1273 inneriter += j_index_end - j_index_start;
1274
1275 /* Outer loop uses 20 flops */
1276 }
1277
1278 /* Increment number of outer iterations */
1279 outeriter += nri;
1280
1281 /* Update outer/inner flops */
1282
1283 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*406)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_VF] += outeriter*20 +
inneriter*406
;
1284}
1285/*
1286 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single
1287 * Electrostatics interaction: Ewald
1288 * VdW interaction: LJEwald
1289 * Geometry: Water3-Water3
1290 * Calculate force/pot: Force
1291 */
1292void
1293nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single
1294 (t_nblist * gmx_restrict nlist,
1295 rvec * gmx_restrict xx,
1296 rvec * gmx_restrict ff,
1297 t_forcerec * gmx_restrict fr,
1298 t_mdatoms * gmx_restrict mdatoms,
1299 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1300 t_nrnb * gmx_restrict nrnb)
1301{
1302 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1303 * just 0 for non-waters.
1304 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1305 * jnr indices corresponding to data put in the four positions in the SIMD register.
1306 */
1307 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1308 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1309 int jnrA,jnrB,jnrC,jnrD;
1310 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1311 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1312 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1313 real rcutoff_scalar;
1314 real *shiftvec,*fshift,*x,*f;
1315 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1316 real scratch[4*DIM3];
1317 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1318 int vdwioffset0;
1319 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1320 int vdwioffset1;
1321 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1322 int vdwioffset2;
1323 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1324 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1325 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1326 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1327 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1328 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1329 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1330 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1331 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1332 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1333 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1334 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1335 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1336 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1337 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1338 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1339 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1340 real *charge;
1341 int nvdwtype;
1342 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1343 int *vdwtype;
1344 real *vdwparam;
1345 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1346 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1347 __m128 c6grid_00;
1348 __m128 c6grid_01;
1349 __m128 c6grid_02;
1350 __m128 c6grid_10;
1351 __m128 c6grid_11;
1352 __m128 c6grid_12;
1353 __m128 c6grid_20;
1354 __m128 c6grid_21;
1355 __m128 c6grid_22;
1356 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1357 real *vdwgridparam;
1358 __m128 one_half = _mm_set1_ps(0.5);
1359 __m128 minus_one = _mm_set1_ps(-1.0);
1360 __m128i ewitab;
1361 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1362 real *ewtab;
1363 __m128 dummy_mask,cutoff_mask;
1364 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1365 __m128 one = _mm_set1_ps(1.0);
1366 __m128 two = _mm_set1_ps(2.0);
1367 x = xx[0];
1368 f = ff[0];
1369
1370 nri = nlist->nri;
1371 iinr = nlist->iinr;
1372 jindex = nlist->jindex;
1373 jjnr = nlist->jjnr;
1374 shiftidx = nlist->shift;
1375 gid = nlist->gid;
1376 shiftvec = fr->shift_vec[0];
1377 fshift = fr->fshift[0];
1378 facel = _mm_set1_ps(fr->epsfac);
1379 charge = mdatoms->chargeA;
1380 nvdwtype = fr->ntype;
1381 vdwparam = fr->nbfp;
1382 vdwtype = mdatoms->typeA;
1383 vdwgridparam = fr->ljpme_c6grid;
1384 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1385 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1386 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1387
1388 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1389 ewtab = fr->ic->tabq_coul_F;
1390 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1391 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1392
1393 /* Setup water-specific parameters */
1394 inr = nlist->iinr[0];
1395 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1396 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1397 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1398 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1399
1400 jq0 = _mm_set1_ps(charge[inr+0]);
1401 jq1 = _mm_set1_ps(charge[inr+1]);
1402 jq2 = _mm_set1_ps(charge[inr+2]);
1403 vdwjidx0A = 2*vdwtype[inr+0];
1404 qq00 = _mm_mul_ps(iq0,jq0);
1405 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1406 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1407 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1408 qq01 = _mm_mul_ps(iq0,jq1);
1409 qq02 = _mm_mul_ps(iq0,jq2);
1410 qq10 = _mm_mul_ps(iq1,jq0);
1411 qq11 = _mm_mul_ps(iq1,jq1);
1412 qq12 = _mm_mul_ps(iq1,jq2);
1413 qq20 = _mm_mul_ps(iq2,jq0);
1414 qq21 = _mm_mul_ps(iq2,jq1);
1415 qq22 = _mm_mul_ps(iq2,jq2);
1416
1417 /* Avoid stupid compiler warnings */
1418 jnrA = jnrB = jnrC = jnrD = 0;
Value stored to 'jnrA' is never read
1419 j_coord_offsetA = 0;
1420 j_coord_offsetB = 0;
1421 j_coord_offsetC = 0;
1422 j_coord_offsetD = 0;
1423
1424 outeriter = 0;
1425 inneriter = 0;
1426
1427 for(iidx=0;iidx<4*DIM3;iidx++)
1428 {
1429 scratch[iidx] = 0.0;
1430 }
1431
1432 /* Start outer loop over neighborlists */
1433 for(iidx=0; iidx<nri; iidx++)
1434 {
1435 /* Load shift vector for this list */
1436 i_shift_offset = DIM3*shiftidx[iidx];
1437
1438 /* Load limits for loop over neighbors */
1439 j_index_start = jindex[iidx];
1440 j_index_end = jindex[iidx+1];
1441
1442 /* Get outer coordinate index */
1443 inr = iinr[iidx];
1444 i_coord_offset = DIM3*inr;
1445
1446 /* Load i particle coords and add shift vector */
1447 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1448 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1449
1450 fix0 = _mm_setzero_ps();
1451 fiy0 = _mm_setzero_ps();
1452 fiz0 = _mm_setzero_ps();
1453 fix1 = _mm_setzero_ps();
1454 fiy1 = _mm_setzero_ps();
1455 fiz1 = _mm_setzero_ps();
1456 fix2 = _mm_setzero_ps();
1457 fiy2 = _mm_setzero_ps();
1458 fiz2 = _mm_setzero_ps();
1459
1460 /* Start inner kernel loop */
1461 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1462 {
1463
1464 /* Get j neighbor index, and coordinate index */
1465 jnrA = jjnr[jidx];
1466 jnrB = jjnr[jidx+1];
1467 jnrC = jjnr[jidx+2];
1468 jnrD = jjnr[jidx+3];
1469 j_coord_offsetA = DIM3*jnrA;
1470 j_coord_offsetB = DIM3*jnrB;
1471 j_coord_offsetC = DIM3*jnrC;
1472 j_coord_offsetD = DIM3*jnrD;
1473
1474 /* load j atom coordinates */
1475 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1476 x+j_coord_offsetC,x+j_coord_offsetD,
1477 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1478
1479 /* Calculate displacement vector */
1480 dx00 = _mm_sub_ps(ix0,jx0);
1481 dy00 = _mm_sub_ps(iy0,jy0);
1482 dz00 = _mm_sub_ps(iz0,jz0);
1483 dx01 = _mm_sub_ps(ix0,jx1);
1484 dy01 = _mm_sub_ps(iy0,jy1);
1485 dz01 = _mm_sub_ps(iz0,jz1);
1486 dx02 = _mm_sub_ps(ix0,jx2);
1487 dy02 = _mm_sub_ps(iy0,jy2);
1488 dz02 = _mm_sub_ps(iz0,jz2);
1489 dx10 = _mm_sub_ps(ix1,jx0);
1490 dy10 = _mm_sub_ps(iy1,jy0);
1491 dz10 = _mm_sub_ps(iz1,jz0);
1492 dx11 = _mm_sub_ps(ix1,jx1);
1493 dy11 = _mm_sub_ps(iy1,jy1);
1494 dz11 = _mm_sub_ps(iz1,jz1);
1495 dx12 = _mm_sub_ps(ix1,jx2);
1496 dy12 = _mm_sub_ps(iy1,jy2);
1497 dz12 = _mm_sub_ps(iz1,jz2);
1498 dx20 = _mm_sub_ps(ix2,jx0);
1499 dy20 = _mm_sub_ps(iy2,jy0);
1500 dz20 = _mm_sub_ps(iz2,jz0);
1501 dx21 = _mm_sub_ps(ix2,jx1);
1502 dy21 = _mm_sub_ps(iy2,jy1);
1503 dz21 = _mm_sub_ps(iz2,jz1);
1504 dx22 = _mm_sub_ps(ix2,jx2);
1505 dy22 = _mm_sub_ps(iy2,jy2);
1506 dz22 = _mm_sub_ps(iz2,jz2);
1507
1508 /* Calculate squared distance and things based on it */
1509 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1510 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1511 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1512 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1513 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1514 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1515 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1516 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1517 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1518
1519 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1520 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1521 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1522 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1523 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1524 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1525 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1526 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1527 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1528
1529 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1530 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1531 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1532 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1533 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1534 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1535 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1536 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1537 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1538
1539 fjx0 = _mm_setzero_ps();
1540 fjy0 = _mm_setzero_ps();
1541 fjz0 = _mm_setzero_ps();
1542 fjx1 = _mm_setzero_ps();
1543 fjy1 = _mm_setzero_ps();
1544 fjz1 = _mm_setzero_ps();
1545 fjx2 = _mm_setzero_ps();
1546 fjy2 = _mm_setzero_ps();
1547 fjz2 = _mm_setzero_ps();
1548
1549 /**************************
1550 * CALCULATE INTERACTIONS *
1551 **************************/
1552
1553 r00 = _mm_mul_ps(rsq00,rinv00);
1554
1555 /* EWALD ELECTROSTATICS */
1556
1557 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1558 ewrt = _mm_mul_ps(r00,ewtabscale);
1559 ewitab = _mm_cvttps_epi32(ewrt);
1560 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1561 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1562 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1563 &ewtabF,&ewtabFn);
1564 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1565 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1566
1567 /* Analytical LJ-PME */
1568 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1569 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1570 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1571 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
1572 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1573 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1574 /* f6A = 6 * C6grid * (1 - poly) */
1575 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1576 /* f6B = C6grid * exponent * beta^6 */
1577 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1578 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1579 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1580
1581 fscal = _mm_add_ps(felec,fvdw);
1582
1583 /* Calculate temporary vectorial force */
1584 tx = _mm_mul_ps(fscal,dx00);
1585 ty = _mm_mul_ps(fscal,dy00);
1586 tz = _mm_mul_ps(fscal,dz00);
1587
1588 /* Update vectorial force */
1589 fix0 = _mm_add_ps(fix0,tx);
1590 fiy0 = _mm_add_ps(fiy0,ty);
1591 fiz0 = _mm_add_ps(fiz0,tz);
1592
1593 fjx0 = _mm_add_ps(fjx0,tx);
1594 fjy0 = _mm_add_ps(fjy0,ty);
1595 fjz0 = _mm_add_ps(fjz0,tz);
1596
1597 /**************************
1598 * CALCULATE INTERACTIONS *
1599 **************************/
1600
1601 r01 = _mm_mul_ps(rsq01,rinv01);
1602
1603 /* EWALD ELECTROSTATICS */
1604
1605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1606 ewrt = _mm_mul_ps(r01,ewtabscale);
1607 ewitab = _mm_cvttps_epi32(ewrt);
1608 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1609 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1610 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1611 &ewtabF,&ewtabFn);
1612 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1613 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1614
1615 fscal = felec;
1616
1617 /* Calculate temporary vectorial force */
1618 tx = _mm_mul_ps(fscal,dx01);
1619 ty = _mm_mul_ps(fscal,dy01);
1620 tz = _mm_mul_ps(fscal,dz01);
1621
1622 /* Update vectorial force */
1623 fix0 = _mm_add_ps(fix0,tx);
1624 fiy0 = _mm_add_ps(fiy0,ty);
1625 fiz0 = _mm_add_ps(fiz0,tz);
1626
1627 fjx1 = _mm_add_ps(fjx1,tx);
1628 fjy1 = _mm_add_ps(fjy1,ty);
1629 fjz1 = _mm_add_ps(fjz1,tz);
1630
1631 /**************************
1632 * CALCULATE INTERACTIONS *
1633 **************************/
1634
1635 r02 = _mm_mul_ps(rsq02,rinv02);
1636
1637 /* EWALD ELECTROSTATICS */
1638
1639 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640 ewrt = _mm_mul_ps(r02,ewtabscale);
1641 ewitab = _mm_cvttps_epi32(ewrt);
1642 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1643 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1644 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1645 &ewtabF,&ewtabFn);
1646 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1647 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1648
1649 fscal = felec;
1650
1651 /* Calculate temporary vectorial force */
1652 tx = _mm_mul_ps(fscal,dx02);
1653 ty = _mm_mul_ps(fscal,dy02);
1654 tz = _mm_mul_ps(fscal,dz02);
1655
1656 /* Update vectorial force */
1657 fix0 = _mm_add_ps(fix0,tx);
1658 fiy0 = _mm_add_ps(fiy0,ty);
1659 fiz0 = _mm_add_ps(fiz0,tz);
1660
1661 fjx2 = _mm_add_ps(fjx2,tx);
1662 fjy2 = _mm_add_ps(fjy2,ty);
1663 fjz2 = _mm_add_ps(fjz2,tz);
1664
1665 /**************************
1666 * CALCULATE INTERACTIONS *
1667 **************************/
1668
1669 r10 = _mm_mul_ps(rsq10,rinv10);
1670
1671 /* EWALD ELECTROSTATICS */
1672
1673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1674 ewrt = _mm_mul_ps(r10,ewtabscale);
1675 ewitab = _mm_cvttps_epi32(ewrt);
1676 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1677 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1678 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1679 &ewtabF,&ewtabFn);
1680 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1681 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1682
1683 fscal = felec;
1684
1685 /* Calculate temporary vectorial force */
1686 tx = _mm_mul_ps(fscal,dx10);
1687 ty = _mm_mul_ps(fscal,dy10);
1688 tz = _mm_mul_ps(fscal,dz10);
1689
1690 /* Update vectorial force */
1691 fix1 = _mm_add_ps(fix1,tx);
1692 fiy1 = _mm_add_ps(fiy1,ty);
1693 fiz1 = _mm_add_ps(fiz1,tz);
1694
1695 fjx0 = _mm_add_ps(fjx0,tx);
1696 fjy0 = _mm_add_ps(fjy0,ty);
1697 fjz0 = _mm_add_ps(fjz0,tz);
1698
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1702
1703 r11 = _mm_mul_ps(rsq11,rinv11);
1704
1705 /* EWALD ELECTROSTATICS */
1706
1707 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1708 ewrt = _mm_mul_ps(r11,ewtabscale);
1709 ewitab = _mm_cvttps_epi32(ewrt);
1710 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1711 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1712 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1713 &ewtabF,&ewtabFn);
1714 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1715 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1716
1717 fscal = felec;
1718
1719 /* Calculate temporary vectorial force */
1720 tx = _mm_mul_ps(fscal,dx11);
1721 ty = _mm_mul_ps(fscal,dy11);
1722 tz = _mm_mul_ps(fscal,dz11);
1723
1724 /* Update vectorial force */
1725 fix1 = _mm_add_ps(fix1,tx);
1726 fiy1 = _mm_add_ps(fiy1,ty);
1727 fiz1 = _mm_add_ps(fiz1,tz);
1728
1729 fjx1 = _mm_add_ps(fjx1,tx);
1730 fjy1 = _mm_add_ps(fjy1,ty);
1731 fjz1 = _mm_add_ps(fjz1,tz);
1732
1733 /**************************
1734 * CALCULATE INTERACTIONS *
1735 **************************/
1736
1737 r12 = _mm_mul_ps(rsq12,rinv12);
1738
1739 /* EWALD ELECTROSTATICS */
1740
1741 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1742 ewrt = _mm_mul_ps(r12,ewtabscale);
1743 ewitab = _mm_cvttps_epi32(ewrt);
1744 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1745 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1746 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1747 &ewtabF,&ewtabFn);
1748 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1749 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1750
1751 fscal = felec;
1752
1753 /* Calculate temporary vectorial force */
1754 tx = _mm_mul_ps(fscal,dx12);
1755 ty = _mm_mul_ps(fscal,dy12);
1756 tz = _mm_mul_ps(fscal,dz12);
1757
1758 /* Update vectorial force */
1759 fix1 = _mm_add_ps(fix1,tx);
1760 fiy1 = _mm_add_ps(fiy1,ty);
1761 fiz1 = _mm_add_ps(fiz1,tz);
1762
1763 fjx2 = _mm_add_ps(fjx2,tx);
1764 fjy2 = _mm_add_ps(fjy2,ty);
1765 fjz2 = _mm_add_ps(fjz2,tz);
1766
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1770
1771 r20 = _mm_mul_ps(rsq20,rinv20);
1772
1773 /* EWALD ELECTROSTATICS */
1774
1775 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1776 ewrt = _mm_mul_ps(r20,ewtabscale);
1777 ewitab = _mm_cvttps_epi32(ewrt);
1778 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1779 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1780 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1781 &ewtabF,&ewtabFn);
1782 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1783 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1784
1785 fscal = felec;
1786
1787 /* Calculate temporary vectorial force */
1788 tx = _mm_mul_ps(fscal,dx20);
1789 ty = _mm_mul_ps(fscal,dy20);
1790 tz = _mm_mul_ps(fscal,dz20);
1791
1792 /* Update vectorial force */
1793 fix2 = _mm_add_ps(fix2,tx);
1794 fiy2 = _mm_add_ps(fiy2,ty);
1795 fiz2 = _mm_add_ps(fiz2,tz);
1796
1797 fjx0 = _mm_add_ps(fjx0,tx);
1798 fjy0 = _mm_add_ps(fjy0,ty);
1799 fjz0 = _mm_add_ps(fjz0,tz);
1800
1801 /**************************
1802 * CALCULATE INTERACTIONS *
1803 **************************/
1804
1805 r21 = _mm_mul_ps(rsq21,rinv21);
1806
1807 /* EWALD ELECTROSTATICS */
1808
1809 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1810 ewrt = _mm_mul_ps(r21,ewtabscale);
1811 ewitab = _mm_cvttps_epi32(ewrt);
1812 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1813 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1814 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1815 &ewtabF,&ewtabFn);
1816 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1817 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1818
1819 fscal = felec;
1820
1821 /* Calculate temporary vectorial force */
1822 tx = _mm_mul_ps(fscal,dx21);
1823 ty = _mm_mul_ps(fscal,dy21);
1824 tz = _mm_mul_ps(fscal,dz21);
1825
1826 /* Update vectorial force */
1827 fix2 = _mm_add_ps(fix2,tx);
1828 fiy2 = _mm_add_ps(fiy2,ty);
1829 fiz2 = _mm_add_ps(fiz2,tz);
1830
1831 fjx1 = _mm_add_ps(fjx1,tx);
1832 fjy1 = _mm_add_ps(fjy1,ty);
1833 fjz1 = _mm_add_ps(fjz1,tz);
1834
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1838
1839 r22 = _mm_mul_ps(rsq22,rinv22);
1840
1841 /* EWALD ELECTROSTATICS */
1842
1843 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1844 ewrt = _mm_mul_ps(r22,ewtabscale);
1845 ewitab = _mm_cvttps_epi32(ewrt);
1846 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1847 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1848 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1849 &ewtabF,&ewtabFn);
1850 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1851 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1852
1853 fscal = felec;
1854
1855 /* Calculate temporary vectorial force */
1856 tx = _mm_mul_ps(fscal,dx22);
1857 ty = _mm_mul_ps(fscal,dy22);
1858 tz = _mm_mul_ps(fscal,dz22);
1859
1860 /* Update vectorial force */
1861 fix2 = _mm_add_ps(fix2,tx);
1862 fiy2 = _mm_add_ps(fiy2,ty);
1863 fiz2 = _mm_add_ps(fiz2,tz);
1864
1865 fjx2 = _mm_add_ps(fjx2,tx);
1866 fjy2 = _mm_add_ps(fjy2,ty);
1867 fjz2 = _mm_add_ps(fjz2,tz);
1868
1869 fjptrA = f+j_coord_offsetA;
1870 fjptrB = f+j_coord_offsetB;
1871 fjptrC = f+j_coord_offsetC;
1872 fjptrD = f+j_coord_offsetD;
1873
1874 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1875 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1876
1877 /* Inner loop uses 347 flops */
1878 }
1879
1880 if(jidx<j_index_end)
1881 {
1882
1883 /* Get j neighbor index, and coordinate index */
1884 jnrlistA = jjnr[jidx];
1885 jnrlistB = jjnr[jidx+1];
1886 jnrlistC = jjnr[jidx+2];
1887 jnrlistD = jjnr[jidx+3];
1888 /* Sign of each element will be negative for non-real atoms.
1889 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1890 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1891 */
1892 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1893 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1894 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1895 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1896 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1897 j_coord_offsetA = DIM3*jnrA;
1898 j_coord_offsetB = DIM3*jnrB;
1899 j_coord_offsetC = DIM3*jnrC;
1900 j_coord_offsetD = DIM3*jnrD;
1901
1902 /* load j atom coordinates */
1903 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1904 x+j_coord_offsetC,x+j_coord_offsetD,
1905 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1906
1907 /* Calculate displacement vector */
1908 dx00 = _mm_sub_ps(ix0,jx0);
1909 dy00 = _mm_sub_ps(iy0,jy0);
1910 dz00 = _mm_sub_ps(iz0,jz0);
1911 dx01 = _mm_sub_ps(ix0,jx1);
1912 dy01 = _mm_sub_ps(iy0,jy1);
1913 dz01 = _mm_sub_ps(iz0,jz1);
1914 dx02 = _mm_sub_ps(ix0,jx2);
1915 dy02 = _mm_sub_ps(iy0,jy2);
1916 dz02 = _mm_sub_ps(iz0,jz2);
1917 dx10 = _mm_sub_ps(ix1,jx0);
1918 dy10 = _mm_sub_ps(iy1,jy0);
1919 dz10 = _mm_sub_ps(iz1,jz0);
1920 dx11 = _mm_sub_ps(ix1,jx1);
1921 dy11 = _mm_sub_ps(iy1,jy1);
1922 dz11 = _mm_sub_ps(iz1,jz1);
1923 dx12 = _mm_sub_ps(ix1,jx2);
1924 dy12 = _mm_sub_ps(iy1,jy2);
1925 dz12 = _mm_sub_ps(iz1,jz2);
1926 dx20 = _mm_sub_ps(ix2,jx0);
1927 dy20 = _mm_sub_ps(iy2,jy0);
1928 dz20 = _mm_sub_ps(iz2,jz0);
1929 dx21 = _mm_sub_ps(ix2,jx1);
1930 dy21 = _mm_sub_ps(iy2,jy1);
1931 dz21 = _mm_sub_ps(iz2,jz1);
1932 dx22 = _mm_sub_ps(ix2,jx2);
1933 dy22 = _mm_sub_ps(iy2,jy2);
1934 dz22 = _mm_sub_ps(iz2,jz2);
1935
1936 /* Calculate squared distance and things based on it */
1937 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1938 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1939 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1940 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1941 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1942 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1943 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1944 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1945 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1946
1947 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1948 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1949 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1950 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1951 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1952 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1953 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1954 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1955 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1956
1957 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1958 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1959 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1960 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1961 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1962 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1963 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1964 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1965 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1966
1967 fjx0 = _mm_setzero_ps();
1968 fjy0 = _mm_setzero_ps();
1969 fjz0 = _mm_setzero_ps();
1970 fjx1 = _mm_setzero_ps();
1971 fjy1 = _mm_setzero_ps();
1972 fjz1 = _mm_setzero_ps();
1973 fjx2 = _mm_setzero_ps();
1974 fjy2 = _mm_setzero_ps();
1975 fjz2 = _mm_setzero_ps();
1976
1977 /**************************
1978 * CALCULATE INTERACTIONS *
1979 **************************/
1980
1981 r00 = _mm_mul_ps(rsq00,rinv00);
1982 r00 = _mm_andnot_ps(dummy_mask,r00);
1983
1984 /* EWALD ELECTROSTATICS */
1985
1986 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1987 ewrt = _mm_mul_ps(r00,ewtabscale);
1988 ewitab = _mm_cvttps_epi32(ewrt);
1989 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1990 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1991 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1992 &ewtabF,&ewtabFn);
1993 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1994 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1995
1996 /* Analytical LJ-PME */
1997 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1998 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1999 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2000 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
2001 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2002 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2003 /* f6A = 6 * C6grid * (1 - poly) */
2004 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2005 /* f6B = C6grid * exponent * beta^6 */
2006 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2007 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2008 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2009
2010 fscal = _mm_add_ps(felec,fvdw);
2011
2012 fscal = _mm_andnot_ps(dummy_mask,fscal);
2013
2014 /* Calculate temporary vectorial force */
2015 tx = _mm_mul_ps(fscal,dx00);
2016 ty = _mm_mul_ps(fscal,dy00);
2017 tz = _mm_mul_ps(fscal,dz00);
2018
2019 /* Update vectorial force */
2020 fix0 = _mm_add_ps(fix0,tx);
2021 fiy0 = _mm_add_ps(fiy0,ty);
2022 fiz0 = _mm_add_ps(fiz0,tz);
2023
2024 fjx0 = _mm_add_ps(fjx0,tx);
2025 fjy0 = _mm_add_ps(fjy0,ty);
2026 fjz0 = _mm_add_ps(fjz0,tz);
2027
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2031
2032 r01 = _mm_mul_ps(rsq01,rinv01);
2033 r01 = _mm_andnot_ps(dummy_mask,r01);
2034
2035 /* EWALD ELECTROSTATICS */
2036
2037 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2038 ewrt = _mm_mul_ps(r01,ewtabscale);
2039 ewitab = _mm_cvttps_epi32(ewrt);
2040 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2041 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2042 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2043 &ewtabF,&ewtabFn);
2044 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2045 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2046
2047 fscal = felec;
2048
2049 fscal = _mm_andnot_ps(dummy_mask,fscal);
2050
2051 /* Calculate temporary vectorial force */
2052 tx = _mm_mul_ps(fscal,dx01);
2053 ty = _mm_mul_ps(fscal,dy01);
2054 tz = _mm_mul_ps(fscal,dz01);
2055
2056 /* Update vectorial force */
2057 fix0 = _mm_add_ps(fix0,tx);
2058 fiy0 = _mm_add_ps(fiy0,ty);
2059 fiz0 = _mm_add_ps(fiz0,tz);
2060
2061 fjx1 = _mm_add_ps(fjx1,tx);
2062 fjy1 = _mm_add_ps(fjy1,ty);
2063 fjz1 = _mm_add_ps(fjz1,tz);
2064
2065 /**************************
2066 * CALCULATE INTERACTIONS *
2067 **************************/
2068
2069 r02 = _mm_mul_ps(rsq02,rinv02);
2070 r02 = _mm_andnot_ps(dummy_mask,r02);
2071
2072 /* EWALD ELECTROSTATICS */
2073
2074 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2075 ewrt = _mm_mul_ps(r02,ewtabscale);
2076 ewitab = _mm_cvttps_epi32(ewrt);
2077 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2078 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2079 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2080 &ewtabF,&ewtabFn);
2081 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2082 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2083
2084 fscal = felec;
2085
2086 fscal = _mm_andnot_ps(dummy_mask,fscal);
2087
2088 /* Calculate temporary vectorial force */
2089 tx = _mm_mul_ps(fscal,dx02);
2090 ty = _mm_mul_ps(fscal,dy02);
2091 tz = _mm_mul_ps(fscal,dz02);
2092
2093 /* Update vectorial force */
2094 fix0 = _mm_add_ps(fix0,tx);
2095 fiy0 = _mm_add_ps(fiy0,ty);
2096 fiz0 = _mm_add_ps(fiz0,tz);
2097
2098 fjx2 = _mm_add_ps(fjx2,tx);
2099 fjy2 = _mm_add_ps(fjy2,ty);
2100 fjz2 = _mm_add_ps(fjz2,tz);
2101
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2105
2106 r10 = _mm_mul_ps(rsq10,rinv10);
2107 r10 = _mm_andnot_ps(dummy_mask,r10);
2108
2109 /* EWALD ELECTROSTATICS */
2110
2111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112 ewrt = _mm_mul_ps(r10,ewtabscale);
2113 ewitab = _mm_cvttps_epi32(ewrt);
2114 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2115 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2116 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2117 &ewtabF,&ewtabFn);
2118 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2119 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2120
2121 fscal = felec;
2122
2123 fscal = _mm_andnot_ps(dummy_mask,fscal);
2124
2125 /* Calculate temporary vectorial force */
2126 tx = _mm_mul_ps(fscal,dx10);
2127 ty = _mm_mul_ps(fscal,dy10);
2128 tz = _mm_mul_ps(fscal,dz10);
2129
2130 /* Update vectorial force */
2131 fix1 = _mm_add_ps(fix1,tx);
2132 fiy1 = _mm_add_ps(fiy1,ty);
2133 fiz1 = _mm_add_ps(fiz1,tz);
2134
2135 fjx0 = _mm_add_ps(fjx0,tx);
2136 fjy0 = _mm_add_ps(fjy0,ty);
2137 fjz0 = _mm_add_ps(fjz0,tz);
2138
2139 /**************************
2140 * CALCULATE INTERACTIONS *
2141 **************************/
2142
2143 r11 = _mm_mul_ps(rsq11,rinv11);
2144 r11 = _mm_andnot_ps(dummy_mask,r11);
2145
2146 /* EWALD ELECTROSTATICS */
2147
2148 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2149 ewrt = _mm_mul_ps(r11,ewtabscale);
2150 ewitab = _mm_cvttps_epi32(ewrt);
2151 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2152 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2153 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2154 &ewtabF,&ewtabFn);
2155 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2156 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2157
2158 fscal = felec;
2159
2160 fscal = _mm_andnot_ps(dummy_mask,fscal);
2161
2162 /* Calculate temporary vectorial force */
2163 tx = _mm_mul_ps(fscal,dx11);
2164 ty = _mm_mul_ps(fscal,dy11);
2165 tz = _mm_mul_ps(fscal,dz11);
2166
2167 /* Update vectorial force */
2168 fix1 = _mm_add_ps(fix1,tx);
2169 fiy1 = _mm_add_ps(fiy1,ty);
2170 fiz1 = _mm_add_ps(fiz1,tz);
2171
2172 fjx1 = _mm_add_ps(fjx1,tx);
2173 fjy1 = _mm_add_ps(fjy1,ty);
2174 fjz1 = _mm_add_ps(fjz1,tz);
2175
2176 /**************************
2177 * CALCULATE INTERACTIONS *
2178 **************************/
2179
2180 r12 = _mm_mul_ps(rsq12,rinv12);
2181 r12 = _mm_andnot_ps(dummy_mask,r12);
2182
2183 /* EWALD ELECTROSTATICS */
2184
2185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2186 ewrt = _mm_mul_ps(r12,ewtabscale);
2187 ewitab = _mm_cvttps_epi32(ewrt);
2188 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2189 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2190 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2191 &ewtabF,&ewtabFn);
2192 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2193 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2194
2195 fscal = felec;
2196
2197 fscal = _mm_andnot_ps(dummy_mask,fscal);
2198
2199 /* Calculate temporary vectorial force */
2200 tx = _mm_mul_ps(fscal,dx12);
2201 ty = _mm_mul_ps(fscal,dy12);
2202 tz = _mm_mul_ps(fscal,dz12);
2203
2204 /* Update vectorial force */
2205 fix1 = _mm_add_ps(fix1,tx);
2206 fiy1 = _mm_add_ps(fiy1,ty);
2207 fiz1 = _mm_add_ps(fiz1,tz);
2208
2209 fjx2 = _mm_add_ps(fjx2,tx);
2210 fjy2 = _mm_add_ps(fjy2,ty);
2211 fjz2 = _mm_add_ps(fjz2,tz);
2212
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2216
2217 r20 = _mm_mul_ps(rsq20,rinv20);
2218 r20 = _mm_andnot_ps(dummy_mask,r20);
2219
2220 /* EWALD ELECTROSTATICS */
2221
2222 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2223 ewrt = _mm_mul_ps(r20,ewtabscale);
2224 ewitab = _mm_cvttps_epi32(ewrt);
2225 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2226 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2227 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2228 &ewtabF,&ewtabFn);
2229 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2230 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2231
2232 fscal = felec;
2233
2234 fscal = _mm_andnot_ps(dummy_mask,fscal);
2235
2236 /* Calculate temporary vectorial force */
2237 tx = _mm_mul_ps(fscal,dx20);
2238 ty = _mm_mul_ps(fscal,dy20);
2239 tz = _mm_mul_ps(fscal,dz20);
2240
2241 /* Update vectorial force */
2242 fix2 = _mm_add_ps(fix2,tx);
2243 fiy2 = _mm_add_ps(fiy2,ty);
2244 fiz2 = _mm_add_ps(fiz2,tz);
2245
2246 fjx0 = _mm_add_ps(fjx0,tx);
2247 fjy0 = _mm_add_ps(fjy0,ty);
2248 fjz0 = _mm_add_ps(fjz0,tz);
2249
2250 /**************************
2251 * CALCULATE INTERACTIONS *
2252 **************************/
2253
2254 r21 = _mm_mul_ps(rsq21,rinv21);
2255 r21 = _mm_andnot_ps(dummy_mask,r21);
2256
2257 /* EWALD ELECTROSTATICS */
2258
2259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2260 ewrt = _mm_mul_ps(r21,ewtabscale);
2261 ewitab = _mm_cvttps_epi32(ewrt);
2262 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2263 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2264 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2265 &ewtabF,&ewtabFn);
2266 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2267 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2268
2269 fscal = felec;
2270
2271 fscal = _mm_andnot_ps(dummy_mask,fscal);
2272
2273 /* Calculate temporary vectorial force */
2274 tx = _mm_mul_ps(fscal,dx21);
2275 ty = _mm_mul_ps(fscal,dy21);
2276 tz = _mm_mul_ps(fscal,dz21);
2277
2278 /* Update vectorial force */
2279 fix2 = _mm_add_ps(fix2,tx);
2280 fiy2 = _mm_add_ps(fiy2,ty);
2281 fiz2 = _mm_add_ps(fiz2,tz);
2282
2283 fjx1 = _mm_add_ps(fjx1,tx);
2284 fjy1 = _mm_add_ps(fjy1,ty);
2285 fjz1 = _mm_add_ps(fjz1,tz);
2286
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2290
2291 r22 = _mm_mul_ps(rsq22,rinv22);
2292 r22 = _mm_andnot_ps(dummy_mask,r22);
2293
2294 /* EWALD ELECTROSTATICS */
2295
2296 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2297 ewrt = _mm_mul_ps(r22,ewtabscale);
2298 ewitab = _mm_cvttps_epi32(ewrt);
2299 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2300 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2301 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2302 &ewtabF,&ewtabFn);
2303 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2304 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2305
2306 fscal = felec;
2307
2308 fscal = _mm_andnot_ps(dummy_mask,fscal);
2309
2310 /* Calculate temporary vectorial force */
2311 tx = _mm_mul_ps(fscal,dx22);
2312 ty = _mm_mul_ps(fscal,dy22);
2313 tz = _mm_mul_ps(fscal,dz22);
2314
2315 /* Update vectorial force */
2316 fix2 = _mm_add_ps(fix2,tx);
2317 fiy2 = _mm_add_ps(fiy2,ty);
2318 fiz2 = _mm_add_ps(fiz2,tz);
2319
2320 fjx2 = _mm_add_ps(fjx2,tx);
2321 fjy2 = _mm_add_ps(fjy2,ty);
2322 fjz2 = _mm_add_ps(fjz2,tz);
2323
2324 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2325 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2326 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2327 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2328
2329 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2330 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2331
2332 /* Inner loop uses 356 flops */
2333 }
2334
2335 /* End of innermost loop */
2336
2337 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2338 f+i_coord_offset,fshift+i_shift_offset);
2339
2340 /* Increment number of inner iterations */
2341 inneriter += j_index_end - j_index_start;
2342
2343 /* Outer loop uses 18 flops */
2344 }
2345
2346 /* Increment number of outer iterations */
2347 outeriter += nri;
2348
2349 /* Update outer/inner flops */
2350
2351 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_F] += outeriter*18 + inneriter
*356
;
2352}