Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse4_1_single.c
Location:line 1420, column 22
Description:Value stored to 'two' during its initialization is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwioffset3;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 real *charge;
113 int nvdwtype;
114 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 int *vdwtype;
116 real *vdwparam;
117 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
118 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
119 __m128i vfitab;
120 __m128i ifour = _mm_set1_epi32(4);
121 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
122 real *vftab;
123 __m128 dummy_mask,cutoff_mask;
124 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
125 __m128 one = _mm_set1_ps(1.0);
126 __m128 two = _mm_set1_ps(2.0);
127 x = xx[0];
128 f = ff[0];
129
130 nri = nlist->nri;
131 iinr = nlist->iinr;
132 jindex = nlist->jindex;
133 jjnr = nlist->jjnr;
134 shiftidx = nlist->shift;
135 gid = nlist->gid;
136 shiftvec = fr->shift_vec[0];
137 fshift = fr->fshift[0];
138 facel = _mm_set1_ps(fr->epsfac);
139 charge = mdatoms->chargeA;
140 nvdwtype = fr->ntype;
141 vdwparam = fr->nbfp;
142 vdwtype = mdatoms->typeA;
143
144 vftab = kernel_data->table_elec->data;
145 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
146
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
150 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
151 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
152 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
153
154 jq1 = _mm_set1_ps(charge[inr+1]);
155 jq2 = _mm_set1_ps(charge[inr+2]);
156 jq3 = _mm_set1_ps(charge[inr+3]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
159 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
160 qq11 = _mm_mul_ps(iq1,jq1);
161 qq12 = _mm_mul_ps(iq1,jq2);
162 qq13 = _mm_mul_ps(iq1,jq3);
163 qq21 = _mm_mul_ps(iq2,jq1);
164 qq22 = _mm_mul_ps(iq2,jq2);
165 qq23 = _mm_mul_ps(iq2,jq3);
166 qq31 = _mm_mul_ps(iq3,jq1);
167 qq32 = _mm_mul_ps(iq3,jq2);
168 qq33 = _mm_mul_ps(iq3,jq3);
169
170 /* Avoid stupid compiler warnings */
171 jnrA = jnrB = jnrC = jnrD = 0;
172 j_coord_offsetA = 0;
173 j_coord_offsetB = 0;
174 j_coord_offsetC = 0;
175 j_coord_offsetD = 0;
176
177 outeriter = 0;
178 inneriter = 0;
179
180 for(iidx=0;iidx<4*DIM3;iidx++)
181 {
182 scratch[iidx] = 0.0;
183 }
184
185 /* Start outer loop over neighborlists */
186 for(iidx=0; iidx<nri; iidx++)
187 {
188 /* Load shift vector for this list */
189 i_shift_offset = DIM3*shiftidx[iidx];
190
191 /* Load limits for loop over neighbors */
192 j_index_start = jindex[iidx];
193 j_index_end = jindex[iidx+1];
194
195 /* Get outer coordinate index */
196 inr = iinr[iidx];
197 i_coord_offset = DIM3*inr;
198
199 /* Load i particle coords and add shift vector */
200 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
201 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
202
203 fix0 = _mm_setzero_ps();
204 fiy0 = _mm_setzero_ps();
205 fiz0 = _mm_setzero_ps();
206 fix1 = _mm_setzero_ps();
207 fiy1 = _mm_setzero_ps();
208 fiz1 = _mm_setzero_ps();
209 fix2 = _mm_setzero_ps();
210 fiy2 = _mm_setzero_ps();
211 fiz2 = _mm_setzero_ps();
212 fix3 = _mm_setzero_ps();
213 fiy3 = _mm_setzero_ps();
214 fiz3 = _mm_setzero_ps();
215
216 /* Reset potential sums */
217 velecsum = _mm_setzero_ps();
218 vvdwsum = _mm_setzero_ps();
219
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222 {
223
224 /* Get j neighbor index, and coordinate index */
225 jnrA = jjnr[jidx];
226 jnrB = jjnr[jidx+1];
227 jnrC = jjnr[jidx+2];
228 jnrD = jjnr[jidx+3];
229 j_coord_offsetA = DIM3*jnrA;
230 j_coord_offsetB = DIM3*jnrB;
231 j_coord_offsetC = DIM3*jnrC;
232 j_coord_offsetD = DIM3*jnrD;
233
234 /* load j atom coordinates */
235 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
238 &jy2,&jz2,&jx3,&jy3,&jz3);
239
240 /* Calculate displacement vector */
241 dx00 = _mm_sub_ps(ix0,jx0);
242 dy00 = _mm_sub_ps(iy0,jy0);
243 dz00 = _mm_sub_ps(iz0,jz0);
244 dx11 = _mm_sub_ps(ix1,jx1);
245 dy11 = _mm_sub_ps(iy1,jy1);
246 dz11 = _mm_sub_ps(iz1,jz1);
247 dx12 = _mm_sub_ps(ix1,jx2);
248 dy12 = _mm_sub_ps(iy1,jy2);
249 dz12 = _mm_sub_ps(iz1,jz2);
250 dx13 = _mm_sub_ps(ix1,jx3);
251 dy13 = _mm_sub_ps(iy1,jy3);
252 dz13 = _mm_sub_ps(iz1,jz3);
253 dx21 = _mm_sub_ps(ix2,jx1);
254 dy21 = _mm_sub_ps(iy2,jy1);
255 dz21 = _mm_sub_ps(iz2,jz1);
256 dx22 = _mm_sub_ps(ix2,jx2);
257 dy22 = _mm_sub_ps(iy2,jy2);
258 dz22 = _mm_sub_ps(iz2,jz2);
259 dx23 = _mm_sub_ps(ix2,jx3);
260 dy23 = _mm_sub_ps(iy2,jy3);
261 dz23 = _mm_sub_ps(iz2,jz3);
262 dx31 = _mm_sub_ps(ix3,jx1);
263 dy31 = _mm_sub_ps(iy3,jy1);
264 dz31 = _mm_sub_ps(iz3,jz1);
265 dx32 = _mm_sub_ps(ix3,jx2);
266 dy32 = _mm_sub_ps(iy3,jy2);
267 dz32 = _mm_sub_ps(iz3,jz2);
268 dx33 = _mm_sub_ps(ix3,jx3);
269 dy33 = _mm_sub_ps(iy3,jy3);
270 dz33 = _mm_sub_ps(iz3,jz3);
271
272 /* Calculate squared distance and things based on it */
273 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
274 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
275 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
276 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
277 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
278 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
279 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
280 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
281 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
282 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
283
284 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
285 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
286 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
287 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
288 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
289 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
290 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
291 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
292 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
293
294 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
295
296 fjx0 = _mm_setzero_ps();
297 fjy0 = _mm_setzero_ps();
298 fjz0 = _mm_setzero_ps();
299 fjx1 = _mm_setzero_ps();
300 fjy1 = _mm_setzero_ps();
301 fjz1 = _mm_setzero_ps();
302 fjx2 = _mm_setzero_ps();
303 fjy2 = _mm_setzero_ps();
304 fjz2 = _mm_setzero_ps();
305 fjx3 = _mm_setzero_ps();
306 fjy3 = _mm_setzero_ps();
307 fjz3 = _mm_setzero_ps();
308
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
312
313 /* LENNARD-JONES DISPERSION/REPULSION */
314
315 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
316 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
317 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
318 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
319 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
320
321 /* Update potential sum for this i atom from the interaction with this j atom. */
322 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
323
324 fscal = fvdw;
325
326 /* Calculate temporary vectorial force */
327 tx = _mm_mul_ps(fscal,dx00);
328 ty = _mm_mul_ps(fscal,dy00);
329 tz = _mm_mul_ps(fscal,dz00);
330
331 /* Update vectorial force */
332 fix0 = _mm_add_ps(fix0,tx);
333 fiy0 = _mm_add_ps(fiy0,ty);
334 fiz0 = _mm_add_ps(fiz0,tz);
335
336 fjx0 = _mm_add_ps(fjx0,tx);
337 fjy0 = _mm_add_ps(fjy0,ty);
338 fjz0 = _mm_add_ps(fjz0,tz);
339
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
343
344 r11 = _mm_mul_ps(rsq11,rinv11);
345
346 /* Calculate table index by multiplying r with table scale and truncate to integer */
347 rt = _mm_mul_ps(r11,vftabscale);
348 vfitab = _mm_cvttps_epi32(rt);
349 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
350 vfitab = _mm_slli_epi32(vfitab,2);
351
352 /* CUBIC SPLINE TABLE ELECTROSTATICS */
353 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
354 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
355 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
356 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
357 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
358 Heps = _mm_mul_ps(vfeps,H);
359 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
360 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
361 velec = _mm_mul_ps(qq11,VV);
362 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
363 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
364
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velecsum = _mm_add_ps(velecsum,velec);
367
368 fscal = felec;
369
370 /* Calculate temporary vectorial force */
371 tx = _mm_mul_ps(fscal,dx11);
372 ty = _mm_mul_ps(fscal,dy11);
373 tz = _mm_mul_ps(fscal,dz11);
374
375 /* Update vectorial force */
376 fix1 = _mm_add_ps(fix1,tx);
377 fiy1 = _mm_add_ps(fiy1,ty);
378 fiz1 = _mm_add_ps(fiz1,tz);
379
380 fjx1 = _mm_add_ps(fjx1,tx);
381 fjy1 = _mm_add_ps(fjy1,ty);
382 fjz1 = _mm_add_ps(fjz1,tz);
383
384 /**************************
385 * CALCULATE INTERACTIONS *
386 **************************/
387
388 r12 = _mm_mul_ps(rsq12,rinv12);
389
390 /* Calculate table index by multiplying r with table scale and truncate to integer */
391 rt = _mm_mul_ps(r12,vftabscale);
392 vfitab = _mm_cvttps_epi32(rt);
393 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
394 vfitab = _mm_slli_epi32(vfitab,2);
395
396 /* CUBIC SPLINE TABLE ELECTROSTATICS */
397 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
398 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
399 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
400 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
401 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
402 Heps = _mm_mul_ps(vfeps,H);
403 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
404 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
405 velec = _mm_mul_ps(qq12,VV);
406 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
407 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
408
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velecsum = _mm_add_ps(velecsum,velec);
411
412 fscal = felec;
413
414 /* Calculate temporary vectorial force */
415 tx = _mm_mul_ps(fscal,dx12);
416 ty = _mm_mul_ps(fscal,dy12);
417 tz = _mm_mul_ps(fscal,dz12);
418
419 /* Update vectorial force */
420 fix1 = _mm_add_ps(fix1,tx);
421 fiy1 = _mm_add_ps(fiy1,ty);
422 fiz1 = _mm_add_ps(fiz1,tz);
423
424 fjx2 = _mm_add_ps(fjx2,tx);
425 fjy2 = _mm_add_ps(fjy2,ty);
426 fjz2 = _mm_add_ps(fjz2,tz);
427
428 /**************************
429 * CALCULATE INTERACTIONS *
430 **************************/
431
432 r13 = _mm_mul_ps(rsq13,rinv13);
433
434 /* Calculate table index by multiplying r with table scale and truncate to integer */
435 rt = _mm_mul_ps(r13,vftabscale);
436 vfitab = _mm_cvttps_epi32(rt);
437 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
438 vfitab = _mm_slli_epi32(vfitab,2);
439
440 /* CUBIC SPLINE TABLE ELECTROSTATICS */
441 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
442 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
443 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
444 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
445 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
446 Heps = _mm_mul_ps(vfeps,H);
447 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
448 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
449 velec = _mm_mul_ps(qq13,VV);
450 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
451 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
452
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velecsum = _mm_add_ps(velecsum,velec);
455
456 fscal = felec;
457
458 /* Calculate temporary vectorial force */
459 tx = _mm_mul_ps(fscal,dx13);
460 ty = _mm_mul_ps(fscal,dy13);
461 tz = _mm_mul_ps(fscal,dz13);
462
463 /* Update vectorial force */
464 fix1 = _mm_add_ps(fix1,tx);
465 fiy1 = _mm_add_ps(fiy1,ty);
466 fiz1 = _mm_add_ps(fiz1,tz);
467
468 fjx3 = _mm_add_ps(fjx3,tx);
469 fjy3 = _mm_add_ps(fjy3,ty);
470 fjz3 = _mm_add_ps(fjz3,tz);
471
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
475
476 r21 = _mm_mul_ps(rsq21,rinv21);
477
478 /* Calculate table index by multiplying r with table scale and truncate to integer */
479 rt = _mm_mul_ps(r21,vftabscale);
480 vfitab = _mm_cvttps_epi32(rt);
481 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
482 vfitab = _mm_slli_epi32(vfitab,2);
483
484 /* CUBIC SPLINE TABLE ELECTROSTATICS */
485 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
486 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
487 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
488 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
489 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
490 Heps = _mm_mul_ps(vfeps,H);
491 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
492 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
493 velec = _mm_mul_ps(qq21,VV);
494 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
495 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
496
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm_add_ps(velecsum,velec);
499
500 fscal = felec;
501
502 /* Calculate temporary vectorial force */
503 tx = _mm_mul_ps(fscal,dx21);
504 ty = _mm_mul_ps(fscal,dy21);
505 tz = _mm_mul_ps(fscal,dz21);
506
507 /* Update vectorial force */
508 fix2 = _mm_add_ps(fix2,tx);
509 fiy2 = _mm_add_ps(fiy2,ty);
510 fiz2 = _mm_add_ps(fiz2,tz);
511
512 fjx1 = _mm_add_ps(fjx1,tx);
513 fjy1 = _mm_add_ps(fjy1,ty);
514 fjz1 = _mm_add_ps(fjz1,tz);
515
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
519
520 r22 = _mm_mul_ps(rsq22,rinv22);
521
522 /* Calculate table index by multiplying r with table scale and truncate to integer */
523 rt = _mm_mul_ps(r22,vftabscale);
524 vfitab = _mm_cvttps_epi32(rt);
525 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
526 vfitab = _mm_slli_epi32(vfitab,2);
527
528 /* CUBIC SPLINE TABLE ELECTROSTATICS */
529 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
530 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
531 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
532 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
533 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
534 Heps = _mm_mul_ps(vfeps,H);
535 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
536 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
537 velec = _mm_mul_ps(qq22,VV);
538 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
539 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
540
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velecsum = _mm_add_ps(velecsum,velec);
543
544 fscal = felec;
545
546 /* Calculate temporary vectorial force */
547 tx = _mm_mul_ps(fscal,dx22);
548 ty = _mm_mul_ps(fscal,dy22);
549 tz = _mm_mul_ps(fscal,dz22);
550
551 /* Update vectorial force */
552 fix2 = _mm_add_ps(fix2,tx);
553 fiy2 = _mm_add_ps(fiy2,ty);
554 fiz2 = _mm_add_ps(fiz2,tz);
555
556 fjx2 = _mm_add_ps(fjx2,tx);
557 fjy2 = _mm_add_ps(fjy2,ty);
558 fjz2 = _mm_add_ps(fjz2,tz);
559
560 /**************************
561 * CALCULATE INTERACTIONS *
562 **************************/
563
564 r23 = _mm_mul_ps(rsq23,rinv23);
565
566 /* Calculate table index by multiplying r with table scale and truncate to integer */
567 rt = _mm_mul_ps(r23,vftabscale);
568 vfitab = _mm_cvttps_epi32(rt);
569 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
570 vfitab = _mm_slli_epi32(vfitab,2);
571
572 /* CUBIC SPLINE TABLE ELECTROSTATICS */
573 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
574 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
575 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
576 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
577 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
578 Heps = _mm_mul_ps(vfeps,H);
579 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
580 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
581 velec = _mm_mul_ps(qq23,VV);
582 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
583 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
584
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velecsum = _mm_add_ps(velecsum,velec);
587
588 fscal = felec;
589
590 /* Calculate temporary vectorial force */
591 tx = _mm_mul_ps(fscal,dx23);
592 ty = _mm_mul_ps(fscal,dy23);
593 tz = _mm_mul_ps(fscal,dz23);
594
595 /* Update vectorial force */
596 fix2 = _mm_add_ps(fix2,tx);
597 fiy2 = _mm_add_ps(fiy2,ty);
598 fiz2 = _mm_add_ps(fiz2,tz);
599
600 fjx3 = _mm_add_ps(fjx3,tx);
601 fjy3 = _mm_add_ps(fjy3,ty);
602 fjz3 = _mm_add_ps(fjz3,tz);
603
604 /**************************
605 * CALCULATE INTERACTIONS *
606 **************************/
607
608 r31 = _mm_mul_ps(rsq31,rinv31);
609
610 /* Calculate table index by multiplying r with table scale and truncate to integer */
611 rt = _mm_mul_ps(r31,vftabscale);
612 vfitab = _mm_cvttps_epi32(rt);
613 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
614 vfitab = _mm_slli_epi32(vfitab,2);
615
616 /* CUBIC SPLINE TABLE ELECTROSTATICS */
617 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
618 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
619 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
620 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
621 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
622 Heps = _mm_mul_ps(vfeps,H);
623 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
624 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
625 velec = _mm_mul_ps(qq31,VV);
626 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
627 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
628
629 /* Update potential sum for this i atom from the interaction with this j atom. */
630 velecsum = _mm_add_ps(velecsum,velec);
631
632 fscal = felec;
633
634 /* Calculate temporary vectorial force */
635 tx = _mm_mul_ps(fscal,dx31);
636 ty = _mm_mul_ps(fscal,dy31);
637 tz = _mm_mul_ps(fscal,dz31);
638
639 /* Update vectorial force */
640 fix3 = _mm_add_ps(fix3,tx);
641 fiy3 = _mm_add_ps(fiy3,ty);
642 fiz3 = _mm_add_ps(fiz3,tz);
643
644 fjx1 = _mm_add_ps(fjx1,tx);
645 fjy1 = _mm_add_ps(fjy1,ty);
646 fjz1 = _mm_add_ps(fjz1,tz);
647
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
651
652 r32 = _mm_mul_ps(rsq32,rinv32);
653
654 /* Calculate table index by multiplying r with table scale and truncate to integer */
655 rt = _mm_mul_ps(r32,vftabscale);
656 vfitab = _mm_cvttps_epi32(rt);
657 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
658 vfitab = _mm_slli_epi32(vfitab,2);
659
660 /* CUBIC SPLINE TABLE ELECTROSTATICS */
661 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
662 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
663 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
664 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
665 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
666 Heps = _mm_mul_ps(vfeps,H);
667 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
668 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
669 velec = _mm_mul_ps(qq32,VV);
670 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
671 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
672
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velecsum = _mm_add_ps(velecsum,velec);
675
676 fscal = felec;
677
678 /* Calculate temporary vectorial force */
679 tx = _mm_mul_ps(fscal,dx32);
680 ty = _mm_mul_ps(fscal,dy32);
681 tz = _mm_mul_ps(fscal,dz32);
682
683 /* Update vectorial force */
684 fix3 = _mm_add_ps(fix3,tx);
685 fiy3 = _mm_add_ps(fiy3,ty);
686 fiz3 = _mm_add_ps(fiz3,tz);
687
688 fjx2 = _mm_add_ps(fjx2,tx);
689 fjy2 = _mm_add_ps(fjy2,ty);
690 fjz2 = _mm_add_ps(fjz2,tz);
691
692 /**************************
693 * CALCULATE INTERACTIONS *
694 **************************/
695
696 r33 = _mm_mul_ps(rsq33,rinv33);
697
698 /* Calculate table index by multiplying r with table scale and truncate to integer */
699 rt = _mm_mul_ps(r33,vftabscale);
700 vfitab = _mm_cvttps_epi32(rt);
701 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
702 vfitab = _mm_slli_epi32(vfitab,2);
703
704 /* CUBIC SPLINE TABLE ELECTROSTATICS */
705 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
706 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
707 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
708 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
709 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
710 Heps = _mm_mul_ps(vfeps,H);
711 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
712 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
713 velec = _mm_mul_ps(qq33,VV);
714 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
715 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
716
717 /* Update potential sum for this i atom from the interaction with this j atom. */
718 velecsum = _mm_add_ps(velecsum,velec);
719
720 fscal = felec;
721
722 /* Calculate temporary vectorial force */
723 tx = _mm_mul_ps(fscal,dx33);
724 ty = _mm_mul_ps(fscal,dy33);
725 tz = _mm_mul_ps(fscal,dz33);
726
727 /* Update vectorial force */
728 fix3 = _mm_add_ps(fix3,tx);
729 fiy3 = _mm_add_ps(fiy3,ty);
730 fiz3 = _mm_add_ps(fiz3,tz);
731
732 fjx3 = _mm_add_ps(fjx3,tx);
733 fjy3 = _mm_add_ps(fjy3,ty);
734 fjz3 = _mm_add_ps(fjz3,tz);
735
736 fjptrA = f+j_coord_offsetA;
737 fjptrB = f+j_coord_offsetB;
738 fjptrC = f+j_coord_offsetC;
739 fjptrD = f+j_coord_offsetD;
740
741 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
742 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
743 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
744
745 /* Inner loop uses 422 flops */
746 }
747
748 if(jidx<j_index_end)
749 {
750
751 /* Get j neighbor index, and coordinate index */
752 jnrlistA = jjnr[jidx];
753 jnrlistB = jjnr[jidx+1];
754 jnrlistC = jjnr[jidx+2];
755 jnrlistD = jjnr[jidx+3];
756 /* Sign of each element will be negative for non-real atoms.
757 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
758 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
759 */
760 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
761 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
762 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
763 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
764 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
765 j_coord_offsetA = DIM3*jnrA;
766 j_coord_offsetB = DIM3*jnrB;
767 j_coord_offsetC = DIM3*jnrC;
768 j_coord_offsetD = DIM3*jnrD;
769
770 /* load j atom coordinates */
771 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
772 x+j_coord_offsetC,x+j_coord_offsetD,
773 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
774 &jy2,&jz2,&jx3,&jy3,&jz3);
775
776 /* Calculate displacement vector */
777 dx00 = _mm_sub_ps(ix0,jx0);
778 dy00 = _mm_sub_ps(iy0,jy0);
779 dz00 = _mm_sub_ps(iz0,jz0);
780 dx11 = _mm_sub_ps(ix1,jx1);
781 dy11 = _mm_sub_ps(iy1,jy1);
782 dz11 = _mm_sub_ps(iz1,jz1);
783 dx12 = _mm_sub_ps(ix1,jx2);
784 dy12 = _mm_sub_ps(iy1,jy2);
785 dz12 = _mm_sub_ps(iz1,jz2);
786 dx13 = _mm_sub_ps(ix1,jx3);
787 dy13 = _mm_sub_ps(iy1,jy3);
788 dz13 = _mm_sub_ps(iz1,jz3);
789 dx21 = _mm_sub_ps(ix2,jx1);
790 dy21 = _mm_sub_ps(iy2,jy1);
791 dz21 = _mm_sub_ps(iz2,jz1);
792 dx22 = _mm_sub_ps(ix2,jx2);
793 dy22 = _mm_sub_ps(iy2,jy2);
794 dz22 = _mm_sub_ps(iz2,jz2);
795 dx23 = _mm_sub_ps(ix2,jx3);
796 dy23 = _mm_sub_ps(iy2,jy3);
797 dz23 = _mm_sub_ps(iz2,jz3);
798 dx31 = _mm_sub_ps(ix3,jx1);
799 dy31 = _mm_sub_ps(iy3,jy1);
800 dz31 = _mm_sub_ps(iz3,jz1);
801 dx32 = _mm_sub_ps(ix3,jx2);
802 dy32 = _mm_sub_ps(iy3,jy2);
803 dz32 = _mm_sub_ps(iz3,jz2);
804 dx33 = _mm_sub_ps(ix3,jx3);
805 dy33 = _mm_sub_ps(iy3,jy3);
806 dz33 = _mm_sub_ps(iz3,jz3);
807
808 /* Calculate squared distance and things based on it */
809 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
810 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
811 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
812 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
813 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
814 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
815 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
816 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
817 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
818 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
819
820 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
821 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
822 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
823 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
824 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
825 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
826 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
827 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
828 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
829
830 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
831
832 fjx0 = _mm_setzero_ps();
833 fjy0 = _mm_setzero_ps();
834 fjz0 = _mm_setzero_ps();
835 fjx1 = _mm_setzero_ps();
836 fjy1 = _mm_setzero_ps();
837 fjz1 = _mm_setzero_ps();
838 fjx2 = _mm_setzero_ps();
839 fjy2 = _mm_setzero_ps();
840 fjz2 = _mm_setzero_ps();
841 fjx3 = _mm_setzero_ps();
842 fjy3 = _mm_setzero_ps();
843 fjz3 = _mm_setzero_ps();
844
845 /**************************
846 * CALCULATE INTERACTIONS *
847 **************************/
848
849 /* LENNARD-JONES DISPERSION/REPULSION */
850
851 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
852 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
853 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
854 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
855 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
856
857 /* Update potential sum for this i atom from the interaction with this j atom. */
858 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
859 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
860
861 fscal = fvdw;
862
863 fscal = _mm_andnot_ps(dummy_mask,fscal);
864
865 /* Calculate temporary vectorial force */
866 tx = _mm_mul_ps(fscal,dx00);
867 ty = _mm_mul_ps(fscal,dy00);
868 tz = _mm_mul_ps(fscal,dz00);
869
870 /* Update vectorial force */
871 fix0 = _mm_add_ps(fix0,tx);
872 fiy0 = _mm_add_ps(fiy0,ty);
873 fiz0 = _mm_add_ps(fiz0,tz);
874
875 fjx0 = _mm_add_ps(fjx0,tx);
876 fjy0 = _mm_add_ps(fjy0,ty);
877 fjz0 = _mm_add_ps(fjz0,tz);
878
879 /**************************
880 * CALCULATE INTERACTIONS *
881 **************************/
882
883 r11 = _mm_mul_ps(rsq11,rinv11);
884 r11 = _mm_andnot_ps(dummy_mask,r11);
885
886 /* Calculate table index by multiplying r with table scale and truncate to integer */
887 rt = _mm_mul_ps(r11,vftabscale);
888 vfitab = _mm_cvttps_epi32(rt);
889 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
890 vfitab = _mm_slli_epi32(vfitab,2);
891
892 /* CUBIC SPLINE TABLE ELECTROSTATICS */
893 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
894 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
895 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
896 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
897 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
898 Heps = _mm_mul_ps(vfeps,H);
899 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
900 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
901 velec = _mm_mul_ps(qq11,VV);
902 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
903 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
904
905 /* Update potential sum for this i atom from the interaction with this j atom. */
906 velec = _mm_andnot_ps(dummy_mask,velec);
907 velecsum = _mm_add_ps(velecsum,velec);
908
909 fscal = felec;
910
911 fscal = _mm_andnot_ps(dummy_mask,fscal);
912
913 /* Calculate temporary vectorial force */
914 tx = _mm_mul_ps(fscal,dx11);
915 ty = _mm_mul_ps(fscal,dy11);
916 tz = _mm_mul_ps(fscal,dz11);
917
918 /* Update vectorial force */
919 fix1 = _mm_add_ps(fix1,tx);
920 fiy1 = _mm_add_ps(fiy1,ty);
921 fiz1 = _mm_add_ps(fiz1,tz);
922
923 fjx1 = _mm_add_ps(fjx1,tx);
924 fjy1 = _mm_add_ps(fjy1,ty);
925 fjz1 = _mm_add_ps(fjz1,tz);
926
927 /**************************
928 * CALCULATE INTERACTIONS *
929 **************************/
930
931 r12 = _mm_mul_ps(rsq12,rinv12);
932 r12 = _mm_andnot_ps(dummy_mask,r12);
933
934 /* Calculate table index by multiplying r with table scale and truncate to integer */
935 rt = _mm_mul_ps(r12,vftabscale);
936 vfitab = _mm_cvttps_epi32(rt);
937 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
938 vfitab = _mm_slli_epi32(vfitab,2);
939
940 /* CUBIC SPLINE TABLE ELECTROSTATICS */
941 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
942 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
943 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
944 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
945 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
946 Heps = _mm_mul_ps(vfeps,H);
947 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
948 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
949 velec = _mm_mul_ps(qq12,VV);
950 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
951 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
952
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_andnot_ps(dummy_mask,velec);
955 velecsum = _mm_add_ps(velecsum,velec);
956
957 fscal = felec;
958
959 fscal = _mm_andnot_ps(dummy_mask,fscal);
960
961 /* Calculate temporary vectorial force */
962 tx = _mm_mul_ps(fscal,dx12);
963 ty = _mm_mul_ps(fscal,dy12);
964 tz = _mm_mul_ps(fscal,dz12);
965
966 /* Update vectorial force */
967 fix1 = _mm_add_ps(fix1,tx);
968 fiy1 = _mm_add_ps(fiy1,ty);
969 fiz1 = _mm_add_ps(fiz1,tz);
970
971 fjx2 = _mm_add_ps(fjx2,tx);
972 fjy2 = _mm_add_ps(fjy2,ty);
973 fjz2 = _mm_add_ps(fjz2,tz);
974
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
978
979 r13 = _mm_mul_ps(rsq13,rinv13);
980 r13 = _mm_andnot_ps(dummy_mask,r13);
981
982 /* Calculate table index by multiplying r with table scale and truncate to integer */
983 rt = _mm_mul_ps(r13,vftabscale);
984 vfitab = _mm_cvttps_epi32(rt);
985 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
986 vfitab = _mm_slli_epi32(vfitab,2);
987
988 /* CUBIC SPLINE TABLE ELECTROSTATICS */
989 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
990 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
991 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
992 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
993 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
994 Heps = _mm_mul_ps(vfeps,H);
995 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
996 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
997 velec = _mm_mul_ps(qq13,VV);
998 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
999 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1000
1001 /* Update potential sum for this i atom from the interaction with this j atom. */
1002 velec = _mm_andnot_ps(dummy_mask,velec);
1003 velecsum = _mm_add_ps(velecsum,velec);
1004
1005 fscal = felec;
1006
1007 fscal = _mm_andnot_ps(dummy_mask,fscal);
1008
1009 /* Calculate temporary vectorial force */
1010 tx = _mm_mul_ps(fscal,dx13);
1011 ty = _mm_mul_ps(fscal,dy13);
1012 tz = _mm_mul_ps(fscal,dz13);
1013
1014 /* Update vectorial force */
1015 fix1 = _mm_add_ps(fix1,tx);
1016 fiy1 = _mm_add_ps(fiy1,ty);
1017 fiz1 = _mm_add_ps(fiz1,tz);
1018
1019 fjx3 = _mm_add_ps(fjx3,tx);
1020 fjy3 = _mm_add_ps(fjy3,ty);
1021 fjz3 = _mm_add_ps(fjz3,tz);
1022
1023 /**************************
1024 * CALCULATE INTERACTIONS *
1025 **************************/
1026
1027 r21 = _mm_mul_ps(rsq21,rinv21);
1028 r21 = _mm_andnot_ps(dummy_mask,r21);
1029
1030 /* Calculate table index by multiplying r with table scale and truncate to integer */
1031 rt = _mm_mul_ps(r21,vftabscale);
1032 vfitab = _mm_cvttps_epi32(rt);
1033 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1034 vfitab = _mm_slli_epi32(vfitab,2);
1035
1036 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1037 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1038 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1039 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1040 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1041 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1042 Heps = _mm_mul_ps(vfeps,H);
1043 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1044 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1045 velec = _mm_mul_ps(qq21,VV);
1046 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1047 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1048
1049 /* Update potential sum for this i atom from the interaction with this j atom. */
1050 velec = _mm_andnot_ps(dummy_mask,velec);
1051 velecsum = _mm_add_ps(velecsum,velec);
1052
1053 fscal = felec;
1054
1055 fscal = _mm_andnot_ps(dummy_mask,fscal);
1056
1057 /* Calculate temporary vectorial force */
1058 tx = _mm_mul_ps(fscal,dx21);
1059 ty = _mm_mul_ps(fscal,dy21);
1060 tz = _mm_mul_ps(fscal,dz21);
1061
1062 /* Update vectorial force */
1063 fix2 = _mm_add_ps(fix2,tx);
1064 fiy2 = _mm_add_ps(fiy2,ty);
1065 fiz2 = _mm_add_ps(fiz2,tz);
1066
1067 fjx1 = _mm_add_ps(fjx1,tx);
1068 fjy1 = _mm_add_ps(fjy1,ty);
1069 fjz1 = _mm_add_ps(fjz1,tz);
1070
1071 /**************************
1072 * CALCULATE INTERACTIONS *
1073 **************************/
1074
1075 r22 = _mm_mul_ps(rsq22,rinv22);
1076 r22 = _mm_andnot_ps(dummy_mask,r22);
1077
1078 /* Calculate table index by multiplying r with table scale and truncate to integer */
1079 rt = _mm_mul_ps(r22,vftabscale);
1080 vfitab = _mm_cvttps_epi32(rt);
1081 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1082 vfitab = _mm_slli_epi32(vfitab,2);
1083
1084 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1085 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1086 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1087 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1088 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1089 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1090 Heps = _mm_mul_ps(vfeps,H);
1091 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1092 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1093 velec = _mm_mul_ps(qq22,VV);
1094 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1095 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1096
1097 /* Update potential sum for this i atom from the interaction with this j atom. */
1098 velec = _mm_andnot_ps(dummy_mask,velec);
1099 velecsum = _mm_add_ps(velecsum,velec);
1100
1101 fscal = felec;
1102
1103 fscal = _mm_andnot_ps(dummy_mask,fscal);
1104
1105 /* Calculate temporary vectorial force */
1106 tx = _mm_mul_ps(fscal,dx22);
1107 ty = _mm_mul_ps(fscal,dy22);
1108 tz = _mm_mul_ps(fscal,dz22);
1109
1110 /* Update vectorial force */
1111 fix2 = _mm_add_ps(fix2,tx);
1112 fiy2 = _mm_add_ps(fiy2,ty);
1113 fiz2 = _mm_add_ps(fiz2,tz);
1114
1115 fjx2 = _mm_add_ps(fjx2,tx);
1116 fjy2 = _mm_add_ps(fjy2,ty);
1117 fjz2 = _mm_add_ps(fjz2,tz);
1118
1119 /**************************
1120 * CALCULATE INTERACTIONS *
1121 **************************/
1122
1123 r23 = _mm_mul_ps(rsq23,rinv23);
1124 r23 = _mm_andnot_ps(dummy_mask,r23);
1125
1126 /* Calculate table index by multiplying r with table scale and truncate to integer */
1127 rt = _mm_mul_ps(r23,vftabscale);
1128 vfitab = _mm_cvttps_epi32(rt);
1129 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1130 vfitab = _mm_slli_epi32(vfitab,2);
1131
1132 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1133 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1134 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1135 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1136 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1137 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1138 Heps = _mm_mul_ps(vfeps,H);
1139 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1140 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1141 velec = _mm_mul_ps(qq23,VV);
1142 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1143 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1144
1145 /* Update potential sum for this i atom from the interaction with this j atom. */
1146 velec = _mm_andnot_ps(dummy_mask,velec);
1147 velecsum = _mm_add_ps(velecsum,velec);
1148
1149 fscal = felec;
1150
1151 fscal = _mm_andnot_ps(dummy_mask,fscal);
1152
1153 /* Calculate temporary vectorial force */
1154 tx = _mm_mul_ps(fscal,dx23);
1155 ty = _mm_mul_ps(fscal,dy23);
1156 tz = _mm_mul_ps(fscal,dz23);
1157
1158 /* Update vectorial force */
1159 fix2 = _mm_add_ps(fix2,tx);
1160 fiy2 = _mm_add_ps(fiy2,ty);
1161 fiz2 = _mm_add_ps(fiz2,tz);
1162
1163 fjx3 = _mm_add_ps(fjx3,tx);
1164 fjy3 = _mm_add_ps(fjy3,ty);
1165 fjz3 = _mm_add_ps(fjz3,tz);
1166
1167 /**************************
1168 * CALCULATE INTERACTIONS *
1169 **************************/
1170
1171 r31 = _mm_mul_ps(rsq31,rinv31);
1172 r31 = _mm_andnot_ps(dummy_mask,r31);
1173
1174 /* Calculate table index by multiplying r with table scale and truncate to integer */
1175 rt = _mm_mul_ps(r31,vftabscale);
1176 vfitab = _mm_cvttps_epi32(rt);
1177 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1178 vfitab = _mm_slli_epi32(vfitab,2);
1179
1180 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1181 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1182 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1183 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1184 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1185 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1186 Heps = _mm_mul_ps(vfeps,H);
1187 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1188 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1189 velec = _mm_mul_ps(qq31,VV);
1190 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1191 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1192
1193 /* Update potential sum for this i atom from the interaction with this j atom. */
1194 velec = _mm_andnot_ps(dummy_mask,velec);
1195 velecsum = _mm_add_ps(velecsum,velec);
1196
1197 fscal = felec;
1198
1199 fscal = _mm_andnot_ps(dummy_mask,fscal);
1200
1201 /* Calculate temporary vectorial force */
1202 tx = _mm_mul_ps(fscal,dx31);
1203 ty = _mm_mul_ps(fscal,dy31);
1204 tz = _mm_mul_ps(fscal,dz31);
1205
1206 /* Update vectorial force */
1207 fix3 = _mm_add_ps(fix3,tx);
1208 fiy3 = _mm_add_ps(fiy3,ty);
1209 fiz3 = _mm_add_ps(fiz3,tz);
1210
1211 fjx1 = _mm_add_ps(fjx1,tx);
1212 fjy1 = _mm_add_ps(fjy1,ty);
1213 fjz1 = _mm_add_ps(fjz1,tz);
1214
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1218
1219 r32 = _mm_mul_ps(rsq32,rinv32);
1220 r32 = _mm_andnot_ps(dummy_mask,r32);
1221
1222 /* Calculate table index by multiplying r with table scale and truncate to integer */
1223 rt = _mm_mul_ps(r32,vftabscale);
1224 vfitab = _mm_cvttps_epi32(rt);
1225 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1226 vfitab = _mm_slli_epi32(vfitab,2);
1227
1228 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1229 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1230 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1231 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1232 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1233 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1234 Heps = _mm_mul_ps(vfeps,H);
1235 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1236 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1237 velec = _mm_mul_ps(qq32,VV);
1238 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1239 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1240
1241 /* Update potential sum for this i atom from the interaction with this j atom. */
1242 velec = _mm_andnot_ps(dummy_mask,velec);
1243 velecsum = _mm_add_ps(velecsum,velec);
1244
1245 fscal = felec;
1246
1247 fscal = _mm_andnot_ps(dummy_mask,fscal);
1248
1249 /* Calculate temporary vectorial force */
1250 tx = _mm_mul_ps(fscal,dx32);
1251 ty = _mm_mul_ps(fscal,dy32);
1252 tz = _mm_mul_ps(fscal,dz32);
1253
1254 /* Update vectorial force */
1255 fix3 = _mm_add_ps(fix3,tx);
1256 fiy3 = _mm_add_ps(fiy3,ty);
1257 fiz3 = _mm_add_ps(fiz3,tz);
1258
1259 fjx2 = _mm_add_ps(fjx2,tx);
1260 fjy2 = _mm_add_ps(fjy2,ty);
1261 fjz2 = _mm_add_ps(fjz2,tz);
1262
1263 /**************************
1264 * CALCULATE INTERACTIONS *
1265 **************************/
1266
1267 r33 = _mm_mul_ps(rsq33,rinv33);
1268 r33 = _mm_andnot_ps(dummy_mask,r33);
1269
1270 /* Calculate table index by multiplying r with table scale and truncate to integer */
1271 rt = _mm_mul_ps(r33,vftabscale);
1272 vfitab = _mm_cvttps_epi32(rt);
1273 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1274 vfitab = _mm_slli_epi32(vfitab,2);
1275
1276 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1277 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1278 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1279 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1280 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1281 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1282 Heps = _mm_mul_ps(vfeps,H);
1283 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1284 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1285 velec = _mm_mul_ps(qq33,VV);
1286 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1287 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1288
1289 /* Update potential sum for this i atom from the interaction with this j atom. */
1290 velec = _mm_andnot_ps(dummy_mask,velec);
1291 velecsum = _mm_add_ps(velecsum,velec);
1292
1293 fscal = felec;
1294
1295 fscal = _mm_andnot_ps(dummy_mask,fscal);
1296
1297 /* Calculate temporary vectorial force */
1298 tx = _mm_mul_ps(fscal,dx33);
1299 ty = _mm_mul_ps(fscal,dy33);
1300 tz = _mm_mul_ps(fscal,dz33);
1301
1302 /* Update vectorial force */
1303 fix3 = _mm_add_ps(fix3,tx);
1304 fiy3 = _mm_add_ps(fiy3,ty);
1305 fiz3 = _mm_add_ps(fiz3,tz);
1306
1307 fjx3 = _mm_add_ps(fjx3,tx);
1308 fjy3 = _mm_add_ps(fjy3,ty);
1309 fjz3 = _mm_add_ps(fjz3,tz);
1310
1311 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1312 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1313 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1314 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1315
1316 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1317 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1318 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1319
1320 /* Inner loop uses 431 flops */
1321 }
1322
1323 /* End of innermost loop */
1324
1325 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1326 f+i_coord_offset,fshift+i_shift_offset);
1327
1328 ggid = gid[iidx];
1329 /* Update potential energies */
1330 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1331 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1332
1333 /* Increment number of inner iterations */
1334 inneriter += j_index_end - j_index_start;
1335
1336 /* Outer loop uses 26 flops */
1337 }
1338
1339 /* Increment number of outer iterations */
1340 outeriter += nri;
1341
1342 /* Update outer/inner flops */
1343
1344 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_VF] += outeriter*26 +
inneriter*431
;
1345}
1346/*
1347 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single
1348 * Electrostatics interaction: CubicSplineTable
1349 * VdW interaction: LennardJones
1350 * Geometry: Water4-Water4
1351 * Calculate force/pot: Force
1352 */
1353void
1354nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single
1355 (t_nblist * gmx_restrict nlist,
1356 rvec * gmx_restrict xx,
1357 rvec * gmx_restrict ff,
1358 t_forcerec * gmx_restrict fr,
1359 t_mdatoms * gmx_restrict mdatoms,
1360 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1361 t_nrnb * gmx_restrict nrnb)
1362{
1363 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1364 * just 0 for non-waters.
1365 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1366 * jnr indices corresponding to data put in the four positions in the SIMD register.
1367 */
1368 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1369 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1370 int jnrA,jnrB,jnrC,jnrD;
1371 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1372 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1373 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1374 real rcutoff_scalar;
1375 real *shiftvec,*fshift,*x,*f;
1376 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1377 real scratch[4*DIM3];
1378 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1379 int vdwioffset0;
1380 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1381 int vdwioffset1;
1382 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1383 int vdwioffset2;
1384 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1385 int vdwioffset3;
1386 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1387 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1388 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1389 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1390 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1391 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1392 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1393 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1394 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1395 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1396 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1397 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1398 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1399 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1400 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1401 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1402 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1403 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1404 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1405 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1406 real *charge;
1407 int nvdwtype;
1408 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1409 int *vdwtype;
1410 real *vdwparam;
1411 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1412 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1413 __m128i vfitab;
1414 __m128i ifour = _mm_set1_epi32(4);
1415 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1416 real *vftab;
1417 __m128 dummy_mask,cutoff_mask;
1418 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1419 __m128 one = _mm_set1_ps(1.0);
1420 __m128 two = _mm_set1_ps(2.0);
Value stored to 'two' during its initialization is never read
1421 x = xx[0];
1422 f = ff[0];
1423
1424 nri = nlist->nri;
1425 iinr = nlist->iinr;
1426 jindex = nlist->jindex;
1427 jjnr = nlist->jjnr;
1428 shiftidx = nlist->shift;
1429 gid = nlist->gid;
1430 shiftvec = fr->shift_vec[0];
1431 fshift = fr->fshift[0];
1432 facel = _mm_set1_ps(fr->epsfac);
1433 charge = mdatoms->chargeA;
1434 nvdwtype = fr->ntype;
1435 vdwparam = fr->nbfp;
1436 vdwtype = mdatoms->typeA;
1437
1438 vftab = kernel_data->table_elec->data;
1439 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1440
1441 /* Setup water-specific parameters */
1442 inr = nlist->iinr[0];
1443 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1444 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1445 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1446 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1447
1448 jq1 = _mm_set1_ps(charge[inr+1]);
1449 jq2 = _mm_set1_ps(charge[inr+2]);
1450 jq3 = _mm_set1_ps(charge[inr+3]);
1451 vdwjidx0A = 2*vdwtype[inr+0];
1452 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1453 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1454 qq11 = _mm_mul_ps(iq1,jq1);
1455 qq12 = _mm_mul_ps(iq1,jq2);
1456 qq13 = _mm_mul_ps(iq1,jq3);
1457 qq21 = _mm_mul_ps(iq2,jq1);
1458 qq22 = _mm_mul_ps(iq2,jq2);
1459 qq23 = _mm_mul_ps(iq2,jq3);
1460 qq31 = _mm_mul_ps(iq3,jq1);
1461 qq32 = _mm_mul_ps(iq3,jq2);
1462 qq33 = _mm_mul_ps(iq3,jq3);
1463
1464 /* Avoid stupid compiler warnings */
1465 jnrA = jnrB = jnrC = jnrD = 0;
1466 j_coord_offsetA = 0;
1467 j_coord_offsetB = 0;
1468 j_coord_offsetC = 0;
1469 j_coord_offsetD = 0;
1470
1471 outeriter = 0;
1472 inneriter = 0;
1473
1474 for(iidx=0;iidx<4*DIM3;iidx++)
1475 {
1476 scratch[iidx] = 0.0;
1477 }
1478
1479 /* Start outer loop over neighborlists */
1480 for(iidx=0; iidx<nri; iidx++)
1481 {
1482 /* Load shift vector for this list */
1483 i_shift_offset = DIM3*shiftidx[iidx];
1484
1485 /* Load limits for loop over neighbors */
1486 j_index_start = jindex[iidx];
1487 j_index_end = jindex[iidx+1];
1488
1489 /* Get outer coordinate index */
1490 inr = iinr[iidx];
1491 i_coord_offset = DIM3*inr;
1492
1493 /* Load i particle coords and add shift vector */
1494 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1495 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1496
1497 fix0 = _mm_setzero_ps();
1498 fiy0 = _mm_setzero_ps();
1499 fiz0 = _mm_setzero_ps();
1500 fix1 = _mm_setzero_ps();
1501 fiy1 = _mm_setzero_ps();
1502 fiz1 = _mm_setzero_ps();
1503 fix2 = _mm_setzero_ps();
1504 fiy2 = _mm_setzero_ps();
1505 fiz2 = _mm_setzero_ps();
1506 fix3 = _mm_setzero_ps();
1507 fiy3 = _mm_setzero_ps();
1508 fiz3 = _mm_setzero_ps();
1509
1510 /* Start inner kernel loop */
1511 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1512 {
1513
1514 /* Get j neighbor index, and coordinate index */
1515 jnrA = jjnr[jidx];
1516 jnrB = jjnr[jidx+1];
1517 jnrC = jjnr[jidx+2];
1518 jnrD = jjnr[jidx+3];
1519 j_coord_offsetA = DIM3*jnrA;
1520 j_coord_offsetB = DIM3*jnrB;
1521 j_coord_offsetC = DIM3*jnrC;
1522 j_coord_offsetD = DIM3*jnrD;
1523
1524 /* load j atom coordinates */
1525 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1526 x+j_coord_offsetC,x+j_coord_offsetD,
1527 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1528 &jy2,&jz2,&jx3,&jy3,&jz3);
1529
1530 /* Calculate displacement vector */
1531 dx00 = _mm_sub_ps(ix0,jx0);
1532 dy00 = _mm_sub_ps(iy0,jy0);
1533 dz00 = _mm_sub_ps(iz0,jz0);
1534 dx11 = _mm_sub_ps(ix1,jx1);
1535 dy11 = _mm_sub_ps(iy1,jy1);
1536 dz11 = _mm_sub_ps(iz1,jz1);
1537 dx12 = _mm_sub_ps(ix1,jx2);
1538 dy12 = _mm_sub_ps(iy1,jy2);
1539 dz12 = _mm_sub_ps(iz1,jz2);
1540 dx13 = _mm_sub_ps(ix1,jx3);
1541 dy13 = _mm_sub_ps(iy1,jy3);
1542 dz13 = _mm_sub_ps(iz1,jz3);
1543 dx21 = _mm_sub_ps(ix2,jx1);
1544 dy21 = _mm_sub_ps(iy2,jy1);
1545 dz21 = _mm_sub_ps(iz2,jz1);
1546 dx22 = _mm_sub_ps(ix2,jx2);
1547 dy22 = _mm_sub_ps(iy2,jy2);
1548 dz22 = _mm_sub_ps(iz2,jz2);
1549 dx23 = _mm_sub_ps(ix2,jx3);
1550 dy23 = _mm_sub_ps(iy2,jy3);
1551 dz23 = _mm_sub_ps(iz2,jz3);
1552 dx31 = _mm_sub_ps(ix3,jx1);
1553 dy31 = _mm_sub_ps(iy3,jy1);
1554 dz31 = _mm_sub_ps(iz3,jz1);
1555 dx32 = _mm_sub_ps(ix3,jx2);
1556 dy32 = _mm_sub_ps(iy3,jy2);
1557 dz32 = _mm_sub_ps(iz3,jz2);
1558 dx33 = _mm_sub_ps(ix3,jx3);
1559 dy33 = _mm_sub_ps(iy3,jy3);
1560 dz33 = _mm_sub_ps(iz3,jz3);
1561
1562 /* Calculate squared distance and things based on it */
1563 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1564 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1565 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1566 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1567 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1568 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1569 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1570 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1571 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1572 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1573
1574 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1575 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1576 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
1577 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1578 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1579 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
1580 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
1581 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
1582 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
1583
1584 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
1585
1586 fjx0 = _mm_setzero_ps();
1587 fjy0 = _mm_setzero_ps();
1588 fjz0 = _mm_setzero_ps();
1589 fjx1 = _mm_setzero_ps();
1590 fjy1 = _mm_setzero_ps();
1591 fjz1 = _mm_setzero_ps();
1592 fjx2 = _mm_setzero_ps();
1593 fjy2 = _mm_setzero_ps();
1594 fjz2 = _mm_setzero_ps();
1595 fjx3 = _mm_setzero_ps();
1596 fjy3 = _mm_setzero_ps();
1597 fjz3 = _mm_setzero_ps();
1598
1599 /**************************
1600 * CALCULATE INTERACTIONS *
1601 **************************/
1602
1603 /* LENNARD-JONES DISPERSION/REPULSION */
1604
1605 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1606 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1607
1608 fscal = fvdw;
1609
1610 /* Calculate temporary vectorial force */
1611 tx = _mm_mul_ps(fscal,dx00);
1612 ty = _mm_mul_ps(fscal,dy00);
1613 tz = _mm_mul_ps(fscal,dz00);
1614
1615 /* Update vectorial force */
1616 fix0 = _mm_add_ps(fix0,tx);
1617 fiy0 = _mm_add_ps(fiy0,ty);
1618 fiz0 = _mm_add_ps(fiz0,tz);
1619
1620 fjx0 = _mm_add_ps(fjx0,tx);
1621 fjy0 = _mm_add_ps(fjy0,ty);
1622 fjz0 = _mm_add_ps(fjz0,tz);
1623
1624 /**************************
1625 * CALCULATE INTERACTIONS *
1626 **************************/
1627
1628 r11 = _mm_mul_ps(rsq11,rinv11);
1629
1630 /* Calculate table index by multiplying r with table scale and truncate to integer */
1631 rt = _mm_mul_ps(r11,vftabscale);
1632 vfitab = _mm_cvttps_epi32(rt);
1633 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1634 vfitab = _mm_slli_epi32(vfitab,2);
1635
1636 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1637 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1638 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1639 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1640 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1641 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1642 Heps = _mm_mul_ps(vfeps,H);
1643 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1644 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1645 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1646
1647 fscal = felec;
1648
1649 /* Calculate temporary vectorial force */
1650 tx = _mm_mul_ps(fscal,dx11);
1651 ty = _mm_mul_ps(fscal,dy11);
1652 tz = _mm_mul_ps(fscal,dz11);
1653
1654 /* Update vectorial force */
1655 fix1 = _mm_add_ps(fix1,tx);
1656 fiy1 = _mm_add_ps(fiy1,ty);
1657 fiz1 = _mm_add_ps(fiz1,tz);
1658
1659 fjx1 = _mm_add_ps(fjx1,tx);
1660 fjy1 = _mm_add_ps(fjy1,ty);
1661 fjz1 = _mm_add_ps(fjz1,tz);
1662
1663 /**************************
1664 * CALCULATE INTERACTIONS *
1665 **************************/
1666
1667 r12 = _mm_mul_ps(rsq12,rinv12);
1668
1669 /* Calculate table index by multiplying r with table scale and truncate to integer */
1670 rt = _mm_mul_ps(r12,vftabscale);
1671 vfitab = _mm_cvttps_epi32(rt);
1672 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1673 vfitab = _mm_slli_epi32(vfitab,2);
1674
1675 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1676 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1677 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1678 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1679 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1680 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1681 Heps = _mm_mul_ps(vfeps,H);
1682 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1683 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1684 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1685
1686 fscal = felec;
1687
1688 /* Calculate temporary vectorial force */
1689 tx = _mm_mul_ps(fscal,dx12);
1690 ty = _mm_mul_ps(fscal,dy12);
1691 tz = _mm_mul_ps(fscal,dz12);
1692
1693 /* Update vectorial force */
1694 fix1 = _mm_add_ps(fix1,tx);
1695 fiy1 = _mm_add_ps(fiy1,ty);
1696 fiz1 = _mm_add_ps(fiz1,tz);
1697
1698 fjx2 = _mm_add_ps(fjx2,tx);
1699 fjy2 = _mm_add_ps(fjy2,ty);
1700 fjz2 = _mm_add_ps(fjz2,tz);
1701
1702 /**************************
1703 * CALCULATE INTERACTIONS *
1704 **************************/
1705
1706 r13 = _mm_mul_ps(rsq13,rinv13);
1707
1708 /* Calculate table index by multiplying r with table scale and truncate to integer */
1709 rt = _mm_mul_ps(r13,vftabscale);
1710 vfitab = _mm_cvttps_epi32(rt);
1711 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1712 vfitab = _mm_slli_epi32(vfitab,2);
1713
1714 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1715 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1716 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1717 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1718 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1719 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1720 Heps = _mm_mul_ps(vfeps,H);
1721 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1722 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1723 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1724
1725 fscal = felec;
1726
1727 /* Calculate temporary vectorial force */
1728 tx = _mm_mul_ps(fscal,dx13);
1729 ty = _mm_mul_ps(fscal,dy13);
1730 tz = _mm_mul_ps(fscal,dz13);
1731
1732 /* Update vectorial force */
1733 fix1 = _mm_add_ps(fix1,tx);
1734 fiy1 = _mm_add_ps(fiy1,ty);
1735 fiz1 = _mm_add_ps(fiz1,tz);
1736
1737 fjx3 = _mm_add_ps(fjx3,tx);
1738 fjy3 = _mm_add_ps(fjy3,ty);
1739 fjz3 = _mm_add_ps(fjz3,tz);
1740
1741 /**************************
1742 * CALCULATE INTERACTIONS *
1743 **************************/
1744
1745 r21 = _mm_mul_ps(rsq21,rinv21);
1746
1747 /* Calculate table index by multiplying r with table scale and truncate to integer */
1748 rt = _mm_mul_ps(r21,vftabscale);
1749 vfitab = _mm_cvttps_epi32(rt);
1750 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1751 vfitab = _mm_slli_epi32(vfitab,2);
1752
1753 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1754 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1755 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1756 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1757 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1758 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1759 Heps = _mm_mul_ps(vfeps,H);
1760 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1761 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1762 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1763
1764 fscal = felec;
1765
1766 /* Calculate temporary vectorial force */
1767 tx = _mm_mul_ps(fscal,dx21);
1768 ty = _mm_mul_ps(fscal,dy21);
1769 tz = _mm_mul_ps(fscal,dz21);
1770
1771 /* Update vectorial force */
1772 fix2 = _mm_add_ps(fix2,tx);
1773 fiy2 = _mm_add_ps(fiy2,ty);
1774 fiz2 = _mm_add_ps(fiz2,tz);
1775
1776 fjx1 = _mm_add_ps(fjx1,tx);
1777 fjy1 = _mm_add_ps(fjy1,ty);
1778 fjz1 = _mm_add_ps(fjz1,tz);
1779
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1783
1784 r22 = _mm_mul_ps(rsq22,rinv22);
1785
1786 /* Calculate table index by multiplying r with table scale and truncate to integer */
1787 rt = _mm_mul_ps(r22,vftabscale);
1788 vfitab = _mm_cvttps_epi32(rt);
1789 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1790 vfitab = _mm_slli_epi32(vfitab,2);
1791
1792 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1793 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1794 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1795 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1796 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1797 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1798 Heps = _mm_mul_ps(vfeps,H);
1799 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1800 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1801 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1802
1803 fscal = felec;
1804
1805 /* Calculate temporary vectorial force */
1806 tx = _mm_mul_ps(fscal,dx22);
1807 ty = _mm_mul_ps(fscal,dy22);
1808 tz = _mm_mul_ps(fscal,dz22);
1809
1810 /* Update vectorial force */
1811 fix2 = _mm_add_ps(fix2,tx);
1812 fiy2 = _mm_add_ps(fiy2,ty);
1813 fiz2 = _mm_add_ps(fiz2,tz);
1814
1815 fjx2 = _mm_add_ps(fjx2,tx);
1816 fjy2 = _mm_add_ps(fjy2,ty);
1817 fjz2 = _mm_add_ps(fjz2,tz);
1818
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1822
1823 r23 = _mm_mul_ps(rsq23,rinv23);
1824
1825 /* Calculate table index by multiplying r with table scale and truncate to integer */
1826 rt = _mm_mul_ps(r23,vftabscale);
1827 vfitab = _mm_cvttps_epi32(rt);
1828 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1829 vfitab = _mm_slli_epi32(vfitab,2);
1830
1831 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1832 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1833 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1834 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1835 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1836 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1837 Heps = _mm_mul_ps(vfeps,H);
1838 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1839 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1840 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1841
1842 fscal = felec;
1843
1844 /* Calculate temporary vectorial force */
1845 tx = _mm_mul_ps(fscal,dx23);
1846 ty = _mm_mul_ps(fscal,dy23);
1847 tz = _mm_mul_ps(fscal,dz23);
1848
1849 /* Update vectorial force */
1850 fix2 = _mm_add_ps(fix2,tx);
1851 fiy2 = _mm_add_ps(fiy2,ty);
1852 fiz2 = _mm_add_ps(fiz2,tz);
1853
1854 fjx3 = _mm_add_ps(fjx3,tx);
1855 fjy3 = _mm_add_ps(fjy3,ty);
1856 fjz3 = _mm_add_ps(fjz3,tz);
1857
1858 /**************************
1859 * CALCULATE INTERACTIONS *
1860 **************************/
1861
1862 r31 = _mm_mul_ps(rsq31,rinv31);
1863
1864 /* Calculate table index by multiplying r with table scale and truncate to integer */
1865 rt = _mm_mul_ps(r31,vftabscale);
1866 vfitab = _mm_cvttps_epi32(rt);
1867 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1868 vfitab = _mm_slli_epi32(vfitab,2);
1869
1870 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1871 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1872 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1873 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1874 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1875 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1876 Heps = _mm_mul_ps(vfeps,H);
1877 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1878 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1879 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1880
1881 fscal = felec;
1882
1883 /* Calculate temporary vectorial force */
1884 tx = _mm_mul_ps(fscal,dx31);
1885 ty = _mm_mul_ps(fscal,dy31);
1886 tz = _mm_mul_ps(fscal,dz31);
1887
1888 /* Update vectorial force */
1889 fix3 = _mm_add_ps(fix3,tx);
1890 fiy3 = _mm_add_ps(fiy3,ty);
1891 fiz3 = _mm_add_ps(fiz3,tz);
1892
1893 fjx1 = _mm_add_ps(fjx1,tx);
1894 fjy1 = _mm_add_ps(fjy1,ty);
1895 fjz1 = _mm_add_ps(fjz1,tz);
1896
1897 /**************************
1898 * CALCULATE INTERACTIONS *
1899 **************************/
1900
1901 r32 = _mm_mul_ps(rsq32,rinv32);
1902
1903 /* Calculate table index by multiplying r with table scale and truncate to integer */
1904 rt = _mm_mul_ps(r32,vftabscale);
1905 vfitab = _mm_cvttps_epi32(rt);
1906 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1907 vfitab = _mm_slli_epi32(vfitab,2);
1908
1909 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1910 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1911 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1912 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1913 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1914 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1915 Heps = _mm_mul_ps(vfeps,H);
1916 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1917 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1918 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1919
1920 fscal = felec;
1921
1922 /* Calculate temporary vectorial force */
1923 tx = _mm_mul_ps(fscal,dx32);
1924 ty = _mm_mul_ps(fscal,dy32);
1925 tz = _mm_mul_ps(fscal,dz32);
1926
1927 /* Update vectorial force */
1928 fix3 = _mm_add_ps(fix3,tx);
1929 fiy3 = _mm_add_ps(fiy3,ty);
1930 fiz3 = _mm_add_ps(fiz3,tz);
1931
1932 fjx2 = _mm_add_ps(fjx2,tx);
1933 fjy2 = _mm_add_ps(fjy2,ty);
1934 fjz2 = _mm_add_ps(fjz2,tz);
1935
1936 /**************************
1937 * CALCULATE INTERACTIONS *
1938 **************************/
1939
1940 r33 = _mm_mul_ps(rsq33,rinv33);
1941
1942 /* Calculate table index by multiplying r with table scale and truncate to integer */
1943 rt = _mm_mul_ps(r33,vftabscale);
1944 vfitab = _mm_cvttps_epi32(rt);
1945 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1946 vfitab = _mm_slli_epi32(vfitab,2);
1947
1948 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1949 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
1950 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
1951 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
1952 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
1953 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
1954 Heps = _mm_mul_ps(vfeps,H);
1955 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1956 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1957 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1958
1959 fscal = felec;
1960
1961 /* Calculate temporary vectorial force */
1962 tx = _mm_mul_ps(fscal,dx33);
1963 ty = _mm_mul_ps(fscal,dy33);
1964 tz = _mm_mul_ps(fscal,dz33);
1965
1966 /* Update vectorial force */
1967 fix3 = _mm_add_ps(fix3,tx);
1968 fiy3 = _mm_add_ps(fiy3,ty);
1969 fiz3 = _mm_add_ps(fiz3,tz);
1970
1971 fjx3 = _mm_add_ps(fjx3,tx);
1972 fjy3 = _mm_add_ps(fjy3,ty);
1973 fjz3 = _mm_add_ps(fjz3,tz);
1974
1975 fjptrA = f+j_coord_offsetA;
1976 fjptrB = f+j_coord_offsetB;
1977 fjptrC = f+j_coord_offsetC;
1978 fjptrD = f+j_coord_offsetD;
1979
1980 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1981 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1982 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1983
1984 /* Inner loop uses 381 flops */
1985 }
1986
1987 if(jidx<j_index_end)
1988 {
1989
1990 /* Get j neighbor index, and coordinate index */
1991 jnrlistA = jjnr[jidx];
1992 jnrlistB = jjnr[jidx+1];
1993 jnrlistC = jjnr[jidx+2];
1994 jnrlistD = jjnr[jidx+3];
1995 /* Sign of each element will be negative for non-real atoms.
1996 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1997 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1998 */
1999 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2000 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2001 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2002 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2003 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2004 j_coord_offsetA = DIM3*jnrA;
2005 j_coord_offsetB = DIM3*jnrB;
2006 j_coord_offsetC = DIM3*jnrC;
2007 j_coord_offsetD = DIM3*jnrD;
2008
2009 /* load j atom coordinates */
2010 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2011 x+j_coord_offsetC,x+j_coord_offsetD,
2012 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2013 &jy2,&jz2,&jx3,&jy3,&jz3);
2014
2015 /* Calculate displacement vector */
2016 dx00 = _mm_sub_ps(ix0,jx0);
2017 dy00 = _mm_sub_ps(iy0,jy0);
2018 dz00 = _mm_sub_ps(iz0,jz0);
2019 dx11 = _mm_sub_ps(ix1,jx1);
2020 dy11 = _mm_sub_ps(iy1,jy1);
2021 dz11 = _mm_sub_ps(iz1,jz1);
2022 dx12 = _mm_sub_ps(ix1,jx2);
2023 dy12 = _mm_sub_ps(iy1,jy2);
2024 dz12 = _mm_sub_ps(iz1,jz2);
2025 dx13 = _mm_sub_ps(ix1,jx3);
2026 dy13 = _mm_sub_ps(iy1,jy3);
2027 dz13 = _mm_sub_ps(iz1,jz3);
2028 dx21 = _mm_sub_ps(ix2,jx1);
2029 dy21 = _mm_sub_ps(iy2,jy1);
2030 dz21 = _mm_sub_ps(iz2,jz1);
2031 dx22 = _mm_sub_ps(ix2,jx2);
2032 dy22 = _mm_sub_ps(iy2,jy2);
2033 dz22 = _mm_sub_ps(iz2,jz2);
2034 dx23 = _mm_sub_ps(ix2,jx3);
2035 dy23 = _mm_sub_ps(iy2,jy3);
2036 dz23 = _mm_sub_ps(iz2,jz3);
2037 dx31 = _mm_sub_ps(ix3,jx1);
2038 dy31 = _mm_sub_ps(iy3,jy1);
2039 dz31 = _mm_sub_ps(iz3,jz1);
2040 dx32 = _mm_sub_ps(ix3,jx2);
2041 dy32 = _mm_sub_ps(iy3,jy2);
2042 dz32 = _mm_sub_ps(iz3,jz2);
2043 dx33 = _mm_sub_ps(ix3,jx3);
2044 dy33 = _mm_sub_ps(iy3,jy3);
2045 dz33 = _mm_sub_ps(iz3,jz3);
2046
2047 /* Calculate squared distance and things based on it */
2048 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2049 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2050 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2051 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2052 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2053 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2054 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2055 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2056 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2057 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2058
2059 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2060 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2061 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
2062 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2063 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2064 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
2065 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
2066 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
2067 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
2068
2069 rinvsq00 = gmx_mm_inv_psgmx_simd_inv_f(rsq00);
2070
2071 fjx0 = _mm_setzero_ps();
2072 fjy0 = _mm_setzero_ps();
2073 fjz0 = _mm_setzero_ps();
2074 fjx1 = _mm_setzero_ps();
2075 fjy1 = _mm_setzero_ps();
2076 fjz1 = _mm_setzero_ps();
2077 fjx2 = _mm_setzero_ps();
2078 fjy2 = _mm_setzero_ps();
2079 fjz2 = _mm_setzero_ps();
2080 fjx3 = _mm_setzero_ps();
2081 fjy3 = _mm_setzero_ps();
2082 fjz3 = _mm_setzero_ps();
2083
2084 /**************************
2085 * CALCULATE INTERACTIONS *
2086 **************************/
2087
2088 /* LENNARD-JONES DISPERSION/REPULSION */
2089
2090 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2091 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2092
2093 fscal = fvdw;
2094
2095 fscal = _mm_andnot_ps(dummy_mask,fscal);
2096
2097 /* Calculate temporary vectorial force */
2098 tx = _mm_mul_ps(fscal,dx00);
2099 ty = _mm_mul_ps(fscal,dy00);
2100 tz = _mm_mul_ps(fscal,dz00);
2101
2102 /* Update vectorial force */
2103 fix0 = _mm_add_ps(fix0,tx);
2104 fiy0 = _mm_add_ps(fiy0,ty);
2105 fiz0 = _mm_add_ps(fiz0,tz);
2106
2107 fjx0 = _mm_add_ps(fjx0,tx);
2108 fjy0 = _mm_add_ps(fjy0,ty);
2109 fjz0 = _mm_add_ps(fjz0,tz);
2110
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2114
2115 r11 = _mm_mul_ps(rsq11,rinv11);
2116 r11 = _mm_andnot_ps(dummy_mask,r11);
2117
2118 /* Calculate table index by multiplying r with table scale and truncate to integer */
2119 rt = _mm_mul_ps(r11,vftabscale);
2120 vfitab = _mm_cvttps_epi32(rt);
2121 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2122 vfitab = _mm_slli_epi32(vfitab,2);
2123
2124 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2125 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2126 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2127 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2128 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2129 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2130 Heps = _mm_mul_ps(vfeps,H);
2131 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2132 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2133 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2134
2135 fscal = felec;
2136
2137 fscal = _mm_andnot_ps(dummy_mask,fscal);
2138
2139 /* Calculate temporary vectorial force */
2140 tx = _mm_mul_ps(fscal,dx11);
2141 ty = _mm_mul_ps(fscal,dy11);
2142 tz = _mm_mul_ps(fscal,dz11);
2143
2144 /* Update vectorial force */
2145 fix1 = _mm_add_ps(fix1,tx);
2146 fiy1 = _mm_add_ps(fiy1,ty);
2147 fiz1 = _mm_add_ps(fiz1,tz);
2148
2149 fjx1 = _mm_add_ps(fjx1,tx);
2150 fjy1 = _mm_add_ps(fjy1,ty);
2151 fjz1 = _mm_add_ps(fjz1,tz);
2152
2153 /**************************
2154 * CALCULATE INTERACTIONS *
2155 **************************/
2156
2157 r12 = _mm_mul_ps(rsq12,rinv12);
2158 r12 = _mm_andnot_ps(dummy_mask,r12);
2159
2160 /* Calculate table index by multiplying r with table scale and truncate to integer */
2161 rt = _mm_mul_ps(r12,vftabscale);
2162 vfitab = _mm_cvttps_epi32(rt);
2163 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2164 vfitab = _mm_slli_epi32(vfitab,2);
2165
2166 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2167 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2168 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2169 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2170 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2171 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2172 Heps = _mm_mul_ps(vfeps,H);
2173 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2174 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2175 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2176
2177 fscal = felec;
2178
2179 fscal = _mm_andnot_ps(dummy_mask,fscal);
2180
2181 /* Calculate temporary vectorial force */
2182 tx = _mm_mul_ps(fscal,dx12);
2183 ty = _mm_mul_ps(fscal,dy12);
2184 tz = _mm_mul_ps(fscal,dz12);
2185
2186 /* Update vectorial force */
2187 fix1 = _mm_add_ps(fix1,tx);
2188 fiy1 = _mm_add_ps(fiy1,ty);
2189 fiz1 = _mm_add_ps(fiz1,tz);
2190
2191 fjx2 = _mm_add_ps(fjx2,tx);
2192 fjy2 = _mm_add_ps(fjy2,ty);
2193 fjz2 = _mm_add_ps(fjz2,tz);
2194
2195 /**************************
2196 * CALCULATE INTERACTIONS *
2197 **************************/
2198
2199 r13 = _mm_mul_ps(rsq13,rinv13);
2200 r13 = _mm_andnot_ps(dummy_mask,r13);
2201
2202 /* Calculate table index by multiplying r with table scale and truncate to integer */
2203 rt = _mm_mul_ps(r13,vftabscale);
2204 vfitab = _mm_cvttps_epi32(rt);
2205 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2206 vfitab = _mm_slli_epi32(vfitab,2);
2207
2208 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2209 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2210 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2211 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2212 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2213 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2214 Heps = _mm_mul_ps(vfeps,H);
2215 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2216 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2217 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2218
2219 fscal = felec;
2220
2221 fscal = _mm_andnot_ps(dummy_mask,fscal);
2222
2223 /* Calculate temporary vectorial force */
2224 tx = _mm_mul_ps(fscal,dx13);
2225 ty = _mm_mul_ps(fscal,dy13);
2226 tz = _mm_mul_ps(fscal,dz13);
2227
2228 /* Update vectorial force */
2229 fix1 = _mm_add_ps(fix1,tx);
2230 fiy1 = _mm_add_ps(fiy1,ty);
2231 fiz1 = _mm_add_ps(fiz1,tz);
2232
2233 fjx3 = _mm_add_ps(fjx3,tx);
2234 fjy3 = _mm_add_ps(fjy3,ty);
2235 fjz3 = _mm_add_ps(fjz3,tz);
2236
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2240
2241 r21 = _mm_mul_ps(rsq21,rinv21);
2242 r21 = _mm_andnot_ps(dummy_mask,r21);
2243
2244 /* Calculate table index by multiplying r with table scale and truncate to integer */
2245 rt = _mm_mul_ps(r21,vftabscale);
2246 vfitab = _mm_cvttps_epi32(rt);
2247 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2248 vfitab = _mm_slli_epi32(vfitab,2);
2249
2250 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2251 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2252 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2253 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2254 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2255 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2256 Heps = _mm_mul_ps(vfeps,H);
2257 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2258 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2259 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2260
2261 fscal = felec;
2262
2263 fscal = _mm_andnot_ps(dummy_mask,fscal);
2264
2265 /* Calculate temporary vectorial force */
2266 tx = _mm_mul_ps(fscal,dx21);
2267 ty = _mm_mul_ps(fscal,dy21);
2268 tz = _mm_mul_ps(fscal,dz21);
2269
2270 /* Update vectorial force */
2271 fix2 = _mm_add_ps(fix2,tx);
2272 fiy2 = _mm_add_ps(fiy2,ty);
2273 fiz2 = _mm_add_ps(fiz2,tz);
2274
2275 fjx1 = _mm_add_ps(fjx1,tx);
2276 fjy1 = _mm_add_ps(fjy1,ty);
2277 fjz1 = _mm_add_ps(fjz1,tz);
2278
2279 /**************************
2280 * CALCULATE INTERACTIONS *
2281 **************************/
2282
2283 r22 = _mm_mul_ps(rsq22,rinv22);
2284 r22 = _mm_andnot_ps(dummy_mask,r22);
2285
2286 /* Calculate table index by multiplying r with table scale and truncate to integer */
2287 rt = _mm_mul_ps(r22,vftabscale);
2288 vfitab = _mm_cvttps_epi32(rt);
2289 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2290 vfitab = _mm_slli_epi32(vfitab,2);
2291
2292 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2293 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2294 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2295 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2296 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2297 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2298 Heps = _mm_mul_ps(vfeps,H);
2299 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2300 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2301 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2302
2303 fscal = felec;
2304
2305 fscal = _mm_andnot_ps(dummy_mask,fscal);
2306
2307 /* Calculate temporary vectorial force */
2308 tx = _mm_mul_ps(fscal,dx22);
2309 ty = _mm_mul_ps(fscal,dy22);
2310 tz = _mm_mul_ps(fscal,dz22);
2311
2312 /* Update vectorial force */
2313 fix2 = _mm_add_ps(fix2,tx);
2314 fiy2 = _mm_add_ps(fiy2,ty);
2315 fiz2 = _mm_add_ps(fiz2,tz);
2316
2317 fjx2 = _mm_add_ps(fjx2,tx);
2318 fjy2 = _mm_add_ps(fjy2,ty);
2319 fjz2 = _mm_add_ps(fjz2,tz);
2320
2321 /**************************
2322 * CALCULATE INTERACTIONS *
2323 **************************/
2324
2325 r23 = _mm_mul_ps(rsq23,rinv23);
2326 r23 = _mm_andnot_ps(dummy_mask,r23);
2327
2328 /* Calculate table index by multiplying r with table scale and truncate to integer */
2329 rt = _mm_mul_ps(r23,vftabscale);
2330 vfitab = _mm_cvttps_epi32(rt);
2331 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2332 vfitab = _mm_slli_epi32(vfitab,2);
2333
2334 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2335 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2336 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2337 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2338 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2339 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2340 Heps = _mm_mul_ps(vfeps,H);
2341 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2342 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2343 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2344
2345 fscal = felec;
2346
2347 fscal = _mm_andnot_ps(dummy_mask,fscal);
2348
2349 /* Calculate temporary vectorial force */
2350 tx = _mm_mul_ps(fscal,dx23);
2351 ty = _mm_mul_ps(fscal,dy23);
2352 tz = _mm_mul_ps(fscal,dz23);
2353
2354 /* Update vectorial force */
2355 fix2 = _mm_add_ps(fix2,tx);
2356 fiy2 = _mm_add_ps(fiy2,ty);
2357 fiz2 = _mm_add_ps(fiz2,tz);
2358
2359 fjx3 = _mm_add_ps(fjx3,tx);
2360 fjy3 = _mm_add_ps(fjy3,ty);
2361 fjz3 = _mm_add_ps(fjz3,tz);
2362
2363 /**************************
2364 * CALCULATE INTERACTIONS *
2365 **************************/
2366
2367 r31 = _mm_mul_ps(rsq31,rinv31);
2368 r31 = _mm_andnot_ps(dummy_mask,r31);
2369
2370 /* Calculate table index by multiplying r with table scale and truncate to integer */
2371 rt = _mm_mul_ps(r31,vftabscale);
2372 vfitab = _mm_cvttps_epi32(rt);
2373 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2374 vfitab = _mm_slli_epi32(vfitab,2);
2375
2376 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2377 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2378 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2379 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2380 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2381 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2382 Heps = _mm_mul_ps(vfeps,H);
2383 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2384 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2385 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2386
2387 fscal = felec;
2388
2389 fscal = _mm_andnot_ps(dummy_mask,fscal);
2390
2391 /* Calculate temporary vectorial force */
2392 tx = _mm_mul_ps(fscal,dx31);
2393 ty = _mm_mul_ps(fscal,dy31);
2394 tz = _mm_mul_ps(fscal,dz31);
2395
2396 /* Update vectorial force */
2397 fix3 = _mm_add_ps(fix3,tx);
2398 fiy3 = _mm_add_ps(fiy3,ty);
2399 fiz3 = _mm_add_ps(fiz3,tz);
2400
2401 fjx1 = _mm_add_ps(fjx1,tx);
2402 fjy1 = _mm_add_ps(fjy1,ty);
2403 fjz1 = _mm_add_ps(fjz1,tz);
2404
2405 /**************************
2406 * CALCULATE INTERACTIONS *
2407 **************************/
2408
2409 r32 = _mm_mul_ps(rsq32,rinv32);
2410 r32 = _mm_andnot_ps(dummy_mask,r32);
2411
2412 /* Calculate table index by multiplying r with table scale and truncate to integer */
2413 rt = _mm_mul_ps(r32,vftabscale);
2414 vfitab = _mm_cvttps_epi32(rt);
2415 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2416 vfitab = _mm_slli_epi32(vfitab,2);
2417
2418 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2419 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2420 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2421 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2422 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2423 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2424 Heps = _mm_mul_ps(vfeps,H);
2425 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2426 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2427 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2428
2429 fscal = felec;
2430
2431 fscal = _mm_andnot_ps(dummy_mask,fscal);
2432
2433 /* Calculate temporary vectorial force */
2434 tx = _mm_mul_ps(fscal,dx32);
2435 ty = _mm_mul_ps(fscal,dy32);
2436 tz = _mm_mul_ps(fscal,dz32);
2437
2438 /* Update vectorial force */
2439 fix3 = _mm_add_ps(fix3,tx);
2440 fiy3 = _mm_add_ps(fiy3,ty);
2441 fiz3 = _mm_add_ps(fiz3,tz);
2442
2443 fjx2 = _mm_add_ps(fjx2,tx);
2444 fjy2 = _mm_add_ps(fjy2,ty);
2445 fjz2 = _mm_add_ps(fjz2,tz);
2446
2447 /**************************
2448 * CALCULATE INTERACTIONS *
2449 **************************/
2450
2451 r33 = _mm_mul_ps(rsq33,rinv33);
2452 r33 = _mm_andnot_ps(dummy_mask,r33);
2453
2454 /* Calculate table index by multiplying r with table scale and truncate to integer */
2455 rt = _mm_mul_ps(r33,vftabscale);
2456 vfitab = _mm_cvttps_epi32(rt);
2457 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2458 vfitab = _mm_slli_epi32(vfitab,2);
2459
2460 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2461 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) &
3];}))
);
2462 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) &
3];}))
);
2463 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) &
3];}))
);
2464 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) &
3];}))
);
2465 _MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y
), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps
((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps
(tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps
(tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0)
;
2466 Heps = _mm_mul_ps(vfeps,H);
2467 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2468 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2469 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2470
2471 fscal = felec;
2472
2473 fscal = _mm_andnot_ps(dummy_mask,fscal);
2474
2475 /* Calculate temporary vectorial force */
2476 tx = _mm_mul_ps(fscal,dx33);
2477 ty = _mm_mul_ps(fscal,dy33);
2478 tz = _mm_mul_ps(fscal,dz33);
2479
2480 /* Update vectorial force */
2481 fix3 = _mm_add_ps(fix3,tx);
2482 fiy3 = _mm_add_ps(fiy3,ty);
2483 fiz3 = _mm_add_ps(fiz3,tz);
2484
2485 fjx3 = _mm_add_ps(fjx3,tx);
2486 fjy3 = _mm_add_ps(fjy3,ty);
2487 fjz3 = _mm_add_ps(fjz3,tz);
2488
2489 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2490 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2491 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2492 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2493
2494 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2495 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2496 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2497
2498 /* Inner loop uses 390 flops */
2499 }
2500
2501 /* End of innermost loop */
2502
2503 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2504 f+i_coord_offset,fshift+i_shift_offset);
2505
2506 /* Increment number of inner iterations */
2507 inneriter += j_index_end - j_index_start;
2508
2509 /* Outer loop uses 24 flops */
2510 }
2511
2512 /* Increment number of outer iterations */
2513 outeriter += nri;
2514
2515 /* Update outer/inner flops */
2516
2517 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_F] += outeriter*24 + inneriter
*390
;
2518}