Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_single.c
Location:line 193, column 5
Description:Value stored to 'j_coord_offsetB' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LJEwald
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwioffset3;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 real *charge;
113 int nvdwtype;
114 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 int *vdwtype;
116 real *vdwparam;
117 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
118 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
119 __m128 c6grid_00;
120 __m128 c6grid_11;
121 __m128 c6grid_12;
122 __m128 c6grid_13;
123 __m128 c6grid_21;
124 __m128 c6grid_22;
125 __m128 c6grid_23;
126 __m128 c6grid_31;
127 __m128 c6grid_32;
128 __m128 c6grid_33;
129 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
130 real *vdwgridparam;
131 __m128 one_half = _mm_set1_ps(0.5);
132 __m128 minus_one = _mm_set1_ps(-1.0);
133 __m128i ewitab;
134 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
135 real *ewtab;
136 __m128 dummy_mask,cutoff_mask;
137 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
138 __m128 one = _mm_set1_ps(1.0);
139 __m128 two = _mm_set1_ps(2.0);
140 x = xx[0];
141 f = ff[0];
142
143 nri = nlist->nri;
144 iinr = nlist->iinr;
145 jindex = nlist->jindex;
146 jjnr = nlist->jjnr;
147 shiftidx = nlist->shift;
148 gid = nlist->gid;
149 shiftvec = fr->shift_vec[0];
150 fshift = fr->fshift[0];
151 facel = _mm_set1_ps(fr->epsfac);
152 charge = mdatoms->chargeA;
153 nvdwtype = fr->ntype;
154 vdwparam = fr->nbfp;
155 vdwtype = mdatoms->typeA;
156 vdwgridparam = fr->ljpme_c6grid;
157 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
158 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
159 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
160
161 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
162 ewtab = fr->ic->tabq_coul_FDV0;
163 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
164 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
165
166 /* Setup water-specific parameters */
167 inr = nlist->iinr[0];
168 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
169 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
170 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
171 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
172
173 jq1 = _mm_set1_ps(charge[inr+1]);
174 jq2 = _mm_set1_ps(charge[inr+2]);
175 jq3 = _mm_set1_ps(charge[inr+3]);
176 vdwjidx0A = 2*vdwtype[inr+0];
177 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
178 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
179 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
180 qq11 = _mm_mul_ps(iq1,jq1);
181 qq12 = _mm_mul_ps(iq1,jq2);
182 qq13 = _mm_mul_ps(iq1,jq3);
183 qq21 = _mm_mul_ps(iq2,jq1);
184 qq22 = _mm_mul_ps(iq2,jq2);
185 qq23 = _mm_mul_ps(iq2,jq3);
186 qq31 = _mm_mul_ps(iq3,jq1);
187 qq32 = _mm_mul_ps(iq3,jq2);
188 qq33 = _mm_mul_ps(iq3,jq3);
189
190 /* Avoid stupid compiler warnings */
191 jnrA = jnrB = jnrC = jnrD = 0;
192 j_coord_offsetA = 0;
193 j_coord_offsetB = 0;
Value stored to 'j_coord_offsetB' is never read
194 j_coord_offsetC = 0;
195 j_coord_offsetD = 0;
196
197 outeriter = 0;
198 inneriter = 0;
199
200 for(iidx=0;iidx<4*DIM3;iidx++)
201 {
202 scratch[iidx] = 0.0;
203 }
204
205 /* Start outer loop over neighborlists */
206 for(iidx=0; iidx<nri; iidx++)
207 {
208 /* Load shift vector for this list */
209 i_shift_offset = DIM3*shiftidx[iidx];
210
211 /* Load limits for loop over neighbors */
212 j_index_start = jindex[iidx];
213 j_index_end = jindex[iidx+1];
214
215 /* Get outer coordinate index */
216 inr = iinr[iidx];
217 i_coord_offset = DIM3*inr;
218
219 /* Load i particle coords and add shift vector */
220 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
221 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
222
223 fix0 = _mm_setzero_ps();
224 fiy0 = _mm_setzero_ps();
225 fiz0 = _mm_setzero_ps();
226 fix1 = _mm_setzero_ps();
227 fiy1 = _mm_setzero_ps();
228 fiz1 = _mm_setzero_ps();
229 fix2 = _mm_setzero_ps();
230 fiy2 = _mm_setzero_ps();
231 fiz2 = _mm_setzero_ps();
232 fix3 = _mm_setzero_ps();
233 fiy3 = _mm_setzero_ps();
234 fiz3 = _mm_setzero_ps();
235
236 /* Reset potential sums */
237 velecsum = _mm_setzero_ps();
238 vvdwsum = _mm_setzero_ps();
239
240 /* Start inner kernel loop */
241 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
242 {
243
244 /* Get j neighbor index, and coordinate index */
245 jnrA = jjnr[jidx];
246 jnrB = jjnr[jidx+1];
247 jnrC = jjnr[jidx+2];
248 jnrD = jjnr[jidx+3];
249 j_coord_offsetA = DIM3*jnrA;
250 j_coord_offsetB = DIM3*jnrB;
251 j_coord_offsetC = DIM3*jnrC;
252 j_coord_offsetD = DIM3*jnrD;
253
254 /* load j atom coordinates */
255 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
256 x+j_coord_offsetC,x+j_coord_offsetD,
257 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
258 &jy2,&jz2,&jx3,&jy3,&jz3);
259
260 /* Calculate displacement vector */
261 dx00 = _mm_sub_ps(ix0,jx0);
262 dy00 = _mm_sub_ps(iy0,jy0);
263 dz00 = _mm_sub_ps(iz0,jz0);
264 dx11 = _mm_sub_ps(ix1,jx1);
265 dy11 = _mm_sub_ps(iy1,jy1);
266 dz11 = _mm_sub_ps(iz1,jz1);
267 dx12 = _mm_sub_ps(ix1,jx2);
268 dy12 = _mm_sub_ps(iy1,jy2);
269 dz12 = _mm_sub_ps(iz1,jz2);
270 dx13 = _mm_sub_ps(ix1,jx3);
271 dy13 = _mm_sub_ps(iy1,jy3);
272 dz13 = _mm_sub_ps(iz1,jz3);
273 dx21 = _mm_sub_ps(ix2,jx1);
274 dy21 = _mm_sub_ps(iy2,jy1);
275 dz21 = _mm_sub_ps(iz2,jz1);
276 dx22 = _mm_sub_ps(ix2,jx2);
277 dy22 = _mm_sub_ps(iy2,jy2);
278 dz22 = _mm_sub_ps(iz2,jz2);
279 dx23 = _mm_sub_ps(ix2,jx3);
280 dy23 = _mm_sub_ps(iy2,jy3);
281 dz23 = _mm_sub_ps(iz2,jz3);
282 dx31 = _mm_sub_ps(ix3,jx1);
283 dy31 = _mm_sub_ps(iy3,jy1);
284 dz31 = _mm_sub_ps(iz3,jz1);
285 dx32 = _mm_sub_ps(ix3,jx2);
286 dy32 = _mm_sub_ps(iy3,jy2);
287 dz32 = _mm_sub_ps(iz3,jz2);
288 dx33 = _mm_sub_ps(ix3,jx3);
289 dy33 = _mm_sub_ps(iy3,jy3);
290 dz33 = _mm_sub_ps(iz3,jz3);
291
292 /* Calculate squared distance and things based on it */
293 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
294 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
295 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
296 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
297 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
298 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
299 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
300 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
301 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
302 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
303
304 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
305 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
306 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
307 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
308 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
309 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
310 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
311 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
312 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
313 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
314
315 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
316 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
317 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
318 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
319 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
320 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
321 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
322 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
323 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
324 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
325
326 fjx0 = _mm_setzero_ps();
327 fjy0 = _mm_setzero_ps();
328 fjz0 = _mm_setzero_ps();
329 fjx1 = _mm_setzero_ps();
330 fjy1 = _mm_setzero_ps();
331 fjz1 = _mm_setzero_ps();
332 fjx2 = _mm_setzero_ps();
333 fjy2 = _mm_setzero_ps();
334 fjz2 = _mm_setzero_ps();
335 fjx3 = _mm_setzero_ps();
336 fjy3 = _mm_setzero_ps();
337 fjz3 = _mm_setzero_ps();
338
339 /**************************
340 * CALCULATE INTERACTIONS *
341 **************************/
342
343 r00 = _mm_mul_ps(rsq00,rinv00);
344
345 /* Analytical LJ-PME */
346 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
347 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
348 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
349 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
350 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
351 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
352 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
353 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
354 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
355 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
356 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
357 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
358
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
361
362 fscal = fvdw;
363
364 /* Calculate temporary vectorial force */
365 tx = _mm_mul_ps(fscal,dx00);
366 ty = _mm_mul_ps(fscal,dy00);
367 tz = _mm_mul_ps(fscal,dz00);
368
369 /* Update vectorial force */
370 fix0 = _mm_add_ps(fix0,tx);
371 fiy0 = _mm_add_ps(fiy0,ty);
372 fiz0 = _mm_add_ps(fiz0,tz);
373
374 fjx0 = _mm_add_ps(fjx0,tx);
375 fjy0 = _mm_add_ps(fjy0,ty);
376 fjz0 = _mm_add_ps(fjz0,tz);
377
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
381
382 r11 = _mm_mul_ps(rsq11,rinv11);
383
384 /* EWALD ELECTROSTATICS */
385
386 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
387 ewrt = _mm_mul_ps(r11,ewtabscale);
388 ewitab = _mm_cvttps_epi32(ewrt);
389 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
390 ewitab = _mm_slli_epi32(ewitab,2);
391 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
392 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
393 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
394 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
395 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
396 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
397 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
398 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
399 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
400
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velecsum = _mm_add_ps(velecsum,velec);
403
404 fscal = felec;
405
406 /* Calculate temporary vectorial force */
407 tx = _mm_mul_ps(fscal,dx11);
408 ty = _mm_mul_ps(fscal,dy11);
409 tz = _mm_mul_ps(fscal,dz11);
410
411 /* Update vectorial force */
412 fix1 = _mm_add_ps(fix1,tx);
413 fiy1 = _mm_add_ps(fiy1,ty);
414 fiz1 = _mm_add_ps(fiz1,tz);
415
416 fjx1 = _mm_add_ps(fjx1,tx);
417 fjy1 = _mm_add_ps(fjy1,ty);
418 fjz1 = _mm_add_ps(fjz1,tz);
419
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
423
424 r12 = _mm_mul_ps(rsq12,rinv12);
425
426 /* EWALD ELECTROSTATICS */
427
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm_mul_ps(r12,ewtabscale);
430 ewitab = _mm_cvttps_epi32(ewrt);
431 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
434 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
435 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
436 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
437 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
438 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
439 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
440 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
441 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
442
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm_add_ps(velecsum,velec);
445
446 fscal = felec;
447
448 /* Calculate temporary vectorial force */
449 tx = _mm_mul_ps(fscal,dx12);
450 ty = _mm_mul_ps(fscal,dy12);
451 tz = _mm_mul_ps(fscal,dz12);
452
453 /* Update vectorial force */
454 fix1 = _mm_add_ps(fix1,tx);
455 fiy1 = _mm_add_ps(fiy1,ty);
456 fiz1 = _mm_add_ps(fiz1,tz);
457
458 fjx2 = _mm_add_ps(fjx2,tx);
459 fjy2 = _mm_add_ps(fjy2,ty);
460 fjz2 = _mm_add_ps(fjz2,tz);
461
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
465
466 r13 = _mm_mul_ps(rsq13,rinv13);
467
468 /* EWALD ELECTROSTATICS */
469
470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
471 ewrt = _mm_mul_ps(r13,ewtabscale);
472 ewitab = _mm_cvttps_epi32(ewrt);
473 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
474 ewitab = _mm_slli_epi32(ewitab,2);
475 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
476 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
477 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
478 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
479 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
480 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
481 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
482 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
483 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
484
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velecsum = _mm_add_ps(velecsum,velec);
487
488 fscal = felec;
489
490 /* Calculate temporary vectorial force */
491 tx = _mm_mul_ps(fscal,dx13);
492 ty = _mm_mul_ps(fscal,dy13);
493 tz = _mm_mul_ps(fscal,dz13);
494
495 /* Update vectorial force */
496 fix1 = _mm_add_ps(fix1,tx);
497 fiy1 = _mm_add_ps(fiy1,ty);
498 fiz1 = _mm_add_ps(fiz1,tz);
499
500 fjx3 = _mm_add_ps(fjx3,tx);
501 fjy3 = _mm_add_ps(fjy3,ty);
502 fjz3 = _mm_add_ps(fjz3,tz);
503
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
507
508 r21 = _mm_mul_ps(rsq21,rinv21);
509
510 /* EWALD ELECTROSTATICS */
511
512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
513 ewrt = _mm_mul_ps(r21,ewtabscale);
514 ewitab = _mm_cvttps_epi32(ewrt);
515 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
516 ewitab = _mm_slli_epi32(ewitab,2);
517 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
518 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
519 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
520 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
521 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
522 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
523 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
524 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
525 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
526
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velecsum = _mm_add_ps(velecsum,velec);
529
530 fscal = felec;
531
532 /* Calculate temporary vectorial force */
533 tx = _mm_mul_ps(fscal,dx21);
534 ty = _mm_mul_ps(fscal,dy21);
535 tz = _mm_mul_ps(fscal,dz21);
536
537 /* Update vectorial force */
538 fix2 = _mm_add_ps(fix2,tx);
539 fiy2 = _mm_add_ps(fiy2,ty);
540 fiz2 = _mm_add_ps(fiz2,tz);
541
542 fjx1 = _mm_add_ps(fjx1,tx);
543 fjy1 = _mm_add_ps(fjy1,ty);
544 fjz1 = _mm_add_ps(fjz1,tz);
545
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
549
550 r22 = _mm_mul_ps(rsq22,rinv22);
551
552 /* EWALD ELECTROSTATICS */
553
554 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
555 ewrt = _mm_mul_ps(r22,ewtabscale);
556 ewitab = _mm_cvttps_epi32(ewrt);
557 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
558 ewitab = _mm_slli_epi32(ewitab,2);
559 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
560 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
561 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
562 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
563 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
564 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
565 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
566 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
567 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
568
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velecsum = _mm_add_ps(velecsum,velec);
571
572 fscal = felec;
573
574 /* Calculate temporary vectorial force */
575 tx = _mm_mul_ps(fscal,dx22);
576 ty = _mm_mul_ps(fscal,dy22);
577 tz = _mm_mul_ps(fscal,dz22);
578
579 /* Update vectorial force */
580 fix2 = _mm_add_ps(fix2,tx);
581 fiy2 = _mm_add_ps(fiy2,ty);
582 fiz2 = _mm_add_ps(fiz2,tz);
583
584 fjx2 = _mm_add_ps(fjx2,tx);
585 fjy2 = _mm_add_ps(fjy2,ty);
586 fjz2 = _mm_add_ps(fjz2,tz);
587
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
591
592 r23 = _mm_mul_ps(rsq23,rinv23);
593
594 /* EWALD ELECTROSTATICS */
595
596 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
597 ewrt = _mm_mul_ps(r23,ewtabscale);
598 ewitab = _mm_cvttps_epi32(ewrt);
599 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
600 ewitab = _mm_slli_epi32(ewitab,2);
601 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
602 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
603 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
604 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
605 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
606 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
607 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
608 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
609 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
610
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velecsum = _mm_add_ps(velecsum,velec);
613
614 fscal = felec;
615
616 /* Calculate temporary vectorial force */
617 tx = _mm_mul_ps(fscal,dx23);
618 ty = _mm_mul_ps(fscal,dy23);
619 tz = _mm_mul_ps(fscal,dz23);
620
621 /* Update vectorial force */
622 fix2 = _mm_add_ps(fix2,tx);
623 fiy2 = _mm_add_ps(fiy2,ty);
624 fiz2 = _mm_add_ps(fiz2,tz);
625
626 fjx3 = _mm_add_ps(fjx3,tx);
627 fjy3 = _mm_add_ps(fjy3,ty);
628 fjz3 = _mm_add_ps(fjz3,tz);
629
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
633
634 r31 = _mm_mul_ps(rsq31,rinv31);
635
636 /* EWALD ELECTROSTATICS */
637
638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639 ewrt = _mm_mul_ps(r31,ewtabscale);
640 ewitab = _mm_cvttps_epi32(ewrt);
641 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
642 ewitab = _mm_slli_epi32(ewitab,2);
643 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
644 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
645 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
646 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
647 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
648 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
649 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
650 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
651 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
652
653 /* Update potential sum for this i atom from the interaction with this j atom. */
654 velecsum = _mm_add_ps(velecsum,velec);
655
656 fscal = felec;
657
658 /* Calculate temporary vectorial force */
659 tx = _mm_mul_ps(fscal,dx31);
660 ty = _mm_mul_ps(fscal,dy31);
661 tz = _mm_mul_ps(fscal,dz31);
662
663 /* Update vectorial force */
664 fix3 = _mm_add_ps(fix3,tx);
665 fiy3 = _mm_add_ps(fiy3,ty);
666 fiz3 = _mm_add_ps(fiz3,tz);
667
668 fjx1 = _mm_add_ps(fjx1,tx);
669 fjy1 = _mm_add_ps(fjy1,ty);
670 fjz1 = _mm_add_ps(fjz1,tz);
671
672 /**************************
673 * CALCULATE INTERACTIONS *
674 **************************/
675
676 r32 = _mm_mul_ps(rsq32,rinv32);
677
678 /* EWALD ELECTROSTATICS */
679
680 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681 ewrt = _mm_mul_ps(r32,ewtabscale);
682 ewitab = _mm_cvttps_epi32(ewrt);
683 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
684 ewitab = _mm_slli_epi32(ewitab,2);
685 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
686 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
687 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
688 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
689 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
690 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
691 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
692 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
693 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
694
695 /* Update potential sum for this i atom from the interaction with this j atom. */
696 velecsum = _mm_add_ps(velecsum,velec);
697
698 fscal = felec;
699
700 /* Calculate temporary vectorial force */
701 tx = _mm_mul_ps(fscal,dx32);
702 ty = _mm_mul_ps(fscal,dy32);
703 tz = _mm_mul_ps(fscal,dz32);
704
705 /* Update vectorial force */
706 fix3 = _mm_add_ps(fix3,tx);
707 fiy3 = _mm_add_ps(fiy3,ty);
708 fiz3 = _mm_add_ps(fiz3,tz);
709
710 fjx2 = _mm_add_ps(fjx2,tx);
711 fjy2 = _mm_add_ps(fjy2,ty);
712 fjz2 = _mm_add_ps(fjz2,tz);
713
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
717
718 r33 = _mm_mul_ps(rsq33,rinv33);
719
720 /* EWALD ELECTROSTATICS */
721
722 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
723 ewrt = _mm_mul_ps(r33,ewtabscale);
724 ewitab = _mm_cvttps_epi32(ewrt);
725 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
726 ewitab = _mm_slli_epi32(ewitab,2);
727 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
728 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
729 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
730 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
731 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
732 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
733 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
734 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
735 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
736
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velecsum = _mm_add_ps(velecsum,velec);
739
740 fscal = felec;
741
742 /* Calculate temporary vectorial force */
743 tx = _mm_mul_ps(fscal,dx33);
744 ty = _mm_mul_ps(fscal,dy33);
745 tz = _mm_mul_ps(fscal,dz33);
746
747 /* Update vectorial force */
748 fix3 = _mm_add_ps(fix3,tx);
749 fiy3 = _mm_add_ps(fiy3,ty);
750 fiz3 = _mm_add_ps(fiz3,tz);
751
752 fjx3 = _mm_add_ps(fjx3,tx);
753 fjy3 = _mm_add_ps(fjy3,ty);
754 fjz3 = _mm_add_ps(fjz3,tz);
755
756 fjptrA = f+j_coord_offsetA;
757 fjptrB = f+j_coord_offsetB;
758 fjptrC = f+j_coord_offsetC;
759 fjptrD = f+j_coord_offsetD;
760
761 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
762 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
763 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
764
765 /* Inner loop uses 423 flops */
766 }
767
768 if(jidx<j_index_end)
769 {
770
771 /* Get j neighbor index, and coordinate index */
772 jnrlistA = jjnr[jidx];
773 jnrlistB = jjnr[jidx+1];
774 jnrlistC = jjnr[jidx+2];
775 jnrlistD = jjnr[jidx+3];
776 /* Sign of each element will be negative for non-real atoms.
777 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
778 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
779 */
780 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
781 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
782 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
783 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
784 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
785 j_coord_offsetA = DIM3*jnrA;
786 j_coord_offsetB = DIM3*jnrB;
787 j_coord_offsetC = DIM3*jnrC;
788 j_coord_offsetD = DIM3*jnrD;
789
790 /* load j atom coordinates */
791 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
792 x+j_coord_offsetC,x+j_coord_offsetD,
793 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
794 &jy2,&jz2,&jx3,&jy3,&jz3);
795
796 /* Calculate displacement vector */
797 dx00 = _mm_sub_ps(ix0,jx0);
798 dy00 = _mm_sub_ps(iy0,jy0);
799 dz00 = _mm_sub_ps(iz0,jz0);
800 dx11 = _mm_sub_ps(ix1,jx1);
801 dy11 = _mm_sub_ps(iy1,jy1);
802 dz11 = _mm_sub_ps(iz1,jz1);
803 dx12 = _mm_sub_ps(ix1,jx2);
804 dy12 = _mm_sub_ps(iy1,jy2);
805 dz12 = _mm_sub_ps(iz1,jz2);
806 dx13 = _mm_sub_ps(ix1,jx3);
807 dy13 = _mm_sub_ps(iy1,jy3);
808 dz13 = _mm_sub_ps(iz1,jz3);
809 dx21 = _mm_sub_ps(ix2,jx1);
810 dy21 = _mm_sub_ps(iy2,jy1);
811 dz21 = _mm_sub_ps(iz2,jz1);
812 dx22 = _mm_sub_ps(ix2,jx2);
813 dy22 = _mm_sub_ps(iy2,jy2);
814 dz22 = _mm_sub_ps(iz2,jz2);
815 dx23 = _mm_sub_ps(ix2,jx3);
816 dy23 = _mm_sub_ps(iy2,jy3);
817 dz23 = _mm_sub_ps(iz2,jz3);
818 dx31 = _mm_sub_ps(ix3,jx1);
819 dy31 = _mm_sub_ps(iy3,jy1);
820 dz31 = _mm_sub_ps(iz3,jz1);
821 dx32 = _mm_sub_ps(ix3,jx2);
822 dy32 = _mm_sub_ps(iy3,jy2);
823 dz32 = _mm_sub_ps(iz3,jz2);
824 dx33 = _mm_sub_ps(ix3,jx3);
825 dy33 = _mm_sub_ps(iy3,jy3);
826 dz33 = _mm_sub_ps(iz3,jz3);
827
828 /* Calculate squared distance and things based on it */
829 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
830 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
831 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
832 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
833 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
834 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
835 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
836 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
837 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
838 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
839
840 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
841 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
842 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
843 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
844 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
845 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
846 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
847 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
848 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
849 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
850
851 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
852 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
853 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
854 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
855 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
856 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
857 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
858 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
859 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
860 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
861
862 fjx0 = _mm_setzero_ps();
863 fjy0 = _mm_setzero_ps();
864 fjz0 = _mm_setzero_ps();
865 fjx1 = _mm_setzero_ps();
866 fjy1 = _mm_setzero_ps();
867 fjz1 = _mm_setzero_ps();
868 fjx2 = _mm_setzero_ps();
869 fjy2 = _mm_setzero_ps();
870 fjz2 = _mm_setzero_ps();
871 fjx3 = _mm_setzero_ps();
872 fjy3 = _mm_setzero_ps();
873 fjz3 = _mm_setzero_ps();
874
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
878
879 r00 = _mm_mul_ps(rsq00,rinv00);
880 r00 = _mm_andnot_ps(dummy_mask,r00);
881
882 /* Analytical LJ-PME */
883 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
884 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
885 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
886 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
887 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
888 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
889 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
890 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
891 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
892 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
893 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
894 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
895
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
898 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
899
900 fscal = fvdw;
901
902 fscal = _mm_andnot_ps(dummy_mask,fscal);
903
904 /* Calculate temporary vectorial force */
905 tx = _mm_mul_ps(fscal,dx00);
906 ty = _mm_mul_ps(fscal,dy00);
907 tz = _mm_mul_ps(fscal,dz00);
908
909 /* Update vectorial force */
910 fix0 = _mm_add_ps(fix0,tx);
911 fiy0 = _mm_add_ps(fiy0,ty);
912 fiz0 = _mm_add_ps(fiz0,tz);
913
914 fjx0 = _mm_add_ps(fjx0,tx);
915 fjy0 = _mm_add_ps(fjy0,ty);
916 fjz0 = _mm_add_ps(fjz0,tz);
917
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
921
922 r11 = _mm_mul_ps(rsq11,rinv11);
923 r11 = _mm_andnot_ps(dummy_mask,r11);
924
925 /* EWALD ELECTROSTATICS */
926
927 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
928 ewrt = _mm_mul_ps(r11,ewtabscale);
929 ewitab = _mm_cvttps_epi32(ewrt);
930 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
931 ewitab = _mm_slli_epi32(ewitab,2);
932 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
933 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
934 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
935 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
936 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
937 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
938 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
939 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
940 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
941
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec = _mm_andnot_ps(dummy_mask,velec);
944 velecsum = _mm_add_ps(velecsum,velec);
945
946 fscal = felec;
947
948 fscal = _mm_andnot_ps(dummy_mask,fscal);
949
950 /* Calculate temporary vectorial force */
951 tx = _mm_mul_ps(fscal,dx11);
952 ty = _mm_mul_ps(fscal,dy11);
953 tz = _mm_mul_ps(fscal,dz11);
954
955 /* Update vectorial force */
956 fix1 = _mm_add_ps(fix1,tx);
957 fiy1 = _mm_add_ps(fiy1,ty);
958 fiz1 = _mm_add_ps(fiz1,tz);
959
960 fjx1 = _mm_add_ps(fjx1,tx);
961 fjy1 = _mm_add_ps(fjy1,ty);
962 fjz1 = _mm_add_ps(fjz1,tz);
963
964 /**************************
965 * CALCULATE INTERACTIONS *
966 **************************/
967
968 r12 = _mm_mul_ps(rsq12,rinv12);
969 r12 = _mm_andnot_ps(dummy_mask,r12);
970
971 /* EWALD ELECTROSTATICS */
972
973 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
974 ewrt = _mm_mul_ps(r12,ewtabscale);
975 ewitab = _mm_cvttps_epi32(ewrt);
976 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
977 ewitab = _mm_slli_epi32(ewitab,2);
978 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
979 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
980 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
981 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
982 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
983 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
984 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
985 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
986 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
987
988 /* Update potential sum for this i atom from the interaction with this j atom. */
989 velec = _mm_andnot_ps(dummy_mask,velec);
990 velecsum = _mm_add_ps(velecsum,velec);
991
992 fscal = felec;
993
994 fscal = _mm_andnot_ps(dummy_mask,fscal);
995
996 /* Calculate temporary vectorial force */
997 tx = _mm_mul_ps(fscal,dx12);
998 ty = _mm_mul_ps(fscal,dy12);
999 tz = _mm_mul_ps(fscal,dz12);
1000
1001 /* Update vectorial force */
1002 fix1 = _mm_add_ps(fix1,tx);
1003 fiy1 = _mm_add_ps(fiy1,ty);
1004 fiz1 = _mm_add_ps(fiz1,tz);
1005
1006 fjx2 = _mm_add_ps(fjx2,tx);
1007 fjy2 = _mm_add_ps(fjy2,ty);
1008 fjz2 = _mm_add_ps(fjz2,tz);
1009
1010 /**************************
1011 * CALCULATE INTERACTIONS *
1012 **************************/
1013
1014 r13 = _mm_mul_ps(rsq13,rinv13);
1015 r13 = _mm_andnot_ps(dummy_mask,r13);
1016
1017 /* EWALD ELECTROSTATICS */
1018
1019 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1020 ewrt = _mm_mul_ps(r13,ewtabscale);
1021 ewitab = _mm_cvttps_epi32(ewrt);
1022 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1023 ewitab = _mm_slli_epi32(ewitab,2);
1024 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1025 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1026 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1027 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1028 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1029 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1030 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1031 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1032 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1033
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec = _mm_andnot_ps(dummy_mask,velec);
1036 velecsum = _mm_add_ps(velecsum,velec);
1037
1038 fscal = felec;
1039
1040 fscal = _mm_andnot_ps(dummy_mask,fscal);
1041
1042 /* Calculate temporary vectorial force */
1043 tx = _mm_mul_ps(fscal,dx13);
1044 ty = _mm_mul_ps(fscal,dy13);
1045 tz = _mm_mul_ps(fscal,dz13);
1046
1047 /* Update vectorial force */
1048 fix1 = _mm_add_ps(fix1,tx);
1049 fiy1 = _mm_add_ps(fiy1,ty);
1050 fiz1 = _mm_add_ps(fiz1,tz);
1051
1052 fjx3 = _mm_add_ps(fjx3,tx);
1053 fjy3 = _mm_add_ps(fjy3,ty);
1054 fjz3 = _mm_add_ps(fjz3,tz);
1055
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1059
1060 r21 = _mm_mul_ps(rsq21,rinv21);
1061 r21 = _mm_andnot_ps(dummy_mask,r21);
1062
1063 /* EWALD ELECTROSTATICS */
1064
1065 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1066 ewrt = _mm_mul_ps(r21,ewtabscale);
1067 ewitab = _mm_cvttps_epi32(ewrt);
1068 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1069 ewitab = _mm_slli_epi32(ewitab,2);
1070 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1071 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1072 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1073 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1074 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1075 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1076 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1077 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1078 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1079
1080 /* Update potential sum for this i atom from the interaction with this j atom. */
1081 velec = _mm_andnot_ps(dummy_mask,velec);
1082 velecsum = _mm_add_ps(velecsum,velec);
1083
1084 fscal = felec;
1085
1086 fscal = _mm_andnot_ps(dummy_mask,fscal);
1087
1088 /* Calculate temporary vectorial force */
1089 tx = _mm_mul_ps(fscal,dx21);
1090 ty = _mm_mul_ps(fscal,dy21);
1091 tz = _mm_mul_ps(fscal,dz21);
1092
1093 /* Update vectorial force */
1094 fix2 = _mm_add_ps(fix2,tx);
1095 fiy2 = _mm_add_ps(fiy2,ty);
1096 fiz2 = _mm_add_ps(fiz2,tz);
1097
1098 fjx1 = _mm_add_ps(fjx1,tx);
1099 fjy1 = _mm_add_ps(fjy1,ty);
1100 fjz1 = _mm_add_ps(fjz1,tz);
1101
1102 /**************************
1103 * CALCULATE INTERACTIONS *
1104 **************************/
1105
1106 r22 = _mm_mul_ps(rsq22,rinv22);
1107 r22 = _mm_andnot_ps(dummy_mask,r22);
1108
1109 /* EWALD ELECTROSTATICS */
1110
1111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1112 ewrt = _mm_mul_ps(r22,ewtabscale);
1113 ewitab = _mm_cvttps_epi32(ewrt);
1114 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1115 ewitab = _mm_slli_epi32(ewitab,2);
1116 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1117 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1118 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1119 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1120 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1121 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1122 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1123 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1124 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1125
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec = _mm_andnot_ps(dummy_mask,velec);
1128 velecsum = _mm_add_ps(velecsum,velec);
1129
1130 fscal = felec;
1131
1132 fscal = _mm_andnot_ps(dummy_mask,fscal);
1133
1134 /* Calculate temporary vectorial force */
1135 tx = _mm_mul_ps(fscal,dx22);
1136 ty = _mm_mul_ps(fscal,dy22);
1137 tz = _mm_mul_ps(fscal,dz22);
1138
1139 /* Update vectorial force */
1140 fix2 = _mm_add_ps(fix2,tx);
1141 fiy2 = _mm_add_ps(fiy2,ty);
1142 fiz2 = _mm_add_ps(fiz2,tz);
1143
1144 fjx2 = _mm_add_ps(fjx2,tx);
1145 fjy2 = _mm_add_ps(fjy2,ty);
1146 fjz2 = _mm_add_ps(fjz2,tz);
1147
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1151
1152 r23 = _mm_mul_ps(rsq23,rinv23);
1153 r23 = _mm_andnot_ps(dummy_mask,r23);
1154
1155 /* EWALD ELECTROSTATICS */
1156
1157 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1158 ewrt = _mm_mul_ps(r23,ewtabscale);
1159 ewitab = _mm_cvttps_epi32(ewrt);
1160 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1161 ewitab = _mm_slli_epi32(ewitab,2);
1162 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1163 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1164 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1165 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1166 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1167 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1168 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1169 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1170 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1171
1172 /* Update potential sum for this i atom from the interaction with this j atom. */
1173 velec = _mm_andnot_ps(dummy_mask,velec);
1174 velecsum = _mm_add_ps(velecsum,velec);
1175
1176 fscal = felec;
1177
1178 fscal = _mm_andnot_ps(dummy_mask,fscal);
1179
1180 /* Calculate temporary vectorial force */
1181 tx = _mm_mul_ps(fscal,dx23);
1182 ty = _mm_mul_ps(fscal,dy23);
1183 tz = _mm_mul_ps(fscal,dz23);
1184
1185 /* Update vectorial force */
1186 fix2 = _mm_add_ps(fix2,tx);
1187 fiy2 = _mm_add_ps(fiy2,ty);
1188 fiz2 = _mm_add_ps(fiz2,tz);
1189
1190 fjx3 = _mm_add_ps(fjx3,tx);
1191 fjy3 = _mm_add_ps(fjy3,ty);
1192 fjz3 = _mm_add_ps(fjz3,tz);
1193
1194 /**************************
1195 * CALCULATE INTERACTIONS *
1196 **************************/
1197
1198 r31 = _mm_mul_ps(rsq31,rinv31);
1199 r31 = _mm_andnot_ps(dummy_mask,r31);
1200
1201 /* EWALD ELECTROSTATICS */
1202
1203 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1204 ewrt = _mm_mul_ps(r31,ewtabscale);
1205 ewitab = _mm_cvttps_epi32(ewrt);
1206 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1207 ewitab = _mm_slli_epi32(ewitab,2);
1208 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1209 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1210 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1211 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1212 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1213 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1214 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1215 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1216 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1217
1218 /* Update potential sum for this i atom from the interaction with this j atom. */
1219 velec = _mm_andnot_ps(dummy_mask,velec);
1220 velecsum = _mm_add_ps(velecsum,velec);
1221
1222 fscal = felec;
1223
1224 fscal = _mm_andnot_ps(dummy_mask,fscal);
1225
1226 /* Calculate temporary vectorial force */
1227 tx = _mm_mul_ps(fscal,dx31);
1228 ty = _mm_mul_ps(fscal,dy31);
1229 tz = _mm_mul_ps(fscal,dz31);
1230
1231 /* Update vectorial force */
1232 fix3 = _mm_add_ps(fix3,tx);
1233 fiy3 = _mm_add_ps(fiy3,ty);
1234 fiz3 = _mm_add_ps(fiz3,tz);
1235
1236 fjx1 = _mm_add_ps(fjx1,tx);
1237 fjy1 = _mm_add_ps(fjy1,ty);
1238 fjz1 = _mm_add_ps(fjz1,tz);
1239
1240 /**************************
1241 * CALCULATE INTERACTIONS *
1242 **************************/
1243
1244 r32 = _mm_mul_ps(rsq32,rinv32);
1245 r32 = _mm_andnot_ps(dummy_mask,r32);
1246
1247 /* EWALD ELECTROSTATICS */
1248
1249 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1250 ewrt = _mm_mul_ps(r32,ewtabscale);
1251 ewitab = _mm_cvttps_epi32(ewrt);
1252 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1253 ewitab = _mm_slli_epi32(ewitab,2);
1254 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1255 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1256 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1257 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1258 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1259 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1260 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1261 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1262 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1263
1264 /* Update potential sum for this i atom from the interaction with this j atom. */
1265 velec = _mm_andnot_ps(dummy_mask,velec);
1266 velecsum = _mm_add_ps(velecsum,velec);
1267
1268 fscal = felec;
1269
1270 fscal = _mm_andnot_ps(dummy_mask,fscal);
1271
1272 /* Calculate temporary vectorial force */
1273 tx = _mm_mul_ps(fscal,dx32);
1274 ty = _mm_mul_ps(fscal,dy32);
1275 tz = _mm_mul_ps(fscal,dz32);
1276
1277 /* Update vectorial force */
1278 fix3 = _mm_add_ps(fix3,tx);
1279 fiy3 = _mm_add_ps(fiy3,ty);
1280 fiz3 = _mm_add_ps(fiz3,tz);
1281
1282 fjx2 = _mm_add_ps(fjx2,tx);
1283 fjy2 = _mm_add_ps(fjy2,ty);
1284 fjz2 = _mm_add_ps(fjz2,tz);
1285
1286 /**************************
1287 * CALCULATE INTERACTIONS *
1288 **************************/
1289
1290 r33 = _mm_mul_ps(rsq33,rinv33);
1291 r33 = _mm_andnot_ps(dummy_mask,r33);
1292
1293 /* EWALD ELECTROSTATICS */
1294
1295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1296 ewrt = _mm_mul_ps(r33,ewtabscale);
1297 ewitab = _mm_cvttps_epi32(ewrt);
1298 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1299 ewitab = _mm_slli_epi32(ewitab,2);
1300 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1301 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1302 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1303 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1304 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1305 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1306 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1307 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1308 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1309
1310 /* Update potential sum for this i atom from the interaction with this j atom. */
1311 velec = _mm_andnot_ps(dummy_mask,velec);
1312 velecsum = _mm_add_ps(velecsum,velec);
1313
1314 fscal = felec;
1315
1316 fscal = _mm_andnot_ps(dummy_mask,fscal);
1317
1318 /* Calculate temporary vectorial force */
1319 tx = _mm_mul_ps(fscal,dx33);
1320 ty = _mm_mul_ps(fscal,dy33);
1321 tz = _mm_mul_ps(fscal,dz33);
1322
1323 /* Update vectorial force */
1324 fix3 = _mm_add_ps(fix3,tx);
1325 fiy3 = _mm_add_ps(fiy3,ty);
1326 fiz3 = _mm_add_ps(fiz3,tz);
1327
1328 fjx3 = _mm_add_ps(fjx3,tx);
1329 fjy3 = _mm_add_ps(fjy3,ty);
1330 fjz3 = _mm_add_ps(fjz3,tz);
1331
1332 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1333 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1334 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1335 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1336
1337 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1338 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1339 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1340
1341 /* Inner loop uses 433 flops */
1342 }
1343
1344 /* End of innermost loop */
1345
1346 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1347 f+i_coord_offset,fshift+i_shift_offset);
1348
1349 ggid = gid[iidx];
1350 /* Update potential energies */
1351 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1352 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1353
1354 /* Increment number of inner iterations */
1355 inneriter += j_index_end - j_index_start;
1356
1357 /* Outer loop uses 26 flops */
1358 }
1359
1360 /* Increment number of outer iterations */
1361 outeriter += nri;
1362
1363 /* Update outer/inner flops */
1364
1365 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*433)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_VF] += outeriter*26 +
inneriter*433
;
1366}
1367/*
1368 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1369 * Electrostatics interaction: Ewald
1370 * VdW interaction: LJEwald
1371 * Geometry: Water4-Water4
1372 * Calculate force/pot: Force
1373 */
1374void
1375nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1376 (t_nblist * gmx_restrict nlist,
1377 rvec * gmx_restrict xx,
1378 rvec * gmx_restrict ff,
1379 t_forcerec * gmx_restrict fr,
1380 t_mdatoms * gmx_restrict mdatoms,
1381 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1382 t_nrnb * gmx_restrict nrnb)
1383{
1384 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1385 * just 0 for non-waters.
1386 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1387 * jnr indices corresponding to data put in the four positions in the SIMD register.
1388 */
1389 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1390 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1391 int jnrA,jnrB,jnrC,jnrD;
1392 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1393 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1394 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1395 real rcutoff_scalar;
1396 real *shiftvec,*fshift,*x,*f;
1397 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1398 real scratch[4*DIM3];
1399 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1400 int vdwioffset0;
1401 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1402 int vdwioffset1;
1403 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1404 int vdwioffset2;
1405 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1406 int vdwioffset3;
1407 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1408 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1409 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1410 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1411 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1412 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1413 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1414 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1415 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1416 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1417 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1418 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1419 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1420 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1421 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1422 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1423 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1424 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1425 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1426 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1427 real *charge;
1428 int nvdwtype;
1429 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1430 int *vdwtype;
1431 real *vdwparam;
1432 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1433 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1434 __m128 c6grid_00;
1435 __m128 c6grid_11;
1436 __m128 c6grid_12;
1437 __m128 c6grid_13;
1438 __m128 c6grid_21;
1439 __m128 c6grid_22;
1440 __m128 c6grid_23;
1441 __m128 c6grid_31;
1442 __m128 c6grid_32;
1443 __m128 c6grid_33;
1444 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1445 real *vdwgridparam;
1446 __m128 one_half = _mm_set1_ps(0.5);
1447 __m128 minus_one = _mm_set1_ps(-1.0);
1448 __m128i ewitab;
1449 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1450 real *ewtab;
1451 __m128 dummy_mask,cutoff_mask;
1452 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1453 __m128 one = _mm_set1_ps(1.0);
1454 __m128 two = _mm_set1_ps(2.0);
1455 x = xx[0];
1456 f = ff[0];
1457
1458 nri = nlist->nri;
1459 iinr = nlist->iinr;
1460 jindex = nlist->jindex;
1461 jjnr = nlist->jjnr;
1462 shiftidx = nlist->shift;
1463 gid = nlist->gid;
1464 shiftvec = fr->shift_vec[0];
1465 fshift = fr->fshift[0];
1466 facel = _mm_set1_ps(fr->epsfac);
1467 charge = mdatoms->chargeA;
1468 nvdwtype = fr->ntype;
1469 vdwparam = fr->nbfp;
1470 vdwtype = mdatoms->typeA;
1471 vdwgridparam = fr->ljpme_c6grid;
1472 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1473 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1474 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1475
1476 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1477 ewtab = fr->ic->tabq_coul_F;
1478 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1479 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1480
1481 /* Setup water-specific parameters */
1482 inr = nlist->iinr[0];
1483 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1484 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1485 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1486 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1487
1488 jq1 = _mm_set1_ps(charge[inr+1]);
1489 jq2 = _mm_set1_ps(charge[inr+2]);
1490 jq3 = _mm_set1_ps(charge[inr+3]);
1491 vdwjidx0A = 2*vdwtype[inr+0];
1492 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1493 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1494 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1495 qq11 = _mm_mul_ps(iq1,jq1);
1496 qq12 = _mm_mul_ps(iq1,jq2);
1497 qq13 = _mm_mul_ps(iq1,jq3);
1498 qq21 = _mm_mul_ps(iq2,jq1);
1499 qq22 = _mm_mul_ps(iq2,jq2);
1500 qq23 = _mm_mul_ps(iq2,jq3);
1501 qq31 = _mm_mul_ps(iq3,jq1);
1502 qq32 = _mm_mul_ps(iq3,jq2);
1503 qq33 = _mm_mul_ps(iq3,jq3);
1504
1505 /* Avoid stupid compiler warnings */
1506 jnrA = jnrB = jnrC = jnrD = 0;
1507 j_coord_offsetA = 0;
1508 j_coord_offsetB = 0;
1509 j_coord_offsetC = 0;
1510 j_coord_offsetD = 0;
1511
1512 outeriter = 0;
1513 inneriter = 0;
1514
1515 for(iidx=0;iidx<4*DIM3;iidx++)
1516 {
1517 scratch[iidx] = 0.0;
1518 }
1519
1520 /* Start outer loop over neighborlists */
1521 for(iidx=0; iidx<nri; iidx++)
1522 {
1523 /* Load shift vector for this list */
1524 i_shift_offset = DIM3*shiftidx[iidx];
1525
1526 /* Load limits for loop over neighbors */
1527 j_index_start = jindex[iidx];
1528 j_index_end = jindex[iidx+1];
1529
1530 /* Get outer coordinate index */
1531 inr = iinr[iidx];
1532 i_coord_offset = DIM3*inr;
1533
1534 /* Load i particle coords and add shift vector */
1535 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1536 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1537
1538 fix0 = _mm_setzero_ps();
1539 fiy0 = _mm_setzero_ps();
1540 fiz0 = _mm_setzero_ps();
1541 fix1 = _mm_setzero_ps();
1542 fiy1 = _mm_setzero_ps();
1543 fiz1 = _mm_setzero_ps();
1544 fix2 = _mm_setzero_ps();
1545 fiy2 = _mm_setzero_ps();
1546 fiz2 = _mm_setzero_ps();
1547 fix3 = _mm_setzero_ps();
1548 fiy3 = _mm_setzero_ps();
1549 fiz3 = _mm_setzero_ps();
1550
1551 /* Start inner kernel loop */
1552 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1553 {
1554
1555 /* Get j neighbor index, and coordinate index */
1556 jnrA = jjnr[jidx];
1557 jnrB = jjnr[jidx+1];
1558 jnrC = jjnr[jidx+2];
1559 jnrD = jjnr[jidx+3];
1560 j_coord_offsetA = DIM3*jnrA;
1561 j_coord_offsetB = DIM3*jnrB;
1562 j_coord_offsetC = DIM3*jnrC;
1563 j_coord_offsetD = DIM3*jnrD;
1564
1565 /* load j atom coordinates */
1566 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1567 x+j_coord_offsetC,x+j_coord_offsetD,
1568 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1569 &jy2,&jz2,&jx3,&jy3,&jz3);
1570
1571 /* Calculate displacement vector */
1572 dx00 = _mm_sub_ps(ix0,jx0);
1573 dy00 = _mm_sub_ps(iy0,jy0);
1574 dz00 = _mm_sub_ps(iz0,jz0);
1575 dx11 = _mm_sub_ps(ix1,jx1);
1576 dy11 = _mm_sub_ps(iy1,jy1);
1577 dz11 = _mm_sub_ps(iz1,jz1);
1578 dx12 = _mm_sub_ps(ix1,jx2);
1579 dy12 = _mm_sub_ps(iy1,jy2);
1580 dz12 = _mm_sub_ps(iz1,jz2);
1581 dx13 = _mm_sub_ps(ix1,jx3);
1582 dy13 = _mm_sub_ps(iy1,jy3);
1583 dz13 = _mm_sub_ps(iz1,jz3);
1584 dx21 = _mm_sub_ps(ix2,jx1);
1585 dy21 = _mm_sub_ps(iy2,jy1);
1586 dz21 = _mm_sub_ps(iz2,jz1);
1587 dx22 = _mm_sub_ps(ix2,jx2);
1588 dy22 = _mm_sub_ps(iy2,jy2);
1589 dz22 = _mm_sub_ps(iz2,jz2);
1590 dx23 = _mm_sub_ps(ix2,jx3);
1591 dy23 = _mm_sub_ps(iy2,jy3);
1592 dz23 = _mm_sub_ps(iz2,jz3);
1593 dx31 = _mm_sub_ps(ix3,jx1);
1594 dy31 = _mm_sub_ps(iy3,jy1);
1595 dz31 = _mm_sub_ps(iz3,jz1);
1596 dx32 = _mm_sub_ps(ix3,jx2);
1597 dy32 = _mm_sub_ps(iy3,jy2);
1598 dz32 = _mm_sub_ps(iz3,jz2);
1599 dx33 = _mm_sub_ps(ix3,jx3);
1600 dy33 = _mm_sub_ps(iy3,jy3);
1601 dz33 = _mm_sub_ps(iz3,jz3);
1602
1603 /* Calculate squared distance and things based on it */
1604 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1605 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1606 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1607 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1608 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1609 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1610 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1611 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1612 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1613 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1614
1615 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1616 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1617 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1618 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
1619 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1620 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1621 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
1622 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
1623 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
1624 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
1625
1626 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1627 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1628 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1629 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1630 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1631 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1632 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1633 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1634 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1635 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1636
1637 fjx0 = _mm_setzero_ps();
1638 fjy0 = _mm_setzero_ps();
1639 fjz0 = _mm_setzero_ps();
1640 fjx1 = _mm_setzero_ps();
1641 fjy1 = _mm_setzero_ps();
1642 fjz1 = _mm_setzero_ps();
1643 fjx2 = _mm_setzero_ps();
1644 fjy2 = _mm_setzero_ps();
1645 fjz2 = _mm_setzero_ps();
1646 fjx3 = _mm_setzero_ps();
1647 fjy3 = _mm_setzero_ps();
1648 fjz3 = _mm_setzero_ps();
1649
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1653
1654 r00 = _mm_mul_ps(rsq00,rinv00);
1655
1656 /* Analytical LJ-PME */
1657 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1658 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1659 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1660 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
1661 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1662 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1663 /* f6A = 6 * C6grid * (1 - poly) */
1664 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1665 /* f6B = C6grid * exponent * beta^6 */
1666 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1667 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1668 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1669
1670 fscal = fvdw;
1671
1672 /* Calculate temporary vectorial force */
1673 tx = _mm_mul_ps(fscal,dx00);
1674 ty = _mm_mul_ps(fscal,dy00);
1675 tz = _mm_mul_ps(fscal,dz00);
1676
1677 /* Update vectorial force */
1678 fix0 = _mm_add_ps(fix0,tx);
1679 fiy0 = _mm_add_ps(fiy0,ty);
1680 fiz0 = _mm_add_ps(fiz0,tz);
1681
1682 fjx0 = _mm_add_ps(fjx0,tx);
1683 fjy0 = _mm_add_ps(fjy0,ty);
1684 fjz0 = _mm_add_ps(fjz0,tz);
1685
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1689
1690 r11 = _mm_mul_ps(rsq11,rinv11);
1691
1692 /* EWALD ELECTROSTATICS */
1693
1694 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1695 ewrt = _mm_mul_ps(r11,ewtabscale);
1696 ewitab = _mm_cvttps_epi32(ewrt);
1697 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1698 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1699 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1700 &ewtabF,&ewtabFn);
1701 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1702 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1703
1704 fscal = felec;
1705
1706 /* Calculate temporary vectorial force */
1707 tx = _mm_mul_ps(fscal,dx11);
1708 ty = _mm_mul_ps(fscal,dy11);
1709 tz = _mm_mul_ps(fscal,dz11);
1710
1711 /* Update vectorial force */
1712 fix1 = _mm_add_ps(fix1,tx);
1713 fiy1 = _mm_add_ps(fiy1,ty);
1714 fiz1 = _mm_add_ps(fiz1,tz);
1715
1716 fjx1 = _mm_add_ps(fjx1,tx);
1717 fjy1 = _mm_add_ps(fjy1,ty);
1718 fjz1 = _mm_add_ps(fjz1,tz);
1719
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1723
1724 r12 = _mm_mul_ps(rsq12,rinv12);
1725
1726 /* EWALD ELECTROSTATICS */
1727
1728 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1729 ewrt = _mm_mul_ps(r12,ewtabscale);
1730 ewitab = _mm_cvttps_epi32(ewrt);
1731 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1732 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1733 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1734 &ewtabF,&ewtabFn);
1735 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1736 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1737
1738 fscal = felec;
1739
1740 /* Calculate temporary vectorial force */
1741 tx = _mm_mul_ps(fscal,dx12);
1742 ty = _mm_mul_ps(fscal,dy12);
1743 tz = _mm_mul_ps(fscal,dz12);
1744
1745 /* Update vectorial force */
1746 fix1 = _mm_add_ps(fix1,tx);
1747 fiy1 = _mm_add_ps(fiy1,ty);
1748 fiz1 = _mm_add_ps(fiz1,tz);
1749
1750 fjx2 = _mm_add_ps(fjx2,tx);
1751 fjy2 = _mm_add_ps(fjy2,ty);
1752 fjz2 = _mm_add_ps(fjz2,tz);
1753
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1757
1758 r13 = _mm_mul_ps(rsq13,rinv13);
1759
1760 /* EWALD ELECTROSTATICS */
1761
1762 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1763 ewrt = _mm_mul_ps(r13,ewtabscale);
1764 ewitab = _mm_cvttps_epi32(ewrt);
1765 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1766 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1767 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1768 &ewtabF,&ewtabFn);
1769 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1770 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1771
1772 fscal = felec;
1773
1774 /* Calculate temporary vectorial force */
1775 tx = _mm_mul_ps(fscal,dx13);
1776 ty = _mm_mul_ps(fscal,dy13);
1777 tz = _mm_mul_ps(fscal,dz13);
1778
1779 /* Update vectorial force */
1780 fix1 = _mm_add_ps(fix1,tx);
1781 fiy1 = _mm_add_ps(fiy1,ty);
1782 fiz1 = _mm_add_ps(fiz1,tz);
1783
1784 fjx3 = _mm_add_ps(fjx3,tx);
1785 fjy3 = _mm_add_ps(fjy3,ty);
1786 fjz3 = _mm_add_ps(fjz3,tz);
1787
1788 /**************************
1789 * CALCULATE INTERACTIONS *
1790 **************************/
1791
1792 r21 = _mm_mul_ps(rsq21,rinv21);
1793
1794 /* EWALD ELECTROSTATICS */
1795
1796 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1797 ewrt = _mm_mul_ps(r21,ewtabscale);
1798 ewitab = _mm_cvttps_epi32(ewrt);
1799 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1800 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1801 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1802 &ewtabF,&ewtabFn);
1803 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1804 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1805
1806 fscal = felec;
1807
1808 /* Calculate temporary vectorial force */
1809 tx = _mm_mul_ps(fscal,dx21);
1810 ty = _mm_mul_ps(fscal,dy21);
1811 tz = _mm_mul_ps(fscal,dz21);
1812
1813 /* Update vectorial force */
1814 fix2 = _mm_add_ps(fix2,tx);
1815 fiy2 = _mm_add_ps(fiy2,ty);
1816 fiz2 = _mm_add_ps(fiz2,tz);
1817
1818 fjx1 = _mm_add_ps(fjx1,tx);
1819 fjy1 = _mm_add_ps(fjy1,ty);
1820 fjz1 = _mm_add_ps(fjz1,tz);
1821
1822 /**************************
1823 * CALCULATE INTERACTIONS *
1824 **************************/
1825
1826 r22 = _mm_mul_ps(rsq22,rinv22);
1827
1828 /* EWALD ELECTROSTATICS */
1829
1830 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1831 ewrt = _mm_mul_ps(r22,ewtabscale);
1832 ewitab = _mm_cvttps_epi32(ewrt);
1833 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1834 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1835 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1836 &ewtabF,&ewtabFn);
1837 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1838 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1839
1840 fscal = felec;
1841
1842 /* Calculate temporary vectorial force */
1843 tx = _mm_mul_ps(fscal,dx22);
1844 ty = _mm_mul_ps(fscal,dy22);
1845 tz = _mm_mul_ps(fscal,dz22);
1846
1847 /* Update vectorial force */
1848 fix2 = _mm_add_ps(fix2,tx);
1849 fiy2 = _mm_add_ps(fiy2,ty);
1850 fiz2 = _mm_add_ps(fiz2,tz);
1851
1852 fjx2 = _mm_add_ps(fjx2,tx);
1853 fjy2 = _mm_add_ps(fjy2,ty);
1854 fjz2 = _mm_add_ps(fjz2,tz);
1855
1856 /**************************
1857 * CALCULATE INTERACTIONS *
1858 **************************/
1859
1860 r23 = _mm_mul_ps(rsq23,rinv23);
1861
1862 /* EWALD ELECTROSTATICS */
1863
1864 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1865 ewrt = _mm_mul_ps(r23,ewtabscale);
1866 ewitab = _mm_cvttps_epi32(ewrt);
1867 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1868 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1869 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1870 &ewtabF,&ewtabFn);
1871 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1872 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1873
1874 fscal = felec;
1875
1876 /* Calculate temporary vectorial force */
1877 tx = _mm_mul_ps(fscal,dx23);
1878 ty = _mm_mul_ps(fscal,dy23);
1879 tz = _mm_mul_ps(fscal,dz23);
1880
1881 /* Update vectorial force */
1882 fix2 = _mm_add_ps(fix2,tx);
1883 fiy2 = _mm_add_ps(fiy2,ty);
1884 fiz2 = _mm_add_ps(fiz2,tz);
1885
1886 fjx3 = _mm_add_ps(fjx3,tx);
1887 fjy3 = _mm_add_ps(fjy3,ty);
1888 fjz3 = _mm_add_ps(fjz3,tz);
1889
1890 /**************************
1891 * CALCULATE INTERACTIONS *
1892 **************************/
1893
1894 r31 = _mm_mul_ps(rsq31,rinv31);
1895
1896 /* EWALD ELECTROSTATICS */
1897
1898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1899 ewrt = _mm_mul_ps(r31,ewtabscale);
1900 ewitab = _mm_cvttps_epi32(ewrt);
1901 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1902 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1903 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1904 &ewtabF,&ewtabFn);
1905 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1906 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1907
1908 fscal = felec;
1909
1910 /* Calculate temporary vectorial force */
1911 tx = _mm_mul_ps(fscal,dx31);
1912 ty = _mm_mul_ps(fscal,dy31);
1913 tz = _mm_mul_ps(fscal,dz31);
1914
1915 /* Update vectorial force */
1916 fix3 = _mm_add_ps(fix3,tx);
1917 fiy3 = _mm_add_ps(fiy3,ty);
1918 fiz3 = _mm_add_ps(fiz3,tz);
1919
1920 fjx1 = _mm_add_ps(fjx1,tx);
1921 fjy1 = _mm_add_ps(fjy1,ty);
1922 fjz1 = _mm_add_ps(fjz1,tz);
1923
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1927
1928 r32 = _mm_mul_ps(rsq32,rinv32);
1929
1930 /* EWALD ELECTROSTATICS */
1931
1932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1933 ewrt = _mm_mul_ps(r32,ewtabscale);
1934 ewitab = _mm_cvttps_epi32(ewrt);
1935 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1936 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1937 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1938 &ewtabF,&ewtabFn);
1939 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1940 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1941
1942 fscal = felec;
1943
1944 /* Calculate temporary vectorial force */
1945 tx = _mm_mul_ps(fscal,dx32);
1946 ty = _mm_mul_ps(fscal,dy32);
1947 tz = _mm_mul_ps(fscal,dz32);
1948
1949 /* Update vectorial force */
1950 fix3 = _mm_add_ps(fix3,tx);
1951 fiy3 = _mm_add_ps(fiy3,ty);
1952 fiz3 = _mm_add_ps(fiz3,tz);
1953
1954 fjx2 = _mm_add_ps(fjx2,tx);
1955 fjy2 = _mm_add_ps(fjy2,ty);
1956 fjz2 = _mm_add_ps(fjz2,tz);
1957
1958 /**************************
1959 * CALCULATE INTERACTIONS *
1960 **************************/
1961
1962 r33 = _mm_mul_ps(rsq33,rinv33);
1963
1964 /* EWALD ELECTROSTATICS */
1965
1966 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1967 ewrt = _mm_mul_ps(r33,ewtabscale);
1968 ewitab = _mm_cvttps_epi32(ewrt);
1969 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1970 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1971 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1972 &ewtabF,&ewtabFn);
1973 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1974 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1975
1976 fscal = felec;
1977
1978 /* Calculate temporary vectorial force */
1979 tx = _mm_mul_ps(fscal,dx33);
1980 ty = _mm_mul_ps(fscal,dy33);
1981 tz = _mm_mul_ps(fscal,dz33);
1982
1983 /* Update vectorial force */
1984 fix3 = _mm_add_ps(fix3,tx);
1985 fiy3 = _mm_add_ps(fiy3,ty);
1986 fiz3 = _mm_add_ps(fiz3,tz);
1987
1988 fjx3 = _mm_add_ps(fjx3,tx);
1989 fjy3 = _mm_add_ps(fjy3,ty);
1990 fjz3 = _mm_add_ps(fjz3,tz);
1991
1992 fjptrA = f+j_coord_offsetA;
1993 fjptrB = f+j_coord_offsetB;
1994 fjptrC = f+j_coord_offsetC;
1995 fjptrD = f+j_coord_offsetD;
1996
1997 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1998 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1999 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2000
2001 /* Inner loop uses 373 flops */
2002 }
2003
2004 if(jidx<j_index_end)
2005 {
2006
2007 /* Get j neighbor index, and coordinate index */
2008 jnrlistA = jjnr[jidx];
2009 jnrlistB = jjnr[jidx+1];
2010 jnrlistC = jjnr[jidx+2];
2011 jnrlistD = jjnr[jidx+3];
2012 /* Sign of each element will be negative for non-real atoms.
2013 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2014 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2015 */
2016 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2017 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2018 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2019 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2020 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2021 j_coord_offsetA = DIM3*jnrA;
2022 j_coord_offsetB = DIM3*jnrB;
2023 j_coord_offsetC = DIM3*jnrC;
2024 j_coord_offsetD = DIM3*jnrD;
2025
2026 /* load j atom coordinates */
2027 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2028 x+j_coord_offsetC,x+j_coord_offsetD,
2029 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2030 &jy2,&jz2,&jx3,&jy3,&jz3);
2031
2032 /* Calculate displacement vector */
2033 dx00 = _mm_sub_ps(ix0,jx0);
2034 dy00 = _mm_sub_ps(iy0,jy0);
2035 dz00 = _mm_sub_ps(iz0,jz0);
2036 dx11 = _mm_sub_ps(ix1,jx1);
2037 dy11 = _mm_sub_ps(iy1,jy1);
2038 dz11 = _mm_sub_ps(iz1,jz1);
2039 dx12 = _mm_sub_ps(ix1,jx2);
2040 dy12 = _mm_sub_ps(iy1,jy2);
2041 dz12 = _mm_sub_ps(iz1,jz2);
2042 dx13 = _mm_sub_ps(ix1,jx3);
2043 dy13 = _mm_sub_ps(iy1,jy3);
2044 dz13 = _mm_sub_ps(iz1,jz3);
2045 dx21 = _mm_sub_ps(ix2,jx1);
2046 dy21 = _mm_sub_ps(iy2,jy1);
2047 dz21 = _mm_sub_ps(iz2,jz1);
2048 dx22 = _mm_sub_ps(ix2,jx2);
2049 dy22 = _mm_sub_ps(iy2,jy2);
2050 dz22 = _mm_sub_ps(iz2,jz2);
2051 dx23 = _mm_sub_ps(ix2,jx3);
2052 dy23 = _mm_sub_ps(iy2,jy3);
2053 dz23 = _mm_sub_ps(iz2,jz3);
2054 dx31 = _mm_sub_ps(ix3,jx1);
2055 dy31 = _mm_sub_ps(iy3,jy1);
2056 dz31 = _mm_sub_ps(iz3,jz1);
2057 dx32 = _mm_sub_ps(ix3,jx2);
2058 dy32 = _mm_sub_ps(iy3,jy2);
2059 dz32 = _mm_sub_ps(iz3,jz2);
2060 dx33 = _mm_sub_ps(ix3,jx3);
2061 dy33 = _mm_sub_ps(iy3,jy3);
2062 dz33 = _mm_sub_ps(iz3,jz3);
2063
2064 /* Calculate squared distance and things based on it */
2065 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2066 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2067 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2068 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2069 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2070 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2071 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2072 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2073 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2074 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2075
2076 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
2077 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2078 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2079 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
2080 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2081 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2082 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
2083 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
2084 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
2085 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
2086
2087 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2088 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2089 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2090 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2091 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2092 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2093 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2094 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2095 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2096 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2097
2098 fjx0 = _mm_setzero_ps();
2099 fjy0 = _mm_setzero_ps();
2100 fjz0 = _mm_setzero_ps();
2101 fjx1 = _mm_setzero_ps();
2102 fjy1 = _mm_setzero_ps();
2103 fjz1 = _mm_setzero_ps();
2104 fjx2 = _mm_setzero_ps();
2105 fjy2 = _mm_setzero_ps();
2106 fjz2 = _mm_setzero_ps();
2107 fjx3 = _mm_setzero_ps();
2108 fjy3 = _mm_setzero_ps();
2109 fjz3 = _mm_setzero_ps();
2110
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2114
2115 r00 = _mm_mul_ps(rsq00,rinv00);
2116 r00 = _mm_andnot_ps(dummy_mask,r00);
2117
2118 /* Analytical LJ-PME */
2119 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2120 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
2121 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2122 exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
2123 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2124 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2125 /* f6A = 6 * C6grid * (1 - poly) */
2126 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2127 /* f6B = C6grid * exponent * beta^6 */
2128 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2129 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2130 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2131
2132 fscal = fvdw;
2133
2134 fscal = _mm_andnot_ps(dummy_mask,fscal);
2135
2136 /* Calculate temporary vectorial force */
2137 tx = _mm_mul_ps(fscal,dx00);
2138 ty = _mm_mul_ps(fscal,dy00);
2139 tz = _mm_mul_ps(fscal,dz00);
2140
2141 /* Update vectorial force */
2142 fix0 = _mm_add_ps(fix0,tx);
2143 fiy0 = _mm_add_ps(fiy0,ty);
2144 fiz0 = _mm_add_ps(fiz0,tz);
2145
2146 fjx0 = _mm_add_ps(fjx0,tx);
2147 fjy0 = _mm_add_ps(fjy0,ty);
2148 fjz0 = _mm_add_ps(fjz0,tz);
2149
2150 /**************************
2151 * CALCULATE INTERACTIONS *
2152 **************************/
2153
2154 r11 = _mm_mul_ps(rsq11,rinv11);
2155 r11 = _mm_andnot_ps(dummy_mask,r11);
2156
2157 /* EWALD ELECTROSTATICS */
2158
2159 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160 ewrt = _mm_mul_ps(r11,ewtabscale);
2161 ewitab = _mm_cvttps_epi32(ewrt);
2162 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2163 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2164 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2165 &ewtabF,&ewtabFn);
2166 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2167 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2168
2169 fscal = felec;
2170
2171 fscal = _mm_andnot_ps(dummy_mask,fscal);
2172
2173 /* Calculate temporary vectorial force */
2174 tx = _mm_mul_ps(fscal,dx11);
2175 ty = _mm_mul_ps(fscal,dy11);
2176 tz = _mm_mul_ps(fscal,dz11);
2177
2178 /* Update vectorial force */
2179 fix1 = _mm_add_ps(fix1,tx);
2180 fiy1 = _mm_add_ps(fiy1,ty);
2181 fiz1 = _mm_add_ps(fiz1,tz);
2182
2183 fjx1 = _mm_add_ps(fjx1,tx);
2184 fjy1 = _mm_add_ps(fjy1,ty);
2185 fjz1 = _mm_add_ps(fjz1,tz);
2186
2187 /**************************
2188 * CALCULATE INTERACTIONS *
2189 **************************/
2190
2191 r12 = _mm_mul_ps(rsq12,rinv12);
2192 r12 = _mm_andnot_ps(dummy_mask,r12);
2193
2194 /* EWALD ELECTROSTATICS */
2195
2196 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2197 ewrt = _mm_mul_ps(r12,ewtabscale);
2198 ewitab = _mm_cvttps_epi32(ewrt);
2199 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2200 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2201 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2202 &ewtabF,&ewtabFn);
2203 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2204 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2205
2206 fscal = felec;
2207
2208 fscal = _mm_andnot_ps(dummy_mask,fscal);
2209
2210 /* Calculate temporary vectorial force */
2211 tx = _mm_mul_ps(fscal,dx12);
2212 ty = _mm_mul_ps(fscal,dy12);
2213 tz = _mm_mul_ps(fscal,dz12);
2214
2215 /* Update vectorial force */
2216 fix1 = _mm_add_ps(fix1,tx);
2217 fiy1 = _mm_add_ps(fiy1,ty);
2218 fiz1 = _mm_add_ps(fiz1,tz);
2219
2220 fjx2 = _mm_add_ps(fjx2,tx);
2221 fjy2 = _mm_add_ps(fjy2,ty);
2222 fjz2 = _mm_add_ps(fjz2,tz);
2223
2224 /**************************
2225 * CALCULATE INTERACTIONS *
2226 **************************/
2227
2228 r13 = _mm_mul_ps(rsq13,rinv13);
2229 r13 = _mm_andnot_ps(dummy_mask,r13);
2230
2231 /* EWALD ELECTROSTATICS */
2232
2233 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2234 ewrt = _mm_mul_ps(r13,ewtabscale);
2235 ewitab = _mm_cvttps_epi32(ewrt);
2236 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2237 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2238 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2239 &ewtabF,&ewtabFn);
2240 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2241 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2242
2243 fscal = felec;
2244
2245 fscal = _mm_andnot_ps(dummy_mask,fscal);
2246
2247 /* Calculate temporary vectorial force */
2248 tx = _mm_mul_ps(fscal,dx13);
2249 ty = _mm_mul_ps(fscal,dy13);
2250 tz = _mm_mul_ps(fscal,dz13);
2251
2252 /* Update vectorial force */
2253 fix1 = _mm_add_ps(fix1,tx);
2254 fiy1 = _mm_add_ps(fiy1,ty);
2255 fiz1 = _mm_add_ps(fiz1,tz);
2256
2257 fjx3 = _mm_add_ps(fjx3,tx);
2258 fjy3 = _mm_add_ps(fjy3,ty);
2259 fjz3 = _mm_add_ps(fjz3,tz);
2260
2261 /**************************
2262 * CALCULATE INTERACTIONS *
2263 **************************/
2264
2265 r21 = _mm_mul_ps(rsq21,rinv21);
2266 r21 = _mm_andnot_ps(dummy_mask,r21);
2267
2268 /* EWALD ELECTROSTATICS */
2269
2270 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2271 ewrt = _mm_mul_ps(r21,ewtabscale);
2272 ewitab = _mm_cvttps_epi32(ewrt);
2273 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2274 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2275 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2276 &ewtabF,&ewtabFn);
2277 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2278 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2279
2280 fscal = felec;
2281
2282 fscal = _mm_andnot_ps(dummy_mask,fscal);
2283
2284 /* Calculate temporary vectorial force */
2285 tx = _mm_mul_ps(fscal,dx21);
2286 ty = _mm_mul_ps(fscal,dy21);
2287 tz = _mm_mul_ps(fscal,dz21);
2288
2289 /* Update vectorial force */
2290 fix2 = _mm_add_ps(fix2,tx);
2291 fiy2 = _mm_add_ps(fiy2,ty);
2292 fiz2 = _mm_add_ps(fiz2,tz);
2293
2294 fjx1 = _mm_add_ps(fjx1,tx);
2295 fjy1 = _mm_add_ps(fjy1,ty);
2296 fjz1 = _mm_add_ps(fjz1,tz);
2297
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2301
2302 r22 = _mm_mul_ps(rsq22,rinv22);
2303 r22 = _mm_andnot_ps(dummy_mask,r22);
2304
2305 /* EWALD ELECTROSTATICS */
2306
2307 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2308 ewrt = _mm_mul_ps(r22,ewtabscale);
2309 ewitab = _mm_cvttps_epi32(ewrt);
2310 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2311 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2312 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2313 &ewtabF,&ewtabFn);
2314 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2315 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2316
2317 fscal = felec;
2318
2319 fscal = _mm_andnot_ps(dummy_mask,fscal);
2320
2321 /* Calculate temporary vectorial force */
2322 tx = _mm_mul_ps(fscal,dx22);
2323 ty = _mm_mul_ps(fscal,dy22);
2324 tz = _mm_mul_ps(fscal,dz22);
2325
2326 /* Update vectorial force */
2327 fix2 = _mm_add_ps(fix2,tx);
2328 fiy2 = _mm_add_ps(fiy2,ty);
2329 fiz2 = _mm_add_ps(fiz2,tz);
2330
2331 fjx2 = _mm_add_ps(fjx2,tx);
2332 fjy2 = _mm_add_ps(fjy2,ty);
2333 fjz2 = _mm_add_ps(fjz2,tz);
2334
2335 /**************************
2336 * CALCULATE INTERACTIONS *
2337 **************************/
2338
2339 r23 = _mm_mul_ps(rsq23,rinv23);
2340 r23 = _mm_andnot_ps(dummy_mask,r23);
2341
2342 /* EWALD ELECTROSTATICS */
2343
2344 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2345 ewrt = _mm_mul_ps(r23,ewtabscale);
2346 ewitab = _mm_cvttps_epi32(ewrt);
2347 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2348 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2349 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2350 &ewtabF,&ewtabFn);
2351 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2352 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2353
2354 fscal = felec;
2355
2356 fscal = _mm_andnot_ps(dummy_mask,fscal);
2357
2358 /* Calculate temporary vectorial force */
2359 tx = _mm_mul_ps(fscal,dx23);
2360 ty = _mm_mul_ps(fscal,dy23);
2361 tz = _mm_mul_ps(fscal,dz23);
2362
2363 /* Update vectorial force */
2364 fix2 = _mm_add_ps(fix2,tx);
2365 fiy2 = _mm_add_ps(fiy2,ty);
2366 fiz2 = _mm_add_ps(fiz2,tz);
2367
2368 fjx3 = _mm_add_ps(fjx3,tx);
2369 fjy3 = _mm_add_ps(fjy3,ty);
2370 fjz3 = _mm_add_ps(fjz3,tz);
2371
2372 /**************************
2373 * CALCULATE INTERACTIONS *
2374 **************************/
2375
2376 r31 = _mm_mul_ps(rsq31,rinv31);
2377 r31 = _mm_andnot_ps(dummy_mask,r31);
2378
2379 /* EWALD ELECTROSTATICS */
2380
2381 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2382 ewrt = _mm_mul_ps(r31,ewtabscale);
2383 ewitab = _mm_cvttps_epi32(ewrt);
2384 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2385 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2386 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2387 &ewtabF,&ewtabFn);
2388 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2389 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2390
2391 fscal = felec;
2392
2393 fscal = _mm_andnot_ps(dummy_mask,fscal);
2394
2395 /* Calculate temporary vectorial force */
2396 tx = _mm_mul_ps(fscal,dx31);
2397 ty = _mm_mul_ps(fscal,dy31);
2398 tz = _mm_mul_ps(fscal,dz31);
2399
2400 /* Update vectorial force */
2401 fix3 = _mm_add_ps(fix3,tx);
2402 fiy3 = _mm_add_ps(fiy3,ty);
2403 fiz3 = _mm_add_ps(fiz3,tz);
2404
2405 fjx1 = _mm_add_ps(fjx1,tx);
2406 fjy1 = _mm_add_ps(fjy1,ty);
2407 fjz1 = _mm_add_ps(fjz1,tz);
2408
2409 /**************************
2410 * CALCULATE INTERACTIONS *
2411 **************************/
2412
2413 r32 = _mm_mul_ps(rsq32,rinv32);
2414 r32 = _mm_andnot_ps(dummy_mask,r32);
2415
2416 /* EWALD ELECTROSTATICS */
2417
2418 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2419 ewrt = _mm_mul_ps(r32,ewtabscale);
2420 ewitab = _mm_cvttps_epi32(ewrt);
2421 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2422 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2423 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2424 &ewtabF,&ewtabFn);
2425 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2426 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2427
2428 fscal = felec;
2429
2430 fscal = _mm_andnot_ps(dummy_mask,fscal);
2431
2432 /* Calculate temporary vectorial force */
2433 tx = _mm_mul_ps(fscal,dx32);
2434 ty = _mm_mul_ps(fscal,dy32);
2435 tz = _mm_mul_ps(fscal,dz32);
2436
2437 /* Update vectorial force */
2438 fix3 = _mm_add_ps(fix3,tx);
2439 fiy3 = _mm_add_ps(fiy3,ty);
2440 fiz3 = _mm_add_ps(fiz3,tz);
2441
2442 fjx2 = _mm_add_ps(fjx2,tx);
2443 fjy2 = _mm_add_ps(fjy2,ty);
2444 fjz2 = _mm_add_ps(fjz2,tz);
2445
2446 /**************************
2447 * CALCULATE INTERACTIONS *
2448 **************************/
2449
2450 r33 = _mm_mul_ps(rsq33,rinv33);
2451 r33 = _mm_andnot_ps(dummy_mask,r33);
2452
2453 /* EWALD ELECTROSTATICS */
2454
2455 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2456 ewrt = _mm_mul_ps(r33,ewtabscale);
2457 ewitab = _mm_cvttps_epi32(ewrt);
2458 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2459 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2460 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2461 &ewtabF,&ewtabFn);
2462 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2463 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2464
2465 fscal = felec;
2466
2467 fscal = _mm_andnot_ps(dummy_mask,fscal);
2468
2469 /* Calculate temporary vectorial force */
2470 tx = _mm_mul_ps(fscal,dx33);
2471 ty = _mm_mul_ps(fscal,dy33);
2472 tz = _mm_mul_ps(fscal,dz33);
2473
2474 /* Update vectorial force */
2475 fix3 = _mm_add_ps(fix3,tx);
2476 fiy3 = _mm_add_ps(fiy3,ty);
2477 fiz3 = _mm_add_ps(fiz3,tz);
2478
2479 fjx3 = _mm_add_ps(fjx3,tx);
2480 fjy3 = _mm_add_ps(fjy3,ty);
2481 fjz3 = _mm_add_ps(fjz3,tz);
2482
2483 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2484 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2485 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2486 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2487
2488 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2489 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2490 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2491
2492 /* Inner loop uses 383 flops */
2493 }
2494
2495 /* End of innermost loop */
2496
2497 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2498 f+i_coord_offset,fshift+i_shift_offset);
2499
2500 /* Increment number of inner iterations */
2501 inneriter += j_index_end - j_index_start;
2502
2503 /* Outer loop uses 24 flops */
2504 }
2505
2506 /* Increment number of outer iterations */
2507 outeriter += nri;
2508
2509 /* Update outer/inner flops */
2510
2511 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_F] += outeriter*24 + inneriter
*383
;
2512}