Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse4_1_single.c
Location:line 1570, column 5
Description:Value stored to 'jnrA' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 real *charge;
108 int nvdwtype;
109 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 int *vdwtype;
111 real *vdwparam;
112 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
113 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
114 __m128i ewitab;
115 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116 real *ewtab;
117 __m128 dummy_mask,cutoff_mask;
118 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one = _mm_set1_ps(1.0);
120 __m128 two = _mm_set1_ps(2.0);
121 x = xx[0];
122 f = ff[0];
123
124 nri = nlist->nri;
125 iinr = nlist->iinr;
126 jindex = nlist->jindex;
127 jjnr = nlist->jjnr;
128 shiftidx = nlist->shift;
129 gid = nlist->gid;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_ps(fr->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
135 vdwparam = fr->nbfp;
136 vdwtype = mdatoms->typeA;
137
138 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
139 ewtab = fr->ic->tabq_coul_FDV0;
140 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
141 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
142
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
146 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149
150 jq0 = _mm_set1_ps(charge[inr+0]);
151 jq1 = _mm_set1_ps(charge[inr+1]);
152 jq2 = _mm_set1_ps(charge[inr+2]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 qq00 = _mm_mul_ps(iq0,jq0);
155 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157 qq01 = _mm_mul_ps(iq0,jq1);
158 qq02 = _mm_mul_ps(iq0,jq2);
159 qq10 = _mm_mul_ps(iq1,jq0);
160 qq11 = _mm_mul_ps(iq1,jq1);
161 qq12 = _mm_mul_ps(iq1,jq2);
162 qq20 = _mm_mul_ps(iq2,jq0);
163 qq21 = _mm_mul_ps(iq2,jq1);
164 qq22 = _mm_mul_ps(iq2,jq2);
165
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar = fr->rcoulomb;
168 rcutoff = _mm_set1_ps(rcutoff_scalar);
169 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
170
171 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
172 rvdw = _mm_set1_ps(fr->rvdw);
173
174 /* Avoid stupid compiler warnings */
175 jnrA = jnrB = jnrC = jnrD = 0;
176 j_coord_offsetA = 0;
177 j_coord_offsetB = 0;
178 j_coord_offsetC = 0;
179 j_coord_offsetD = 0;
180
181 outeriter = 0;
182 inneriter = 0;
183
184 for(iidx=0;iidx<4*DIM3;iidx++)
185 {
186 scratch[iidx] = 0.0;
187 }
188
189 /* Start outer loop over neighborlists */
190 for(iidx=0; iidx<nri; iidx++)
191 {
192 /* Load shift vector for this list */
193 i_shift_offset = DIM3*shiftidx[iidx];
194
195 /* Load limits for loop over neighbors */
196 j_index_start = jindex[iidx];
197 j_index_end = jindex[iidx+1];
198
199 /* Get outer coordinate index */
200 inr = iinr[iidx];
201 i_coord_offset = DIM3*inr;
202
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
205 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206
207 fix0 = _mm_setzero_ps();
208 fiy0 = _mm_setzero_ps();
209 fiz0 = _mm_setzero_ps();
210 fix1 = _mm_setzero_ps();
211 fiy1 = _mm_setzero_ps();
212 fiz1 = _mm_setzero_ps();
213 fix2 = _mm_setzero_ps();
214 fiy2 = _mm_setzero_ps();
215 fiz2 = _mm_setzero_ps();
216
217 /* Reset potential sums */
218 velecsum = _mm_setzero_ps();
219 vvdwsum = _mm_setzero_ps();
220
221 /* Start inner kernel loop */
222 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
223 {
224
225 /* Get j neighbor index, and coordinate index */
226 jnrA = jjnr[jidx];
227 jnrB = jjnr[jidx+1];
228 jnrC = jjnr[jidx+2];
229 jnrD = jjnr[jidx+3];
230 j_coord_offsetA = DIM3*jnrA;
231 j_coord_offsetB = DIM3*jnrB;
232 j_coord_offsetC = DIM3*jnrC;
233 j_coord_offsetD = DIM3*jnrD;
234
235 /* load j atom coordinates */
236 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
237 x+j_coord_offsetC,x+j_coord_offsetD,
238 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
239
240 /* Calculate displacement vector */
241 dx00 = _mm_sub_ps(ix0,jx0);
242 dy00 = _mm_sub_ps(iy0,jy0);
243 dz00 = _mm_sub_ps(iz0,jz0);
244 dx01 = _mm_sub_ps(ix0,jx1);
245 dy01 = _mm_sub_ps(iy0,jy1);
246 dz01 = _mm_sub_ps(iz0,jz1);
247 dx02 = _mm_sub_ps(ix0,jx2);
248 dy02 = _mm_sub_ps(iy0,jy2);
249 dz02 = _mm_sub_ps(iz0,jz2);
250 dx10 = _mm_sub_ps(ix1,jx0);
251 dy10 = _mm_sub_ps(iy1,jy0);
252 dz10 = _mm_sub_ps(iz1,jz0);
253 dx11 = _mm_sub_ps(ix1,jx1);
254 dy11 = _mm_sub_ps(iy1,jy1);
255 dz11 = _mm_sub_ps(iz1,jz1);
256 dx12 = _mm_sub_ps(ix1,jx2);
257 dy12 = _mm_sub_ps(iy1,jy2);
258 dz12 = _mm_sub_ps(iz1,jz2);
259 dx20 = _mm_sub_ps(ix2,jx0);
260 dy20 = _mm_sub_ps(iy2,jy0);
261 dz20 = _mm_sub_ps(iz2,jz0);
262 dx21 = _mm_sub_ps(ix2,jx1);
263 dy21 = _mm_sub_ps(iy2,jy1);
264 dz21 = _mm_sub_ps(iz2,jz1);
265 dx22 = _mm_sub_ps(ix2,jx2);
266 dy22 = _mm_sub_ps(iy2,jy2);
267 dz22 = _mm_sub_ps(iz2,jz2);
268
269 /* Calculate squared distance and things based on it */
270 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
271 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
272 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
273 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
274 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
275 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
276 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
277 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
278 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
279
280 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
281 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
282 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
283 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
284 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
285 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
286 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
287 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
288 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
289
290 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
291 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
292 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
293 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
294 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
295 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
296 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
297 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
298 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
299
300 fjx0 = _mm_setzero_ps();
301 fjy0 = _mm_setzero_ps();
302 fjz0 = _mm_setzero_ps();
303 fjx1 = _mm_setzero_ps();
304 fjy1 = _mm_setzero_ps();
305 fjz1 = _mm_setzero_ps();
306 fjx2 = _mm_setzero_ps();
307 fjy2 = _mm_setzero_ps();
308 fjz2 = _mm_setzero_ps();
309
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
313
314 if (gmx_mm_any_lt(rsq00,rcutoff2))
315 {
316
317 r00 = _mm_mul_ps(rsq00,rinv00);
318
319 /* EWALD ELECTROSTATICS */
320
321 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
322 ewrt = _mm_mul_ps(r00,ewtabscale);
323 ewitab = _mm_cvttps_epi32(ewrt);
324 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
325 ewitab = _mm_slli_epi32(ewitab,2);
326 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
327 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
328 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
329 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
330 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
331 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
332 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
333 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
334 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
335
336 /* LENNARD-JONES DISPERSION/REPULSION */
337
338 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
339 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
340 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
341 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
342 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
343 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
344
345 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
346
347 /* Update potential sum for this i atom from the interaction with this j atom. */
348 velec = _mm_and_ps(velec,cutoff_mask);
349 velecsum = _mm_add_ps(velecsum,velec);
350 vvdw = _mm_and_ps(vvdw,cutoff_mask);
351 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
352
353 fscal = _mm_add_ps(felec,fvdw);
354
355 fscal = _mm_and_ps(fscal,cutoff_mask);
356
357 /* Calculate temporary vectorial force */
358 tx = _mm_mul_ps(fscal,dx00);
359 ty = _mm_mul_ps(fscal,dy00);
360 tz = _mm_mul_ps(fscal,dz00);
361
362 /* Update vectorial force */
363 fix0 = _mm_add_ps(fix0,tx);
364 fiy0 = _mm_add_ps(fiy0,ty);
365 fiz0 = _mm_add_ps(fiz0,tz);
366
367 fjx0 = _mm_add_ps(fjx0,tx);
368 fjy0 = _mm_add_ps(fjy0,ty);
369 fjz0 = _mm_add_ps(fjz0,tz);
370
371 }
372
373 /**************************
374 * CALCULATE INTERACTIONS *
375 **************************/
376
377 if (gmx_mm_any_lt(rsq01,rcutoff2))
378 {
379
380 r01 = _mm_mul_ps(rsq01,rinv01);
381
382 /* EWALD ELECTROSTATICS */
383
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm_mul_ps(r01,ewtabscale);
386 ewitab = _mm_cvttps_epi32(ewrt);
387 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
390 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
391 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
392 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
393 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
394 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
395 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
396 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
397 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
398
399 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
400
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velec = _mm_and_ps(velec,cutoff_mask);
403 velecsum = _mm_add_ps(velecsum,velec);
404
405 fscal = felec;
406
407 fscal = _mm_and_ps(fscal,cutoff_mask);
408
409 /* Calculate temporary vectorial force */
410 tx = _mm_mul_ps(fscal,dx01);
411 ty = _mm_mul_ps(fscal,dy01);
412 tz = _mm_mul_ps(fscal,dz01);
413
414 /* Update vectorial force */
415 fix0 = _mm_add_ps(fix0,tx);
416 fiy0 = _mm_add_ps(fiy0,ty);
417 fiz0 = _mm_add_ps(fiz0,tz);
418
419 fjx1 = _mm_add_ps(fjx1,tx);
420 fjy1 = _mm_add_ps(fjy1,ty);
421 fjz1 = _mm_add_ps(fjz1,tz);
422
423 }
424
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
428
429 if (gmx_mm_any_lt(rsq02,rcutoff2))
430 {
431
432 r02 = _mm_mul_ps(rsq02,rinv02);
433
434 /* EWALD ELECTROSTATICS */
435
436 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
437 ewrt = _mm_mul_ps(r02,ewtabscale);
438 ewitab = _mm_cvttps_epi32(ewrt);
439 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
440 ewitab = _mm_slli_epi32(ewitab,2);
441 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
442 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
443 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
444 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
445 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
446 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
447 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
448 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
449 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
450
451 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
452
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velec = _mm_and_ps(velec,cutoff_mask);
455 velecsum = _mm_add_ps(velecsum,velec);
456
457 fscal = felec;
458
459 fscal = _mm_and_ps(fscal,cutoff_mask);
460
461 /* Calculate temporary vectorial force */
462 tx = _mm_mul_ps(fscal,dx02);
463 ty = _mm_mul_ps(fscal,dy02);
464 tz = _mm_mul_ps(fscal,dz02);
465
466 /* Update vectorial force */
467 fix0 = _mm_add_ps(fix0,tx);
468 fiy0 = _mm_add_ps(fiy0,ty);
469 fiz0 = _mm_add_ps(fiz0,tz);
470
471 fjx2 = _mm_add_ps(fjx2,tx);
472 fjy2 = _mm_add_ps(fjy2,ty);
473 fjz2 = _mm_add_ps(fjz2,tz);
474
475 }
476
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
480
481 if (gmx_mm_any_lt(rsq10,rcutoff2))
482 {
483
484 r10 = _mm_mul_ps(rsq10,rinv10);
485
486 /* EWALD ELECTROSTATICS */
487
488 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
489 ewrt = _mm_mul_ps(r10,ewtabscale);
490 ewitab = _mm_cvttps_epi32(ewrt);
491 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
492 ewitab = _mm_slli_epi32(ewitab,2);
493 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
494 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
495 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
496 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
497 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
498 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
499 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
500 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
501 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
502
503 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
504
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velec = _mm_and_ps(velec,cutoff_mask);
507 velecsum = _mm_add_ps(velecsum,velec);
508
509 fscal = felec;
510
511 fscal = _mm_and_ps(fscal,cutoff_mask);
512
513 /* Calculate temporary vectorial force */
514 tx = _mm_mul_ps(fscal,dx10);
515 ty = _mm_mul_ps(fscal,dy10);
516 tz = _mm_mul_ps(fscal,dz10);
517
518 /* Update vectorial force */
519 fix1 = _mm_add_ps(fix1,tx);
520 fiy1 = _mm_add_ps(fiy1,ty);
521 fiz1 = _mm_add_ps(fiz1,tz);
522
523 fjx0 = _mm_add_ps(fjx0,tx);
524 fjy0 = _mm_add_ps(fjy0,ty);
525 fjz0 = _mm_add_ps(fjz0,tz);
526
527 }
528
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
532
533 if (gmx_mm_any_lt(rsq11,rcutoff2))
534 {
535
536 r11 = _mm_mul_ps(rsq11,rinv11);
537
538 /* EWALD ELECTROSTATICS */
539
540 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
541 ewrt = _mm_mul_ps(r11,ewtabscale);
542 ewitab = _mm_cvttps_epi32(ewrt);
543 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
544 ewitab = _mm_slli_epi32(ewitab,2);
545 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
546 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
547 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
548 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
549 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
550 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
551 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
552 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
553 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
554
555 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
556
557 /* Update potential sum for this i atom from the interaction with this j atom. */
558 velec = _mm_and_ps(velec,cutoff_mask);
559 velecsum = _mm_add_ps(velecsum,velec);
560
561 fscal = felec;
562
563 fscal = _mm_and_ps(fscal,cutoff_mask);
564
565 /* Calculate temporary vectorial force */
566 tx = _mm_mul_ps(fscal,dx11);
567 ty = _mm_mul_ps(fscal,dy11);
568 tz = _mm_mul_ps(fscal,dz11);
569
570 /* Update vectorial force */
571 fix1 = _mm_add_ps(fix1,tx);
572 fiy1 = _mm_add_ps(fiy1,ty);
573 fiz1 = _mm_add_ps(fiz1,tz);
574
575 fjx1 = _mm_add_ps(fjx1,tx);
576 fjy1 = _mm_add_ps(fjy1,ty);
577 fjz1 = _mm_add_ps(fjz1,tz);
578
579 }
580
581 /**************************
582 * CALCULATE INTERACTIONS *
583 **************************/
584
585 if (gmx_mm_any_lt(rsq12,rcutoff2))
586 {
587
588 r12 = _mm_mul_ps(rsq12,rinv12);
589
590 /* EWALD ELECTROSTATICS */
591
592 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
593 ewrt = _mm_mul_ps(r12,ewtabscale);
594 ewitab = _mm_cvttps_epi32(ewrt);
595 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
596 ewitab = _mm_slli_epi32(ewitab,2);
597 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
598 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
599 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
600 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
601 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
602 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
603 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
604 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
605 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
606
607 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
608
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velec = _mm_and_ps(velec,cutoff_mask);
611 velecsum = _mm_add_ps(velecsum,velec);
612
613 fscal = felec;
614
615 fscal = _mm_and_ps(fscal,cutoff_mask);
616
617 /* Calculate temporary vectorial force */
618 tx = _mm_mul_ps(fscal,dx12);
619 ty = _mm_mul_ps(fscal,dy12);
620 tz = _mm_mul_ps(fscal,dz12);
621
622 /* Update vectorial force */
623 fix1 = _mm_add_ps(fix1,tx);
624 fiy1 = _mm_add_ps(fiy1,ty);
625 fiz1 = _mm_add_ps(fiz1,tz);
626
627 fjx2 = _mm_add_ps(fjx2,tx);
628 fjy2 = _mm_add_ps(fjy2,ty);
629 fjz2 = _mm_add_ps(fjz2,tz);
630
631 }
632
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
636
637 if (gmx_mm_any_lt(rsq20,rcutoff2))
638 {
639
640 r20 = _mm_mul_ps(rsq20,rinv20);
641
642 /* EWALD ELECTROSTATICS */
643
644 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
645 ewrt = _mm_mul_ps(r20,ewtabscale);
646 ewitab = _mm_cvttps_epi32(ewrt);
647 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
648 ewitab = _mm_slli_epi32(ewitab,2);
649 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
650 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
651 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
652 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
653 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
654 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
655 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
656 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
657 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
658
659 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
660
661 /* Update potential sum for this i atom from the interaction with this j atom. */
662 velec = _mm_and_ps(velec,cutoff_mask);
663 velecsum = _mm_add_ps(velecsum,velec);
664
665 fscal = felec;
666
667 fscal = _mm_and_ps(fscal,cutoff_mask);
668
669 /* Calculate temporary vectorial force */
670 tx = _mm_mul_ps(fscal,dx20);
671 ty = _mm_mul_ps(fscal,dy20);
672 tz = _mm_mul_ps(fscal,dz20);
673
674 /* Update vectorial force */
675 fix2 = _mm_add_ps(fix2,tx);
676 fiy2 = _mm_add_ps(fiy2,ty);
677 fiz2 = _mm_add_ps(fiz2,tz);
678
679 fjx0 = _mm_add_ps(fjx0,tx);
680 fjy0 = _mm_add_ps(fjy0,ty);
681 fjz0 = _mm_add_ps(fjz0,tz);
682
683 }
684
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
688
689 if (gmx_mm_any_lt(rsq21,rcutoff2))
690 {
691
692 r21 = _mm_mul_ps(rsq21,rinv21);
693
694 /* EWALD ELECTROSTATICS */
695
696 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
697 ewrt = _mm_mul_ps(r21,ewtabscale);
698 ewitab = _mm_cvttps_epi32(ewrt);
699 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
700 ewitab = _mm_slli_epi32(ewitab,2);
701 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
702 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
703 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
704 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
705 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
706 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
707 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
708 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
709 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
710
711 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
712
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velec = _mm_and_ps(velec,cutoff_mask);
715 velecsum = _mm_add_ps(velecsum,velec);
716
717 fscal = felec;
718
719 fscal = _mm_and_ps(fscal,cutoff_mask);
720
721 /* Calculate temporary vectorial force */
722 tx = _mm_mul_ps(fscal,dx21);
723 ty = _mm_mul_ps(fscal,dy21);
724 tz = _mm_mul_ps(fscal,dz21);
725
726 /* Update vectorial force */
727 fix2 = _mm_add_ps(fix2,tx);
728 fiy2 = _mm_add_ps(fiy2,ty);
729 fiz2 = _mm_add_ps(fiz2,tz);
730
731 fjx1 = _mm_add_ps(fjx1,tx);
732 fjy1 = _mm_add_ps(fjy1,ty);
733 fjz1 = _mm_add_ps(fjz1,tz);
734
735 }
736
737 /**************************
738 * CALCULATE INTERACTIONS *
739 **************************/
740
741 if (gmx_mm_any_lt(rsq22,rcutoff2))
742 {
743
744 r22 = _mm_mul_ps(rsq22,rinv22);
745
746 /* EWALD ELECTROSTATICS */
747
748 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
749 ewrt = _mm_mul_ps(r22,ewtabscale);
750 ewitab = _mm_cvttps_epi32(ewrt);
751 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
752 ewitab = _mm_slli_epi32(ewitab,2);
753 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
754 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
755 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
756 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
757 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
758 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
759 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
760 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
761 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
762
763 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
764
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 velec = _mm_and_ps(velec,cutoff_mask);
767 velecsum = _mm_add_ps(velecsum,velec);
768
769 fscal = felec;
770
771 fscal = _mm_and_ps(fscal,cutoff_mask);
772
773 /* Calculate temporary vectorial force */
774 tx = _mm_mul_ps(fscal,dx22);
775 ty = _mm_mul_ps(fscal,dy22);
776 tz = _mm_mul_ps(fscal,dz22);
777
778 /* Update vectorial force */
779 fix2 = _mm_add_ps(fix2,tx);
780 fiy2 = _mm_add_ps(fiy2,ty);
781 fiz2 = _mm_add_ps(fiz2,tz);
782
783 fjx2 = _mm_add_ps(fjx2,tx);
784 fjy2 = _mm_add_ps(fjy2,ty);
785 fjz2 = _mm_add_ps(fjz2,tz);
786
787 }
788
789 fjptrA = f+j_coord_offsetA;
790 fjptrB = f+j_coord_offsetB;
791 fjptrC = f+j_coord_offsetC;
792 fjptrD = f+j_coord_offsetD;
793
794 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
795 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
796
797 /* Inner loop uses 432 flops */
798 }
799
800 if(jidx<j_index_end)
801 {
802
803 /* Get j neighbor index, and coordinate index */
804 jnrlistA = jjnr[jidx];
805 jnrlistB = jjnr[jidx+1];
806 jnrlistC = jjnr[jidx+2];
807 jnrlistD = jjnr[jidx+3];
808 /* Sign of each element will be negative for non-real atoms.
809 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
810 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
811 */
812 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
813 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
814 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
815 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
816 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
817 j_coord_offsetA = DIM3*jnrA;
818 j_coord_offsetB = DIM3*jnrB;
819 j_coord_offsetC = DIM3*jnrC;
820 j_coord_offsetD = DIM3*jnrD;
821
822 /* load j atom coordinates */
823 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
824 x+j_coord_offsetC,x+j_coord_offsetD,
825 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
826
827 /* Calculate displacement vector */
828 dx00 = _mm_sub_ps(ix0,jx0);
829 dy00 = _mm_sub_ps(iy0,jy0);
830 dz00 = _mm_sub_ps(iz0,jz0);
831 dx01 = _mm_sub_ps(ix0,jx1);
832 dy01 = _mm_sub_ps(iy0,jy1);
833 dz01 = _mm_sub_ps(iz0,jz1);
834 dx02 = _mm_sub_ps(ix0,jx2);
835 dy02 = _mm_sub_ps(iy0,jy2);
836 dz02 = _mm_sub_ps(iz0,jz2);
837 dx10 = _mm_sub_ps(ix1,jx0);
838 dy10 = _mm_sub_ps(iy1,jy0);
839 dz10 = _mm_sub_ps(iz1,jz0);
840 dx11 = _mm_sub_ps(ix1,jx1);
841 dy11 = _mm_sub_ps(iy1,jy1);
842 dz11 = _mm_sub_ps(iz1,jz1);
843 dx12 = _mm_sub_ps(ix1,jx2);
844 dy12 = _mm_sub_ps(iy1,jy2);
845 dz12 = _mm_sub_ps(iz1,jz2);
846 dx20 = _mm_sub_ps(ix2,jx0);
847 dy20 = _mm_sub_ps(iy2,jy0);
848 dz20 = _mm_sub_ps(iz2,jz0);
849 dx21 = _mm_sub_ps(ix2,jx1);
850 dy21 = _mm_sub_ps(iy2,jy1);
851 dz21 = _mm_sub_ps(iz2,jz1);
852 dx22 = _mm_sub_ps(ix2,jx2);
853 dy22 = _mm_sub_ps(iy2,jy2);
854 dz22 = _mm_sub_ps(iz2,jz2);
855
856 /* Calculate squared distance and things based on it */
857 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
858 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
859 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
860 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
861 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
862 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
863 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
864 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
865 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
866
867 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
868 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
869 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
870 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
871 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
872 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
873 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
874 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
875 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
876
877 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
878 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
879 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
880 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
881 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
882 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
883 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
884 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
885 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
886
887 fjx0 = _mm_setzero_ps();
888 fjy0 = _mm_setzero_ps();
889 fjz0 = _mm_setzero_ps();
890 fjx1 = _mm_setzero_ps();
891 fjy1 = _mm_setzero_ps();
892 fjz1 = _mm_setzero_ps();
893 fjx2 = _mm_setzero_ps();
894 fjy2 = _mm_setzero_ps();
895 fjz2 = _mm_setzero_ps();
896
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
900
901 if (gmx_mm_any_lt(rsq00,rcutoff2))
902 {
903
904 r00 = _mm_mul_ps(rsq00,rinv00);
905 r00 = _mm_andnot_ps(dummy_mask,r00);
906
907 /* EWALD ELECTROSTATICS */
908
909 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
910 ewrt = _mm_mul_ps(r00,ewtabscale);
911 ewitab = _mm_cvttps_epi32(ewrt);
912 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
913 ewitab = _mm_slli_epi32(ewitab,2);
914 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
915 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
916 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
917 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
918 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
919 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
920 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
921 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
922 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
923
924 /* LENNARD-JONES DISPERSION/REPULSION */
925
926 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
927 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
928 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
929 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
930 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
931 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
932
933 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
934
935 /* Update potential sum for this i atom from the interaction with this j atom. */
936 velec = _mm_and_ps(velec,cutoff_mask);
937 velec = _mm_andnot_ps(dummy_mask,velec);
938 velecsum = _mm_add_ps(velecsum,velec);
939 vvdw = _mm_and_ps(vvdw,cutoff_mask);
940 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
941 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
942
943 fscal = _mm_add_ps(felec,fvdw);
944
945 fscal = _mm_and_ps(fscal,cutoff_mask);
946
947 fscal = _mm_andnot_ps(dummy_mask,fscal);
948
949 /* Calculate temporary vectorial force */
950 tx = _mm_mul_ps(fscal,dx00);
951 ty = _mm_mul_ps(fscal,dy00);
952 tz = _mm_mul_ps(fscal,dz00);
953
954 /* Update vectorial force */
955 fix0 = _mm_add_ps(fix0,tx);
956 fiy0 = _mm_add_ps(fiy0,ty);
957 fiz0 = _mm_add_ps(fiz0,tz);
958
959 fjx0 = _mm_add_ps(fjx0,tx);
960 fjy0 = _mm_add_ps(fjy0,ty);
961 fjz0 = _mm_add_ps(fjz0,tz);
962
963 }
964
965 /**************************
966 * CALCULATE INTERACTIONS *
967 **************************/
968
969 if (gmx_mm_any_lt(rsq01,rcutoff2))
970 {
971
972 r01 = _mm_mul_ps(rsq01,rinv01);
973 r01 = _mm_andnot_ps(dummy_mask,r01);
974
975 /* EWALD ELECTROSTATICS */
976
977 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
978 ewrt = _mm_mul_ps(r01,ewtabscale);
979 ewitab = _mm_cvttps_epi32(ewrt);
980 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
981 ewitab = _mm_slli_epi32(ewitab,2);
982 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
983 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
984 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
985 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
986 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
987 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
988 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
989 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
990 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
991
992 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
993
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm_and_ps(velec,cutoff_mask);
996 velec = _mm_andnot_ps(dummy_mask,velec);
997 velecsum = _mm_add_ps(velecsum,velec);
998
999 fscal = felec;
1000
1001 fscal = _mm_and_ps(fscal,cutoff_mask);
1002
1003 fscal = _mm_andnot_ps(dummy_mask,fscal);
1004
1005 /* Calculate temporary vectorial force */
1006 tx = _mm_mul_ps(fscal,dx01);
1007 ty = _mm_mul_ps(fscal,dy01);
1008 tz = _mm_mul_ps(fscal,dz01);
1009
1010 /* Update vectorial force */
1011 fix0 = _mm_add_ps(fix0,tx);
1012 fiy0 = _mm_add_ps(fiy0,ty);
1013 fiz0 = _mm_add_ps(fiz0,tz);
1014
1015 fjx1 = _mm_add_ps(fjx1,tx);
1016 fjy1 = _mm_add_ps(fjy1,ty);
1017 fjz1 = _mm_add_ps(fjz1,tz);
1018
1019 }
1020
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1024
1025 if (gmx_mm_any_lt(rsq02,rcutoff2))
1026 {
1027
1028 r02 = _mm_mul_ps(rsq02,rinv02);
1029 r02 = _mm_andnot_ps(dummy_mask,r02);
1030
1031 /* EWALD ELECTROSTATICS */
1032
1033 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1034 ewrt = _mm_mul_ps(r02,ewtabscale);
1035 ewitab = _mm_cvttps_epi32(ewrt);
1036 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1037 ewitab = _mm_slli_epi32(ewitab,2);
1038 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1039 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1040 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1041 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1042 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1043 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1044 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1045 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1046 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1047
1048 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1049
1050 /* Update potential sum for this i atom from the interaction with this j atom. */
1051 velec = _mm_and_ps(velec,cutoff_mask);
1052 velec = _mm_andnot_ps(dummy_mask,velec);
1053 velecsum = _mm_add_ps(velecsum,velec);
1054
1055 fscal = felec;
1056
1057 fscal = _mm_and_ps(fscal,cutoff_mask);
1058
1059 fscal = _mm_andnot_ps(dummy_mask,fscal);
1060
1061 /* Calculate temporary vectorial force */
1062 tx = _mm_mul_ps(fscal,dx02);
1063 ty = _mm_mul_ps(fscal,dy02);
1064 tz = _mm_mul_ps(fscal,dz02);
1065
1066 /* Update vectorial force */
1067 fix0 = _mm_add_ps(fix0,tx);
1068 fiy0 = _mm_add_ps(fiy0,ty);
1069 fiz0 = _mm_add_ps(fiz0,tz);
1070
1071 fjx2 = _mm_add_ps(fjx2,tx);
1072 fjy2 = _mm_add_ps(fjy2,ty);
1073 fjz2 = _mm_add_ps(fjz2,tz);
1074
1075 }
1076
1077 /**************************
1078 * CALCULATE INTERACTIONS *
1079 **************************/
1080
1081 if (gmx_mm_any_lt(rsq10,rcutoff2))
1082 {
1083
1084 r10 = _mm_mul_ps(rsq10,rinv10);
1085 r10 = _mm_andnot_ps(dummy_mask,r10);
1086
1087 /* EWALD ELECTROSTATICS */
1088
1089 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1090 ewrt = _mm_mul_ps(r10,ewtabscale);
1091 ewitab = _mm_cvttps_epi32(ewrt);
1092 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1093 ewitab = _mm_slli_epi32(ewitab,2);
1094 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1095 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1096 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1097 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1098 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1099 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1100 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1101 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1102 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1103
1104 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1105
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 velec = _mm_and_ps(velec,cutoff_mask);
1108 velec = _mm_andnot_ps(dummy_mask,velec);
1109 velecsum = _mm_add_ps(velecsum,velec);
1110
1111 fscal = felec;
1112
1113 fscal = _mm_and_ps(fscal,cutoff_mask);
1114
1115 fscal = _mm_andnot_ps(dummy_mask,fscal);
1116
1117 /* Calculate temporary vectorial force */
1118 tx = _mm_mul_ps(fscal,dx10);
1119 ty = _mm_mul_ps(fscal,dy10);
1120 tz = _mm_mul_ps(fscal,dz10);
1121
1122 /* Update vectorial force */
1123 fix1 = _mm_add_ps(fix1,tx);
1124 fiy1 = _mm_add_ps(fiy1,ty);
1125 fiz1 = _mm_add_ps(fiz1,tz);
1126
1127 fjx0 = _mm_add_ps(fjx0,tx);
1128 fjy0 = _mm_add_ps(fjy0,ty);
1129 fjz0 = _mm_add_ps(fjz0,tz);
1130
1131 }
1132
1133 /**************************
1134 * CALCULATE INTERACTIONS *
1135 **************************/
1136
1137 if (gmx_mm_any_lt(rsq11,rcutoff2))
1138 {
1139
1140 r11 = _mm_mul_ps(rsq11,rinv11);
1141 r11 = _mm_andnot_ps(dummy_mask,r11);
1142
1143 /* EWALD ELECTROSTATICS */
1144
1145 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1146 ewrt = _mm_mul_ps(r11,ewtabscale);
1147 ewitab = _mm_cvttps_epi32(ewrt);
1148 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1149 ewitab = _mm_slli_epi32(ewitab,2);
1150 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1151 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1152 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1153 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1154 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1155 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1156 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1157 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1158 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1159
1160 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1161
1162 /* Update potential sum for this i atom from the interaction with this j atom. */
1163 velec = _mm_and_ps(velec,cutoff_mask);
1164 velec = _mm_andnot_ps(dummy_mask,velec);
1165 velecsum = _mm_add_ps(velecsum,velec);
1166
1167 fscal = felec;
1168
1169 fscal = _mm_and_ps(fscal,cutoff_mask);
1170
1171 fscal = _mm_andnot_ps(dummy_mask,fscal);
1172
1173 /* Calculate temporary vectorial force */
1174 tx = _mm_mul_ps(fscal,dx11);
1175 ty = _mm_mul_ps(fscal,dy11);
1176 tz = _mm_mul_ps(fscal,dz11);
1177
1178 /* Update vectorial force */
1179 fix1 = _mm_add_ps(fix1,tx);
1180 fiy1 = _mm_add_ps(fiy1,ty);
1181 fiz1 = _mm_add_ps(fiz1,tz);
1182
1183 fjx1 = _mm_add_ps(fjx1,tx);
1184 fjy1 = _mm_add_ps(fjy1,ty);
1185 fjz1 = _mm_add_ps(fjz1,tz);
1186
1187 }
1188
1189 /**************************
1190 * CALCULATE INTERACTIONS *
1191 **************************/
1192
1193 if (gmx_mm_any_lt(rsq12,rcutoff2))
1194 {
1195
1196 r12 = _mm_mul_ps(rsq12,rinv12);
1197 r12 = _mm_andnot_ps(dummy_mask,r12);
1198
1199 /* EWALD ELECTROSTATICS */
1200
1201 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1202 ewrt = _mm_mul_ps(r12,ewtabscale);
1203 ewitab = _mm_cvttps_epi32(ewrt);
1204 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1205 ewitab = _mm_slli_epi32(ewitab,2);
1206 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1207 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1208 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1209 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1210 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1211 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1212 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1213 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1214 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1215
1216 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1217
1218 /* Update potential sum for this i atom from the interaction with this j atom. */
1219 velec = _mm_and_ps(velec,cutoff_mask);
1220 velec = _mm_andnot_ps(dummy_mask,velec);
1221 velecsum = _mm_add_ps(velecsum,velec);
1222
1223 fscal = felec;
1224
1225 fscal = _mm_and_ps(fscal,cutoff_mask);
1226
1227 fscal = _mm_andnot_ps(dummy_mask,fscal);
1228
1229 /* Calculate temporary vectorial force */
1230 tx = _mm_mul_ps(fscal,dx12);
1231 ty = _mm_mul_ps(fscal,dy12);
1232 tz = _mm_mul_ps(fscal,dz12);
1233
1234 /* Update vectorial force */
1235 fix1 = _mm_add_ps(fix1,tx);
1236 fiy1 = _mm_add_ps(fiy1,ty);
1237 fiz1 = _mm_add_ps(fiz1,tz);
1238
1239 fjx2 = _mm_add_ps(fjx2,tx);
1240 fjy2 = _mm_add_ps(fjy2,ty);
1241 fjz2 = _mm_add_ps(fjz2,tz);
1242
1243 }
1244
1245 /**************************
1246 * CALCULATE INTERACTIONS *
1247 **************************/
1248
1249 if (gmx_mm_any_lt(rsq20,rcutoff2))
1250 {
1251
1252 r20 = _mm_mul_ps(rsq20,rinv20);
1253 r20 = _mm_andnot_ps(dummy_mask,r20);
1254
1255 /* EWALD ELECTROSTATICS */
1256
1257 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1258 ewrt = _mm_mul_ps(r20,ewtabscale);
1259 ewitab = _mm_cvttps_epi32(ewrt);
1260 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1261 ewitab = _mm_slli_epi32(ewitab,2);
1262 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1263 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1264 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1265 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1266 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1267 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1268 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1269 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1270 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1271
1272 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1273
1274 /* Update potential sum for this i atom from the interaction with this j atom. */
1275 velec = _mm_and_ps(velec,cutoff_mask);
1276 velec = _mm_andnot_ps(dummy_mask,velec);
1277 velecsum = _mm_add_ps(velecsum,velec);
1278
1279 fscal = felec;
1280
1281 fscal = _mm_and_ps(fscal,cutoff_mask);
1282
1283 fscal = _mm_andnot_ps(dummy_mask,fscal);
1284
1285 /* Calculate temporary vectorial force */
1286 tx = _mm_mul_ps(fscal,dx20);
1287 ty = _mm_mul_ps(fscal,dy20);
1288 tz = _mm_mul_ps(fscal,dz20);
1289
1290 /* Update vectorial force */
1291 fix2 = _mm_add_ps(fix2,tx);
1292 fiy2 = _mm_add_ps(fiy2,ty);
1293 fiz2 = _mm_add_ps(fiz2,tz);
1294
1295 fjx0 = _mm_add_ps(fjx0,tx);
1296 fjy0 = _mm_add_ps(fjy0,ty);
1297 fjz0 = _mm_add_ps(fjz0,tz);
1298
1299 }
1300
1301 /**************************
1302 * CALCULATE INTERACTIONS *
1303 **************************/
1304
1305 if (gmx_mm_any_lt(rsq21,rcutoff2))
1306 {
1307
1308 r21 = _mm_mul_ps(rsq21,rinv21);
1309 r21 = _mm_andnot_ps(dummy_mask,r21);
1310
1311 /* EWALD ELECTROSTATICS */
1312
1313 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1314 ewrt = _mm_mul_ps(r21,ewtabscale);
1315 ewitab = _mm_cvttps_epi32(ewrt);
1316 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1317 ewitab = _mm_slli_epi32(ewitab,2);
1318 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1319 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1320 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1321 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1322 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1323 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1324 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1325 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1326 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1327
1328 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1329
1330 /* Update potential sum for this i atom from the interaction with this j atom. */
1331 velec = _mm_and_ps(velec,cutoff_mask);
1332 velec = _mm_andnot_ps(dummy_mask,velec);
1333 velecsum = _mm_add_ps(velecsum,velec);
1334
1335 fscal = felec;
1336
1337 fscal = _mm_and_ps(fscal,cutoff_mask);
1338
1339 fscal = _mm_andnot_ps(dummy_mask,fscal);
1340
1341 /* Calculate temporary vectorial force */
1342 tx = _mm_mul_ps(fscal,dx21);
1343 ty = _mm_mul_ps(fscal,dy21);
1344 tz = _mm_mul_ps(fscal,dz21);
1345
1346 /* Update vectorial force */
1347 fix2 = _mm_add_ps(fix2,tx);
1348 fiy2 = _mm_add_ps(fiy2,ty);
1349 fiz2 = _mm_add_ps(fiz2,tz);
1350
1351 fjx1 = _mm_add_ps(fjx1,tx);
1352 fjy1 = _mm_add_ps(fjy1,ty);
1353 fjz1 = _mm_add_ps(fjz1,tz);
1354
1355 }
1356
1357 /**************************
1358 * CALCULATE INTERACTIONS *
1359 **************************/
1360
1361 if (gmx_mm_any_lt(rsq22,rcutoff2))
1362 {
1363
1364 r22 = _mm_mul_ps(rsq22,rinv22);
1365 r22 = _mm_andnot_ps(dummy_mask,r22);
1366
1367 /* EWALD ELECTROSTATICS */
1368
1369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1370 ewrt = _mm_mul_ps(r22,ewtabscale);
1371 ewitab = _mm_cvttps_epi32(ewrt);
1372 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1373 ewitab = _mm_slli_epi32(ewitab,2);
1374 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1375 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1376 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1377 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1378 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1379 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1380 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1381 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1382 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1383
1384 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1385
1386 /* Update potential sum for this i atom from the interaction with this j atom. */
1387 velec = _mm_and_ps(velec,cutoff_mask);
1388 velec = _mm_andnot_ps(dummy_mask,velec);
1389 velecsum = _mm_add_ps(velecsum,velec);
1390
1391 fscal = felec;
1392
1393 fscal = _mm_and_ps(fscal,cutoff_mask);
1394
1395 fscal = _mm_andnot_ps(dummy_mask,fscal);
1396
1397 /* Calculate temporary vectorial force */
1398 tx = _mm_mul_ps(fscal,dx22);
1399 ty = _mm_mul_ps(fscal,dy22);
1400 tz = _mm_mul_ps(fscal,dz22);
1401
1402 /* Update vectorial force */
1403 fix2 = _mm_add_ps(fix2,tx);
1404 fiy2 = _mm_add_ps(fiy2,ty);
1405 fiz2 = _mm_add_ps(fiz2,tz);
1406
1407 fjx2 = _mm_add_ps(fjx2,tx);
1408 fjy2 = _mm_add_ps(fjy2,ty);
1409 fjz2 = _mm_add_ps(fjz2,tz);
1410
1411 }
1412
1413 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1414 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1415 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1416 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1417
1418 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1419 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1420
1421 /* Inner loop uses 441 flops */
1422 }
1423
1424 /* End of innermost loop */
1425
1426 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1427 f+i_coord_offset,fshift+i_shift_offset);
1428
1429 ggid = gid[iidx];
1430 /* Update potential energies */
1431 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1432 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1433
1434 /* Increment number of inner iterations */
1435 inneriter += j_index_end - j_index_start;
1436
1437 /* Outer loop uses 20 flops */
1438 }
1439
1440 /* Increment number of outer iterations */
1441 outeriter += nri;
1442
1443 /* Update outer/inner flops */
1444
1445 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_VF] += outeriter*20 +
inneriter*441
;
1446}
1447/*
1448 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1449 * Electrostatics interaction: Ewald
1450 * VdW interaction: LennardJones
1451 * Geometry: Water3-Water3
1452 * Calculate force/pot: Force
1453 */
1454void
1455nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1456 (t_nblist * gmx_restrict nlist,
1457 rvec * gmx_restrict xx,
1458 rvec * gmx_restrict ff,
1459 t_forcerec * gmx_restrict fr,
1460 t_mdatoms * gmx_restrict mdatoms,
1461 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1462 t_nrnb * gmx_restrict nrnb)
1463{
1464 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1465 * just 0 for non-waters.
1466 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1467 * jnr indices corresponding to data put in the four positions in the SIMD register.
1468 */
1469 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1470 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1471 int jnrA,jnrB,jnrC,jnrD;
1472 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1473 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1474 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1475 real rcutoff_scalar;
1476 real *shiftvec,*fshift,*x,*f;
1477 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1478 real scratch[4*DIM3];
1479 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1480 int vdwioffset0;
1481 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1482 int vdwioffset1;
1483 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1484 int vdwioffset2;
1485 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1486 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1487 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1488 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1489 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1490 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1491 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1492 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1493 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1494 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1495 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1496 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1497 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1498 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1499 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1500 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1501 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1502 real *charge;
1503 int nvdwtype;
1504 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1505 int *vdwtype;
1506 real *vdwparam;
1507 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1508 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1509 __m128i ewitab;
1510 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1511 real *ewtab;
1512 __m128 dummy_mask,cutoff_mask;
1513 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1514 __m128 one = _mm_set1_ps(1.0);
1515 __m128 two = _mm_set1_ps(2.0);
1516 x = xx[0];
1517 f = ff[0];
1518
1519 nri = nlist->nri;
1520 iinr = nlist->iinr;
1521 jindex = nlist->jindex;
1522 jjnr = nlist->jjnr;
1523 shiftidx = nlist->shift;
1524 gid = nlist->gid;
1525 shiftvec = fr->shift_vec[0];
1526 fshift = fr->fshift[0];
1527 facel = _mm_set1_ps(fr->epsfac);
1528 charge = mdatoms->chargeA;
1529 nvdwtype = fr->ntype;
1530 vdwparam = fr->nbfp;
1531 vdwtype = mdatoms->typeA;
1532
1533 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1534 ewtab = fr->ic->tabq_coul_F;
1535 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1536 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1537
1538 /* Setup water-specific parameters */
1539 inr = nlist->iinr[0];
1540 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1541 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1542 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1543 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1544
1545 jq0 = _mm_set1_ps(charge[inr+0]);
1546 jq1 = _mm_set1_ps(charge[inr+1]);
1547 jq2 = _mm_set1_ps(charge[inr+2]);
1548 vdwjidx0A = 2*vdwtype[inr+0];
1549 qq00 = _mm_mul_ps(iq0,jq0);
1550 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1551 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1552 qq01 = _mm_mul_ps(iq0,jq1);
1553 qq02 = _mm_mul_ps(iq0,jq2);
1554 qq10 = _mm_mul_ps(iq1,jq0);
1555 qq11 = _mm_mul_ps(iq1,jq1);
1556 qq12 = _mm_mul_ps(iq1,jq2);
1557 qq20 = _mm_mul_ps(iq2,jq0);
1558 qq21 = _mm_mul_ps(iq2,jq1);
1559 qq22 = _mm_mul_ps(iq2,jq2);
1560
1561 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1562 rcutoff_scalar = fr->rcoulomb;
1563 rcutoff = _mm_set1_ps(rcutoff_scalar);
1564 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1565
1566 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1567 rvdw = _mm_set1_ps(fr->rvdw);
1568
1569 /* Avoid stupid compiler warnings */
1570 jnrA = jnrB = jnrC = jnrD = 0;
Value stored to 'jnrA' is never read
1571 j_coord_offsetA = 0;
1572 j_coord_offsetB = 0;
1573 j_coord_offsetC = 0;
1574 j_coord_offsetD = 0;
1575
1576 outeriter = 0;
1577 inneriter = 0;
1578
1579 for(iidx=0;iidx<4*DIM3;iidx++)
1580 {
1581 scratch[iidx] = 0.0;
1582 }
1583
1584 /* Start outer loop over neighborlists */
1585 for(iidx=0; iidx<nri; iidx++)
1586 {
1587 /* Load shift vector for this list */
1588 i_shift_offset = DIM3*shiftidx[iidx];
1589
1590 /* Load limits for loop over neighbors */
1591 j_index_start = jindex[iidx];
1592 j_index_end = jindex[iidx+1];
1593
1594 /* Get outer coordinate index */
1595 inr = iinr[iidx];
1596 i_coord_offset = DIM3*inr;
1597
1598 /* Load i particle coords and add shift vector */
1599 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1600 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1601
1602 fix0 = _mm_setzero_ps();
1603 fiy0 = _mm_setzero_ps();
1604 fiz0 = _mm_setzero_ps();
1605 fix1 = _mm_setzero_ps();
1606 fiy1 = _mm_setzero_ps();
1607 fiz1 = _mm_setzero_ps();
1608 fix2 = _mm_setzero_ps();
1609 fiy2 = _mm_setzero_ps();
1610 fiz2 = _mm_setzero_ps();
1611
1612 /* Start inner kernel loop */
1613 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1614 {
1615
1616 /* Get j neighbor index, and coordinate index */
1617 jnrA = jjnr[jidx];
1618 jnrB = jjnr[jidx+1];
1619 jnrC = jjnr[jidx+2];
1620 jnrD = jjnr[jidx+3];
1621 j_coord_offsetA = DIM3*jnrA;
1622 j_coord_offsetB = DIM3*jnrB;
1623 j_coord_offsetC = DIM3*jnrC;
1624 j_coord_offsetD = DIM3*jnrD;
1625
1626 /* load j atom coordinates */
1627 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1628 x+j_coord_offsetC,x+j_coord_offsetD,
1629 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1630
1631 /* Calculate displacement vector */
1632 dx00 = _mm_sub_ps(ix0,jx0);
1633 dy00 = _mm_sub_ps(iy0,jy0);
1634 dz00 = _mm_sub_ps(iz0,jz0);
1635 dx01 = _mm_sub_ps(ix0,jx1);
1636 dy01 = _mm_sub_ps(iy0,jy1);
1637 dz01 = _mm_sub_ps(iz0,jz1);
1638 dx02 = _mm_sub_ps(ix0,jx2);
1639 dy02 = _mm_sub_ps(iy0,jy2);
1640 dz02 = _mm_sub_ps(iz0,jz2);
1641 dx10 = _mm_sub_ps(ix1,jx0);
1642 dy10 = _mm_sub_ps(iy1,jy0);
1643 dz10 = _mm_sub_ps(iz1,jz0);
1644 dx11 = _mm_sub_ps(ix1,jx1);
1645 dy11 = _mm_sub_ps(iy1,jy1);
1646 dz11 = _mm_sub_ps(iz1,jz1);
1647 dx12 = _mm_sub_ps(ix1,jx2);
1648 dy12 = _mm_sub_ps(iy1,jy2);
1649 dz12 = _mm_sub_ps(iz1,jz2);
1650 dx20 = _mm_sub_ps(ix2,jx0);
1651 dy20 = _mm_sub_ps(iy2,jy0);
1652 dz20 = _mm_sub_ps(iz2,jz0);
1653 dx21 = _mm_sub_ps(ix2,jx1);
1654 dy21 = _mm_sub_ps(iy2,jy1);
1655 dz21 = _mm_sub_ps(iz2,jz1);
1656 dx22 = _mm_sub_ps(ix2,jx2);
1657 dy22 = _mm_sub_ps(iy2,jy2);
1658 dz22 = _mm_sub_ps(iz2,jz2);
1659
1660 /* Calculate squared distance and things based on it */
1661 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1662 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1663 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1664 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1665 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1666 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1667 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1668 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1669 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1670
1671 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1672 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1673 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1674 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1675 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1676 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1677 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1678 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1679 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1680
1681 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1682 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1683 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1684 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1685 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1686 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1687 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1688 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1689 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1690
1691 fjx0 = _mm_setzero_ps();
1692 fjy0 = _mm_setzero_ps();
1693 fjz0 = _mm_setzero_ps();
1694 fjx1 = _mm_setzero_ps();
1695 fjy1 = _mm_setzero_ps();
1696 fjz1 = _mm_setzero_ps();
1697 fjx2 = _mm_setzero_ps();
1698 fjy2 = _mm_setzero_ps();
1699 fjz2 = _mm_setzero_ps();
1700
1701 /**************************
1702 * CALCULATE INTERACTIONS *
1703 **************************/
1704
1705 if (gmx_mm_any_lt(rsq00,rcutoff2))
1706 {
1707
1708 r00 = _mm_mul_ps(rsq00,rinv00);
1709
1710 /* EWALD ELECTROSTATICS */
1711
1712 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1713 ewrt = _mm_mul_ps(r00,ewtabscale);
1714 ewitab = _mm_cvttps_epi32(ewrt);
1715 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1716 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1717 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1718 &ewtabF,&ewtabFn);
1719 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1720 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1721
1722 /* LENNARD-JONES DISPERSION/REPULSION */
1723
1724 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1725 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1726
1727 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1728
1729 fscal = _mm_add_ps(felec,fvdw);
1730
1731 fscal = _mm_and_ps(fscal,cutoff_mask);
1732
1733 /* Calculate temporary vectorial force */
1734 tx = _mm_mul_ps(fscal,dx00);
1735 ty = _mm_mul_ps(fscal,dy00);
1736 tz = _mm_mul_ps(fscal,dz00);
1737
1738 /* Update vectorial force */
1739 fix0 = _mm_add_ps(fix0,tx);
1740 fiy0 = _mm_add_ps(fiy0,ty);
1741 fiz0 = _mm_add_ps(fiz0,tz);
1742
1743 fjx0 = _mm_add_ps(fjx0,tx);
1744 fjy0 = _mm_add_ps(fjy0,ty);
1745 fjz0 = _mm_add_ps(fjz0,tz);
1746
1747 }
1748
1749 /**************************
1750 * CALCULATE INTERACTIONS *
1751 **************************/
1752
1753 if (gmx_mm_any_lt(rsq01,rcutoff2))
1754 {
1755
1756 r01 = _mm_mul_ps(rsq01,rinv01);
1757
1758 /* EWALD ELECTROSTATICS */
1759
1760 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1761 ewrt = _mm_mul_ps(r01,ewtabscale);
1762 ewitab = _mm_cvttps_epi32(ewrt);
1763 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1764 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1765 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1766 &ewtabF,&ewtabFn);
1767 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1768 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1769
1770 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1771
1772 fscal = felec;
1773
1774 fscal = _mm_and_ps(fscal,cutoff_mask);
1775
1776 /* Calculate temporary vectorial force */
1777 tx = _mm_mul_ps(fscal,dx01);
1778 ty = _mm_mul_ps(fscal,dy01);
1779 tz = _mm_mul_ps(fscal,dz01);
1780
1781 /* Update vectorial force */
1782 fix0 = _mm_add_ps(fix0,tx);
1783 fiy0 = _mm_add_ps(fiy0,ty);
1784 fiz0 = _mm_add_ps(fiz0,tz);
1785
1786 fjx1 = _mm_add_ps(fjx1,tx);
1787 fjy1 = _mm_add_ps(fjy1,ty);
1788 fjz1 = _mm_add_ps(fjz1,tz);
1789
1790 }
1791
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1795
1796 if (gmx_mm_any_lt(rsq02,rcutoff2))
1797 {
1798
1799 r02 = _mm_mul_ps(rsq02,rinv02);
1800
1801 /* EWALD ELECTROSTATICS */
1802
1803 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1804 ewrt = _mm_mul_ps(r02,ewtabscale);
1805 ewitab = _mm_cvttps_epi32(ewrt);
1806 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1807 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1808 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1809 &ewtabF,&ewtabFn);
1810 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1811 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1812
1813 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1814
1815 fscal = felec;
1816
1817 fscal = _mm_and_ps(fscal,cutoff_mask);
1818
1819 /* Calculate temporary vectorial force */
1820 tx = _mm_mul_ps(fscal,dx02);
1821 ty = _mm_mul_ps(fscal,dy02);
1822 tz = _mm_mul_ps(fscal,dz02);
1823
1824 /* Update vectorial force */
1825 fix0 = _mm_add_ps(fix0,tx);
1826 fiy0 = _mm_add_ps(fiy0,ty);
1827 fiz0 = _mm_add_ps(fiz0,tz);
1828
1829 fjx2 = _mm_add_ps(fjx2,tx);
1830 fjy2 = _mm_add_ps(fjy2,ty);
1831 fjz2 = _mm_add_ps(fjz2,tz);
1832
1833 }
1834
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1838
1839 if (gmx_mm_any_lt(rsq10,rcutoff2))
1840 {
1841
1842 r10 = _mm_mul_ps(rsq10,rinv10);
1843
1844 /* EWALD ELECTROSTATICS */
1845
1846 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1847 ewrt = _mm_mul_ps(r10,ewtabscale);
1848 ewitab = _mm_cvttps_epi32(ewrt);
1849 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1850 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1851 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1852 &ewtabF,&ewtabFn);
1853 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1854 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1855
1856 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1857
1858 fscal = felec;
1859
1860 fscal = _mm_and_ps(fscal,cutoff_mask);
1861
1862 /* Calculate temporary vectorial force */
1863 tx = _mm_mul_ps(fscal,dx10);
1864 ty = _mm_mul_ps(fscal,dy10);
1865 tz = _mm_mul_ps(fscal,dz10);
1866
1867 /* Update vectorial force */
1868 fix1 = _mm_add_ps(fix1,tx);
1869 fiy1 = _mm_add_ps(fiy1,ty);
1870 fiz1 = _mm_add_ps(fiz1,tz);
1871
1872 fjx0 = _mm_add_ps(fjx0,tx);
1873 fjy0 = _mm_add_ps(fjy0,ty);
1874 fjz0 = _mm_add_ps(fjz0,tz);
1875
1876 }
1877
1878 /**************************
1879 * CALCULATE INTERACTIONS *
1880 **************************/
1881
1882 if (gmx_mm_any_lt(rsq11,rcutoff2))
1883 {
1884
1885 r11 = _mm_mul_ps(rsq11,rinv11);
1886
1887 /* EWALD ELECTROSTATICS */
1888
1889 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1890 ewrt = _mm_mul_ps(r11,ewtabscale);
1891 ewitab = _mm_cvttps_epi32(ewrt);
1892 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1893 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1894 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1895 &ewtabF,&ewtabFn);
1896 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1897 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1898
1899 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1900
1901 fscal = felec;
1902
1903 fscal = _mm_and_ps(fscal,cutoff_mask);
1904
1905 /* Calculate temporary vectorial force */
1906 tx = _mm_mul_ps(fscal,dx11);
1907 ty = _mm_mul_ps(fscal,dy11);
1908 tz = _mm_mul_ps(fscal,dz11);
1909
1910 /* Update vectorial force */
1911 fix1 = _mm_add_ps(fix1,tx);
1912 fiy1 = _mm_add_ps(fiy1,ty);
1913 fiz1 = _mm_add_ps(fiz1,tz);
1914
1915 fjx1 = _mm_add_ps(fjx1,tx);
1916 fjy1 = _mm_add_ps(fjy1,ty);
1917 fjz1 = _mm_add_ps(fjz1,tz);
1918
1919 }
1920
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1924
1925 if (gmx_mm_any_lt(rsq12,rcutoff2))
1926 {
1927
1928 r12 = _mm_mul_ps(rsq12,rinv12);
1929
1930 /* EWALD ELECTROSTATICS */
1931
1932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1933 ewrt = _mm_mul_ps(r12,ewtabscale);
1934 ewitab = _mm_cvttps_epi32(ewrt);
1935 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1936 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1937 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1938 &ewtabF,&ewtabFn);
1939 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1940 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1941
1942 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1943
1944 fscal = felec;
1945
1946 fscal = _mm_and_ps(fscal,cutoff_mask);
1947
1948 /* Calculate temporary vectorial force */
1949 tx = _mm_mul_ps(fscal,dx12);
1950 ty = _mm_mul_ps(fscal,dy12);
1951 tz = _mm_mul_ps(fscal,dz12);
1952
1953 /* Update vectorial force */
1954 fix1 = _mm_add_ps(fix1,tx);
1955 fiy1 = _mm_add_ps(fiy1,ty);
1956 fiz1 = _mm_add_ps(fiz1,tz);
1957
1958 fjx2 = _mm_add_ps(fjx2,tx);
1959 fjy2 = _mm_add_ps(fjy2,ty);
1960 fjz2 = _mm_add_ps(fjz2,tz);
1961
1962 }
1963
1964 /**************************
1965 * CALCULATE INTERACTIONS *
1966 **************************/
1967
1968 if (gmx_mm_any_lt(rsq20,rcutoff2))
1969 {
1970
1971 r20 = _mm_mul_ps(rsq20,rinv20);
1972
1973 /* EWALD ELECTROSTATICS */
1974
1975 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1976 ewrt = _mm_mul_ps(r20,ewtabscale);
1977 ewitab = _mm_cvttps_epi32(ewrt);
1978 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1979 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
1980 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
1981 &ewtabF,&ewtabFn);
1982 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1983 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1984
1985 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1986
1987 fscal = felec;
1988
1989 fscal = _mm_and_ps(fscal,cutoff_mask);
1990
1991 /* Calculate temporary vectorial force */
1992 tx = _mm_mul_ps(fscal,dx20);
1993 ty = _mm_mul_ps(fscal,dy20);
1994 tz = _mm_mul_ps(fscal,dz20);
1995
1996 /* Update vectorial force */
1997 fix2 = _mm_add_ps(fix2,tx);
1998 fiy2 = _mm_add_ps(fiy2,ty);
1999 fiz2 = _mm_add_ps(fiz2,tz);
2000
2001 fjx0 = _mm_add_ps(fjx0,tx);
2002 fjy0 = _mm_add_ps(fjy0,ty);
2003 fjz0 = _mm_add_ps(fjz0,tz);
2004
2005 }
2006
2007 /**************************
2008 * CALCULATE INTERACTIONS *
2009 **************************/
2010
2011 if (gmx_mm_any_lt(rsq21,rcutoff2))
2012 {
2013
2014 r21 = _mm_mul_ps(rsq21,rinv21);
2015
2016 /* EWALD ELECTROSTATICS */
2017
2018 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2019 ewrt = _mm_mul_ps(r21,ewtabscale);
2020 ewitab = _mm_cvttps_epi32(ewrt);
2021 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2022 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2023 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2024 &ewtabF,&ewtabFn);
2025 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2026 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2027
2028 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2029
2030 fscal = felec;
2031
2032 fscal = _mm_and_ps(fscal,cutoff_mask);
2033
2034 /* Calculate temporary vectorial force */
2035 tx = _mm_mul_ps(fscal,dx21);
2036 ty = _mm_mul_ps(fscal,dy21);
2037 tz = _mm_mul_ps(fscal,dz21);
2038
2039 /* Update vectorial force */
2040 fix2 = _mm_add_ps(fix2,tx);
2041 fiy2 = _mm_add_ps(fiy2,ty);
2042 fiz2 = _mm_add_ps(fiz2,tz);
2043
2044 fjx1 = _mm_add_ps(fjx1,tx);
2045 fjy1 = _mm_add_ps(fjy1,ty);
2046 fjz1 = _mm_add_ps(fjz1,tz);
2047
2048 }
2049
2050 /**************************
2051 * CALCULATE INTERACTIONS *
2052 **************************/
2053
2054 if (gmx_mm_any_lt(rsq22,rcutoff2))
2055 {
2056
2057 r22 = _mm_mul_ps(rsq22,rinv22);
2058
2059 /* EWALD ELECTROSTATICS */
2060
2061 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2062 ewrt = _mm_mul_ps(r22,ewtabscale);
2063 ewitab = _mm_cvttps_epi32(ewrt);
2064 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2065 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2066 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2067 &ewtabF,&ewtabFn);
2068 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2069 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2070
2071 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2072
2073 fscal = felec;
2074
2075 fscal = _mm_and_ps(fscal,cutoff_mask);
2076
2077 /* Calculate temporary vectorial force */
2078 tx = _mm_mul_ps(fscal,dx22);
2079 ty = _mm_mul_ps(fscal,dy22);
2080 tz = _mm_mul_ps(fscal,dz22);
2081
2082 /* Update vectorial force */
2083 fix2 = _mm_add_ps(fix2,tx);
2084 fiy2 = _mm_add_ps(fiy2,ty);
2085 fiz2 = _mm_add_ps(fiz2,tz);
2086
2087 fjx2 = _mm_add_ps(fjx2,tx);
2088 fjy2 = _mm_add_ps(fjy2,ty);
2089 fjz2 = _mm_add_ps(fjz2,tz);
2090
2091 }
2092
2093 fjptrA = f+j_coord_offsetA;
2094 fjptrB = f+j_coord_offsetB;
2095 fjptrC = f+j_coord_offsetC;
2096 fjptrD = f+j_coord_offsetD;
2097
2098 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2099 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2100
2101 /* Inner loop uses 358 flops */
2102 }
2103
2104 if(jidx<j_index_end)
2105 {
2106
2107 /* Get j neighbor index, and coordinate index */
2108 jnrlistA = jjnr[jidx];
2109 jnrlistB = jjnr[jidx+1];
2110 jnrlistC = jjnr[jidx+2];
2111 jnrlistD = jjnr[jidx+3];
2112 /* Sign of each element will be negative for non-real atoms.
2113 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2114 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2115 */
2116 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2117 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2118 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2119 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2120 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2121 j_coord_offsetA = DIM3*jnrA;
2122 j_coord_offsetB = DIM3*jnrB;
2123 j_coord_offsetC = DIM3*jnrC;
2124 j_coord_offsetD = DIM3*jnrD;
2125
2126 /* load j atom coordinates */
2127 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2128 x+j_coord_offsetC,x+j_coord_offsetD,
2129 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2130
2131 /* Calculate displacement vector */
2132 dx00 = _mm_sub_ps(ix0,jx0);
2133 dy00 = _mm_sub_ps(iy0,jy0);
2134 dz00 = _mm_sub_ps(iz0,jz0);
2135 dx01 = _mm_sub_ps(ix0,jx1);
2136 dy01 = _mm_sub_ps(iy0,jy1);
2137 dz01 = _mm_sub_ps(iz0,jz1);
2138 dx02 = _mm_sub_ps(ix0,jx2);
2139 dy02 = _mm_sub_ps(iy0,jy2);
2140 dz02 = _mm_sub_ps(iz0,jz2);
2141 dx10 = _mm_sub_ps(ix1,jx0);
2142 dy10 = _mm_sub_ps(iy1,jy0);
2143 dz10 = _mm_sub_ps(iz1,jz0);
2144 dx11 = _mm_sub_ps(ix1,jx1);
2145 dy11 = _mm_sub_ps(iy1,jy1);
2146 dz11 = _mm_sub_ps(iz1,jz1);
2147 dx12 = _mm_sub_ps(ix1,jx2);
2148 dy12 = _mm_sub_ps(iy1,jy2);
2149 dz12 = _mm_sub_ps(iz1,jz2);
2150 dx20 = _mm_sub_ps(ix2,jx0);
2151 dy20 = _mm_sub_ps(iy2,jy0);
2152 dz20 = _mm_sub_ps(iz2,jz0);
2153 dx21 = _mm_sub_ps(ix2,jx1);
2154 dy21 = _mm_sub_ps(iy2,jy1);
2155 dz21 = _mm_sub_ps(iz2,jz1);
2156 dx22 = _mm_sub_ps(ix2,jx2);
2157 dy22 = _mm_sub_ps(iy2,jy2);
2158 dz22 = _mm_sub_ps(iz2,jz2);
2159
2160 /* Calculate squared distance and things based on it */
2161 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2162 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2163 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2164 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2165 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2166 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2167 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2168 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2169 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2170
2171 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
2172 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
2173 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
2174 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
2175 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2176 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2177 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
2178 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2179 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2180
2181 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2182 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2183 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2184 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2185 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2186 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2187 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2188 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2189 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2190
2191 fjx0 = _mm_setzero_ps();
2192 fjy0 = _mm_setzero_ps();
2193 fjz0 = _mm_setzero_ps();
2194 fjx1 = _mm_setzero_ps();
2195 fjy1 = _mm_setzero_ps();
2196 fjz1 = _mm_setzero_ps();
2197 fjx2 = _mm_setzero_ps();
2198 fjy2 = _mm_setzero_ps();
2199 fjz2 = _mm_setzero_ps();
2200
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2204
2205 if (gmx_mm_any_lt(rsq00,rcutoff2))
2206 {
2207
2208 r00 = _mm_mul_ps(rsq00,rinv00);
2209 r00 = _mm_andnot_ps(dummy_mask,r00);
2210
2211 /* EWALD ELECTROSTATICS */
2212
2213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214 ewrt = _mm_mul_ps(r00,ewtabscale);
2215 ewitab = _mm_cvttps_epi32(ewrt);
2216 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2217 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2218 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2219 &ewtabF,&ewtabFn);
2220 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2221 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2222
2223 /* LENNARD-JONES DISPERSION/REPULSION */
2224
2225 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2226 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2227
2228 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2229
2230 fscal = _mm_add_ps(felec,fvdw);
2231
2232 fscal = _mm_and_ps(fscal,cutoff_mask);
2233
2234 fscal = _mm_andnot_ps(dummy_mask,fscal);
2235
2236 /* Calculate temporary vectorial force */
2237 tx = _mm_mul_ps(fscal,dx00);
2238 ty = _mm_mul_ps(fscal,dy00);
2239 tz = _mm_mul_ps(fscal,dz00);
2240
2241 /* Update vectorial force */
2242 fix0 = _mm_add_ps(fix0,tx);
2243 fiy0 = _mm_add_ps(fiy0,ty);
2244 fiz0 = _mm_add_ps(fiz0,tz);
2245
2246 fjx0 = _mm_add_ps(fjx0,tx);
2247 fjy0 = _mm_add_ps(fjy0,ty);
2248 fjz0 = _mm_add_ps(fjz0,tz);
2249
2250 }
2251
2252 /**************************
2253 * CALCULATE INTERACTIONS *
2254 **************************/
2255
2256 if (gmx_mm_any_lt(rsq01,rcutoff2))
2257 {
2258
2259 r01 = _mm_mul_ps(rsq01,rinv01);
2260 r01 = _mm_andnot_ps(dummy_mask,r01);
2261
2262 /* EWALD ELECTROSTATICS */
2263
2264 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2265 ewrt = _mm_mul_ps(r01,ewtabscale);
2266 ewitab = _mm_cvttps_epi32(ewrt);
2267 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2268 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2269 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2270 &ewtabF,&ewtabFn);
2271 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2272 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2273
2274 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2275
2276 fscal = felec;
2277
2278 fscal = _mm_and_ps(fscal,cutoff_mask);
2279
2280 fscal = _mm_andnot_ps(dummy_mask,fscal);
2281
2282 /* Calculate temporary vectorial force */
2283 tx = _mm_mul_ps(fscal,dx01);
2284 ty = _mm_mul_ps(fscal,dy01);
2285 tz = _mm_mul_ps(fscal,dz01);
2286
2287 /* Update vectorial force */
2288 fix0 = _mm_add_ps(fix0,tx);
2289 fiy0 = _mm_add_ps(fiy0,ty);
2290 fiz0 = _mm_add_ps(fiz0,tz);
2291
2292 fjx1 = _mm_add_ps(fjx1,tx);
2293 fjy1 = _mm_add_ps(fjy1,ty);
2294 fjz1 = _mm_add_ps(fjz1,tz);
2295
2296 }
2297
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2301
2302 if (gmx_mm_any_lt(rsq02,rcutoff2))
2303 {
2304
2305 r02 = _mm_mul_ps(rsq02,rinv02);
2306 r02 = _mm_andnot_ps(dummy_mask,r02);
2307
2308 /* EWALD ELECTROSTATICS */
2309
2310 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2311 ewrt = _mm_mul_ps(r02,ewtabscale);
2312 ewitab = _mm_cvttps_epi32(ewrt);
2313 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2314 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2315 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2316 &ewtabF,&ewtabFn);
2317 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2318 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2319
2320 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2321
2322 fscal = felec;
2323
2324 fscal = _mm_and_ps(fscal,cutoff_mask);
2325
2326 fscal = _mm_andnot_ps(dummy_mask,fscal);
2327
2328 /* Calculate temporary vectorial force */
2329 tx = _mm_mul_ps(fscal,dx02);
2330 ty = _mm_mul_ps(fscal,dy02);
2331 tz = _mm_mul_ps(fscal,dz02);
2332
2333 /* Update vectorial force */
2334 fix0 = _mm_add_ps(fix0,tx);
2335 fiy0 = _mm_add_ps(fiy0,ty);
2336 fiz0 = _mm_add_ps(fiz0,tz);
2337
2338 fjx2 = _mm_add_ps(fjx2,tx);
2339 fjy2 = _mm_add_ps(fjy2,ty);
2340 fjz2 = _mm_add_ps(fjz2,tz);
2341
2342 }
2343
2344 /**************************
2345 * CALCULATE INTERACTIONS *
2346 **************************/
2347
2348 if (gmx_mm_any_lt(rsq10,rcutoff2))
2349 {
2350
2351 r10 = _mm_mul_ps(rsq10,rinv10);
2352 r10 = _mm_andnot_ps(dummy_mask,r10);
2353
2354 /* EWALD ELECTROSTATICS */
2355
2356 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2357 ewrt = _mm_mul_ps(r10,ewtabscale);
2358 ewitab = _mm_cvttps_epi32(ewrt);
2359 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2360 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2361 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2362 &ewtabF,&ewtabFn);
2363 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2364 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2365
2366 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2367
2368 fscal = felec;
2369
2370 fscal = _mm_and_ps(fscal,cutoff_mask);
2371
2372 fscal = _mm_andnot_ps(dummy_mask,fscal);
2373
2374 /* Calculate temporary vectorial force */
2375 tx = _mm_mul_ps(fscal,dx10);
2376 ty = _mm_mul_ps(fscal,dy10);
2377 tz = _mm_mul_ps(fscal,dz10);
2378
2379 /* Update vectorial force */
2380 fix1 = _mm_add_ps(fix1,tx);
2381 fiy1 = _mm_add_ps(fiy1,ty);
2382 fiz1 = _mm_add_ps(fiz1,tz);
2383
2384 fjx0 = _mm_add_ps(fjx0,tx);
2385 fjy0 = _mm_add_ps(fjy0,ty);
2386 fjz0 = _mm_add_ps(fjz0,tz);
2387
2388 }
2389
2390 /**************************
2391 * CALCULATE INTERACTIONS *
2392 **************************/
2393
2394 if (gmx_mm_any_lt(rsq11,rcutoff2))
2395 {
2396
2397 r11 = _mm_mul_ps(rsq11,rinv11);
2398 r11 = _mm_andnot_ps(dummy_mask,r11);
2399
2400 /* EWALD ELECTROSTATICS */
2401
2402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2403 ewrt = _mm_mul_ps(r11,ewtabscale);
2404 ewitab = _mm_cvttps_epi32(ewrt);
2405 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2406 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2407 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2408 &ewtabF,&ewtabFn);
2409 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2410 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2411
2412 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2413
2414 fscal = felec;
2415
2416 fscal = _mm_and_ps(fscal,cutoff_mask);
2417
2418 fscal = _mm_andnot_ps(dummy_mask,fscal);
2419
2420 /* Calculate temporary vectorial force */
2421 tx = _mm_mul_ps(fscal,dx11);
2422 ty = _mm_mul_ps(fscal,dy11);
2423 tz = _mm_mul_ps(fscal,dz11);
2424
2425 /* Update vectorial force */
2426 fix1 = _mm_add_ps(fix1,tx);
2427 fiy1 = _mm_add_ps(fiy1,ty);
2428 fiz1 = _mm_add_ps(fiz1,tz);
2429
2430 fjx1 = _mm_add_ps(fjx1,tx);
2431 fjy1 = _mm_add_ps(fjy1,ty);
2432 fjz1 = _mm_add_ps(fjz1,tz);
2433
2434 }
2435
2436 /**************************
2437 * CALCULATE INTERACTIONS *
2438 **************************/
2439
2440 if (gmx_mm_any_lt(rsq12,rcutoff2))
2441 {
2442
2443 r12 = _mm_mul_ps(rsq12,rinv12);
2444 r12 = _mm_andnot_ps(dummy_mask,r12);
2445
2446 /* EWALD ELECTROSTATICS */
2447
2448 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2449 ewrt = _mm_mul_ps(r12,ewtabscale);
2450 ewitab = _mm_cvttps_epi32(ewrt);
2451 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2452 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2453 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2454 &ewtabF,&ewtabFn);
2455 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2456 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2457
2458 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2459
2460 fscal = felec;
2461
2462 fscal = _mm_and_ps(fscal,cutoff_mask);
2463
2464 fscal = _mm_andnot_ps(dummy_mask,fscal);
2465
2466 /* Calculate temporary vectorial force */
2467 tx = _mm_mul_ps(fscal,dx12);
2468 ty = _mm_mul_ps(fscal,dy12);
2469 tz = _mm_mul_ps(fscal,dz12);
2470
2471 /* Update vectorial force */
2472 fix1 = _mm_add_ps(fix1,tx);
2473 fiy1 = _mm_add_ps(fiy1,ty);
2474 fiz1 = _mm_add_ps(fiz1,tz);
2475
2476 fjx2 = _mm_add_ps(fjx2,tx);
2477 fjy2 = _mm_add_ps(fjy2,ty);
2478 fjz2 = _mm_add_ps(fjz2,tz);
2479
2480 }
2481
2482 /**************************
2483 * CALCULATE INTERACTIONS *
2484 **************************/
2485
2486 if (gmx_mm_any_lt(rsq20,rcutoff2))
2487 {
2488
2489 r20 = _mm_mul_ps(rsq20,rinv20);
2490 r20 = _mm_andnot_ps(dummy_mask,r20);
2491
2492 /* EWALD ELECTROSTATICS */
2493
2494 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2495 ewrt = _mm_mul_ps(r20,ewtabscale);
2496 ewitab = _mm_cvttps_epi32(ewrt);
2497 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2498 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2499 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2500 &ewtabF,&ewtabFn);
2501 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2502 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2503
2504 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2505
2506 fscal = felec;
2507
2508 fscal = _mm_and_ps(fscal,cutoff_mask);
2509
2510 fscal = _mm_andnot_ps(dummy_mask,fscal);
2511
2512 /* Calculate temporary vectorial force */
2513 tx = _mm_mul_ps(fscal,dx20);
2514 ty = _mm_mul_ps(fscal,dy20);
2515 tz = _mm_mul_ps(fscal,dz20);
2516
2517 /* Update vectorial force */
2518 fix2 = _mm_add_ps(fix2,tx);
2519 fiy2 = _mm_add_ps(fiy2,ty);
2520 fiz2 = _mm_add_ps(fiz2,tz);
2521
2522 fjx0 = _mm_add_ps(fjx0,tx);
2523 fjy0 = _mm_add_ps(fjy0,ty);
2524 fjz0 = _mm_add_ps(fjz0,tz);
2525
2526 }
2527
2528 /**************************
2529 * CALCULATE INTERACTIONS *
2530 **************************/
2531
2532 if (gmx_mm_any_lt(rsq21,rcutoff2))
2533 {
2534
2535 r21 = _mm_mul_ps(rsq21,rinv21);
2536 r21 = _mm_andnot_ps(dummy_mask,r21);
2537
2538 /* EWALD ELECTROSTATICS */
2539
2540 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2541 ewrt = _mm_mul_ps(r21,ewtabscale);
2542 ewitab = _mm_cvttps_epi32(ewrt);
2543 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2544 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2545 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2546 &ewtabF,&ewtabFn);
2547 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2548 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2549
2550 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2551
2552 fscal = felec;
2553
2554 fscal = _mm_and_ps(fscal,cutoff_mask);
2555
2556 fscal = _mm_andnot_ps(dummy_mask,fscal);
2557
2558 /* Calculate temporary vectorial force */
2559 tx = _mm_mul_ps(fscal,dx21);
2560 ty = _mm_mul_ps(fscal,dy21);
2561 tz = _mm_mul_ps(fscal,dz21);
2562
2563 /* Update vectorial force */
2564 fix2 = _mm_add_ps(fix2,tx);
2565 fiy2 = _mm_add_ps(fiy2,ty);
2566 fiz2 = _mm_add_ps(fiz2,tz);
2567
2568 fjx1 = _mm_add_ps(fjx1,tx);
2569 fjy1 = _mm_add_ps(fjy1,ty);
2570 fjz1 = _mm_add_ps(fjz1,tz);
2571
2572 }
2573
2574 /**************************
2575 * CALCULATE INTERACTIONS *
2576 **************************/
2577
2578 if (gmx_mm_any_lt(rsq22,rcutoff2))
2579 {
2580
2581 r22 = _mm_mul_ps(rsq22,rinv22);
2582 r22 = _mm_andnot_ps(dummy_mask,r22);
2583
2584 /* EWALD ELECTROSTATICS */
2585
2586 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2587 ewrt = _mm_mul_ps(r22,ewtabscale);
2588 ewitab = _mm_cvttps_epi32(ewrt);
2589 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2590 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
,
2591 ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
,ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
,
2592 &ewtabF,&ewtabFn);
2593 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2594 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2595
2596 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2597
2598 fscal = felec;
2599
2600 fscal = _mm_and_ps(fscal,cutoff_mask);
2601
2602 fscal = _mm_andnot_ps(dummy_mask,fscal);
2603
2604 /* Calculate temporary vectorial force */
2605 tx = _mm_mul_ps(fscal,dx22);
2606 ty = _mm_mul_ps(fscal,dy22);
2607 tz = _mm_mul_ps(fscal,dz22);
2608
2609 /* Update vectorial force */
2610 fix2 = _mm_add_ps(fix2,tx);
2611 fiy2 = _mm_add_ps(fiy2,ty);
2612 fiz2 = _mm_add_ps(fiz2,tz);
2613
2614 fjx2 = _mm_add_ps(fjx2,tx);
2615 fjy2 = _mm_add_ps(fjy2,ty);
2616 fjz2 = _mm_add_ps(fjz2,tz);
2617
2618 }
2619
2620 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2621 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2622 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2623 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2624
2625 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2626 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2627
2628 /* Inner loop uses 367 flops */
2629 }
2630
2631 /* End of innermost loop */
2632
2633 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2634 f+i_coord_offset,fshift+i_shift_offset);
2635
2636 /* Increment number of inner iterations */
2637 inneriter += j_index_end - j_index_start;
2638
2639 /* Outer loop uses 18 flops */
2640 }
2641
2642 /* Increment number of outer iterations */
2643 outeriter += nri;
2644
2645 /* Update outer/inner flops */
2646
2647 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_F] += outeriter*18 + inneriter
*367
;
2648}