Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse4_1_single.c
Location:line 164, column 5
Description:Value stored to 'd' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: None
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset1;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 int vdwioffset2;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwioffset3;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
96 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 real *charge;
108 __m128i ewitab;
109 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
110 real *ewtab;
111 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
112 real rswitch_scalar,d_scalar;
113 __m128 dummy_mask,cutoff_mask;
114 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
115 __m128 one = _mm_set1_ps(1.0);
116 __m128 two = _mm_set1_ps(2.0);
117 x = xx[0];
118 f = ff[0];
119
120 nri = nlist->nri;
121 iinr = nlist->iinr;
122 jindex = nlist->jindex;
123 jjnr = nlist->jjnr;
124 shiftidx = nlist->shift;
125 gid = nlist->gid;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_ps(fr->epsfac);
129 charge = mdatoms->chargeA;
130
131 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
132 ewtab = fr->ic->tabq_coul_FDV0;
133 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
134 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
135
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
141
142 jq1 = _mm_set1_ps(charge[inr+1]);
143 jq2 = _mm_set1_ps(charge[inr+2]);
144 jq3 = _mm_set1_ps(charge[inr+3]);
145 qq11 = _mm_mul_ps(iq1,jq1);
146 qq12 = _mm_mul_ps(iq1,jq2);
147 qq13 = _mm_mul_ps(iq1,jq3);
148 qq21 = _mm_mul_ps(iq2,jq1);
149 qq22 = _mm_mul_ps(iq2,jq2);
150 qq23 = _mm_mul_ps(iq2,jq3);
151 qq31 = _mm_mul_ps(iq3,jq1);
152 qq32 = _mm_mul_ps(iq3,jq2);
153 qq33 = _mm_mul_ps(iq3,jq3);
154
155 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
156 rcutoff_scalar = fr->rcoulomb;
157 rcutoff = _mm_set1_ps(rcutoff_scalar);
158 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
159
160 rswitch_scalar = fr->rcoulomb_switch;
161 rswitch = _mm_set1_ps(rswitch_scalar);
162 /* Setup switch parameters */
163 d_scalar = rcutoff_scalar-rswitch_scalar;
164 d = _mm_set1_ps(d_scalar);
Value stored to 'd' is never read
165 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
166 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
167 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
168 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
169 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
170 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
171
172 /* Avoid stupid compiler warnings */
173 jnrA = jnrB = jnrC = jnrD = 0;
174 j_coord_offsetA = 0;
175 j_coord_offsetB = 0;
176 j_coord_offsetC = 0;
177 j_coord_offsetD = 0;
178
179 outeriter = 0;
180 inneriter = 0;
181
182 for(iidx=0;iidx<4*DIM3;iidx++)
183 {
184 scratch[iidx] = 0.0;
185 }
186
187 /* Start outer loop over neighborlists */
188 for(iidx=0; iidx<nri; iidx++)
189 {
190 /* Load shift vector for this list */
191 i_shift_offset = DIM3*shiftidx[iidx];
192
193 /* Load limits for loop over neighbors */
194 j_index_start = jindex[iidx];
195 j_index_end = jindex[iidx+1];
196
197 /* Get outer coordinate index */
198 inr = iinr[iidx];
199 i_coord_offset = DIM3*inr;
200
201 /* Load i particle coords and add shift vector */
202 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM3,
203 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
204
205 fix1 = _mm_setzero_ps();
206 fiy1 = _mm_setzero_ps();
207 fiz1 = _mm_setzero_ps();
208 fix2 = _mm_setzero_ps();
209 fiy2 = _mm_setzero_ps();
210 fiz2 = _mm_setzero_ps();
211 fix3 = _mm_setzero_ps();
212 fiy3 = _mm_setzero_ps();
213 fiz3 = _mm_setzero_ps();
214
215 /* Reset potential sums */
216 velecsum = _mm_setzero_ps();
217
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
220 {
221
222 /* Get j neighbor index, and coordinate index */
223 jnrA = jjnr[jidx];
224 jnrB = jjnr[jidx+1];
225 jnrC = jjnr[jidx+2];
226 jnrD = jjnr[jidx+3];
227 j_coord_offsetA = DIM3*jnrA;
228 j_coord_offsetB = DIM3*jnrB;
229 j_coord_offsetC = DIM3*jnrC;
230 j_coord_offsetD = DIM3*jnrD;
231
232 /* load j atom coordinates */
233 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM3,x+j_coord_offsetB+DIM3,
234 x+j_coord_offsetC+DIM3,x+j_coord_offsetD+DIM3,
235 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
236
237 /* Calculate displacement vector */
238 dx11 = _mm_sub_ps(ix1,jx1);
239 dy11 = _mm_sub_ps(iy1,jy1);
240 dz11 = _mm_sub_ps(iz1,jz1);
241 dx12 = _mm_sub_ps(ix1,jx2);
242 dy12 = _mm_sub_ps(iy1,jy2);
243 dz12 = _mm_sub_ps(iz1,jz2);
244 dx13 = _mm_sub_ps(ix1,jx3);
245 dy13 = _mm_sub_ps(iy1,jy3);
246 dz13 = _mm_sub_ps(iz1,jz3);
247 dx21 = _mm_sub_ps(ix2,jx1);
248 dy21 = _mm_sub_ps(iy2,jy1);
249 dz21 = _mm_sub_ps(iz2,jz1);
250 dx22 = _mm_sub_ps(ix2,jx2);
251 dy22 = _mm_sub_ps(iy2,jy2);
252 dz22 = _mm_sub_ps(iz2,jz2);
253 dx23 = _mm_sub_ps(ix2,jx3);
254 dy23 = _mm_sub_ps(iy2,jy3);
255 dz23 = _mm_sub_ps(iz2,jz3);
256 dx31 = _mm_sub_ps(ix3,jx1);
257 dy31 = _mm_sub_ps(iy3,jy1);
258 dz31 = _mm_sub_ps(iz3,jz1);
259 dx32 = _mm_sub_ps(ix3,jx2);
260 dy32 = _mm_sub_ps(iy3,jy2);
261 dz32 = _mm_sub_ps(iz3,jz2);
262 dx33 = _mm_sub_ps(ix3,jx3);
263 dy33 = _mm_sub_ps(iy3,jy3);
264 dz33 = _mm_sub_ps(iz3,jz3);
265
266 /* Calculate squared distance and things based on it */
267 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
270 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
273 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
274 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
275 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
276
277 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
278 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
279 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
280 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
281 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
282 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
283 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
284 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
285 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
286
287 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
288 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
289 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
290 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
291 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
292 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
293 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
294 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
295 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
296
297 fjx1 = _mm_setzero_ps();
298 fjy1 = _mm_setzero_ps();
299 fjz1 = _mm_setzero_ps();
300 fjx2 = _mm_setzero_ps();
301 fjy2 = _mm_setzero_ps();
302 fjz2 = _mm_setzero_ps();
303 fjx3 = _mm_setzero_ps();
304 fjy3 = _mm_setzero_ps();
305 fjz3 = _mm_setzero_ps();
306
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
310
311 if (gmx_mm_any_lt(rsq11,rcutoff2))
312 {
313
314 r11 = _mm_mul_ps(rsq11,rinv11);
315
316 /* EWALD ELECTROSTATICS */
317
318 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
319 ewrt = _mm_mul_ps(r11,ewtabscale);
320 ewitab = _mm_cvttps_epi32(ewrt);
321 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
322 ewitab = _mm_slli_epi32(ewitab,2);
323 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
324 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
325 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
326 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
327 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
328 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
329 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
330 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
331 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
332
333 d = _mm_sub_ps(r11,rswitch);
334 d = _mm_max_ps(d,_mm_setzero_ps());
335 d2 = _mm_mul_ps(d,d);
336 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
337
338 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
339
340 /* Evaluate switch function */
341 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
342 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
343 velec = _mm_mul_ps(velec,sw);
344 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
345
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velec = _mm_and_ps(velec,cutoff_mask);
348 velecsum = _mm_add_ps(velecsum,velec);
349
350 fscal = felec;
351
352 fscal = _mm_and_ps(fscal,cutoff_mask);
353
354 /* Calculate temporary vectorial force */
355 tx = _mm_mul_ps(fscal,dx11);
356 ty = _mm_mul_ps(fscal,dy11);
357 tz = _mm_mul_ps(fscal,dz11);
358
359 /* Update vectorial force */
360 fix1 = _mm_add_ps(fix1,tx);
361 fiy1 = _mm_add_ps(fiy1,ty);
362 fiz1 = _mm_add_ps(fiz1,tz);
363
364 fjx1 = _mm_add_ps(fjx1,tx);
365 fjy1 = _mm_add_ps(fjy1,ty);
366 fjz1 = _mm_add_ps(fjz1,tz);
367
368 }
369
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
373
374 if (gmx_mm_any_lt(rsq12,rcutoff2))
375 {
376
377 r12 = _mm_mul_ps(rsq12,rinv12);
378
379 /* EWALD ELECTROSTATICS */
380
381 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
382 ewrt = _mm_mul_ps(r12,ewtabscale);
383 ewitab = _mm_cvttps_epi32(ewrt);
384 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
385 ewitab = _mm_slli_epi32(ewitab,2);
386 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
387 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
388 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
389 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
390 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
391 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
392 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
393 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
394 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
395
396 d = _mm_sub_ps(r12,rswitch);
397 d = _mm_max_ps(d,_mm_setzero_ps());
398 d2 = _mm_mul_ps(d,d);
399 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
400
401 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
402
403 /* Evaluate switch function */
404 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
405 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
406 velec = _mm_mul_ps(velec,sw);
407 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
408
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velec = _mm_and_ps(velec,cutoff_mask);
411 velecsum = _mm_add_ps(velecsum,velec);
412
413 fscal = felec;
414
415 fscal = _mm_and_ps(fscal,cutoff_mask);
416
417 /* Calculate temporary vectorial force */
418 tx = _mm_mul_ps(fscal,dx12);
419 ty = _mm_mul_ps(fscal,dy12);
420 tz = _mm_mul_ps(fscal,dz12);
421
422 /* Update vectorial force */
423 fix1 = _mm_add_ps(fix1,tx);
424 fiy1 = _mm_add_ps(fiy1,ty);
425 fiz1 = _mm_add_ps(fiz1,tz);
426
427 fjx2 = _mm_add_ps(fjx2,tx);
428 fjy2 = _mm_add_ps(fjy2,ty);
429 fjz2 = _mm_add_ps(fjz2,tz);
430
431 }
432
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
436
437 if (gmx_mm_any_lt(rsq13,rcutoff2))
438 {
439
440 r13 = _mm_mul_ps(rsq13,rinv13);
441
442 /* EWALD ELECTROSTATICS */
443
444 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
445 ewrt = _mm_mul_ps(r13,ewtabscale);
446 ewitab = _mm_cvttps_epi32(ewrt);
447 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
448 ewitab = _mm_slli_epi32(ewitab,2);
449 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
450 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
451 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
452 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
453 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
454 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
455 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
456 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
457 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
458
459 d = _mm_sub_ps(r13,rswitch);
460 d = _mm_max_ps(d,_mm_setzero_ps());
461 d2 = _mm_mul_ps(d,d);
462 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
463
464 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
465
466 /* Evaluate switch function */
467 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
468 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
469 velec = _mm_mul_ps(velec,sw);
470 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
471
472 /* Update potential sum for this i atom from the interaction with this j atom. */
473 velec = _mm_and_ps(velec,cutoff_mask);
474 velecsum = _mm_add_ps(velecsum,velec);
475
476 fscal = felec;
477
478 fscal = _mm_and_ps(fscal,cutoff_mask);
479
480 /* Calculate temporary vectorial force */
481 tx = _mm_mul_ps(fscal,dx13);
482 ty = _mm_mul_ps(fscal,dy13);
483 tz = _mm_mul_ps(fscal,dz13);
484
485 /* Update vectorial force */
486 fix1 = _mm_add_ps(fix1,tx);
487 fiy1 = _mm_add_ps(fiy1,ty);
488 fiz1 = _mm_add_ps(fiz1,tz);
489
490 fjx3 = _mm_add_ps(fjx3,tx);
491 fjy3 = _mm_add_ps(fjy3,ty);
492 fjz3 = _mm_add_ps(fjz3,tz);
493
494 }
495
496 /**************************
497 * CALCULATE INTERACTIONS *
498 **************************/
499
500 if (gmx_mm_any_lt(rsq21,rcutoff2))
501 {
502
503 r21 = _mm_mul_ps(rsq21,rinv21);
504
505 /* EWALD ELECTROSTATICS */
506
507 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
508 ewrt = _mm_mul_ps(r21,ewtabscale);
509 ewitab = _mm_cvttps_epi32(ewrt);
510 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
511 ewitab = _mm_slli_epi32(ewitab,2);
512 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
513 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
514 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
515 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
516 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
517 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
518 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
519 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
520 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
521
522 d = _mm_sub_ps(r21,rswitch);
523 d = _mm_max_ps(d,_mm_setzero_ps());
524 d2 = _mm_mul_ps(d,d);
525 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
526
527 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
528
529 /* Evaluate switch function */
530 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
531 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
532 velec = _mm_mul_ps(velec,sw);
533 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
534
535 /* Update potential sum for this i atom from the interaction with this j atom. */
536 velec = _mm_and_ps(velec,cutoff_mask);
537 velecsum = _mm_add_ps(velecsum,velec);
538
539 fscal = felec;
540
541 fscal = _mm_and_ps(fscal,cutoff_mask);
542
543 /* Calculate temporary vectorial force */
544 tx = _mm_mul_ps(fscal,dx21);
545 ty = _mm_mul_ps(fscal,dy21);
546 tz = _mm_mul_ps(fscal,dz21);
547
548 /* Update vectorial force */
549 fix2 = _mm_add_ps(fix2,tx);
550 fiy2 = _mm_add_ps(fiy2,ty);
551 fiz2 = _mm_add_ps(fiz2,tz);
552
553 fjx1 = _mm_add_ps(fjx1,tx);
554 fjy1 = _mm_add_ps(fjy1,ty);
555 fjz1 = _mm_add_ps(fjz1,tz);
556
557 }
558
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
562
563 if (gmx_mm_any_lt(rsq22,rcutoff2))
564 {
565
566 r22 = _mm_mul_ps(rsq22,rinv22);
567
568 /* EWALD ELECTROSTATICS */
569
570 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
571 ewrt = _mm_mul_ps(r22,ewtabscale);
572 ewitab = _mm_cvttps_epi32(ewrt);
573 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
574 ewitab = _mm_slli_epi32(ewitab,2);
575 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
576 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
577 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
578 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
579 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
580 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
581 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
582 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
583 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
584
585 d = _mm_sub_ps(r22,rswitch);
586 d = _mm_max_ps(d,_mm_setzero_ps());
587 d2 = _mm_mul_ps(d,d);
588 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
589
590 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
591
592 /* Evaluate switch function */
593 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
594 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
595 velec = _mm_mul_ps(velec,sw);
596 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
597
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec = _mm_and_ps(velec,cutoff_mask);
600 velecsum = _mm_add_ps(velecsum,velec);
601
602 fscal = felec;
603
604 fscal = _mm_and_ps(fscal,cutoff_mask);
605
606 /* Calculate temporary vectorial force */
607 tx = _mm_mul_ps(fscal,dx22);
608 ty = _mm_mul_ps(fscal,dy22);
609 tz = _mm_mul_ps(fscal,dz22);
610
611 /* Update vectorial force */
612 fix2 = _mm_add_ps(fix2,tx);
613 fiy2 = _mm_add_ps(fiy2,ty);
614 fiz2 = _mm_add_ps(fiz2,tz);
615
616 fjx2 = _mm_add_ps(fjx2,tx);
617 fjy2 = _mm_add_ps(fjy2,ty);
618 fjz2 = _mm_add_ps(fjz2,tz);
619
620 }
621
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
625
626 if (gmx_mm_any_lt(rsq23,rcutoff2))
627 {
628
629 r23 = _mm_mul_ps(rsq23,rinv23);
630
631 /* EWALD ELECTROSTATICS */
632
633 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
634 ewrt = _mm_mul_ps(r23,ewtabscale);
635 ewitab = _mm_cvttps_epi32(ewrt);
636 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
637 ewitab = _mm_slli_epi32(ewitab,2);
638 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
639 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
640 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
641 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
642 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
643 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
644 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
645 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
646 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
647
648 d = _mm_sub_ps(r23,rswitch);
649 d = _mm_max_ps(d,_mm_setzero_ps());
650 d2 = _mm_mul_ps(d,d);
651 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
652
653 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
654
655 /* Evaluate switch function */
656 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
657 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
658 velec = _mm_mul_ps(velec,sw);
659 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
660
661 /* Update potential sum for this i atom from the interaction with this j atom. */
662 velec = _mm_and_ps(velec,cutoff_mask);
663 velecsum = _mm_add_ps(velecsum,velec);
664
665 fscal = felec;
666
667 fscal = _mm_and_ps(fscal,cutoff_mask);
668
669 /* Calculate temporary vectorial force */
670 tx = _mm_mul_ps(fscal,dx23);
671 ty = _mm_mul_ps(fscal,dy23);
672 tz = _mm_mul_ps(fscal,dz23);
673
674 /* Update vectorial force */
675 fix2 = _mm_add_ps(fix2,tx);
676 fiy2 = _mm_add_ps(fiy2,ty);
677 fiz2 = _mm_add_ps(fiz2,tz);
678
679 fjx3 = _mm_add_ps(fjx3,tx);
680 fjy3 = _mm_add_ps(fjy3,ty);
681 fjz3 = _mm_add_ps(fjz3,tz);
682
683 }
684
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
688
689 if (gmx_mm_any_lt(rsq31,rcutoff2))
690 {
691
692 r31 = _mm_mul_ps(rsq31,rinv31);
693
694 /* EWALD ELECTROSTATICS */
695
696 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
697 ewrt = _mm_mul_ps(r31,ewtabscale);
698 ewitab = _mm_cvttps_epi32(ewrt);
699 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
700 ewitab = _mm_slli_epi32(ewitab,2);
701 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
702 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
703 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
704 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
705 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
706 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
707 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
708 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
709 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
710
711 d = _mm_sub_ps(r31,rswitch);
712 d = _mm_max_ps(d,_mm_setzero_ps());
713 d2 = _mm_mul_ps(d,d);
714 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
715
716 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
717
718 /* Evaluate switch function */
719 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
720 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
721 velec = _mm_mul_ps(velec,sw);
722 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
723
724 /* Update potential sum for this i atom from the interaction with this j atom. */
725 velec = _mm_and_ps(velec,cutoff_mask);
726 velecsum = _mm_add_ps(velecsum,velec);
727
728 fscal = felec;
729
730 fscal = _mm_and_ps(fscal,cutoff_mask);
731
732 /* Calculate temporary vectorial force */
733 tx = _mm_mul_ps(fscal,dx31);
734 ty = _mm_mul_ps(fscal,dy31);
735 tz = _mm_mul_ps(fscal,dz31);
736
737 /* Update vectorial force */
738 fix3 = _mm_add_ps(fix3,tx);
739 fiy3 = _mm_add_ps(fiy3,ty);
740 fiz3 = _mm_add_ps(fiz3,tz);
741
742 fjx1 = _mm_add_ps(fjx1,tx);
743 fjy1 = _mm_add_ps(fjy1,ty);
744 fjz1 = _mm_add_ps(fjz1,tz);
745
746 }
747
748 /**************************
749 * CALCULATE INTERACTIONS *
750 **************************/
751
752 if (gmx_mm_any_lt(rsq32,rcutoff2))
753 {
754
755 r32 = _mm_mul_ps(rsq32,rinv32);
756
757 /* EWALD ELECTROSTATICS */
758
759 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
760 ewrt = _mm_mul_ps(r32,ewtabscale);
761 ewitab = _mm_cvttps_epi32(ewrt);
762 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
763 ewitab = _mm_slli_epi32(ewitab,2);
764 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
765 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
766 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
767 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
768 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
769 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
770 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
771 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
772 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
773
774 d = _mm_sub_ps(r32,rswitch);
775 d = _mm_max_ps(d,_mm_setzero_ps());
776 d2 = _mm_mul_ps(d,d);
777 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
778
779 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
780
781 /* Evaluate switch function */
782 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
783 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
784 velec = _mm_mul_ps(velec,sw);
785 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
786
787 /* Update potential sum for this i atom from the interaction with this j atom. */
788 velec = _mm_and_ps(velec,cutoff_mask);
789 velecsum = _mm_add_ps(velecsum,velec);
790
791 fscal = felec;
792
793 fscal = _mm_and_ps(fscal,cutoff_mask);
794
795 /* Calculate temporary vectorial force */
796 tx = _mm_mul_ps(fscal,dx32);
797 ty = _mm_mul_ps(fscal,dy32);
798 tz = _mm_mul_ps(fscal,dz32);
799
800 /* Update vectorial force */
801 fix3 = _mm_add_ps(fix3,tx);
802 fiy3 = _mm_add_ps(fiy3,ty);
803 fiz3 = _mm_add_ps(fiz3,tz);
804
805 fjx2 = _mm_add_ps(fjx2,tx);
806 fjy2 = _mm_add_ps(fjy2,ty);
807 fjz2 = _mm_add_ps(fjz2,tz);
808
809 }
810
811 /**************************
812 * CALCULATE INTERACTIONS *
813 **************************/
814
815 if (gmx_mm_any_lt(rsq33,rcutoff2))
816 {
817
818 r33 = _mm_mul_ps(rsq33,rinv33);
819
820 /* EWALD ELECTROSTATICS */
821
822 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
823 ewrt = _mm_mul_ps(r33,ewtabscale);
824 ewitab = _mm_cvttps_epi32(ewrt);
825 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
826 ewitab = _mm_slli_epi32(ewitab,2);
827 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
828 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
829 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
830 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
831 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
832 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
833 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
834 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
835 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
836
837 d = _mm_sub_ps(r33,rswitch);
838 d = _mm_max_ps(d,_mm_setzero_ps());
839 d2 = _mm_mul_ps(d,d);
840 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
841
842 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
843
844 /* Evaluate switch function */
845 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
846 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
847 velec = _mm_mul_ps(velec,sw);
848 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
849
850 /* Update potential sum for this i atom from the interaction with this j atom. */
851 velec = _mm_and_ps(velec,cutoff_mask);
852 velecsum = _mm_add_ps(velecsum,velec);
853
854 fscal = felec;
855
856 fscal = _mm_and_ps(fscal,cutoff_mask);
857
858 /* Calculate temporary vectorial force */
859 tx = _mm_mul_ps(fscal,dx33);
860 ty = _mm_mul_ps(fscal,dy33);
861 tz = _mm_mul_ps(fscal,dz33);
862
863 /* Update vectorial force */
864 fix3 = _mm_add_ps(fix3,tx);
865 fiy3 = _mm_add_ps(fiy3,ty);
866 fiz3 = _mm_add_ps(fiz3,tz);
867
868 fjx3 = _mm_add_ps(fjx3,tx);
869 fjy3 = _mm_add_ps(fjy3,ty);
870 fjz3 = _mm_add_ps(fjz3,tz);
871
872 }
873
874 fjptrA = f+j_coord_offsetA;
875 fjptrB = f+j_coord_offsetB;
876 fjptrC = f+j_coord_offsetC;
877 fjptrD = f+j_coord_offsetD;
878
879 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM3,fjptrB+DIM3,fjptrC+DIM3,fjptrD+DIM3,
880 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
881
882 /* Inner loop uses 585 flops */
883 }
884
885 if(jidx<j_index_end)
886 {
887
888 /* Get j neighbor index, and coordinate index */
889 jnrlistA = jjnr[jidx];
890 jnrlistB = jjnr[jidx+1];
891 jnrlistC = jjnr[jidx+2];
892 jnrlistD = jjnr[jidx+3];
893 /* Sign of each element will be negative for non-real atoms.
894 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
895 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
896 */
897 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
898 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
899 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
900 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
901 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
902 j_coord_offsetA = DIM3*jnrA;
903 j_coord_offsetB = DIM3*jnrB;
904 j_coord_offsetC = DIM3*jnrC;
905 j_coord_offsetD = DIM3*jnrD;
906
907 /* load j atom coordinates */
908 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM3,x+j_coord_offsetB+DIM3,
909 x+j_coord_offsetC+DIM3,x+j_coord_offsetD+DIM3,
910 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
911
912 /* Calculate displacement vector */
913 dx11 = _mm_sub_ps(ix1,jx1);
914 dy11 = _mm_sub_ps(iy1,jy1);
915 dz11 = _mm_sub_ps(iz1,jz1);
916 dx12 = _mm_sub_ps(ix1,jx2);
917 dy12 = _mm_sub_ps(iy1,jy2);
918 dz12 = _mm_sub_ps(iz1,jz2);
919 dx13 = _mm_sub_ps(ix1,jx3);
920 dy13 = _mm_sub_ps(iy1,jy3);
921 dz13 = _mm_sub_ps(iz1,jz3);
922 dx21 = _mm_sub_ps(ix2,jx1);
923 dy21 = _mm_sub_ps(iy2,jy1);
924 dz21 = _mm_sub_ps(iz2,jz1);
925 dx22 = _mm_sub_ps(ix2,jx2);
926 dy22 = _mm_sub_ps(iy2,jy2);
927 dz22 = _mm_sub_ps(iz2,jz2);
928 dx23 = _mm_sub_ps(ix2,jx3);
929 dy23 = _mm_sub_ps(iy2,jy3);
930 dz23 = _mm_sub_ps(iz2,jz3);
931 dx31 = _mm_sub_ps(ix3,jx1);
932 dy31 = _mm_sub_ps(iy3,jy1);
933 dz31 = _mm_sub_ps(iz3,jz1);
934 dx32 = _mm_sub_ps(ix3,jx2);
935 dy32 = _mm_sub_ps(iy3,jy2);
936 dz32 = _mm_sub_ps(iz3,jz2);
937 dx33 = _mm_sub_ps(ix3,jx3);
938 dy33 = _mm_sub_ps(iy3,jy3);
939 dz33 = _mm_sub_ps(iz3,jz3);
940
941 /* Calculate squared distance and things based on it */
942 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
943 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
944 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
945 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
946 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
947 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
948 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
949 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
950 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
951
952 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
953 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
954 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
955 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
956 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
957 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
958 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
959 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
960 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
961
962 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
963 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
964 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
965 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
966 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
967 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
968 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
969 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
970 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
971
972 fjx1 = _mm_setzero_ps();
973 fjy1 = _mm_setzero_ps();
974 fjz1 = _mm_setzero_ps();
975 fjx2 = _mm_setzero_ps();
976 fjy2 = _mm_setzero_ps();
977 fjz2 = _mm_setzero_ps();
978 fjx3 = _mm_setzero_ps();
979 fjy3 = _mm_setzero_ps();
980 fjz3 = _mm_setzero_ps();
981
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
985
986 if (gmx_mm_any_lt(rsq11,rcutoff2))
987 {
988
989 r11 = _mm_mul_ps(rsq11,rinv11);
990 r11 = _mm_andnot_ps(dummy_mask,r11);
991
992 /* EWALD ELECTROSTATICS */
993
994 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
995 ewrt = _mm_mul_ps(r11,ewtabscale);
996 ewitab = _mm_cvttps_epi32(ewrt);
997 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
998 ewitab = _mm_slli_epi32(ewitab,2);
999 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1000 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1001 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1002 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1003 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1004 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1005 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1006 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1007 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1008
1009 d = _mm_sub_ps(r11,rswitch);
1010 d = _mm_max_ps(d,_mm_setzero_ps());
1011 d2 = _mm_mul_ps(d,d);
1012 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1013
1014 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1015
1016 /* Evaluate switch function */
1017 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1018 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1019 velec = _mm_mul_ps(velec,sw);
1020 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1021
1022 /* Update potential sum for this i atom from the interaction with this j atom. */
1023 velec = _mm_and_ps(velec,cutoff_mask);
1024 velec = _mm_andnot_ps(dummy_mask,velec);
1025 velecsum = _mm_add_ps(velecsum,velec);
1026
1027 fscal = felec;
1028
1029 fscal = _mm_and_ps(fscal,cutoff_mask);
1030
1031 fscal = _mm_andnot_ps(dummy_mask,fscal);
1032
1033 /* Calculate temporary vectorial force */
1034 tx = _mm_mul_ps(fscal,dx11);
1035 ty = _mm_mul_ps(fscal,dy11);
1036 tz = _mm_mul_ps(fscal,dz11);
1037
1038 /* Update vectorial force */
1039 fix1 = _mm_add_ps(fix1,tx);
1040 fiy1 = _mm_add_ps(fiy1,ty);
1041 fiz1 = _mm_add_ps(fiz1,tz);
1042
1043 fjx1 = _mm_add_ps(fjx1,tx);
1044 fjy1 = _mm_add_ps(fjy1,ty);
1045 fjz1 = _mm_add_ps(fjz1,tz);
1046
1047 }
1048
1049 /**************************
1050 * CALCULATE INTERACTIONS *
1051 **************************/
1052
1053 if (gmx_mm_any_lt(rsq12,rcutoff2))
1054 {
1055
1056 r12 = _mm_mul_ps(rsq12,rinv12);
1057 r12 = _mm_andnot_ps(dummy_mask,r12);
1058
1059 /* EWALD ELECTROSTATICS */
1060
1061 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1062 ewrt = _mm_mul_ps(r12,ewtabscale);
1063 ewitab = _mm_cvttps_epi32(ewrt);
1064 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1065 ewitab = _mm_slli_epi32(ewitab,2);
1066 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1067 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1068 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1069 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1070 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1071 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1072 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1073 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1074 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1075
1076 d = _mm_sub_ps(r12,rswitch);
1077 d = _mm_max_ps(d,_mm_setzero_ps());
1078 d2 = _mm_mul_ps(d,d);
1079 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1080
1081 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1082
1083 /* Evaluate switch function */
1084 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1085 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1086 velec = _mm_mul_ps(velec,sw);
1087 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1088
1089 /* Update potential sum for this i atom from the interaction with this j atom. */
1090 velec = _mm_and_ps(velec,cutoff_mask);
1091 velec = _mm_andnot_ps(dummy_mask,velec);
1092 velecsum = _mm_add_ps(velecsum,velec);
1093
1094 fscal = felec;
1095
1096 fscal = _mm_and_ps(fscal,cutoff_mask);
1097
1098 fscal = _mm_andnot_ps(dummy_mask,fscal);
1099
1100 /* Calculate temporary vectorial force */
1101 tx = _mm_mul_ps(fscal,dx12);
1102 ty = _mm_mul_ps(fscal,dy12);
1103 tz = _mm_mul_ps(fscal,dz12);
1104
1105 /* Update vectorial force */
1106 fix1 = _mm_add_ps(fix1,tx);
1107 fiy1 = _mm_add_ps(fiy1,ty);
1108 fiz1 = _mm_add_ps(fiz1,tz);
1109
1110 fjx2 = _mm_add_ps(fjx2,tx);
1111 fjy2 = _mm_add_ps(fjy2,ty);
1112 fjz2 = _mm_add_ps(fjz2,tz);
1113
1114 }
1115
1116 /**************************
1117 * CALCULATE INTERACTIONS *
1118 **************************/
1119
1120 if (gmx_mm_any_lt(rsq13,rcutoff2))
1121 {
1122
1123 r13 = _mm_mul_ps(rsq13,rinv13);
1124 r13 = _mm_andnot_ps(dummy_mask,r13);
1125
1126 /* EWALD ELECTROSTATICS */
1127
1128 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1129 ewrt = _mm_mul_ps(r13,ewtabscale);
1130 ewitab = _mm_cvttps_epi32(ewrt);
1131 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1132 ewitab = _mm_slli_epi32(ewitab,2);
1133 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1134 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1135 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1136 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1137 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1138 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1139 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1140 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1141 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1142
1143 d = _mm_sub_ps(r13,rswitch);
1144 d = _mm_max_ps(d,_mm_setzero_ps());
1145 d2 = _mm_mul_ps(d,d);
1146 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1147
1148 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1149
1150 /* Evaluate switch function */
1151 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1152 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1153 velec = _mm_mul_ps(velec,sw);
1154 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1155
1156 /* Update potential sum for this i atom from the interaction with this j atom. */
1157 velec = _mm_and_ps(velec,cutoff_mask);
1158 velec = _mm_andnot_ps(dummy_mask,velec);
1159 velecsum = _mm_add_ps(velecsum,velec);
1160
1161 fscal = felec;
1162
1163 fscal = _mm_and_ps(fscal,cutoff_mask);
1164
1165 fscal = _mm_andnot_ps(dummy_mask,fscal);
1166
1167 /* Calculate temporary vectorial force */
1168 tx = _mm_mul_ps(fscal,dx13);
1169 ty = _mm_mul_ps(fscal,dy13);
1170 tz = _mm_mul_ps(fscal,dz13);
1171
1172 /* Update vectorial force */
1173 fix1 = _mm_add_ps(fix1,tx);
1174 fiy1 = _mm_add_ps(fiy1,ty);
1175 fiz1 = _mm_add_ps(fiz1,tz);
1176
1177 fjx3 = _mm_add_ps(fjx3,tx);
1178 fjy3 = _mm_add_ps(fjy3,ty);
1179 fjz3 = _mm_add_ps(fjz3,tz);
1180
1181 }
1182
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1186
1187 if (gmx_mm_any_lt(rsq21,rcutoff2))
1188 {
1189
1190 r21 = _mm_mul_ps(rsq21,rinv21);
1191 r21 = _mm_andnot_ps(dummy_mask,r21);
1192
1193 /* EWALD ELECTROSTATICS */
1194
1195 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196 ewrt = _mm_mul_ps(r21,ewtabscale);
1197 ewitab = _mm_cvttps_epi32(ewrt);
1198 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1199 ewitab = _mm_slli_epi32(ewitab,2);
1200 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1201 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1202 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1203 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1204 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1205 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1206 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1207 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1208 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1209
1210 d = _mm_sub_ps(r21,rswitch);
1211 d = _mm_max_ps(d,_mm_setzero_ps());
1212 d2 = _mm_mul_ps(d,d);
1213 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1214
1215 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1216
1217 /* Evaluate switch function */
1218 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1219 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1220 velec = _mm_mul_ps(velec,sw);
1221 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1222
1223 /* Update potential sum for this i atom from the interaction with this j atom. */
1224 velec = _mm_and_ps(velec,cutoff_mask);
1225 velec = _mm_andnot_ps(dummy_mask,velec);
1226 velecsum = _mm_add_ps(velecsum,velec);
1227
1228 fscal = felec;
1229
1230 fscal = _mm_and_ps(fscal,cutoff_mask);
1231
1232 fscal = _mm_andnot_ps(dummy_mask,fscal);
1233
1234 /* Calculate temporary vectorial force */
1235 tx = _mm_mul_ps(fscal,dx21);
1236 ty = _mm_mul_ps(fscal,dy21);
1237 tz = _mm_mul_ps(fscal,dz21);
1238
1239 /* Update vectorial force */
1240 fix2 = _mm_add_ps(fix2,tx);
1241 fiy2 = _mm_add_ps(fiy2,ty);
1242 fiz2 = _mm_add_ps(fiz2,tz);
1243
1244 fjx1 = _mm_add_ps(fjx1,tx);
1245 fjy1 = _mm_add_ps(fjy1,ty);
1246 fjz1 = _mm_add_ps(fjz1,tz);
1247
1248 }
1249
1250 /**************************
1251 * CALCULATE INTERACTIONS *
1252 **************************/
1253
1254 if (gmx_mm_any_lt(rsq22,rcutoff2))
1255 {
1256
1257 r22 = _mm_mul_ps(rsq22,rinv22);
1258 r22 = _mm_andnot_ps(dummy_mask,r22);
1259
1260 /* EWALD ELECTROSTATICS */
1261
1262 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1263 ewrt = _mm_mul_ps(r22,ewtabscale);
1264 ewitab = _mm_cvttps_epi32(ewrt);
1265 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1266 ewitab = _mm_slli_epi32(ewitab,2);
1267 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1268 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1269 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1270 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1271 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1272 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1273 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1274 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1275 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1276
1277 d = _mm_sub_ps(r22,rswitch);
1278 d = _mm_max_ps(d,_mm_setzero_ps());
1279 d2 = _mm_mul_ps(d,d);
1280 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1281
1282 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1283
1284 /* Evaluate switch function */
1285 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1286 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1287 velec = _mm_mul_ps(velec,sw);
1288 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1289
1290 /* Update potential sum for this i atom from the interaction with this j atom. */
1291 velec = _mm_and_ps(velec,cutoff_mask);
1292 velec = _mm_andnot_ps(dummy_mask,velec);
1293 velecsum = _mm_add_ps(velecsum,velec);
1294
1295 fscal = felec;
1296
1297 fscal = _mm_and_ps(fscal,cutoff_mask);
1298
1299 fscal = _mm_andnot_ps(dummy_mask,fscal);
1300
1301 /* Calculate temporary vectorial force */
1302 tx = _mm_mul_ps(fscal,dx22);
1303 ty = _mm_mul_ps(fscal,dy22);
1304 tz = _mm_mul_ps(fscal,dz22);
1305
1306 /* Update vectorial force */
1307 fix2 = _mm_add_ps(fix2,tx);
1308 fiy2 = _mm_add_ps(fiy2,ty);
1309 fiz2 = _mm_add_ps(fiz2,tz);
1310
1311 fjx2 = _mm_add_ps(fjx2,tx);
1312 fjy2 = _mm_add_ps(fjy2,ty);
1313 fjz2 = _mm_add_ps(fjz2,tz);
1314
1315 }
1316
1317 /**************************
1318 * CALCULATE INTERACTIONS *
1319 **************************/
1320
1321 if (gmx_mm_any_lt(rsq23,rcutoff2))
1322 {
1323
1324 r23 = _mm_mul_ps(rsq23,rinv23);
1325 r23 = _mm_andnot_ps(dummy_mask,r23);
1326
1327 /* EWALD ELECTROSTATICS */
1328
1329 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1330 ewrt = _mm_mul_ps(r23,ewtabscale);
1331 ewitab = _mm_cvttps_epi32(ewrt);
1332 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1333 ewitab = _mm_slli_epi32(ewitab,2);
1334 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1335 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1336 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1337 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1338 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1339 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1340 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1341 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1342 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1343
1344 d = _mm_sub_ps(r23,rswitch);
1345 d = _mm_max_ps(d,_mm_setzero_ps());
1346 d2 = _mm_mul_ps(d,d);
1347 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1348
1349 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1350
1351 /* Evaluate switch function */
1352 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1353 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1354 velec = _mm_mul_ps(velec,sw);
1355 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1356
1357 /* Update potential sum for this i atom from the interaction with this j atom. */
1358 velec = _mm_and_ps(velec,cutoff_mask);
1359 velec = _mm_andnot_ps(dummy_mask,velec);
1360 velecsum = _mm_add_ps(velecsum,velec);
1361
1362 fscal = felec;
1363
1364 fscal = _mm_and_ps(fscal,cutoff_mask);
1365
1366 fscal = _mm_andnot_ps(dummy_mask,fscal);
1367
1368 /* Calculate temporary vectorial force */
1369 tx = _mm_mul_ps(fscal,dx23);
1370 ty = _mm_mul_ps(fscal,dy23);
1371 tz = _mm_mul_ps(fscal,dz23);
1372
1373 /* Update vectorial force */
1374 fix2 = _mm_add_ps(fix2,tx);
1375 fiy2 = _mm_add_ps(fiy2,ty);
1376 fiz2 = _mm_add_ps(fiz2,tz);
1377
1378 fjx3 = _mm_add_ps(fjx3,tx);
1379 fjy3 = _mm_add_ps(fjy3,ty);
1380 fjz3 = _mm_add_ps(fjz3,tz);
1381
1382 }
1383
1384 /**************************
1385 * CALCULATE INTERACTIONS *
1386 **************************/
1387
1388 if (gmx_mm_any_lt(rsq31,rcutoff2))
1389 {
1390
1391 r31 = _mm_mul_ps(rsq31,rinv31);
1392 r31 = _mm_andnot_ps(dummy_mask,r31);
1393
1394 /* EWALD ELECTROSTATICS */
1395
1396 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1397 ewrt = _mm_mul_ps(r31,ewtabscale);
1398 ewitab = _mm_cvttps_epi32(ewrt);
1399 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1400 ewitab = _mm_slli_epi32(ewitab,2);
1401 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1402 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1403 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1404 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1405 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1406 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1407 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1408 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1409 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1410
1411 d = _mm_sub_ps(r31,rswitch);
1412 d = _mm_max_ps(d,_mm_setzero_ps());
1413 d2 = _mm_mul_ps(d,d);
1414 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1415
1416 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1417
1418 /* Evaluate switch function */
1419 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1420 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1421 velec = _mm_mul_ps(velec,sw);
1422 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1423
1424 /* Update potential sum for this i atom from the interaction with this j atom. */
1425 velec = _mm_and_ps(velec,cutoff_mask);
1426 velec = _mm_andnot_ps(dummy_mask,velec);
1427 velecsum = _mm_add_ps(velecsum,velec);
1428
1429 fscal = felec;
1430
1431 fscal = _mm_and_ps(fscal,cutoff_mask);
1432
1433 fscal = _mm_andnot_ps(dummy_mask,fscal);
1434
1435 /* Calculate temporary vectorial force */
1436 tx = _mm_mul_ps(fscal,dx31);
1437 ty = _mm_mul_ps(fscal,dy31);
1438 tz = _mm_mul_ps(fscal,dz31);
1439
1440 /* Update vectorial force */
1441 fix3 = _mm_add_ps(fix3,tx);
1442 fiy3 = _mm_add_ps(fiy3,ty);
1443 fiz3 = _mm_add_ps(fiz3,tz);
1444
1445 fjx1 = _mm_add_ps(fjx1,tx);
1446 fjy1 = _mm_add_ps(fjy1,ty);
1447 fjz1 = _mm_add_ps(fjz1,tz);
1448
1449 }
1450
1451 /**************************
1452 * CALCULATE INTERACTIONS *
1453 **************************/
1454
1455 if (gmx_mm_any_lt(rsq32,rcutoff2))
1456 {
1457
1458 r32 = _mm_mul_ps(rsq32,rinv32);
1459 r32 = _mm_andnot_ps(dummy_mask,r32);
1460
1461 /* EWALD ELECTROSTATICS */
1462
1463 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1464 ewrt = _mm_mul_ps(r32,ewtabscale);
1465 ewitab = _mm_cvttps_epi32(ewrt);
1466 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1467 ewitab = _mm_slli_epi32(ewitab,2);
1468 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1469 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1470 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1471 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1472 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1473 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1474 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1475 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1476 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1477
1478 d = _mm_sub_ps(r32,rswitch);
1479 d = _mm_max_ps(d,_mm_setzero_ps());
1480 d2 = _mm_mul_ps(d,d);
1481 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1482
1483 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1484
1485 /* Evaluate switch function */
1486 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1487 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1488 velec = _mm_mul_ps(velec,sw);
1489 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1490
1491 /* Update potential sum for this i atom from the interaction with this j atom. */
1492 velec = _mm_and_ps(velec,cutoff_mask);
1493 velec = _mm_andnot_ps(dummy_mask,velec);
1494 velecsum = _mm_add_ps(velecsum,velec);
1495
1496 fscal = felec;
1497
1498 fscal = _mm_and_ps(fscal,cutoff_mask);
1499
1500 fscal = _mm_andnot_ps(dummy_mask,fscal);
1501
1502 /* Calculate temporary vectorial force */
1503 tx = _mm_mul_ps(fscal,dx32);
1504 ty = _mm_mul_ps(fscal,dy32);
1505 tz = _mm_mul_ps(fscal,dz32);
1506
1507 /* Update vectorial force */
1508 fix3 = _mm_add_ps(fix3,tx);
1509 fiy3 = _mm_add_ps(fiy3,ty);
1510 fiz3 = _mm_add_ps(fiz3,tz);
1511
1512 fjx2 = _mm_add_ps(fjx2,tx);
1513 fjy2 = _mm_add_ps(fjy2,ty);
1514 fjz2 = _mm_add_ps(fjz2,tz);
1515
1516 }
1517
1518 /**************************
1519 * CALCULATE INTERACTIONS *
1520 **************************/
1521
1522 if (gmx_mm_any_lt(rsq33,rcutoff2))
1523 {
1524
1525 r33 = _mm_mul_ps(rsq33,rinv33);
1526 r33 = _mm_andnot_ps(dummy_mask,r33);
1527
1528 /* EWALD ELECTROSTATICS */
1529
1530 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1531 ewrt = _mm_mul_ps(r33,ewtabscale);
1532 ewitab = _mm_cvttps_epi32(ewrt);
1533 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1534 ewitab = _mm_slli_epi32(ewitab,2);
1535 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1536 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1537 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1538 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1539 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1540 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1541 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1542 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1543 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1544
1545 d = _mm_sub_ps(r33,rswitch);
1546 d = _mm_max_ps(d,_mm_setzero_ps());
1547 d2 = _mm_mul_ps(d,d);
1548 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1549
1550 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1551
1552 /* Evaluate switch function */
1553 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1554 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1555 velec = _mm_mul_ps(velec,sw);
1556 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1557
1558 /* Update potential sum for this i atom from the interaction with this j atom. */
1559 velec = _mm_and_ps(velec,cutoff_mask);
1560 velec = _mm_andnot_ps(dummy_mask,velec);
1561 velecsum = _mm_add_ps(velecsum,velec);
1562
1563 fscal = felec;
1564
1565 fscal = _mm_and_ps(fscal,cutoff_mask);
1566
1567 fscal = _mm_andnot_ps(dummy_mask,fscal);
1568
1569 /* Calculate temporary vectorial force */
1570 tx = _mm_mul_ps(fscal,dx33);
1571 ty = _mm_mul_ps(fscal,dy33);
1572 tz = _mm_mul_ps(fscal,dz33);
1573
1574 /* Update vectorial force */
1575 fix3 = _mm_add_ps(fix3,tx);
1576 fiy3 = _mm_add_ps(fiy3,ty);
1577 fiz3 = _mm_add_ps(fiz3,tz);
1578
1579 fjx3 = _mm_add_ps(fjx3,tx);
1580 fjy3 = _mm_add_ps(fjy3,ty);
1581 fjz3 = _mm_add_ps(fjz3,tz);
1582
1583 }
1584
1585 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1586 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1587 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1588 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1589
1590 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM3,fjptrB+DIM3,fjptrC+DIM3,fjptrD+DIM3,
1591 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1592
1593 /* Inner loop uses 594 flops */
1594 }
1595
1596 /* End of innermost loop */
1597
1598 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1599 f+i_coord_offset+DIM3,fshift+i_shift_offset);
1600
1601 ggid = gid[iidx];
1602 /* Update potential energies */
1603 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1604
1605 /* Increment number of inner iterations */
1606 inneriter += j_index_end - j_index_start;
1607
1608 /* Outer loop uses 19 flops */
1609 }
1610
1611 /* Increment number of outer iterations */
1612 outeriter += nri;
1613
1614 /* Update outer/inner flops */
1615
1616 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594)(nrnb)->n[eNR_NBKERNEL_ELEC_W4W4_VF] += outeriter*19 + inneriter
*594
;
1617}
1618/*
1619 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1620 * Electrostatics interaction: Ewald
1621 * VdW interaction: None
1622 * Geometry: Water4-Water4
1623 * Calculate force/pot: Force
1624 */
1625void
1626nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1627 (t_nblist * gmx_restrict nlist,
1628 rvec * gmx_restrict xx,
1629 rvec * gmx_restrict ff,
1630 t_forcerec * gmx_restrict fr,
1631 t_mdatoms * gmx_restrict mdatoms,
1632 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1633 t_nrnb * gmx_restrict nrnb)
1634{
1635 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1636 * just 0 for non-waters.
1637 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1638 * jnr indices corresponding to data put in the four positions in the SIMD register.
1639 */
1640 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1641 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1642 int jnrA,jnrB,jnrC,jnrD;
1643 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1644 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1645 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1646 real rcutoff_scalar;
1647 real *shiftvec,*fshift,*x,*f;
1648 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1649 real scratch[4*DIM3];
1650 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1651 int vdwioffset1;
1652 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1653 int vdwioffset2;
1654 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1655 int vdwioffset3;
1656 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1657 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1658 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1659 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1660 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1661 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1662 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1663 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1664 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1665 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1666 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1667 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1668 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1669 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1670 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1671 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1672 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1673 real *charge;
1674 __m128i ewitab;
1675 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1676 real *ewtab;
1677 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1678 real rswitch_scalar,d_scalar;
1679 __m128 dummy_mask,cutoff_mask;
1680 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1681 __m128 one = _mm_set1_ps(1.0);
1682 __m128 two = _mm_set1_ps(2.0);
1683 x = xx[0];
1684 f = ff[0];
1685
1686 nri = nlist->nri;
1687 iinr = nlist->iinr;
1688 jindex = nlist->jindex;
1689 jjnr = nlist->jjnr;
1690 shiftidx = nlist->shift;
1691 gid = nlist->gid;
1692 shiftvec = fr->shift_vec[0];
1693 fshift = fr->fshift[0];
1694 facel = _mm_set1_ps(fr->epsfac);
1695 charge = mdatoms->chargeA;
1696
1697 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1698 ewtab = fr->ic->tabq_coul_FDV0;
1699 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1700 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1701
1702 /* Setup water-specific parameters */
1703 inr = nlist->iinr[0];
1704 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1705 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1706 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1707
1708 jq1 = _mm_set1_ps(charge[inr+1]);
1709 jq2 = _mm_set1_ps(charge[inr+2]);
1710 jq3 = _mm_set1_ps(charge[inr+3]);
1711 qq11 = _mm_mul_ps(iq1,jq1);
1712 qq12 = _mm_mul_ps(iq1,jq2);
1713 qq13 = _mm_mul_ps(iq1,jq3);
1714 qq21 = _mm_mul_ps(iq2,jq1);
1715 qq22 = _mm_mul_ps(iq2,jq2);
1716 qq23 = _mm_mul_ps(iq2,jq3);
1717 qq31 = _mm_mul_ps(iq3,jq1);
1718 qq32 = _mm_mul_ps(iq3,jq2);
1719 qq33 = _mm_mul_ps(iq3,jq3);
1720
1721 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1722 rcutoff_scalar = fr->rcoulomb;
1723 rcutoff = _mm_set1_ps(rcutoff_scalar);
1724 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1725
1726 rswitch_scalar = fr->rcoulomb_switch;
1727 rswitch = _mm_set1_ps(rswitch_scalar);
1728 /* Setup switch parameters */
1729 d_scalar = rcutoff_scalar-rswitch_scalar;
1730 d = _mm_set1_ps(d_scalar);
1731 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1732 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1733 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1734 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1735 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1736 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1737
1738 /* Avoid stupid compiler warnings */
1739 jnrA = jnrB = jnrC = jnrD = 0;
1740 j_coord_offsetA = 0;
1741 j_coord_offsetB = 0;
1742 j_coord_offsetC = 0;
1743 j_coord_offsetD = 0;
1744
1745 outeriter = 0;
1746 inneriter = 0;
1747
1748 for(iidx=0;iidx<4*DIM3;iidx++)
1749 {
1750 scratch[iidx] = 0.0;
1751 }
1752
1753 /* Start outer loop over neighborlists */
1754 for(iidx=0; iidx<nri; iidx++)
1755 {
1756 /* Load shift vector for this list */
1757 i_shift_offset = DIM3*shiftidx[iidx];
1758
1759 /* Load limits for loop over neighbors */
1760 j_index_start = jindex[iidx];
1761 j_index_end = jindex[iidx+1];
1762
1763 /* Get outer coordinate index */
1764 inr = iinr[iidx];
1765 i_coord_offset = DIM3*inr;
1766
1767 /* Load i particle coords and add shift vector */
1768 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM3,
1769 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1770
1771 fix1 = _mm_setzero_ps();
1772 fiy1 = _mm_setzero_ps();
1773 fiz1 = _mm_setzero_ps();
1774 fix2 = _mm_setzero_ps();
1775 fiy2 = _mm_setzero_ps();
1776 fiz2 = _mm_setzero_ps();
1777 fix3 = _mm_setzero_ps();
1778 fiy3 = _mm_setzero_ps();
1779 fiz3 = _mm_setzero_ps();
1780
1781 /* Start inner kernel loop */
1782 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1783 {
1784
1785 /* Get j neighbor index, and coordinate index */
1786 jnrA = jjnr[jidx];
1787 jnrB = jjnr[jidx+1];
1788 jnrC = jjnr[jidx+2];
1789 jnrD = jjnr[jidx+3];
1790 j_coord_offsetA = DIM3*jnrA;
1791 j_coord_offsetB = DIM3*jnrB;
1792 j_coord_offsetC = DIM3*jnrC;
1793 j_coord_offsetD = DIM3*jnrD;
1794
1795 /* load j atom coordinates */
1796 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM3,x+j_coord_offsetB+DIM3,
1797 x+j_coord_offsetC+DIM3,x+j_coord_offsetD+DIM3,
1798 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1799
1800 /* Calculate displacement vector */
1801 dx11 = _mm_sub_ps(ix1,jx1);
1802 dy11 = _mm_sub_ps(iy1,jy1);
1803 dz11 = _mm_sub_ps(iz1,jz1);
1804 dx12 = _mm_sub_ps(ix1,jx2);
1805 dy12 = _mm_sub_ps(iy1,jy2);
1806 dz12 = _mm_sub_ps(iz1,jz2);
1807 dx13 = _mm_sub_ps(ix1,jx3);
1808 dy13 = _mm_sub_ps(iy1,jy3);
1809 dz13 = _mm_sub_ps(iz1,jz3);
1810 dx21 = _mm_sub_ps(ix2,jx1);
1811 dy21 = _mm_sub_ps(iy2,jy1);
1812 dz21 = _mm_sub_ps(iz2,jz1);
1813 dx22 = _mm_sub_ps(ix2,jx2);
1814 dy22 = _mm_sub_ps(iy2,jy2);
1815 dz22 = _mm_sub_ps(iz2,jz2);
1816 dx23 = _mm_sub_ps(ix2,jx3);
1817 dy23 = _mm_sub_ps(iy2,jy3);
1818 dz23 = _mm_sub_ps(iz2,jz3);
1819 dx31 = _mm_sub_ps(ix3,jx1);
1820 dy31 = _mm_sub_ps(iy3,jy1);
1821 dz31 = _mm_sub_ps(iz3,jz1);
1822 dx32 = _mm_sub_ps(ix3,jx2);
1823 dy32 = _mm_sub_ps(iy3,jy2);
1824 dz32 = _mm_sub_ps(iz3,jz2);
1825 dx33 = _mm_sub_ps(ix3,jx3);
1826 dy33 = _mm_sub_ps(iy3,jy3);
1827 dz33 = _mm_sub_ps(iz3,jz3);
1828
1829 /* Calculate squared distance and things based on it */
1830 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1831 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1832 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1833 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1834 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1835 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1836 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1837 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1838 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1839
1840 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1841 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1842 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
1843 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1844 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1845 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
1846 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
1847 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
1848 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
1849
1850 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1851 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1852 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1853 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1854 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1855 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1856 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1857 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1858 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1859
1860 fjx1 = _mm_setzero_ps();
1861 fjy1 = _mm_setzero_ps();
1862 fjz1 = _mm_setzero_ps();
1863 fjx2 = _mm_setzero_ps();
1864 fjy2 = _mm_setzero_ps();
1865 fjz2 = _mm_setzero_ps();
1866 fjx3 = _mm_setzero_ps();
1867 fjy3 = _mm_setzero_ps();
1868 fjz3 = _mm_setzero_ps();
1869
1870 /**************************
1871 * CALCULATE INTERACTIONS *
1872 **************************/
1873
1874 if (gmx_mm_any_lt(rsq11,rcutoff2))
1875 {
1876
1877 r11 = _mm_mul_ps(rsq11,rinv11);
1878
1879 /* EWALD ELECTROSTATICS */
1880
1881 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1882 ewrt = _mm_mul_ps(r11,ewtabscale);
1883 ewitab = _mm_cvttps_epi32(ewrt);
1884 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1885 ewitab = _mm_slli_epi32(ewitab,2);
1886 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1887 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1888 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1889 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1890 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1891 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1892 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1893 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1894 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1895
1896 d = _mm_sub_ps(r11,rswitch);
1897 d = _mm_max_ps(d,_mm_setzero_ps());
1898 d2 = _mm_mul_ps(d,d);
1899 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1900
1901 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1902
1903 /* Evaluate switch function */
1904 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1905 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1906 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1907
1908 fscal = felec;
1909
1910 fscal = _mm_and_ps(fscal,cutoff_mask);
1911
1912 /* Calculate temporary vectorial force */
1913 tx = _mm_mul_ps(fscal,dx11);
1914 ty = _mm_mul_ps(fscal,dy11);
1915 tz = _mm_mul_ps(fscal,dz11);
1916
1917 /* Update vectorial force */
1918 fix1 = _mm_add_ps(fix1,tx);
1919 fiy1 = _mm_add_ps(fiy1,ty);
1920 fiz1 = _mm_add_ps(fiz1,tz);
1921
1922 fjx1 = _mm_add_ps(fjx1,tx);
1923 fjy1 = _mm_add_ps(fjy1,ty);
1924 fjz1 = _mm_add_ps(fjz1,tz);
1925
1926 }
1927
1928 /**************************
1929 * CALCULATE INTERACTIONS *
1930 **************************/
1931
1932 if (gmx_mm_any_lt(rsq12,rcutoff2))
1933 {
1934
1935 r12 = _mm_mul_ps(rsq12,rinv12);
1936
1937 /* EWALD ELECTROSTATICS */
1938
1939 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1940 ewrt = _mm_mul_ps(r12,ewtabscale);
1941 ewitab = _mm_cvttps_epi32(ewrt);
1942 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1943 ewitab = _mm_slli_epi32(ewitab,2);
1944 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1945 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1946 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1947 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1948 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1949 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1950 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1951 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1952 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1953
1954 d = _mm_sub_ps(r12,rswitch);
1955 d = _mm_max_ps(d,_mm_setzero_ps());
1956 d2 = _mm_mul_ps(d,d);
1957 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1958
1959 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1960
1961 /* Evaluate switch function */
1962 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1963 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1964 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1965
1966 fscal = felec;
1967
1968 fscal = _mm_and_ps(fscal,cutoff_mask);
1969
1970 /* Calculate temporary vectorial force */
1971 tx = _mm_mul_ps(fscal,dx12);
1972 ty = _mm_mul_ps(fscal,dy12);
1973 tz = _mm_mul_ps(fscal,dz12);
1974
1975 /* Update vectorial force */
1976 fix1 = _mm_add_ps(fix1,tx);
1977 fiy1 = _mm_add_ps(fiy1,ty);
1978 fiz1 = _mm_add_ps(fiz1,tz);
1979
1980 fjx2 = _mm_add_ps(fjx2,tx);
1981 fjy2 = _mm_add_ps(fjy2,ty);
1982 fjz2 = _mm_add_ps(fjz2,tz);
1983
1984 }
1985
1986 /**************************
1987 * CALCULATE INTERACTIONS *
1988 **************************/
1989
1990 if (gmx_mm_any_lt(rsq13,rcutoff2))
1991 {
1992
1993 r13 = _mm_mul_ps(rsq13,rinv13);
1994
1995 /* EWALD ELECTROSTATICS */
1996
1997 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1998 ewrt = _mm_mul_ps(r13,ewtabscale);
1999 ewitab = _mm_cvttps_epi32(ewrt);
2000 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2001 ewitab = _mm_slli_epi32(ewitab,2);
2002 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2003 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2004 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2005 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2006 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2007 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2008 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2009 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2010 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2011
2012 d = _mm_sub_ps(r13,rswitch);
2013 d = _mm_max_ps(d,_mm_setzero_ps());
2014 d2 = _mm_mul_ps(d,d);
2015 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2016
2017 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2018
2019 /* Evaluate switch function */
2020 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2021 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2022 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2023
2024 fscal = felec;
2025
2026 fscal = _mm_and_ps(fscal,cutoff_mask);
2027
2028 /* Calculate temporary vectorial force */
2029 tx = _mm_mul_ps(fscal,dx13);
2030 ty = _mm_mul_ps(fscal,dy13);
2031 tz = _mm_mul_ps(fscal,dz13);
2032
2033 /* Update vectorial force */
2034 fix1 = _mm_add_ps(fix1,tx);
2035 fiy1 = _mm_add_ps(fiy1,ty);
2036 fiz1 = _mm_add_ps(fiz1,tz);
2037
2038 fjx3 = _mm_add_ps(fjx3,tx);
2039 fjy3 = _mm_add_ps(fjy3,ty);
2040 fjz3 = _mm_add_ps(fjz3,tz);
2041
2042 }
2043
2044 /**************************
2045 * CALCULATE INTERACTIONS *
2046 **************************/
2047
2048 if (gmx_mm_any_lt(rsq21,rcutoff2))
2049 {
2050
2051 r21 = _mm_mul_ps(rsq21,rinv21);
2052
2053 /* EWALD ELECTROSTATICS */
2054
2055 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2056 ewrt = _mm_mul_ps(r21,ewtabscale);
2057 ewitab = _mm_cvttps_epi32(ewrt);
2058 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2059 ewitab = _mm_slli_epi32(ewitab,2);
2060 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2061 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2062 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2063 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2064 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2065 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2066 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2067 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2068 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2069
2070 d = _mm_sub_ps(r21,rswitch);
2071 d = _mm_max_ps(d,_mm_setzero_ps());
2072 d2 = _mm_mul_ps(d,d);
2073 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2074
2075 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2076
2077 /* Evaluate switch function */
2078 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2079 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2080 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2081
2082 fscal = felec;
2083
2084 fscal = _mm_and_ps(fscal,cutoff_mask);
2085
2086 /* Calculate temporary vectorial force */
2087 tx = _mm_mul_ps(fscal,dx21);
2088 ty = _mm_mul_ps(fscal,dy21);
2089 tz = _mm_mul_ps(fscal,dz21);
2090
2091 /* Update vectorial force */
2092 fix2 = _mm_add_ps(fix2,tx);
2093 fiy2 = _mm_add_ps(fiy2,ty);
2094 fiz2 = _mm_add_ps(fiz2,tz);
2095
2096 fjx1 = _mm_add_ps(fjx1,tx);
2097 fjy1 = _mm_add_ps(fjy1,ty);
2098 fjz1 = _mm_add_ps(fjz1,tz);
2099
2100 }
2101
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2105
2106 if (gmx_mm_any_lt(rsq22,rcutoff2))
2107 {
2108
2109 r22 = _mm_mul_ps(rsq22,rinv22);
2110
2111 /* EWALD ELECTROSTATICS */
2112
2113 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2114 ewrt = _mm_mul_ps(r22,ewtabscale);
2115 ewitab = _mm_cvttps_epi32(ewrt);
2116 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2117 ewitab = _mm_slli_epi32(ewitab,2);
2118 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2119 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2120 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2121 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2122 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2123 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2124 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2125 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2126 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2127
2128 d = _mm_sub_ps(r22,rswitch);
2129 d = _mm_max_ps(d,_mm_setzero_ps());
2130 d2 = _mm_mul_ps(d,d);
2131 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2132
2133 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2134
2135 /* Evaluate switch function */
2136 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2137 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2138 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2139
2140 fscal = felec;
2141
2142 fscal = _mm_and_ps(fscal,cutoff_mask);
2143
2144 /* Calculate temporary vectorial force */
2145 tx = _mm_mul_ps(fscal,dx22);
2146 ty = _mm_mul_ps(fscal,dy22);
2147 tz = _mm_mul_ps(fscal,dz22);
2148
2149 /* Update vectorial force */
2150 fix2 = _mm_add_ps(fix2,tx);
2151 fiy2 = _mm_add_ps(fiy2,ty);
2152 fiz2 = _mm_add_ps(fiz2,tz);
2153
2154 fjx2 = _mm_add_ps(fjx2,tx);
2155 fjy2 = _mm_add_ps(fjy2,ty);
2156 fjz2 = _mm_add_ps(fjz2,tz);
2157
2158 }
2159
2160 /**************************
2161 * CALCULATE INTERACTIONS *
2162 **************************/
2163
2164 if (gmx_mm_any_lt(rsq23,rcutoff2))
2165 {
2166
2167 r23 = _mm_mul_ps(rsq23,rinv23);
2168
2169 /* EWALD ELECTROSTATICS */
2170
2171 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2172 ewrt = _mm_mul_ps(r23,ewtabscale);
2173 ewitab = _mm_cvttps_epi32(ewrt);
2174 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2175 ewitab = _mm_slli_epi32(ewitab,2);
2176 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2177 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2178 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2179 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2180 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2181 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2182 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2183 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2184 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2185
2186 d = _mm_sub_ps(r23,rswitch);
2187 d = _mm_max_ps(d,_mm_setzero_ps());
2188 d2 = _mm_mul_ps(d,d);
2189 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2190
2191 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2192
2193 /* Evaluate switch function */
2194 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2195 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2196 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2197
2198 fscal = felec;
2199
2200 fscal = _mm_and_ps(fscal,cutoff_mask);
2201
2202 /* Calculate temporary vectorial force */
2203 tx = _mm_mul_ps(fscal,dx23);
2204 ty = _mm_mul_ps(fscal,dy23);
2205 tz = _mm_mul_ps(fscal,dz23);
2206
2207 /* Update vectorial force */
2208 fix2 = _mm_add_ps(fix2,tx);
2209 fiy2 = _mm_add_ps(fiy2,ty);
2210 fiz2 = _mm_add_ps(fiz2,tz);
2211
2212 fjx3 = _mm_add_ps(fjx3,tx);
2213 fjy3 = _mm_add_ps(fjy3,ty);
2214 fjz3 = _mm_add_ps(fjz3,tz);
2215
2216 }
2217
2218 /**************************
2219 * CALCULATE INTERACTIONS *
2220 **************************/
2221
2222 if (gmx_mm_any_lt(rsq31,rcutoff2))
2223 {
2224
2225 r31 = _mm_mul_ps(rsq31,rinv31);
2226
2227 /* EWALD ELECTROSTATICS */
2228
2229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2230 ewrt = _mm_mul_ps(r31,ewtabscale);
2231 ewitab = _mm_cvttps_epi32(ewrt);
2232 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2233 ewitab = _mm_slli_epi32(ewitab,2);
2234 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2235 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2236 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2237 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2238 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2239 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2240 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2241 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2242 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2243
2244 d = _mm_sub_ps(r31,rswitch);
2245 d = _mm_max_ps(d,_mm_setzero_ps());
2246 d2 = _mm_mul_ps(d,d);
2247 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2248
2249 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2250
2251 /* Evaluate switch function */
2252 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2253 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2254 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2255
2256 fscal = felec;
2257
2258 fscal = _mm_and_ps(fscal,cutoff_mask);
2259
2260 /* Calculate temporary vectorial force */
2261 tx = _mm_mul_ps(fscal,dx31);
2262 ty = _mm_mul_ps(fscal,dy31);
2263 tz = _mm_mul_ps(fscal,dz31);
2264
2265 /* Update vectorial force */
2266 fix3 = _mm_add_ps(fix3,tx);
2267 fiy3 = _mm_add_ps(fiy3,ty);
2268 fiz3 = _mm_add_ps(fiz3,tz);
2269
2270 fjx1 = _mm_add_ps(fjx1,tx);
2271 fjy1 = _mm_add_ps(fjy1,ty);
2272 fjz1 = _mm_add_ps(fjz1,tz);
2273
2274 }
2275
2276 /**************************
2277 * CALCULATE INTERACTIONS *
2278 **************************/
2279
2280 if (gmx_mm_any_lt(rsq32,rcutoff2))
2281 {
2282
2283 r32 = _mm_mul_ps(rsq32,rinv32);
2284
2285 /* EWALD ELECTROSTATICS */
2286
2287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288 ewrt = _mm_mul_ps(r32,ewtabscale);
2289 ewitab = _mm_cvttps_epi32(ewrt);
2290 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2291 ewitab = _mm_slli_epi32(ewitab,2);
2292 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2293 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2294 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2295 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2296 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2297 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2298 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2299 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2300 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2301
2302 d = _mm_sub_ps(r32,rswitch);
2303 d = _mm_max_ps(d,_mm_setzero_ps());
2304 d2 = _mm_mul_ps(d,d);
2305 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2306
2307 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2308
2309 /* Evaluate switch function */
2310 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2311 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2312 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2313
2314 fscal = felec;
2315
2316 fscal = _mm_and_ps(fscal,cutoff_mask);
2317
2318 /* Calculate temporary vectorial force */
2319 tx = _mm_mul_ps(fscal,dx32);
2320 ty = _mm_mul_ps(fscal,dy32);
2321 tz = _mm_mul_ps(fscal,dz32);
2322
2323 /* Update vectorial force */
2324 fix3 = _mm_add_ps(fix3,tx);
2325 fiy3 = _mm_add_ps(fiy3,ty);
2326 fiz3 = _mm_add_ps(fiz3,tz);
2327
2328 fjx2 = _mm_add_ps(fjx2,tx);
2329 fjy2 = _mm_add_ps(fjy2,ty);
2330 fjz2 = _mm_add_ps(fjz2,tz);
2331
2332 }
2333
2334 /**************************
2335 * CALCULATE INTERACTIONS *
2336 **************************/
2337
2338 if (gmx_mm_any_lt(rsq33,rcutoff2))
2339 {
2340
2341 r33 = _mm_mul_ps(rsq33,rinv33);
2342
2343 /* EWALD ELECTROSTATICS */
2344
2345 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2346 ewrt = _mm_mul_ps(r33,ewtabscale);
2347 ewitab = _mm_cvttps_epi32(ewrt);
2348 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2349 ewitab = _mm_slli_epi32(ewitab,2);
2350 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2351 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2352 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2353 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2354 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2355 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2356 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2357 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2358 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2359
2360 d = _mm_sub_ps(r33,rswitch);
2361 d = _mm_max_ps(d,_mm_setzero_ps());
2362 d2 = _mm_mul_ps(d,d);
2363 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2364
2365 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2366
2367 /* Evaluate switch function */
2368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2369 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2370 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2371
2372 fscal = felec;
2373
2374 fscal = _mm_and_ps(fscal,cutoff_mask);
2375
2376 /* Calculate temporary vectorial force */
2377 tx = _mm_mul_ps(fscal,dx33);
2378 ty = _mm_mul_ps(fscal,dy33);
2379 tz = _mm_mul_ps(fscal,dz33);
2380
2381 /* Update vectorial force */
2382 fix3 = _mm_add_ps(fix3,tx);
2383 fiy3 = _mm_add_ps(fiy3,ty);
2384 fiz3 = _mm_add_ps(fiz3,tz);
2385
2386 fjx3 = _mm_add_ps(fjx3,tx);
2387 fjy3 = _mm_add_ps(fjy3,ty);
2388 fjz3 = _mm_add_ps(fjz3,tz);
2389
2390 }
2391
2392 fjptrA = f+j_coord_offsetA;
2393 fjptrB = f+j_coord_offsetB;
2394 fjptrC = f+j_coord_offsetC;
2395 fjptrD = f+j_coord_offsetD;
2396
2397 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM3,fjptrB+DIM3,fjptrC+DIM3,fjptrD+DIM3,
2398 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2399
2400 /* Inner loop uses 558 flops */
2401 }
2402
2403 if(jidx<j_index_end)
2404 {
2405
2406 /* Get j neighbor index, and coordinate index */
2407 jnrlistA = jjnr[jidx];
2408 jnrlistB = jjnr[jidx+1];
2409 jnrlistC = jjnr[jidx+2];
2410 jnrlistD = jjnr[jidx+3];
2411 /* Sign of each element will be negative for non-real atoms.
2412 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2413 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2414 */
2415 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2416 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2417 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2418 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2419 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2420 j_coord_offsetA = DIM3*jnrA;
2421 j_coord_offsetB = DIM3*jnrB;
2422 j_coord_offsetC = DIM3*jnrC;
2423 j_coord_offsetD = DIM3*jnrD;
2424
2425 /* load j atom coordinates */
2426 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM3,x+j_coord_offsetB+DIM3,
2427 x+j_coord_offsetC+DIM3,x+j_coord_offsetD+DIM3,
2428 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2429
2430 /* Calculate displacement vector */
2431 dx11 = _mm_sub_ps(ix1,jx1);
2432 dy11 = _mm_sub_ps(iy1,jy1);
2433 dz11 = _mm_sub_ps(iz1,jz1);
2434 dx12 = _mm_sub_ps(ix1,jx2);
2435 dy12 = _mm_sub_ps(iy1,jy2);
2436 dz12 = _mm_sub_ps(iz1,jz2);
2437 dx13 = _mm_sub_ps(ix1,jx3);
2438 dy13 = _mm_sub_ps(iy1,jy3);
2439 dz13 = _mm_sub_ps(iz1,jz3);
2440 dx21 = _mm_sub_ps(ix2,jx1);
2441 dy21 = _mm_sub_ps(iy2,jy1);
2442 dz21 = _mm_sub_ps(iz2,jz1);
2443 dx22 = _mm_sub_ps(ix2,jx2);
2444 dy22 = _mm_sub_ps(iy2,jy2);
2445 dz22 = _mm_sub_ps(iz2,jz2);
2446 dx23 = _mm_sub_ps(ix2,jx3);
2447 dy23 = _mm_sub_ps(iy2,jy3);
2448 dz23 = _mm_sub_ps(iz2,jz3);
2449 dx31 = _mm_sub_ps(ix3,jx1);
2450 dy31 = _mm_sub_ps(iy3,jy1);
2451 dz31 = _mm_sub_ps(iz3,jz1);
2452 dx32 = _mm_sub_ps(ix3,jx2);
2453 dy32 = _mm_sub_ps(iy3,jy2);
2454 dz32 = _mm_sub_ps(iz3,jz2);
2455 dx33 = _mm_sub_ps(ix3,jx3);
2456 dy33 = _mm_sub_ps(iy3,jy3);
2457 dz33 = _mm_sub_ps(iz3,jz3);
2458
2459 /* Calculate squared distance and things based on it */
2460 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2461 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2462 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2463 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2464 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2465 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2466 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2467 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2468 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2469
2470 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2471 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2472 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
2473 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2474 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2475 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
2476 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
2477 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
2478 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
2479
2480 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2481 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2482 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2483 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2484 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2485 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2486 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2487 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2488 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2489
2490 fjx1 = _mm_setzero_ps();
2491 fjy1 = _mm_setzero_ps();
2492 fjz1 = _mm_setzero_ps();
2493 fjx2 = _mm_setzero_ps();
2494 fjy2 = _mm_setzero_ps();
2495 fjz2 = _mm_setzero_ps();
2496 fjx3 = _mm_setzero_ps();
2497 fjy3 = _mm_setzero_ps();
2498 fjz3 = _mm_setzero_ps();
2499
2500 /**************************
2501 * CALCULATE INTERACTIONS *
2502 **************************/
2503
2504 if (gmx_mm_any_lt(rsq11,rcutoff2))
2505 {
2506
2507 r11 = _mm_mul_ps(rsq11,rinv11);
2508 r11 = _mm_andnot_ps(dummy_mask,r11);
2509
2510 /* EWALD ELECTROSTATICS */
2511
2512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2513 ewrt = _mm_mul_ps(r11,ewtabscale);
2514 ewitab = _mm_cvttps_epi32(ewrt);
2515 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2516 ewitab = _mm_slli_epi32(ewitab,2);
2517 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2518 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2519 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2520 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2521 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2522 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2523 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2524 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2525 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2526
2527 d = _mm_sub_ps(r11,rswitch);
2528 d = _mm_max_ps(d,_mm_setzero_ps());
2529 d2 = _mm_mul_ps(d,d);
2530 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2531
2532 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2533
2534 /* Evaluate switch function */
2535 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2536 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2537 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2538
2539 fscal = felec;
2540
2541 fscal = _mm_and_ps(fscal,cutoff_mask);
2542
2543 fscal = _mm_andnot_ps(dummy_mask,fscal);
2544
2545 /* Calculate temporary vectorial force */
2546 tx = _mm_mul_ps(fscal,dx11);
2547 ty = _mm_mul_ps(fscal,dy11);
2548 tz = _mm_mul_ps(fscal,dz11);
2549
2550 /* Update vectorial force */
2551 fix1 = _mm_add_ps(fix1,tx);
2552 fiy1 = _mm_add_ps(fiy1,ty);
2553 fiz1 = _mm_add_ps(fiz1,tz);
2554
2555 fjx1 = _mm_add_ps(fjx1,tx);
2556 fjy1 = _mm_add_ps(fjy1,ty);
2557 fjz1 = _mm_add_ps(fjz1,tz);
2558
2559 }
2560
2561 /**************************
2562 * CALCULATE INTERACTIONS *
2563 **************************/
2564
2565 if (gmx_mm_any_lt(rsq12,rcutoff2))
2566 {
2567
2568 r12 = _mm_mul_ps(rsq12,rinv12);
2569 r12 = _mm_andnot_ps(dummy_mask,r12);
2570
2571 /* EWALD ELECTROSTATICS */
2572
2573 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2574 ewrt = _mm_mul_ps(r12,ewtabscale);
2575 ewitab = _mm_cvttps_epi32(ewrt);
2576 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2577 ewitab = _mm_slli_epi32(ewitab,2);
2578 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2579 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2580 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2581 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2582 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2583 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2584 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2585 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2586 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2587
2588 d = _mm_sub_ps(r12,rswitch);
2589 d = _mm_max_ps(d,_mm_setzero_ps());
2590 d2 = _mm_mul_ps(d,d);
2591 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2592
2593 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2594
2595 /* Evaluate switch function */
2596 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2597 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2598 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2599
2600 fscal = felec;
2601
2602 fscal = _mm_and_ps(fscal,cutoff_mask);
2603
2604 fscal = _mm_andnot_ps(dummy_mask,fscal);
2605
2606 /* Calculate temporary vectorial force */
2607 tx = _mm_mul_ps(fscal,dx12);
2608 ty = _mm_mul_ps(fscal,dy12);
2609 tz = _mm_mul_ps(fscal,dz12);
2610
2611 /* Update vectorial force */
2612 fix1 = _mm_add_ps(fix1,tx);
2613 fiy1 = _mm_add_ps(fiy1,ty);
2614 fiz1 = _mm_add_ps(fiz1,tz);
2615
2616 fjx2 = _mm_add_ps(fjx2,tx);
2617 fjy2 = _mm_add_ps(fjy2,ty);
2618 fjz2 = _mm_add_ps(fjz2,tz);
2619
2620 }
2621
2622 /**************************
2623 * CALCULATE INTERACTIONS *
2624 **************************/
2625
2626 if (gmx_mm_any_lt(rsq13,rcutoff2))
2627 {
2628
2629 r13 = _mm_mul_ps(rsq13,rinv13);
2630 r13 = _mm_andnot_ps(dummy_mask,r13);
2631
2632 /* EWALD ELECTROSTATICS */
2633
2634 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2635 ewrt = _mm_mul_ps(r13,ewtabscale);
2636 ewitab = _mm_cvttps_epi32(ewrt);
2637 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2638 ewitab = _mm_slli_epi32(ewitab,2);
2639 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2640 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2641 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2642 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2643 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2644 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2645 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2646 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2647 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2648
2649 d = _mm_sub_ps(r13,rswitch);
2650 d = _mm_max_ps(d,_mm_setzero_ps());
2651 d2 = _mm_mul_ps(d,d);
2652 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2653
2654 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2655
2656 /* Evaluate switch function */
2657 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2658 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2659 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2660
2661 fscal = felec;
2662
2663 fscal = _mm_and_ps(fscal,cutoff_mask);
2664
2665 fscal = _mm_andnot_ps(dummy_mask,fscal);
2666
2667 /* Calculate temporary vectorial force */
2668 tx = _mm_mul_ps(fscal,dx13);
2669 ty = _mm_mul_ps(fscal,dy13);
2670 tz = _mm_mul_ps(fscal,dz13);
2671
2672 /* Update vectorial force */
2673 fix1 = _mm_add_ps(fix1,tx);
2674 fiy1 = _mm_add_ps(fiy1,ty);
2675 fiz1 = _mm_add_ps(fiz1,tz);
2676
2677 fjx3 = _mm_add_ps(fjx3,tx);
2678 fjy3 = _mm_add_ps(fjy3,ty);
2679 fjz3 = _mm_add_ps(fjz3,tz);
2680
2681 }
2682
2683 /**************************
2684 * CALCULATE INTERACTIONS *
2685 **************************/
2686
2687 if (gmx_mm_any_lt(rsq21,rcutoff2))
2688 {
2689
2690 r21 = _mm_mul_ps(rsq21,rinv21);
2691 r21 = _mm_andnot_ps(dummy_mask,r21);
2692
2693 /* EWALD ELECTROSTATICS */
2694
2695 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2696 ewrt = _mm_mul_ps(r21,ewtabscale);
2697 ewitab = _mm_cvttps_epi32(ewrt);
2698 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2699 ewitab = _mm_slli_epi32(ewitab,2);
2700 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2701 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2702 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2703 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2704 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2705 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2706 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2707 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2708 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2709
2710 d = _mm_sub_ps(r21,rswitch);
2711 d = _mm_max_ps(d,_mm_setzero_ps());
2712 d2 = _mm_mul_ps(d,d);
2713 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2714
2715 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2716
2717 /* Evaluate switch function */
2718 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2719 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2720 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2721
2722 fscal = felec;
2723
2724 fscal = _mm_and_ps(fscal,cutoff_mask);
2725
2726 fscal = _mm_andnot_ps(dummy_mask,fscal);
2727
2728 /* Calculate temporary vectorial force */
2729 tx = _mm_mul_ps(fscal,dx21);
2730 ty = _mm_mul_ps(fscal,dy21);
2731 tz = _mm_mul_ps(fscal,dz21);
2732
2733 /* Update vectorial force */
2734 fix2 = _mm_add_ps(fix2,tx);
2735 fiy2 = _mm_add_ps(fiy2,ty);
2736 fiz2 = _mm_add_ps(fiz2,tz);
2737
2738 fjx1 = _mm_add_ps(fjx1,tx);
2739 fjy1 = _mm_add_ps(fjy1,ty);
2740 fjz1 = _mm_add_ps(fjz1,tz);
2741
2742 }
2743
2744 /**************************
2745 * CALCULATE INTERACTIONS *
2746 **************************/
2747
2748 if (gmx_mm_any_lt(rsq22,rcutoff2))
2749 {
2750
2751 r22 = _mm_mul_ps(rsq22,rinv22);
2752 r22 = _mm_andnot_ps(dummy_mask,r22);
2753
2754 /* EWALD ELECTROSTATICS */
2755
2756 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2757 ewrt = _mm_mul_ps(r22,ewtabscale);
2758 ewitab = _mm_cvttps_epi32(ewrt);
2759 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2760 ewitab = _mm_slli_epi32(ewitab,2);
2761 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2762 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2763 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2764 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2765 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2766 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2767 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2768 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2769 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2770
2771 d = _mm_sub_ps(r22,rswitch);
2772 d = _mm_max_ps(d,_mm_setzero_ps());
2773 d2 = _mm_mul_ps(d,d);
2774 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2775
2776 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2777
2778 /* Evaluate switch function */
2779 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2780 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2781 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2782
2783 fscal = felec;
2784
2785 fscal = _mm_and_ps(fscal,cutoff_mask);
2786
2787 fscal = _mm_andnot_ps(dummy_mask,fscal);
2788
2789 /* Calculate temporary vectorial force */
2790 tx = _mm_mul_ps(fscal,dx22);
2791 ty = _mm_mul_ps(fscal,dy22);
2792 tz = _mm_mul_ps(fscal,dz22);
2793
2794 /* Update vectorial force */
2795 fix2 = _mm_add_ps(fix2,tx);
2796 fiy2 = _mm_add_ps(fiy2,ty);
2797 fiz2 = _mm_add_ps(fiz2,tz);
2798
2799 fjx2 = _mm_add_ps(fjx2,tx);
2800 fjy2 = _mm_add_ps(fjy2,ty);
2801 fjz2 = _mm_add_ps(fjz2,tz);
2802
2803 }
2804
2805 /**************************
2806 * CALCULATE INTERACTIONS *
2807 **************************/
2808
2809 if (gmx_mm_any_lt(rsq23,rcutoff2))
2810 {
2811
2812 r23 = _mm_mul_ps(rsq23,rinv23);
2813 r23 = _mm_andnot_ps(dummy_mask,r23);
2814
2815 /* EWALD ELECTROSTATICS */
2816
2817 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2818 ewrt = _mm_mul_ps(r23,ewtabscale);
2819 ewitab = _mm_cvttps_epi32(ewrt);
2820 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2821 ewitab = _mm_slli_epi32(ewitab,2);
2822 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2823 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2824 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2825 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2826 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2827 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2828 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2829 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2830 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2831
2832 d = _mm_sub_ps(r23,rswitch);
2833 d = _mm_max_ps(d,_mm_setzero_ps());
2834 d2 = _mm_mul_ps(d,d);
2835 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2836
2837 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2838
2839 /* Evaluate switch function */
2840 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2841 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2842 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2843
2844 fscal = felec;
2845
2846 fscal = _mm_and_ps(fscal,cutoff_mask);
2847
2848 fscal = _mm_andnot_ps(dummy_mask,fscal);
2849
2850 /* Calculate temporary vectorial force */
2851 tx = _mm_mul_ps(fscal,dx23);
2852 ty = _mm_mul_ps(fscal,dy23);
2853 tz = _mm_mul_ps(fscal,dz23);
2854
2855 /* Update vectorial force */
2856 fix2 = _mm_add_ps(fix2,tx);
2857 fiy2 = _mm_add_ps(fiy2,ty);
2858 fiz2 = _mm_add_ps(fiz2,tz);
2859
2860 fjx3 = _mm_add_ps(fjx3,tx);
2861 fjy3 = _mm_add_ps(fjy3,ty);
2862 fjz3 = _mm_add_ps(fjz3,tz);
2863
2864 }
2865
2866 /**************************
2867 * CALCULATE INTERACTIONS *
2868 **************************/
2869
2870 if (gmx_mm_any_lt(rsq31,rcutoff2))
2871 {
2872
2873 r31 = _mm_mul_ps(rsq31,rinv31);
2874 r31 = _mm_andnot_ps(dummy_mask,r31);
2875
2876 /* EWALD ELECTROSTATICS */
2877
2878 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2879 ewrt = _mm_mul_ps(r31,ewtabscale);
2880 ewitab = _mm_cvttps_epi32(ewrt);
2881 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2882 ewitab = _mm_slli_epi32(ewitab,2);
2883 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2884 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2885 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2886 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2887 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2888 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2889 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2890 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2891 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2892
2893 d = _mm_sub_ps(r31,rswitch);
2894 d = _mm_max_ps(d,_mm_setzero_ps());
2895 d2 = _mm_mul_ps(d,d);
2896 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2897
2898 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2899
2900 /* Evaluate switch function */
2901 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2902 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2903 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2904
2905 fscal = felec;
2906
2907 fscal = _mm_and_ps(fscal,cutoff_mask);
2908
2909 fscal = _mm_andnot_ps(dummy_mask,fscal);
2910
2911 /* Calculate temporary vectorial force */
2912 tx = _mm_mul_ps(fscal,dx31);
2913 ty = _mm_mul_ps(fscal,dy31);
2914 tz = _mm_mul_ps(fscal,dz31);
2915
2916 /* Update vectorial force */
2917 fix3 = _mm_add_ps(fix3,tx);
2918 fiy3 = _mm_add_ps(fiy3,ty);
2919 fiz3 = _mm_add_ps(fiz3,tz);
2920
2921 fjx1 = _mm_add_ps(fjx1,tx);
2922 fjy1 = _mm_add_ps(fjy1,ty);
2923 fjz1 = _mm_add_ps(fjz1,tz);
2924
2925 }
2926
2927 /**************************
2928 * CALCULATE INTERACTIONS *
2929 **************************/
2930
2931 if (gmx_mm_any_lt(rsq32,rcutoff2))
2932 {
2933
2934 r32 = _mm_mul_ps(rsq32,rinv32);
2935 r32 = _mm_andnot_ps(dummy_mask,r32);
2936
2937 /* EWALD ELECTROSTATICS */
2938
2939 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2940 ewrt = _mm_mul_ps(r32,ewtabscale);
2941 ewitab = _mm_cvttps_epi32(ewrt);
2942 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2943 ewitab = _mm_slli_epi32(ewitab,2);
2944 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2945 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2946 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2947 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2948 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2949 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2950 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2951 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2952 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2953
2954 d = _mm_sub_ps(r32,rswitch);
2955 d = _mm_max_ps(d,_mm_setzero_ps());
2956 d2 = _mm_mul_ps(d,d);
2957 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2958
2959 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2960
2961 /* Evaluate switch function */
2962 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2963 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2964 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2965
2966 fscal = felec;
2967
2968 fscal = _mm_and_ps(fscal,cutoff_mask);
2969
2970 fscal = _mm_andnot_ps(dummy_mask,fscal);
2971
2972 /* Calculate temporary vectorial force */
2973 tx = _mm_mul_ps(fscal,dx32);
2974 ty = _mm_mul_ps(fscal,dy32);
2975 tz = _mm_mul_ps(fscal,dz32);
2976
2977 /* Update vectorial force */
2978 fix3 = _mm_add_ps(fix3,tx);
2979 fiy3 = _mm_add_ps(fiy3,ty);
2980 fiz3 = _mm_add_ps(fiz3,tz);
2981
2982 fjx2 = _mm_add_ps(fjx2,tx);
2983 fjy2 = _mm_add_ps(fjy2,ty);
2984 fjz2 = _mm_add_ps(fjz2,tz);
2985
2986 }
2987
2988 /**************************
2989 * CALCULATE INTERACTIONS *
2990 **************************/
2991
2992 if (gmx_mm_any_lt(rsq33,rcutoff2))
2993 {
2994
2995 r33 = _mm_mul_ps(rsq33,rinv33);
2996 r33 = _mm_andnot_ps(dummy_mask,r33);
2997
2998 /* EWALD ELECTROSTATICS */
2999
3000 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3001 ewrt = _mm_mul_ps(r33,ewtabscale);
3002 ewitab = _mm_cvttps_epi32(ewrt);
3003 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3004 ewitab = _mm_slli_epi32(ewitab,2);
3005 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3006 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3007 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3008 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3009 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3010 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3011 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3012 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3013 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3014
3015 d = _mm_sub_ps(r33,rswitch);
3016 d = _mm_max_ps(d,_mm_setzero_ps());
3017 d2 = _mm_mul_ps(d,d);
3018 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3019
3020 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3021
3022 /* Evaluate switch function */
3023 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3024 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3025 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
3026
3027 fscal = felec;
3028
3029 fscal = _mm_and_ps(fscal,cutoff_mask);
3030
3031 fscal = _mm_andnot_ps(dummy_mask,fscal);
3032
3033 /* Calculate temporary vectorial force */
3034 tx = _mm_mul_ps(fscal,dx33);
3035 ty = _mm_mul_ps(fscal,dy33);
3036 tz = _mm_mul_ps(fscal,dz33);
3037
3038 /* Update vectorial force */
3039 fix3 = _mm_add_ps(fix3,tx);
3040 fiy3 = _mm_add_ps(fiy3,ty);
3041 fiz3 = _mm_add_ps(fiz3,tz);
3042
3043 fjx3 = _mm_add_ps(fjx3,tx);
3044 fjy3 = _mm_add_ps(fjy3,ty);
3045 fjz3 = _mm_add_ps(fjz3,tz);
3046
3047 }
3048
3049 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3050 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3051 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3052 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3053
3054 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM3,fjptrB+DIM3,fjptrC+DIM3,fjptrD+DIM3,
3055 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3056
3057 /* Inner loop uses 567 flops */
3058 }
3059
3060 /* End of innermost loop */
3061
3062 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3063 f+i_coord_offset+DIM3,fshift+i_shift_offset);
3064
3065 /* Increment number of inner iterations */
3066 inneriter += j_index_end - j_index_start;
3067
3068 /* Outer loop uses 18 flops */
3069 }
3070
3071 /* Increment number of outer iterations */
3072 outeriter += nri;
3073
3074 /* Update outer/inner flops */
3075
3076 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567)(nrnb)->n[eNR_NBKERNEL_ELEC_W4W4_F] += outeriter*18 + inneriter
*567
;
3077}