Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse4_1_single.c
Location:line 1905, column 5
Description:Value stored to 'd' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwioffset3;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 real *charge;
113 int nvdwtype;
114 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 int *vdwtype;
116 real *vdwparam;
117 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
118 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
119 __m128i ewitab;
120 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
121 real *ewtab;
122 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
123 real rswitch_scalar,d_scalar;
124 __m128 dummy_mask,cutoff_mask;
125 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
126 __m128 one = _mm_set1_ps(1.0);
127 __m128 two = _mm_set1_ps(2.0);
128 x = xx[0];
129 f = ff[0];
130
131 nri = nlist->nri;
132 iinr = nlist->iinr;
133 jindex = nlist->jindex;
134 jjnr = nlist->jjnr;
135 shiftidx = nlist->shift;
136 gid = nlist->gid;
137 shiftvec = fr->shift_vec[0];
138 fshift = fr->fshift[0];
139 facel = _mm_set1_ps(fr->epsfac);
140 charge = mdatoms->chargeA;
141 nvdwtype = fr->ntype;
142 vdwparam = fr->nbfp;
143 vdwtype = mdatoms->typeA;
144
145 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
146 ewtab = fr->ic->tabq_coul_FDV0;
147 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
148 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
149
150 /* Setup water-specific parameters */
151 inr = nlist->iinr[0];
152 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
153 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
154 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
155 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
156
157 jq1 = _mm_set1_ps(charge[inr+1]);
158 jq2 = _mm_set1_ps(charge[inr+2]);
159 jq3 = _mm_set1_ps(charge[inr+3]);
160 vdwjidx0A = 2*vdwtype[inr+0];
161 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
162 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
163 qq11 = _mm_mul_ps(iq1,jq1);
164 qq12 = _mm_mul_ps(iq1,jq2);
165 qq13 = _mm_mul_ps(iq1,jq3);
166 qq21 = _mm_mul_ps(iq2,jq1);
167 qq22 = _mm_mul_ps(iq2,jq2);
168 qq23 = _mm_mul_ps(iq2,jq3);
169 qq31 = _mm_mul_ps(iq3,jq1);
170 qq32 = _mm_mul_ps(iq3,jq2);
171 qq33 = _mm_mul_ps(iq3,jq3);
172
173 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
174 rcutoff_scalar = fr->rcoulomb;
175 rcutoff = _mm_set1_ps(rcutoff_scalar);
176 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
177
178 rswitch_scalar = fr->rcoulomb_switch;
179 rswitch = _mm_set1_ps(rswitch_scalar);
180 /* Setup switch parameters */
181 d_scalar = rcutoff_scalar-rswitch_scalar;
182 d = _mm_set1_ps(d_scalar);
183 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
184 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
187 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
188 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
189
190 /* Avoid stupid compiler warnings */
191 jnrA = jnrB = jnrC = jnrD = 0;
192 j_coord_offsetA = 0;
193 j_coord_offsetB = 0;
194 j_coord_offsetC = 0;
195 j_coord_offsetD = 0;
196
197 outeriter = 0;
198 inneriter = 0;
199
200 for(iidx=0;iidx<4*DIM3;iidx++)
201 {
202 scratch[iidx] = 0.0;
203 }
204
205 /* Start outer loop over neighborlists */
206 for(iidx=0; iidx<nri; iidx++)
207 {
208 /* Load shift vector for this list */
209 i_shift_offset = DIM3*shiftidx[iidx];
210
211 /* Load limits for loop over neighbors */
212 j_index_start = jindex[iidx];
213 j_index_end = jindex[iidx+1];
214
215 /* Get outer coordinate index */
216 inr = iinr[iidx];
217 i_coord_offset = DIM3*inr;
218
219 /* Load i particle coords and add shift vector */
220 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
221 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
222
223 fix0 = _mm_setzero_ps();
224 fiy0 = _mm_setzero_ps();
225 fiz0 = _mm_setzero_ps();
226 fix1 = _mm_setzero_ps();
227 fiy1 = _mm_setzero_ps();
228 fiz1 = _mm_setzero_ps();
229 fix2 = _mm_setzero_ps();
230 fiy2 = _mm_setzero_ps();
231 fiz2 = _mm_setzero_ps();
232 fix3 = _mm_setzero_ps();
233 fiy3 = _mm_setzero_ps();
234 fiz3 = _mm_setzero_ps();
235
236 /* Reset potential sums */
237 velecsum = _mm_setzero_ps();
238 vvdwsum = _mm_setzero_ps();
239
240 /* Start inner kernel loop */
241 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
242 {
243
244 /* Get j neighbor index, and coordinate index */
245 jnrA = jjnr[jidx];
246 jnrB = jjnr[jidx+1];
247 jnrC = jjnr[jidx+2];
248 jnrD = jjnr[jidx+3];
249 j_coord_offsetA = DIM3*jnrA;
250 j_coord_offsetB = DIM3*jnrB;
251 j_coord_offsetC = DIM3*jnrC;
252 j_coord_offsetD = DIM3*jnrD;
253
254 /* load j atom coordinates */
255 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
256 x+j_coord_offsetC,x+j_coord_offsetD,
257 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
258 &jy2,&jz2,&jx3,&jy3,&jz3);
259
260 /* Calculate displacement vector */
261 dx00 = _mm_sub_ps(ix0,jx0);
262 dy00 = _mm_sub_ps(iy0,jy0);
263 dz00 = _mm_sub_ps(iz0,jz0);
264 dx11 = _mm_sub_ps(ix1,jx1);
265 dy11 = _mm_sub_ps(iy1,jy1);
266 dz11 = _mm_sub_ps(iz1,jz1);
267 dx12 = _mm_sub_ps(ix1,jx2);
268 dy12 = _mm_sub_ps(iy1,jy2);
269 dz12 = _mm_sub_ps(iz1,jz2);
270 dx13 = _mm_sub_ps(ix1,jx3);
271 dy13 = _mm_sub_ps(iy1,jy3);
272 dz13 = _mm_sub_ps(iz1,jz3);
273 dx21 = _mm_sub_ps(ix2,jx1);
274 dy21 = _mm_sub_ps(iy2,jy1);
275 dz21 = _mm_sub_ps(iz2,jz1);
276 dx22 = _mm_sub_ps(ix2,jx2);
277 dy22 = _mm_sub_ps(iy2,jy2);
278 dz22 = _mm_sub_ps(iz2,jz2);
279 dx23 = _mm_sub_ps(ix2,jx3);
280 dy23 = _mm_sub_ps(iy2,jy3);
281 dz23 = _mm_sub_ps(iz2,jz3);
282 dx31 = _mm_sub_ps(ix3,jx1);
283 dy31 = _mm_sub_ps(iy3,jy1);
284 dz31 = _mm_sub_ps(iz3,jz1);
285 dx32 = _mm_sub_ps(ix3,jx2);
286 dy32 = _mm_sub_ps(iy3,jy2);
287 dz32 = _mm_sub_ps(iz3,jz2);
288 dx33 = _mm_sub_ps(ix3,jx3);
289 dy33 = _mm_sub_ps(iy3,jy3);
290 dz33 = _mm_sub_ps(iz3,jz3);
291
292 /* Calculate squared distance and things based on it */
293 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
294 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
295 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
296 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
297 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
298 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
299 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
300 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
301 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
302 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
303
304 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
305 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
306 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
307 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
308 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
309 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
310 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
311 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
312 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
313 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
314
315 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
316 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
317 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
318 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
319 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
320 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
321 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
322 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
323 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
324 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
325
326 fjx0 = _mm_setzero_ps();
327 fjy0 = _mm_setzero_ps();
328 fjz0 = _mm_setzero_ps();
329 fjx1 = _mm_setzero_ps();
330 fjy1 = _mm_setzero_ps();
331 fjz1 = _mm_setzero_ps();
332 fjx2 = _mm_setzero_ps();
333 fjy2 = _mm_setzero_ps();
334 fjz2 = _mm_setzero_ps();
335 fjx3 = _mm_setzero_ps();
336 fjy3 = _mm_setzero_ps();
337 fjz3 = _mm_setzero_ps();
338
339 /**************************
340 * CALCULATE INTERACTIONS *
341 **************************/
342
343 if (gmx_mm_any_lt(rsq00,rcutoff2))
344 {
345
346 r00 = _mm_mul_ps(rsq00,rinv00);
347
348 /* LENNARD-JONES DISPERSION/REPULSION */
349
350 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
351 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
352 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
353 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
354 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
355
356 d = _mm_sub_ps(r00,rswitch);
357 d = _mm_max_ps(d,_mm_setzero_ps());
358 d2 = _mm_mul_ps(d,d);
359 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
360
361 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
362
363 /* Evaluate switch function */
364 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
365 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
366 vvdw = _mm_mul_ps(vvdw,sw);
367 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
368
369 /* Update potential sum for this i atom from the interaction with this j atom. */
370 vvdw = _mm_and_ps(vvdw,cutoff_mask);
371 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
372
373 fscal = fvdw;
374
375 fscal = _mm_and_ps(fscal,cutoff_mask);
376
377 /* Calculate temporary vectorial force */
378 tx = _mm_mul_ps(fscal,dx00);
379 ty = _mm_mul_ps(fscal,dy00);
380 tz = _mm_mul_ps(fscal,dz00);
381
382 /* Update vectorial force */
383 fix0 = _mm_add_ps(fix0,tx);
384 fiy0 = _mm_add_ps(fiy0,ty);
385 fiz0 = _mm_add_ps(fiz0,tz);
386
387 fjx0 = _mm_add_ps(fjx0,tx);
388 fjy0 = _mm_add_ps(fjy0,ty);
389 fjz0 = _mm_add_ps(fjz0,tz);
390
391 }
392
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
396
397 if (gmx_mm_any_lt(rsq11,rcutoff2))
398 {
399
400 r11 = _mm_mul_ps(rsq11,rinv11);
401
402 /* EWALD ELECTROSTATICS */
403
404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
405 ewrt = _mm_mul_ps(r11,ewtabscale);
406 ewitab = _mm_cvttps_epi32(ewrt);
407 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
408 ewitab = _mm_slli_epi32(ewitab,2);
409 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
410 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
411 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
412 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
413 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
414 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
415 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
416 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
417 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
418
419 d = _mm_sub_ps(r11,rswitch);
420 d = _mm_max_ps(d,_mm_setzero_ps());
421 d2 = _mm_mul_ps(d,d);
422 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
423
424 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
425
426 /* Evaluate switch function */
427 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
428 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
429 velec = _mm_mul_ps(velec,sw);
430 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
431
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm_and_ps(velec,cutoff_mask);
434 velecsum = _mm_add_ps(velecsum,velec);
435
436 fscal = felec;
437
438 fscal = _mm_and_ps(fscal,cutoff_mask);
439
440 /* Calculate temporary vectorial force */
441 tx = _mm_mul_ps(fscal,dx11);
442 ty = _mm_mul_ps(fscal,dy11);
443 tz = _mm_mul_ps(fscal,dz11);
444
445 /* Update vectorial force */
446 fix1 = _mm_add_ps(fix1,tx);
447 fiy1 = _mm_add_ps(fiy1,ty);
448 fiz1 = _mm_add_ps(fiz1,tz);
449
450 fjx1 = _mm_add_ps(fjx1,tx);
451 fjy1 = _mm_add_ps(fjy1,ty);
452 fjz1 = _mm_add_ps(fjz1,tz);
453
454 }
455
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
459
460 if (gmx_mm_any_lt(rsq12,rcutoff2))
461 {
462
463 r12 = _mm_mul_ps(rsq12,rinv12);
464
465 /* EWALD ELECTROSTATICS */
466
467 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468 ewrt = _mm_mul_ps(r12,ewtabscale);
469 ewitab = _mm_cvttps_epi32(ewrt);
470 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
471 ewitab = _mm_slli_epi32(ewitab,2);
472 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
473 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
474 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
475 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
476 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
477 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
478 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
479 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
480 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
481
482 d = _mm_sub_ps(r12,rswitch);
483 d = _mm_max_ps(d,_mm_setzero_ps());
484 d2 = _mm_mul_ps(d,d);
485 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
486
487 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
488
489 /* Evaluate switch function */
490 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
491 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
492 velec = _mm_mul_ps(velec,sw);
493 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
494
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm_and_ps(velec,cutoff_mask);
497 velecsum = _mm_add_ps(velecsum,velec);
498
499 fscal = felec;
500
501 fscal = _mm_and_ps(fscal,cutoff_mask);
502
503 /* Calculate temporary vectorial force */
504 tx = _mm_mul_ps(fscal,dx12);
505 ty = _mm_mul_ps(fscal,dy12);
506 tz = _mm_mul_ps(fscal,dz12);
507
508 /* Update vectorial force */
509 fix1 = _mm_add_ps(fix1,tx);
510 fiy1 = _mm_add_ps(fiy1,ty);
511 fiz1 = _mm_add_ps(fiz1,tz);
512
513 fjx2 = _mm_add_ps(fjx2,tx);
514 fjy2 = _mm_add_ps(fjy2,ty);
515 fjz2 = _mm_add_ps(fjz2,tz);
516
517 }
518
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
522
523 if (gmx_mm_any_lt(rsq13,rcutoff2))
524 {
525
526 r13 = _mm_mul_ps(rsq13,rinv13);
527
528 /* EWALD ELECTROSTATICS */
529
530 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
531 ewrt = _mm_mul_ps(r13,ewtabscale);
532 ewitab = _mm_cvttps_epi32(ewrt);
533 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
534 ewitab = _mm_slli_epi32(ewitab,2);
535 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
536 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
537 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
538 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
539 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
540 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
541 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
542 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
543 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
544
545 d = _mm_sub_ps(r13,rswitch);
546 d = _mm_max_ps(d,_mm_setzero_ps());
547 d2 = _mm_mul_ps(d,d);
548 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
549
550 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
551
552 /* Evaluate switch function */
553 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
554 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
555 velec = _mm_mul_ps(velec,sw);
556 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
557
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velec = _mm_and_ps(velec,cutoff_mask);
560 velecsum = _mm_add_ps(velecsum,velec);
561
562 fscal = felec;
563
564 fscal = _mm_and_ps(fscal,cutoff_mask);
565
566 /* Calculate temporary vectorial force */
567 tx = _mm_mul_ps(fscal,dx13);
568 ty = _mm_mul_ps(fscal,dy13);
569 tz = _mm_mul_ps(fscal,dz13);
570
571 /* Update vectorial force */
572 fix1 = _mm_add_ps(fix1,tx);
573 fiy1 = _mm_add_ps(fiy1,ty);
574 fiz1 = _mm_add_ps(fiz1,tz);
575
576 fjx3 = _mm_add_ps(fjx3,tx);
577 fjy3 = _mm_add_ps(fjy3,ty);
578 fjz3 = _mm_add_ps(fjz3,tz);
579
580 }
581
582 /**************************
583 * CALCULATE INTERACTIONS *
584 **************************/
585
586 if (gmx_mm_any_lt(rsq21,rcutoff2))
587 {
588
589 r21 = _mm_mul_ps(rsq21,rinv21);
590
591 /* EWALD ELECTROSTATICS */
592
593 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594 ewrt = _mm_mul_ps(r21,ewtabscale);
595 ewitab = _mm_cvttps_epi32(ewrt);
596 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
597 ewitab = _mm_slli_epi32(ewitab,2);
598 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
599 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
600 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
601 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
602 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
603 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
604 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
605 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
606 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
607
608 d = _mm_sub_ps(r21,rswitch);
609 d = _mm_max_ps(d,_mm_setzero_ps());
610 d2 = _mm_mul_ps(d,d);
611 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
612
613 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
614
615 /* Evaluate switch function */
616 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
617 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
618 velec = _mm_mul_ps(velec,sw);
619 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
620
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velec = _mm_and_ps(velec,cutoff_mask);
623 velecsum = _mm_add_ps(velecsum,velec);
624
625 fscal = felec;
626
627 fscal = _mm_and_ps(fscal,cutoff_mask);
628
629 /* Calculate temporary vectorial force */
630 tx = _mm_mul_ps(fscal,dx21);
631 ty = _mm_mul_ps(fscal,dy21);
632 tz = _mm_mul_ps(fscal,dz21);
633
634 /* Update vectorial force */
635 fix2 = _mm_add_ps(fix2,tx);
636 fiy2 = _mm_add_ps(fiy2,ty);
637 fiz2 = _mm_add_ps(fiz2,tz);
638
639 fjx1 = _mm_add_ps(fjx1,tx);
640 fjy1 = _mm_add_ps(fjy1,ty);
641 fjz1 = _mm_add_ps(fjz1,tz);
642
643 }
644
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
648
649 if (gmx_mm_any_lt(rsq22,rcutoff2))
650 {
651
652 r22 = _mm_mul_ps(rsq22,rinv22);
653
654 /* EWALD ELECTROSTATICS */
655
656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657 ewrt = _mm_mul_ps(r22,ewtabscale);
658 ewitab = _mm_cvttps_epi32(ewrt);
659 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
660 ewitab = _mm_slli_epi32(ewitab,2);
661 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
662 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
663 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
664 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
665 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
666 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
667 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
668 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
669 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
670
671 d = _mm_sub_ps(r22,rswitch);
672 d = _mm_max_ps(d,_mm_setzero_ps());
673 d2 = _mm_mul_ps(d,d);
674 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
675
676 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
677
678 /* Evaluate switch function */
679 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
680 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
681 velec = _mm_mul_ps(velec,sw);
682 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
683
684 /* Update potential sum for this i atom from the interaction with this j atom. */
685 velec = _mm_and_ps(velec,cutoff_mask);
686 velecsum = _mm_add_ps(velecsum,velec);
687
688 fscal = felec;
689
690 fscal = _mm_and_ps(fscal,cutoff_mask);
691
692 /* Calculate temporary vectorial force */
693 tx = _mm_mul_ps(fscal,dx22);
694 ty = _mm_mul_ps(fscal,dy22);
695 tz = _mm_mul_ps(fscal,dz22);
696
697 /* Update vectorial force */
698 fix2 = _mm_add_ps(fix2,tx);
699 fiy2 = _mm_add_ps(fiy2,ty);
700 fiz2 = _mm_add_ps(fiz2,tz);
701
702 fjx2 = _mm_add_ps(fjx2,tx);
703 fjy2 = _mm_add_ps(fjy2,ty);
704 fjz2 = _mm_add_ps(fjz2,tz);
705
706 }
707
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
711
712 if (gmx_mm_any_lt(rsq23,rcutoff2))
713 {
714
715 r23 = _mm_mul_ps(rsq23,rinv23);
716
717 /* EWALD ELECTROSTATICS */
718
719 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720 ewrt = _mm_mul_ps(r23,ewtabscale);
721 ewitab = _mm_cvttps_epi32(ewrt);
722 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
723 ewitab = _mm_slli_epi32(ewitab,2);
724 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
725 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
726 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
727 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
728 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
729 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
730 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
731 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
732 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
733
734 d = _mm_sub_ps(r23,rswitch);
735 d = _mm_max_ps(d,_mm_setzero_ps());
736 d2 = _mm_mul_ps(d,d);
737 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
738
739 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
740
741 /* Evaluate switch function */
742 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
743 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
744 velec = _mm_mul_ps(velec,sw);
745 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
746
747 /* Update potential sum for this i atom from the interaction with this j atom. */
748 velec = _mm_and_ps(velec,cutoff_mask);
749 velecsum = _mm_add_ps(velecsum,velec);
750
751 fscal = felec;
752
753 fscal = _mm_and_ps(fscal,cutoff_mask);
754
755 /* Calculate temporary vectorial force */
756 tx = _mm_mul_ps(fscal,dx23);
757 ty = _mm_mul_ps(fscal,dy23);
758 tz = _mm_mul_ps(fscal,dz23);
759
760 /* Update vectorial force */
761 fix2 = _mm_add_ps(fix2,tx);
762 fiy2 = _mm_add_ps(fiy2,ty);
763 fiz2 = _mm_add_ps(fiz2,tz);
764
765 fjx3 = _mm_add_ps(fjx3,tx);
766 fjy3 = _mm_add_ps(fjy3,ty);
767 fjz3 = _mm_add_ps(fjz3,tz);
768
769 }
770
771 /**************************
772 * CALCULATE INTERACTIONS *
773 **************************/
774
775 if (gmx_mm_any_lt(rsq31,rcutoff2))
776 {
777
778 r31 = _mm_mul_ps(rsq31,rinv31);
779
780 /* EWALD ELECTROSTATICS */
781
782 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
783 ewrt = _mm_mul_ps(r31,ewtabscale);
784 ewitab = _mm_cvttps_epi32(ewrt);
785 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
786 ewitab = _mm_slli_epi32(ewitab,2);
787 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
788 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
789 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
790 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
791 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
792 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
793 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
794 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
795 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
796
797 d = _mm_sub_ps(r31,rswitch);
798 d = _mm_max_ps(d,_mm_setzero_ps());
799 d2 = _mm_mul_ps(d,d);
800 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
801
802 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
803
804 /* Evaluate switch function */
805 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
806 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
807 velec = _mm_mul_ps(velec,sw);
808 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
809
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm_and_ps(velec,cutoff_mask);
812 velecsum = _mm_add_ps(velecsum,velec);
813
814 fscal = felec;
815
816 fscal = _mm_and_ps(fscal,cutoff_mask);
817
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_ps(fscal,dx31);
820 ty = _mm_mul_ps(fscal,dy31);
821 tz = _mm_mul_ps(fscal,dz31);
822
823 /* Update vectorial force */
824 fix3 = _mm_add_ps(fix3,tx);
825 fiy3 = _mm_add_ps(fiy3,ty);
826 fiz3 = _mm_add_ps(fiz3,tz);
827
828 fjx1 = _mm_add_ps(fjx1,tx);
829 fjy1 = _mm_add_ps(fjy1,ty);
830 fjz1 = _mm_add_ps(fjz1,tz);
831
832 }
833
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
837
838 if (gmx_mm_any_lt(rsq32,rcutoff2))
839 {
840
841 r32 = _mm_mul_ps(rsq32,rinv32);
842
843 /* EWALD ELECTROSTATICS */
844
845 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
846 ewrt = _mm_mul_ps(r32,ewtabscale);
847 ewitab = _mm_cvttps_epi32(ewrt);
848 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
849 ewitab = _mm_slli_epi32(ewitab,2);
850 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
851 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
852 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
853 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
854 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
855 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
856 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
857 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
858 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
859
860 d = _mm_sub_ps(r32,rswitch);
861 d = _mm_max_ps(d,_mm_setzero_ps());
862 d2 = _mm_mul_ps(d,d);
863 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
864
865 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
866
867 /* Evaluate switch function */
868 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
869 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
870 velec = _mm_mul_ps(velec,sw);
871 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
872
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_and_ps(velec,cutoff_mask);
875 velecsum = _mm_add_ps(velecsum,velec);
876
877 fscal = felec;
878
879 fscal = _mm_and_ps(fscal,cutoff_mask);
880
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_ps(fscal,dx32);
883 ty = _mm_mul_ps(fscal,dy32);
884 tz = _mm_mul_ps(fscal,dz32);
885
886 /* Update vectorial force */
887 fix3 = _mm_add_ps(fix3,tx);
888 fiy3 = _mm_add_ps(fiy3,ty);
889 fiz3 = _mm_add_ps(fiz3,tz);
890
891 fjx2 = _mm_add_ps(fjx2,tx);
892 fjy2 = _mm_add_ps(fjy2,ty);
893 fjz2 = _mm_add_ps(fjz2,tz);
894
895 }
896
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
900
901 if (gmx_mm_any_lt(rsq33,rcutoff2))
902 {
903
904 r33 = _mm_mul_ps(rsq33,rinv33);
905
906 /* EWALD ELECTROSTATICS */
907
908 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
909 ewrt = _mm_mul_ps(r33,ewtabscale);
910 ewitab = _mm_cvttps_epi32(ewrt);
911 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
912 ewitab = _mm_slli_epi32(ewitab,2);
913 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
914 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
915 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
916 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
917 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
918 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
919 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
920 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
921 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
922
923 d = _mm_sub_ps(r33,rswitch);
924 d = _mm_max_ps(d,_mm_setzero_ps());
925 d2 = _mm_mul_ps(d,d);
926 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
927
928 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
929
930 /* Evaluate switch function */
931 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
932 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
933 velec = _mm_mul_ps(velec,sw);
934 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
935
936 /* Update potential sum for this i atom from the interaction with this j atom. */
937 velec = _mm_and_ps(velec,cutoff_mask);
938 velecsum = _mm_add_ps(velecsum,velec);
939
940 fscal = felec;
941
942 fscal = _mm_and_ps(fscal,cutoff_mask);
943
944 /* Calculate temporary vectorial force */
945 tx = _mm_mul_ps(fscal,dx33);
946 ty = _mm_mul_ps(fscal,dy33);
947 tz = _mm_mul_ps(fscal,dz33);
948
949 /* Update vectorial force */
950 fix3 = _mm_add_ps(fix3,tx);
951 fiy3 = _mm_add_ps(fiy3,ty);
952 fiz3 = _mm_add_ps(fiz3,tz);
953
954 fjx3 = _mm_add_ps(fjx3,tx);
955 fjy3 = _mm_add_ps(fjy3,ty);
956 fjz3 = _mm_add_ps(fjz3,tz);
957
958 }
959
960 fjptrA = f+j_coord_offsetA;
961 fjptrB = f+j_coord_offsetB;
962 fjptrC = f+j_coord_offsetC;
963 fjptrD = f+j_coord_offsetD;
964
965 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
966 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
967 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
968
969 /* Inner loop uses 647 flops */
970 }
971
972 if(jidx<j_index_end)
973 {
974
975 /* Get j neighbor index, and coordinate index */
976 jnrlistA = jjnr[jidx];
977 jnrlistB = jjnr[jidx+1];
978 jnrlistC = jjnr[jidx+2];
979 jnrlistD = jjnr[jidx+3];
980 /* Sign of each element will be negative for non-real atoms.
981 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
982 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
983 */
984 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
985 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
986 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
987 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
988 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
989 j_coord_offsetA = DIM3*jnrA;
990 j_coord_offsetB = DIM3*jnrB;
991 j_coord_offsetC = DIM3*jnrC;
992 j_coord_offsetD = DIM3*jnrD;
993
994 /* load j atom coordinates */
995 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
996 x+j_coord_offsetC,x+j_coord_offsetD,
997 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
998 &jy2,&jz2,&jx3,&jy3,&jz3);
999
1000 /* Calculate displacement vector */
1001 dx00 = _mm_sub_ps(ix0,jx0);
1002 dy00 = _mm_sub_ps(iy0,jy0);
1003 dz00 = _mm_sub_ps(iz0,jz0);
1004 dx11 = _mm_sub_ps(ix1,jx1);
1005 dy11 = _mm_sub_ps(iy1,jy1);
1006 dz11 = _mm_sub_ps(iz1,jz1);
1007 dx12 = _mm_sub_ps(ix1,jx2);
1008 dy12 = _mm_sub_ps(iy1,jy2);
1009 dz12 = _mm_sub_ps(iz1,jz2);
1010 dx13 = _mm_sub_ps(ix1,jx3);
1011 dy13 = _mm_sub_ps(iy1,jy3);
1012 dz13 = _mm_sub_ps(iz1,jz3);
1013 dx21 = _mm_sub_ps(ix2,jx1);
1014 dy21 = _mm_sub_ps(iy2,jy1);
1015 dz21 = _mm_sub_ps(iz2,jz1);
1016 dx22 = _mm_sub_ps(ix2,jx2);
1017 dy22 = _mm_sub_ps(iy2,jy2);
1018 dz22 = _mm_sub_ps(iz2,jz2);
1019 dx23 = _mm_sub_ps(ix2,jx3);
1020 dy23 = _mm_sub_ps(iy2,jy3);
1021 dz23 = _mm_sub_ps(iz2,jz3);
1022 dx31 = _mm_sub_ps(ix3,jx1);
1023 dy31 = _mm_sub_ps(iy3,jy1);
1024 dz31 = _mm_sub_ps(iz3,jz1);
1025 dx32 = _mm_sub_ps(ix3,jx2);
1026 dy32 = _mm_sub_ps(iy3,jy2);
1027 dz32 = _mm_sub_ps(iz3,jz2);
1028 dx33 = _mm_sub_ps(ix3,jx3);
1029 dy33 = _mm_sub_ps(iy3,jy3);
1030 dz33 = _mm_sub_ps(iz3,jz3);
1031
1032 /* Calculate squared distance and things based on it */
1033 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1034 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1035 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1036 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1037 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1038 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1039 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1040 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1041 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1042 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1043
1044 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1045 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1046 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1047 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
1048 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1049 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1050 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
1051 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
1052 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
1053 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
1054
1055 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1056 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1057 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1058 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1059 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1060 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1061 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1062 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1063 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1064 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1065
1066 fjx0 = _mm_setzero_ps();
1067 fjy0 = _mm_setzero_ps();
1068 fjz0 = _mm_setzero_ps();
1069 fjx1 = _mm_setzero_ps();
1070 fjy1 = _mm_setzero_ps();
1071 fjz1 = _mm_setzero_ps();
1072 fjx2 = _mm_setzero_ps();
1073 fjy2 = _mm_setzero_ps();
1074 fjz2 = _mm_setzero_ps();
1075 fjx3 = _mm_setzero_ps();
1076 fjy3 = _mm_setzero_ps();
1077 fjz3 = _mm_setzero_ps();
1078
1079 /**************************
1080 * CALCULATE INTERACTIONS *
1081 **************************/
1082
1083 if (gmx_mm_any_lt(rsq00,rcutoff2))
1084 {
1085
1086 r00 = _mm_mul_ps(rsq00,rinv00);
1087 r00 = _mm_andnot_ps(dummy_mask,r00);
1088
1089 /* LENNARD-JONES DISPERSION/REPULSION */
1090
1091 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1092 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1093 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1094 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1095 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1096
1097 d = _mm_sub_ps(r00,rswitch);
1098 d = _mm_max_ps(d,_mm_setzero_ps());
1099 d2 = _mm_mul_ps(d,d);
1100 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1101
1102 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1103
1104 /* Evaluate switch function */
1105 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1106 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1107 vvdw = _mm_mul_ps(vvdw,sw);
1108 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1109
1110 /* Update potential sum for this i atom from the interaction with this j atom. */
1111 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1112 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1113 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1114
1115 fscal = fvdw;
1116
1117 fscal = _mm_and_ps(fscal,cutoff_mask);
1118
1119 fscal = _mm_andnot_ps(dummy_mask,fscal);
1120
1121 /* Calculate temporary vectorial force */
1122 tx = _mm_mul_ps(fscal,dx00);
1123 ty = _mm_mul_ps(fscal,dy00);
1124 tz = _mm_mul_ps(fscal,dz00);
1125
1126 /* Update vectorial force */
1127 fix0 = _mm_add_ps(fix0,tx);
1128 fiy0 = _mm_add_ps(fiy0,ty);
1129 fiz0 = _mm_add_ps(fiz0,tz);
1130
1131 fjx0 = _mm_add_ps(fjx0,tx);
1132 fjy0 = _mm_add_ps(fjy0,ty);
1133 fjz0 = _mm_add_ps(fjz0,tz);
1134
1135 }
1136
1137 /**************************
1138 * CALCULATE INTERACTIONS *
1139 **************************/
1140
1141 if (gmx_mm_any_lt(rsq11,rcutoff2))
1142 {
1143
1144 r11 = _mm_mul_ps(rsq11,rinv11);
1145 r11 = _mm_andnot_ps(dummy_mask,r11);
1146
1147 /* EWALD ELECTROSTATICS */
1148
1149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150 ewrt = _mm_mul_ps(r11,ewtabscale);
1151 ewitab = _mm_cvttps_epi32(ewrt);
1152 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1153 ewitab = _mm_slli_epi32(ewitab,2);
1154 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1155 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1156 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1157 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1158 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1159 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1160 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1161 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1162 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1163
1164 d = _mm_sub_ps(r11,rswitch);
1165 d = _mm_max_ps(d,_mm_setzero_ps());
1166 d2 = _mm_mul_ps(d,d);
1167 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1168
1169 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1170
1171 /* Evaluate switch function */
1172 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1173 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1174 velec = _mm_mul_ps(velec,sw);
1175 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1176
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec = _mm_and_ps(velec,cutoff_mask);
1179 velec = _mm_andnot_ps(dummy_mask,velec);
1180 velecsum = _mm_add_ps(velecsum,velec);
1181
1182 fscal = felec;
1183
1184 fscal = _mm_and_ps(fscal,cutoff_mask);
1185
1186 fscal = _mm_andnot_ps(dummy_mask,fscal);
1187
1188 /* Calculate temporary vectorial force */
1189 tx = _mm_mul_ps(fscal,dx11);
1190 ty = _mm_mul_ps(fscal,dy11);
1191 tz = _mm_mul_ps(fscal,dz11);
1192
1193 /* Update vectorial force */
1194 fix1 = _mm_add_ps(fix1,tx);
1195 fiy1 = _mm_add_ps(fiy1,ty);
1196 fiz1 = _mm_add_ps(fiz1,tz);
1197
1198 fjx1 = _mm_add_ps(fjx1,tx);
1199 fjy1 = _mm_add_ps(fjy1,ty);
1200 fjz1 = _mm_add_ps(fjz1,tz);
1201
1202 }
1203
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1207
1208 if (gmx_mm_any_lt(rsq12,rcutoff2))
1209 {
1210
1211 r12 = _mm_mul_ps(rsq12,rinv12);
1212 r12 = _mm_andnot_ps(dummy_mask,r12);
1213
1214 /* EWALD ELECTROSTATICS */
1215
1216 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217 ewrt = _mm_mul_ps(r12,ewtabscale);
1218 ewitab = _mm_cvttps_epi32(ewrt);
1219 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1220 ewitab = _mm_slli_epi32(ewitab,2);
1221 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1222 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1223 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1224 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1225 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1226 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1227 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1228 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1229 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1230
1231 d = _mm_sub_ps(r12,rswitch);
1232 d = _mm_max_ps(d,_mm_setzero_ps());
1233 d2 = _mm_mul_ps(d,d);
1234 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1235
1236 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1237
1238 /* Evaluate switch function */
1239 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1240 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1241 velec = _mm_mul_ps(velec,sw);
1242 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1243
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec = _mm_and_ps(velec,cutoff_mask);
1246 velec = _mm_andnot_ps(dummy_mask,velec);
1247 velecsum = _mm_add_ps(velecsum,velec);
1248
1249 fscal = felec;
1250
1251 fscal = _mm_and_ps(fscal,cutoff_mask);
1252
1253 fscal = _mm_andnot_ps(dummy_mask,fscal);
1254
1255 /* Calculate temporary vectorial force */
1256 tx = _mm_mul_ps(fscal,dx12);
1257 ty = _mm_mul_ps(fscal,dy12);
1258 tz = _mm_mul_ps(fscal,dz12);
1259
1260 /* Update vectorial force */
1261 fix1 = _mm_add_ps(fix1,tx);
1262 fiy1 = _mm_add_ps(fiy1,ty);
1263 fiz1 = _mm_add_ps(fiz1,tz);
1264
1265 fjx2 = _mm_add_ps(fjx2,tx);
1266 fjy2 = _mm_add_ps(fjy2,ty);
1267 fjz2 = _mm_add_ps(fjz2,tz);
1268
1269 }
1270
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1274
1275 if (gmx_mm_any_lt(rsq13,rcutoff2))
1276 {
1277
1278 r13 = _mm_mul_ps(rsq13,rinv13);
1279 r13 = _mm_andnot_ps(dummy_mask,r13);
1280
1281 /* EWALD ELECTROSTATICS */
1282
1283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284 ewrt = _mm_mul_ps(r13,ewtabscale);
1285 ewitab = _mm_cvttps_epi32(ewrt);
1286 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1287 ewitab = _mm_slli_epi32(ewitab,2);
1288 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1289 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1290 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1291 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1292 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1293 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1294 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1295 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1296 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1297
1298 d = _mm_sub_ps(r13,rswitch);
1299 d = _mm_max_ps(d,_mm_setzero_ps());
1300 d2 = _mm_mul_ps(d,d);
1301 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1302
1303 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1304
1305 /* Evaluate switch function */
1306 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1307 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1308 velec = _mm_mul_ps(velec,sw);
1309 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1310
1311 /* Update potential sum for this i atom from the interaction with this j atom. */
1312 velec = _mm_and_ps(velec,cutoff_mask);
1313 velec = _mm_andnot_ps(dummy_mask,velec);
1314 velecsum = _mm_add_ps(velecsum,velec);
1315
1316 fscal = felec;
1317
1318 fscal = _mm_and_ps(fscal,cutoff_mask);
1319
1320 fscal = _mm_andnot_ps(dummy_mask,fscal);
1321
1322 /* Calculate temporary vectorial force */
1323 tx = _mm_mul_ps(fscal,dx13);
1324 ty = _mm_mul_ps(fscal,dy13);
1325 tz = _mm_mul_ps(fscal,dz13);
1326
1327 /* Update vectorial force */
1328 fix1 = _mm_add_ps(fix1,tx);
1329 fiy1 = _mm_add_ps(fiy1,ty);
1330 fiz1 = _mm_add_ps(fiz1,tz);
1331
1332 fjx3 = _mm_add_ps(fjx3,tx);
1333 fjy3 = _mm_add_ps(fjy3,ty);
1334 fjz3 = _mm_add_ps(fjz3,tz);
1335
1336 }
1337
1338 /**************************
1339 * CALCULATE INTERACTIONS *
1340 **************************/
1341
1342 if (gmx_mm_any_lt(rsq21,rcutoff2))
1343 {
1344
1345 r21 = _mm_mul_ps(rsq21,rinv21);
1346 r21 = _mm_andnot_ps(dummy_mask,r21);
1347
1348 /* EWALD ELECTROSTATICS */
1349
1350 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1351 ewrt = _mm_mul_ps(r21,ewtabscale);
1352 ewitab = _mm_cvttps_epi32(ewrt);
1353 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1354 ewitab = _mm_slli_epi32(ewitab,2);
1355 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1356 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1357 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1358 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1359 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1360 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1361 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1362 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1363 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1364
1365 d = _mm_sub_ps(r21,rswitch);
1366 d = _mm_max_ps(d,_mm_setzero_ps());
1367 d2 = _mm_mul_ps(d,d);
1368 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1369
1370 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1371
1372 /* Evaluate switch function */
1373 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1374 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1375 velec = _mm_mul_ps(velec,sw);
1376 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1377
1378 /* Update potential sum for this i atom from the interaction with this j atom. */
1379 velec = _mm_and_ps(velec,cutoff_mask);
1380 velec = _mm_andnot_ps(dummy_mask,velec);
1381 velecsum = _mm_add_ps(velecsum,velec);
1382
1383 fscal = felec;
1384
1385 fscal = _mm_and_ps(fscal,cutoff_mask);
1386
1387 fscal = _mm_andnot_ps(dummy_mask,fscal);
1388
1389 /* Calculate temporary vectorial force */
1390 tx = _mm_mul_ps(fscal,dx21);
1391 ty = _mm_mul_ps(fscal,dy21);
1392 tz = _mm_mul_ps(fscal,dz21);
1393
1394 /* Update vectorial force */
1395 fix2 = _mm_add_ps(fix2,tx);
1396 fiy2 = _mm_add_ps(fiy2,ty);
1397 fiz2 = _mm_add_ps(fiz2,tz);
1398
1399 fjx1 = _mm_add_ps(fjx1,tx);
1400 fjy1 = _mm_add_ps(fjy1,ty);
1401 fjz1 = _mm_add_ps(fjz1,tz);
1402
1403 }
1404
1405 /**************************
1406 * CALCULATE INTERACTIONS *
1407 **************************/
1408
1409 if (gmx_mm_any_lt(rsq22,rcutoff2))
1410 {
1411
1412 r22 = _mm_mul_ps(rsq22,rinv22);
1413 r22 = _mm_andnot_ps(dummy_mask,r22);
1414
1415 /* EWALD ELECTROSTATICS */
1416
1417 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1418 ewrt = _mm_mul_ps(r22,ewtabscale);
1419 ewitab = _mm_cvttps_epi32(ewrt);
1420 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1421 ewitab = _mm_slli_epi32(ewitab,2);
1422 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1423 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1424 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1425 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1426 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1427 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1428 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1429 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1430 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1431
1432 d = _mm_sub_ps(r22,rswitch);
1433 d = _mm_max_ps(d,_mm_setzero_ps());
1434 d2 = _mm_mul_ps(d,d);
1435 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1436
1437 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1438
1439 /* Evaluate switch function */
1440 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1441 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1442 velec = _mm_mul_ps(velec,sw);
1443 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1444
1445 /* Update potential sum for this i atom from the interaction with this j atom. */
1446 velec = _mm_and_ps(velec,cutoff_mask);
1447 velec = _mm_andnot_ps(dummy_mask,velec);
1448 velecsum = _mm_add_ps(velecsum,velec);
1449
1450 fscal = felec;
1451
1452 fscal = _mm_and_ps(fscal,cutoff_mask);
1453
1454 fscal = _mm_andnot_ps(dummy_mask,fscal);
1455
1456 /* Calculate temporary vectorial force */
1457 tx = _mm_mul_ps(fscal,dx22);
1458 ty = _mm_mul_ps(fscal,dy22);
1459 tz = _mm_mul_ps(fscal,dz22);
1460
1461 /* Update vectorial force */
1462 fix2 = _mm_add_ps(fix2,tx);
1463 fiy2 = _mm_add_ps(fiy2,ty);
1464 fiz2 = _mm_add_ps(fiz2,tz);
1465
1466 fjx2 = _mm_add_ps(fjx2,tx);
1467 fjy2 = _mm_add_ps(fjy2,ty);
1468 fjz2 = _mm_add_ps(fjz2,tz);
1469
1470 }
1471
1472 /**************************
1473 * CALCULATE INTERACTIONS *
1474 **************************/
1475
1476 if (gmx_mm_any_lt(rsq23,rcutoff2))
1477 {
1478
1479 r23 = _mm_mul_ps(rsq23,rinv23);
1480 r23 = _mm_andnot_ps(dummy_mask,r23);
1481
1482 /* EWALD ELECTROSTATICS */
1483
1484 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1485 ewrt = _mm_mul_ps(r23,ewtabscale);
1486 ewitab = _mm_cvttps_epi32(ewrt);
1487 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1488 ewitab = _mm_slli_epi32(ewitab,2);
1489 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1490 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1491 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1492 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1493 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1494 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1495 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1496 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1497 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1498
1499 d = _mm_sub_ps(r23,rswitch);
1500 d = _mm_max_ps(d,_mm_setzero_ps());
1501 d2 = _mm_mul_ps(d,d);
1502 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1503
1504 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1505
1506 /* Evaluate switch function */
1507 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1508 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1509 velec = _mm_mul_ps(velec,sw);
1510 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1511
1512 /* Update potential sum for this i atom from the interaction with this j atom. */
1513 velec = _mm_and_ps(velec,cutoff_mask);
1514 velec = _mm_andnot_ps(dummy_mask,velec);
1515 velecsum = _mm_add_ps(velecsum,velec);
1516
1517 fscal = felec;
1518
1519 fscal = _mm_and_ps(fscal,cutoff_mask);
1520
1521 fscal = _mm_andnot_ps(dummy_mask,fscal);
1522
1523 /* Calculate temporary vectorial force */
1524 tx = _mm_mul_ps(fscal,dx23);
1525 ty = _mm_mul_ps(fscal,dy23);
1526 tz = _mm_mul_ps(fscal,dz23);
1527
1528 /* Update vectorial force */
1529 fix2 = _mm_add_ps(fix2,tx);
1530 fiy2 = _mm_add_ps(fiy2,ty);
1531 fiz2 = _mm_add_ps(fiz2,tz);
1532
1533 fjx3 = _mm_add_ps(fjx3,tx);
1534 fjy3 = _mm_add_ps(fjy3,ty);
1535 fjz3 = _mm_add_ps(fjz3,tz);
1536
1537 }
1538
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1542
1543 if (gmx_mm_any_lt(rsq31,rcutoff2))
1544 {
1545
1546 r31 = _mm_mul_ps(rsq31,rinv31);
1547 r31 = _mm_andnot_ps(dummy_mask,r31);
1548
1549 /* EWALD ELECTROSTATICS */
1550
1551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1552 ewrt = _mm_mul_ps(r31,ewtabscale);
1553 ewitab = _mm_cvttps_epi32(ewrt);
1554 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1555 ewitab = _mm_slli_epi32(ewitab,2);
1556 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1557 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1558 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1559 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1560 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1561 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1562 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1563 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1564 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1565
1566 d = _mm_sub_ps(r31,rswitch);
1567 d = _mm_max_ps(d,_mm_setzero_ps());
1568 d2 = _mm_mul_ps(d,d);
1569 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1570
1571 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1572
1573 /* Evaluate switch function */
1574 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1575 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1576 velec = _mm_mul_ps(velec,sw);
1577 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1578
1579 /* Update potential sum for this i atom from the interaction with this j atom. */
1580 velec = _mm_and_ps(velec,cutoff_mask);
1581 velec = _mm_andnot_ps(dummy_mask,velec);
1582 velecsum = _mm_add_ps(velecsum,velec);
1583
1584 fscal = felec;
1585
1586 fscal = _mm_and_ps(fscal,cutoff_mask);
1587
1588 fscal = _mm_andnot_ps(dummy_mask,fscal);
1589
1590 /* Calculate temporary vectorial force */
1591 tx = _mm_mul_ps(fscal,dx31);
1592 ty = _mm_mul_ps(fscal,dy31);
1593 tz = _mm_mul_ps(fscal,dz31);
1594
1595 /* Update vectorial force */
1596 fix3 = _mm_add_ps(fix3,tx);
1597 fiy3 = _mm_add_ps(fiy3,ty);
1598 fiz3 = _mm_add_ps(fiz3,tz);
1599
1600 fjx1 = _mm_add_ps(fjx1,tx);
1601 fjy1 = _mm_add_ps(fjy1,ty);
1602 fjz1 = _mm_add_ps(fjz1,tz);
1603
1604 }
1605
1606 /**************************
1607 * CALCULATE INTERACTIONS *
1608 **************************/
1609
1610 if (gmx_mm_any_lt(rsq32,rcutoff2))
1611 {
1612
1613 r32 = _mm_mul_ps(rsq32,rinv32);
1614 r32 = _mm_andnot_ps(dummy_mask,r32);
1615
1616 /* EWALD ELECTROSTATICS */
1617
1618 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1619 ewrt = _mm_mul_ps(r32,ewtabscale);
1620 ewitab = _mm_cvttps_epi32(ewrt);
1621 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1622 ewitab = _mm_slli_epi32(ewitab,2);
1623 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1624 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1625 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1626 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1627 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1628 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1629 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1630 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1631 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1632
1633 d = _mm_sub_ps(r32,rswitch);
1634 d = _mm_max_ps(d,_mm_setzero_ps());
1635 d2 = _mm_mul_ps(d,d);
1636 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1637
1638 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1639
1640 /* Evaluate switch function */
1641 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1642 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1643 velec = _mm_mul_ps(velec,sw);
1644 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1645
1646 /* Update potential sum for this i atom from the interaction with this j atom. */
1647 velec = _mm_and_ps(velec,cutoff_mask);
1648 velec = _mm_andnot_ps(dummy_mask,velec);
1649 velecsum = _mm_add_ps(velecsum,velec);
1650
1651 fscal = felec;
1652
1653 fscal = _mm_and_ps(fscal,cutoff_mask);
1654
1655 fscal = _mm_andnot_ps(dummy_mask,fscal);
1656
1657 /* Calculate temporary vectorial force */
1658 tx = _mm_mul_ps(fscal,dx32);
1659 ty = _mm_mul_ps(fscal,dy32);
1660 tz = _mm_mul_ps(fscal,dz32);
1661
1662 /* Update vectorial force */
1663 fix3 = _mm_add_ps(fix3,tx);
1664 fiy3 = _mm_add_ps(fiy3,ty);
1665 fiz3 = _mm_add_ps(fiz3,tz);
1666
1667 fjx2 = _mm_add_ps(fjx2,tx);
1668 fjy2 = _mm_add_ps(fjy2,ty);
1669 fjz2 = _mm_add_ps(fjz2,tz);
1670
1671 }
1672
1673 /**************************
1674 * CALCULATE INTERACTIONS *
1675 **************************/
1676
1677 if (gmx_mm_any_lt(rsq33,rcutoff2))
1678 {
1679
1680 r33 = _mm_mul_ps(rsq33,rinv33);
1681 r33 = _mm_andnot_ps(dummy_mask,r33);
1682
1683 /* EWALD ELECTROSTATICS */
1684
1685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1686 ewrt = _mm_mul_ps(r33,ewtabscale);
1687 ewitab = _mm_cvttps_epi32(ewrt);
1688 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1689 ewitab = _mm_slli_epi32(ewitab,2);
1690 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1691 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1692 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1693 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1694 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1695 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1696 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1697 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1698 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1699
1700 d = _mm_sub_ps(r33,rswitch);
1701 d = _mm_max_ps(d,_mm_setzero_ps());
1702 d2 = _mm_mul_ps(d,d);
1703 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1704
1705 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1706
1707 /* Evaluate switch function */
1708 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1709 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1710 velec = _mm_mul_ps(velec,sw);
1711 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1712
1713 /* Update potential sum for this i atom from the interaction with this j atom. */
1714 velec = _mm_and_ps(velec,cutoff_mask);
1715 velec = _mm_andnot_ps(dummy_mask,velec);
1716 velecsum = _mm_add_ps(velecsum,velec);
1717
1718 fscal = felec;
1719
1720 fscal = _mm_and_ps(fscal,cutoff_mask);
1721
1722 fscal = _mm_andnot_ps(dummy_mask,fscal);
1723
1724 /* Calculate temporary vectorial force */
1725 tx = _mm_mul_ps(fscal,dx33);
1726 ty = _mm_mul_ps(fscal,dy33);
1727 tz = _mm_mul_ps(fscal,dz33);
1728
1729 /* Update vectorial force */
1730 fix3 = _mm_add_ps(fix3,tx);
1731 fiy3 = _mm_add_ps(fiy3,ty);
1732 fiz3 = _mm_add_ps(fiz3,tz);
1733
1734 fjx3 = _mm_add_ps(fjx3,tx);
1735 fjy3 = _mm_add_ps(fjy3,ty);
1736 fjz3 = _mm_add_ps(fjz3,tz);
1737
1738 }
1739
1740 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1741 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1742 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1743 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1744
1745 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1746 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1747 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1748
1749 /* Inner loop uses 657 flops */
1750 }
1751
1752 /* End of innermost loop */
1753
1754 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1755 f+i_coord_offset,fshift+i_shift_offset);
1756
1757 ggid = gid[iidx];
1758 /* Update potential energies */
1759 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1760 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1761
1762 /* Increment number of inner iterations */
1763 inneriter += j_index_end - j_index_start;
1764
1765 /* Outer loop uses 26 flops */
1766 }
1767
1768 /* Increment number of outer iterations */
1769 outeriter += nri;
1770
1771 /* Update outer/inner flops */
1772
1773 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_VF] += outeriter*26 +
inneriter*657
;
1774}
1775/*
1776 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1777 * Electrostatics interaction: Ewald
1778 * VdW interaction: LennardJones
1779 * Geometry: Water4-Water4
1780 * Calculate force/pot: Force
1781 */
1782void
1783nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1784 (t_nblist * gmx_restrict nlist,
1785 rvec * gmx_restrict xx,
1786 rvec * gmx_restrict ff,
1787 t_forcerec * gmx_restrict fr,
1788 t_mdatoms * gmx_restrict mdatoms,
1789 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1790 t_nrnb * gmx_restrict nrnb)
1791{
1792 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1793 * just 0 for non-waters.
1794 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1795 * jnr indices corresponding to data put in the four positions in the SIMD register.
1796 */
1797 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1798 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1799 int jnrA,jnrB,jnrC,jnrD;
1800 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1801 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1802 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1803 real rcutoff_scalar;
1804 real *shiftvec,*fshift,*x,*f;
1805 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1806 real scratch[4*DIM3];
1807 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1808 int vdwioffset0;
1809 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1810 int vdwioffset1;
1811 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1812 int vdwioffset2;
1813 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1814 int vdwioffset3;
1815 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1816 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1817 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1818 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1819 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1820 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1821 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1822 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1823 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1824 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1825 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1826 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1827 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1828 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1829 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1830 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1831 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1832 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1833 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1834 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1835 real *charge;
1836 int nvdwtype;
1837 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1838 int *vdwtype;
1839 real *vdwparam;
1840 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1841 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1842 __m128i ewitab;
1843 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1844 real *ewtab;
1845 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1846 real rswitch_scalar,d_scalar;
1847 __m128 dummy_mask,cutoff_mask;
1848 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1849 __m128 one = _mm_set1_ps(1.0);
1850 __m128 two = _mm_set1_ps(2.0);
1851 x = xx[0];
1852 f = ff[0];
1853
1854 nri = nlist->nri;
1855 iinr = nlist->iinr;
1856 jindex = nlist->jindex;
1857 jjnr = nlist->jjnr;
1858 shiftidx = nlist->shift;
1859 gid = nlist->gid;
1860 shiftvec = fr->shift_vec[0];
1861 fshift = fr->fshift[0];
1862 facel = _mm_set1_ps(fr->epsfac);
1863 charge = mdatoms->chargeA;
1864 nvdwtype = fr->ntype;
1865 vdwparam = fr->nbfp;
1866 vdwtype = mdatoms->typeA;
1867
1868 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1869 ewtab = fr->ic->tabq_coul_FDV0;
1870 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1871 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1872
1873 /* Setup water-specific parameters */
1874 inr = nlist->iinr[0];
1875 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1876 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1877 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1878 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1879
1880 jq1 = _mm_set1_ps(charge[inr+1]);
1881 jq2 = _mm_set1_ps(charge[inr+2]);
1882 jq3 = _mm_set1_ps(charge[inr+3]);
1883 vdwjidx0A = 2*vdwtype[inr+0];
1884 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1885 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1886 qq11 = _mm_mul_ps(iq1,jq1);
1887 qq12 = _mm_mul_ps(iq1,jq2);
1888 qq13 = _mm_mul_ps(iq1,jq3);
1889 qq21 = _mm_mul_ps(iq2,jq1);
1890 qq22 = _mm_mul_ps(iq2,jq2);
1891 qq23 = _mm_mul_ps(iq2,jq3);
1892 qq31 = _mm_mul_ps(iq3,jq1);
1893 qq32 = _mm_mul_ps(iq3,jq2);
1894 qq33 = _mm_mul_ps(iq3,jq3);
1895
1896 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1897 rcutoff_scalar = fr->rcoulomb;
1898 rcutoff = _mm_set1_ps(rcutoff_scalar);
1899 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1900
1901 rswitch_scalar = fr->rcoulomb_switch;
1902 rswitch = _mm_set1_ps(rswitch_scalar);
1903 /* Setup switch parameters */
1904 d_scalar = rcutoff_scalar-rswitch_scalar;
1905 d = _mm_set1_ps(d_scalar);
Value stored to 'd' is never read
1906 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1907 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1908 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1909 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1910 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1911 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1912
1913 /* Avoid stupid compiler warnings */
1914 jnrA = jnrB = jnrC = jnrD = 0;
1915 j_coord_offsetA = 0;
1916 j_coord_offsetB = 0;
1917 j_coord_offsetC = 0;
1918 j_coord_offsetD = 0;
1919
1920 outeriter = 0;
1921 inneriter = 0;
1922
1923 for(iidx=0;iidx<4*DIM3;iidx++)
1924 {
1925 scratch[iidx] = 0.0;
1926 }
1927
1928 /* Start outer loop over neighborlists */
1929 for(iidx=0; iidx<nri; iidx++)
1930 {
1931 /* Load shift vector for this list */
1932 i_shift_offset = DIM3*shiftidx[iidx];
1933
1934 /* Load limits for loop over neighbors */
1935 j_index_start = jindex[iidx];
1936 j_index_end = jindex[iidx+1];
1937
1938 /* Get outer coordinate index */
1939 inr = iinr[iidx];
1940 i_coord_offset = DIM3*inr;
1941
1942 /* Load i particle coords and add shift vector */
1943 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1944 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1945
1946 fix0 = _mm_setzero_ps();
1947 fiy0 = _mm_setzero_ps();
1948 fiz0 = _mm_setzero_ps();
1949 fix1 = _mm_setzero_ps();
1950 fiy1 = _mm_setzero_ps();
1951 fiz1 = _mm_setzero_ps();
1952 fix2 = _mm_setzero_ps();
1953 fiy2 = _mm_setzero_ps();
1954 fiz2 = _mm_setzero_ps();
1955 fix3 = _mm_setzero_ps();
1956 fiy3 = _mm_setzero_ps();
1957 fiz3 = _mm_setzero_ps();
1958
1959 /* Start inner kernel loop */
1960 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1961 {
1962
1963 /* Get j neighbor index, and coordinate index */
1964 jnrA = jjnr[jidx];
1965 jnrB = jjnr[jidx+1];
1966 jnrC = jjnr[jidx+2];
1967 jnrD = jjnr[jidx+3];
1968 j_coord_offsetA = DIM3*jnrA;
1969 j_coord_offsetB = DIM3*jnrB;
1970 j_coord_offsetC = DIM3*jnrC;
1971 j_coord_offsetD = DIM3*jnrD;
1972
1973 /* load j atom coordinates */
1974 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1975 x+j_coord_offsetC,x+j_coord_offsetD,
1976 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1977 &jy2,&jz2,&jx3,&jy3,&jz3);
1978
1979 /* Calculate displacement vector */
1980 dx00 = _mm_sub_ps(ix0,jx0);
1981 dy00 = _mm_sub_ps(iy0,jy0);
1982 dz00 = _mm_sub_ps(iz0,jz0);
1983 dx11 = _mm_sub_ps(ix1,jx1);
1984 dy11 = _mm_sub_ps(iy1,jy1);
1985 dz11 = _mm_sub_ps(iz1,jz1);
1986 dx12 = _mm_sub_ps(ix1,jx2);
1987 dy12 = _mm_sub_ps(iy1,jy2);
1988 dz12 = _mm_sub_ps(iz1,jz2);
1989 dx13 = _mm_sub_ps(ix1,jx3);
1990 dy13 = _mm_sub_ps(iy1,jy3);
1991 dz13 = _mm_sub_ps(iz1,jz3);
1992 dx21 = _mm_sub_ps(ix2,jx1);
1993 dy21 = _mm_sub_ps(iy2,jy1);
1994 dz21 = _mm_sub_ps(iz2,jz1);
1995 dx22 = _mm_sub_ps(ix2,jx2);
1996 dy22 = _mm_sub_ps(iy2,jy2);
1997 dz22 = _mm_sub_ps(iz2,jz2);
1998 dx23 = _mm_sub_ps(ix2,jx3);
1999 dy23 = _mm_sub_ps(iy2,jy3);
2000 dz23 = _mm_sub_ps(iz2,jz3);
2001 dx31 = _mm_sub_ps(ix3,jx1);
2002 dy31 = _mm_sub_ps(iy3,jy1);
2003 dz31 = _mm_sub_ps(iz3,jz1);
2004 dx32 = _mm_sub_ps(ix3,jx2);
2005 dy32 = _mm_sub_ps(iy3,jy2);
2006 dz32 = _mm_sub_ps(iz3,jz2);
2007 dx33 = _mm_sub_ps(ix3,jx3);
2008 dy33 = _mm_sub_ps(iy3,jy3);
2009 dz33 = _mm_sub_ps(iz3,jz3);
2010
2011 /* Calculate squared distance and things based on it */
2012 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2013 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2014 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2015 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2016 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2017 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2018 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2019 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2020 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2021 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2022
2023 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
2024 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2025 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2026 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
2027 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2028 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2029 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
2030 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
2031 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
2032 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
2033
2034 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2035 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2036 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2037 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2038 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2039 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2040 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2041 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2042 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2043 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2044
2045 fjx0 = _mm_setzero_ps();
2046 fjy0 = _mm_setzero_ps();
2047 fjz0 = _mm_setzero_ps();
2048 fjx1 = _mm_setzero_ps();
2049 fjy1 = _mm_setzero_ps();
2050 fjz1 = _mm_setzero_ps();
2051 fjx2 = _mm_setzero_ps();
2052 fjy2 = _mm_setzero_ps();
2053 fjz2 = _mm_setzero_ps();
2054 fjx3 = _mm_setzero_ps();
2055 fjy3 = _mm_setzero_ps();
2056 fjz3 = _mm_setzero_ps();
2057
2058 /**************************
2059 * CALCULATE INTERACTIONS *
2060 **************************/
2061
2062 if (gmx_mm_any_lt(rsq00,rcutoff2))
2063 {
2064
2065 r00 = _mm_mul_ps(rsq00,rinv00);
2066
2067 /* LENNARD-JONES DISPERSION/REPULSION */
2068
2069 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2070 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2071 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2072 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2073 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2074
2075 d = _mm_sub_ps(r00,rswitch);
2076 d = _mm_max_ps(d,_mm_setzero_ps());
2077 d2 = _mm_mul_ps(d,d);
2078 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2079
2080 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2081
2082 /* Evaluate switch function */
2083 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2084 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2085 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2086
2087 fscal = fvdw;
2088
2089 fscal = _mm_and_ps(fscal,cutoff_mask);
2090
2091 /* Calculate temporary vectorial force */
2092 tx = _mm_mul_ps(fscal,dx00);
2093 ty = _mm_mul_ps(fscal,dy00);
2094 tz = _mm_mul_ps(fscal,dz00);
2095
2096 /* Update vectorial force */
2097 fix0 = _mm_add_ps(fix0,tx);
2098 fiy0 = _mm_add_ps(fiy0,ty);
2099 fiz0 = _mm_add_ps(fiz0,tz);
2100
2101 fjx0 = _mm_add_ps(fjx0,tx);
2102 fjy0 = _mm_add_ps(fjy0,ty);
2103 fjz0 = _mm_add_ps(fjz0,tz);
2104
2105 }
2106
2107 /**************************
2108 * CALCULATE INTERACTIONS *
2109 **************************/
2110
2111 if (gmx_mm_any_lt(rsq11,rcutoff2))
2112 {
2113
2114 r11 = _mm_mul_ps(rsq11,rinv11);
2115
2116 /* EWALD ELECTROSTATICS */
2117
2118 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2119 ewrt = _mm_mul_ps(r11,ewtabscale);
2120 ewitab = _mm_cvttps_epi32(ewrt);
2121 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2122 ewitab = _mm_slli_epi32(ewitab,2);
2123 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2124 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2125 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2126 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2127 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2128 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2129 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2130 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2131 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2132
2133 d = _mm_sub_ps(r11,rswitch);
2134 d = _mm_max_ps(d,_mm_setzero_ps());
2135 d2 = _mm_mul_ps(d,d);
2136 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2137
2138 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2139
2140 /* Evaluate switch function */
2141 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2142 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2143 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2144
2145 fscal = felec;
2146
2147 fscal = _mm_and_ps(fscal,cutoff_mask);
2148
2149 /* Calculate temporary vectorial force */
2150 tx = _mm_mul_ps(fscal,dx11);
2151 ty = _mm_mul_ps(fscal,dy11);
2152 tz = _mm_mul_ps(fscal,dz11);
2153
2154 /* Update vectorial force */
2155 fix1 = _mm_add_ps(fix1,tx);
2156 fiy1 = _mm_add_ps(fiy1,ty);
2157 fiz1 = _mm_add_ps(fiz1,tz);
2158
2159 fjx1 = _mm_add_ps(fjx1,tx);
2160 fjy1 = _mm_add_ps(fjy1,ty);
2161 fjz1 = _mm_add_ps(fjz1,tz);
2162
2163 }
2164
2165 /**************************
2166 * CALCULATE INTERACTIONS *
2167 **************************/
2168
2169 if (gmx_mm_any_lt(rsq12,rcutoff2))
2170 {
2171
2172 r12 = _mm_mul_ps(rsq12,rinv12);
2173
2174 /* EWALD ELECTROSTATICS */
2175
2176 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2177 ewrt = _mm_mul_ps(r12,ewtabscale);
2178 ewitab = _mm_cvttps_epi32(ewrt);
2179 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2180 ewitab = _mm_slli_epi32(ewitab,2);
2181 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2182 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2183 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2184 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2185 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2186 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2187 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2188 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2189 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2190
2191 d = _mm_sub_ps(r12,rswitch);
2192 d = _mm_max_ps(d,_mm_setzero_ps());
2193 d2 = _mm_mul_ps(d,d);
2194 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2195
2196 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2197
2198 /* Evaluate switch function */
2199 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2200 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2201 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2202
2203 fscal = felec;
2204
2205 fscal = _mm_and_ps(fscal,cutoff_mask);
2206
2207 /* Calculate temporary vectorial force */
2208 tx = _mm_mul_ps(fscal,dx12);
2209 ty = _mm_mul_ps(fscal,dy12);
2210 tz = _mm_mul_ps(fscal,dz12);
2211
2212 /* Update vectorial force */
2213 fix1 = _mm_add_ps(fix1,tx);
2214 fiy1 = _mm_add_ps(fiy1,ty);
2215 fiz1 = _mm_add_ps(fiz1,tz);
2216
2217 fjx2 = _mm_add_ps(fjx2,tx);
2218 fjy2 = _mm_add_ps(fjy2,ty);
2219 fjz2 = _mm_add_ps(fjz2,tz);
2220
2221 }
2222
2223 /**************************
2224 * CALCULATE INTERACTIONS *
2225 **************************/
2226
2227 if (gmx_mm_any_lt(rsq13,rcutoff2))
2228 {
2229
2230 r13 = _mm_mul_ps(rsq13,rinv13);
2231
2232 /* EWALD ELECTROSTATICS */
2233
2234 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2235 ewrt = _mm_mul_ps(r13,ewtabscale);
2236 ewitab = _mm_cvttps_epi32(ewrt);
2237 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2238 ewitab = _mm_slli_epi32(ewitab,2);
2239 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2240 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2241 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2242 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2243 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2244 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2245 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2246 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2247 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2248
2249 d = _mm_sub_ps(r13,rswitch);
2250 d = _mm_max_ps(d,_mm_setzero_ps());
2251 d2 = _mm_mul_ps(d,d);
2252 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2253
2254 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2255
2256 /* Evaluate switch function */
2257 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2258 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2259 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2260
2261 fscal = felec;
2262
2263 fscal = _mm_and_ps(fscal,cutoff_mask);
2264
2265 /* Calculate temporary vectorial force */
2266 tx = _mm_mul_ps(fscal,dx13);
2267 ty = _mm_mul_ps(fscal,dy13);
2268 tz = _mm_mul_ps(fscal,dz13);
2269
2270 /* Update vectorial force */
2271 fix1 = _mm_add_ps(fix1,tx);
2272 fiy1 = _mm_add_ps(fiy1,ty);
2273 fiz1 = _mm_add_ps(fiz1,tz);
2274
2275 fjx3 = _mm_add_ps(fjx3,tx);
2276 fjy3 = _mm_add_ps(fjy3,ty);
2277 fjz3 = _mm_add_ps(fjz3,tz);
2278
2279 }
2280
2281 /**************************
2282 * CALCULATE INTERACTIONS *
2283 **************************/
2284
2285 if (gmx_mm_any_lt(rsq21,rcutoff2))
2286 {
2287
2288 r21 = _mm_mul_ps(rsq21,rinv21);
2289
2290 /* EWALD ELECTROSTATICS */
2291
2292 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2293 ewrt = _mm_mul_ps(r21,ewtabscale);
2294 ewitab = _mm_cvttps_epi32(ewrt);
2295 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2296 ewitab = _mm_slli_epi32(ewitab,2);
2297 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2298 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2299 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2300 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2301 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2302 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2303 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2304 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2305 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2306
2307 d = _mm_sub_ps(r21,rswitch);
2308 d = _mm_max_ps(d,_mm_setzero_ps());
2309 d2 = _mm_mul_ps(d,d);
2310 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2311
2312 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2313
2314 /* Evaluate switch function */
2315 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2316 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2317 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2318
2319 fscal = felec;
2320
2321 fscal = _mm_and_ps(fscal,cutoff_mask);
2322
2323 /* Calculate temporary vectorial force */
2324 tx = _mm_mul_ps(fscal,dx21);
2325 ty = _mm_mul_ps(fscal,dy21);
2326 tz = _mm_mul_ps(fscal,dz21);
2327
2328 /* Update vectorial force */
2329 fix2 = _mm_add_ps(fix2,tx);
2330 fiy2 = _mm_add_ps(fiy2,ty);
2331 fiz2 = _mm_add_ps(fiz2,tz);
2332
2333 fjx1 = _mm_add_ps(fjx1,tx);
2334 fjy1 = _mm_add_ps(fjy1,ty);
2335 fjz1 = _mm_add_ps(fjz1,tz);
2336
2337 }
2338
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2342
2343 if (gmx_mm_any_lt(rsq22,rcutoff2))
2344 {
2345
2346 r22 = _mm_mul_ps(rsq22,rinv22);
2347
2348 /* EWALD ELECTROSTATICS */
2349
2350 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2351 ewrt = _mm_mul_ps(r22,ewtabscale);
2352 ewitab = _mm_cvttps_epi32(ewrt);
2353 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2354 ewitab = _mm_slli_epi32(ewitab,2);
2355 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2356 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2357 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2358 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2359 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2360 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2361 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2362 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2363 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2364
2365 d = _mm_sub_ps(r22,rswitch);
2366 d = _mm_max_ps(d,_mm_setzero_ps());
2367 d2 = _mm_mul_ps(d,d);
2368 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2369
2370 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2371
2372 /* Evaluate switch function */
2373 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2374 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2375 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2376
2377 fscal = felec;
2378
2379 fscal = _mm_and_ps(fscal,cutoff_mask);
2380
2381 /* Calculate temporary vectorial force */
2382 tx = _mm_mul_ps(fscal,dx22);
2383 ty = _mm_mul_ps(fscal,dy22);
2384 tz = _mm_mul_ps(fscal,dz22);
2385
2386 /* Update vectorial force */
2387 fix2 = _mm_add_ps(fix2,tx);
2388 fiy2 = _mm_add_ps(fiy2,ty);
2389 fiz2 = _mm_add_ps(fiz2,tz);
2390
2391 fjx2 = _mm_add_ps(fjx2,tx);
2392 fjy2 = _mm_add_ps(fjy2,ty);
2393 fjz2 = _mm_add_ps(fjz2,tz);
2394
2395 }
2396
2397 /**************************
2398 * CALCULATE INTERACTIONS *
2399 **************************/
2400
2401 if (gmx_mm_any_lt(rsq23,rcutoff2))
2402 {
2403
2404 r23 = _mm_mul_ps(rsq23,rinv23);
2405
2406 /* EWALD ELECTROSTATICS */
2407
2408 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2409 ewrt = _mm_mul_ps(r23,ewtabscale);
2410 ewitab = _mm_cvttps_epi32(ewrt);
2411 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2412 ewitab = _mm_slli_epi32(ewitab,2);
2413 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2414 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2415 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2416 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2417 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2418 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2419 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2420 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2421 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2422
2423 d = _mm_sub_ps(r23,rswitch);
2424 d = _mm_max_ps(d,_mm_setzero_ps());
2425 d2 = _mm_mul_ps(d,d);
2426 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2427
2428 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2429
2430 /* Evaluate switch function */
2431 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2432 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2433 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2434
2435 fscal = felec;
2436
2437 fscal = _mm_and_ps(fscal,cutoff_mask);
2438
2439 /* Calculate temporary vectorial force */
2440 tx = _mm_mul_ps(fscal,dx23);
2441 ty = _mm_mul_ps(fscal,dy23);
2442 tz = _mm_mul_ps(fscal,dz23);
2443
2444 /* Update vectorial force */
2445 fix2 = _mm_add_ps(fix2,tx);
2446 fiy2 = _mm_add_ps(fiy2,ty);
2447 fiz2 = _mm_add_ps(fiz2,tz);
2448
2449 fjx3 = _mm_add_ps(fjx3,tx);
2450 fjy3 = _mm_add_ps(fjy3,ty);
2451 fjz3 = _mm_add_ps(fjz3,tz);
2452
2453 }
2454
2455 /**************************
2456 * CALCULATE INTERACTIONS *
2457 **************************/
2458
2459 if (gmx_mm_any_lt(rsq31,rcutoff2))
2460 {
2461
2462 r31 = _mm_mul_ps(rsq31,rinv31);
2463
2464 /* EWALD ELECTROSTATICS */
2465
2466 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2467 ewrt = _mm_mul_ps(r31,ewtabscale);
2468 ewitab = _mm_cvttps_epi32(ewrt);
2469 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2470 ewitab = _mm_slli_epi32(ewitab,2);
2471 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2472 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2473 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2474 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2475 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2476 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2477 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2478 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2479 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2480
2481 d = _mm_sub_ps(r31,rswitch);
2482 d = _mm_max_ps(d,_mm_setzero_ps());
2483 d2 = _mm_mul_ps(d,d);
2484 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2485
2486 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2487
2488 /* Evaluate switch function */
2489 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2490 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2491 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2492
2493 fscal = felec;
2494
2495 fscal = _mm_and_ps(fscal,cutoff_mask);
2496
2497 /* Calculate temporary vectorial force */
2498 tx = _mm_mul_ps(fscal,dx31);
2499 ty = _mm_mul_ps(fscal,dy31);
2500 tz = _mm_mul_ps(fscal,dz31);
2501
2502 /* Update vectorial force */
2503 fix3 = _mm_add_ps(fix3,tx);
2504 fiy3 = _mm_add_ps(fiy3,ty);
2505 fiz3 = _mm_add_ps(fiz3,tz);
2506
2507 fjx1 = _mm_add_ps(fjx1,tx);
2508 fjy1 = _mm_add_ps(fjy1,ty);
2509 fjz1 = _mm_add_ps(fjz1,tz);
2510
2511 }
2512
2513 /**************************
2514 * CALCULATE INTERACTIONS *
2515 **************************/
2516
2517 if (gmx_mm_any_lt(rsq32,rcutoff2))
2518 {
2519
2520 r32 = _mm_mul_ps(rsq32,rinv32);
2521
2522 /* EWALD ELECTROSTATICS */
2523
2524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2525 ewrt = _mm_mul_ps(r32,ewtabscale);
2526 ewitab = _mm_cvttps_epi32(ewrt);
2527 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2528 ewitab = _mm_slli_epi32(ewitab,2);
2529 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2530 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2531 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2532 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2533 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2534 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2535 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2536 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2537 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2538
2539 d = _mm_sub_ps(r32,rswitch);
2540 d = _mm_max_ps(d,_mm_setzero_ps());
2541 d2 = _mm_mul_ps(d,d);
2542 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2543
2544 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2545
2546 /* Evaluate switch function */
2547 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2548 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2549 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2550
2551 fscal = felec;
2552
2553 fscal = _mm_and_ps(fscal,cutoff_mask);
2554
2555 /* Calculate temporary vectorial force */
2556 tx = _mm_mul_ps(fscal,dx32);
2557 ty = _mm_mul_ps(fscal,dy32);
2558 tz = _mm_mul_ps(fscal,dz32);
2559
2560 /* Update vectorial force */
2561 fix3 = _mm_add_ps(fix3,tx);
2562 fiy3 = _mm_add_ps(fiy3,ty);
2563 fiz3 = _mm_add_ps(fiz3,tz);
2564
2565 fjx2 = _mm_add_ps(fjx2,tx);
2566 fjy2 = _mm_add_ps(fjy2,ty);
2567 fjz2 = _mm_add_ps(fjz2,tz);
2568
2569 }
2570
2571 /**************************
2572 * CALCULATE INTERACTIONS *
2573 **************************/
2574
2575 if (gmx_mm_any_lt(rsq33,rcutoff2))
2576 {
2577
2578 r33 = _mm_mul_ps(rsq33,rinv33);
2579
2580 /* EWALD ELECTROSTATICS */
2581
2582 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2583 ewrt = _mm_mul_ps(r33,ewtabscale);
2584 ewitab = _mm_cvttps_epi32(ewrt);
2585 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2586 ewitab = _mm_slli_epi32(ewitab,2);
2587 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2588 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2589 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2590 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2591 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2592 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2593 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2594 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2595 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2596
2597 d = _mm_sub_ps(r33,rswitch);
2598 d = _mm_max_ps(d,_mm_setzero_ps());
2599 d2 = _mm_mul_ps(d,d);
2600 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2601
2602 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2603
2604 /* Evaluate switch function */
2605 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2606 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2607 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2608
2609 fscal = felec;
2610
2611 fscal = _mm_and_ps(fscal,cutoff_mask);
2612
2613 /* Calculate temporary vectorial force */
2614 tx = _mm_mul_ps(fscal,dx33);
2615 ty = _mm_mul_ps(fscal,dy33);
2616 tz = _mm_mul_ps(fscal,dz33);
2617
2618 /* Update vectorial force */
2619 fix3 = _mm_add_ps(fix3,tx);
2620 fiy3 = _mm_add_ps(fiy3,ty);
2621 fiz3 = _mm_add_ps(fiz3,tz);
2622
2623 fjx3 = _mm_add_ps(fjx3,tx);
2624 fjy3 = _mm_add_ps(fjy3,ty);
2625 fjz3 = _mm_add_ps(fjz3,tz);
2626
2627 }
2628
2629 fjptrA = f+j_coord_offsetA;
2630 fjptrB = f+j_coord_offsetB;
2631 fjptrC = f+j_coord_offsetC;
2632 fjptrD = f+j_coord_offsetD;
2633
2634 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2635 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2636 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2637
2638 /* Inner loop uses 617 flops */
2639 }
2640
2641 if(jidx<j_index_end)
2642 {
2643
2644 /* Get j neighbor index, and coordinate index */
2645 jnrlistA = jjnr[jidx];
2646 jnrlistB = jjnr[jidx+1];
2647 jnrlistC = jjnr[jidx+2];
2648 jnrlistD = jjnr[jidx+3];
2649 /* Sign of each element will be negative for non-real atoms.
2650 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2651 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2652 */
2653 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2654 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2655 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2656 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2657 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2658 j_coord_offsetA = DIM3*jnrA;
2659 j_coord_offsetB = DIM3*jnrB;
2660 j_coord_offsetC = DIM3*jnrC;
2661 j_coord_offsetD = DIM3*jnrD;
2662
2663 /* load j atom coordinates */
2664 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2665 x+j_coord_offsetC,x+j_coord_offsetD,
2666 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2667 &jy2,&jz2,&jx3,&jy3,&jz3);
2668
2669 /* Calculate displacement vector */
2670 dx00 = _mm_sub_ps(ix0,jx0);
2671 dy00 = _mm_sub_ps(iy0,jy0);
2672 dz00 = _mm_sub_ps(iz0,jz0);
2673 dx11 = _mm_sub_ps(ix1,jx1);
2674 dy11 = _mm_sub_ps(iy1,jy1);
2675 dz11 = _mm_sub_ps(iz1,jz1);
2676 dx12 = _mm_sub_ps(ix1,jx2);
2677 dy12 = _mm_sub_ps(iy1,jy2);
2678 dz12 = _mm_sub_ps(iz1,jz2);
2679 dx13 = _mm_sub_ps(ix1,jx3);
2680 dy13 = _mm_sub_ps(iy1,jy3);
2681 dz13 = _mm_sub_ps(iz1,jz3);
2682 dx21 = _mm_sub_ps(ix2,jx1);
2683 dy21 = _mm_sub_ps(iy2,jy1);
2684 dz21 = _mm_sub_ps(iz2,jz1);
2685 dx22 = _mm_sub_ps(ix2,jx2);
2686 dy22 = _mm_sub_ps(iy2,jy2);
2687 dz22 = _mm_sub_ps(iz2,jz2);
2688 dx23 = _mm_sub_ps(ix2,jx3);
2689 dy23 = _mm_sub_ps(iy2,jy3);
2690 dz23 = _mm_sub_ps(iz2,jz3);
2691 dx31 = _mm_sub_ps(ix3,jx1);
2692 dy31 = _mm_sub_ps(iy3,jy1);
2693 dz31 = _mm_sub_ps(iz3,jz1);
2694 dx32 = _mm_sub_ps(ix3,jx2);
2695 dy32 = _mm_sub_ps(iy3,jy2);
2696 dz32 = _mm_sub_ps(iz3,jz2);
2697 dx33 = _mm_sub_ps(ix3,jx3);
2698 dy33 = _mm_sub_ps(iy3,jy3);
2699 dz33 = _mm_sub_ps(iz3,jz3);
2700
2701 /* Calculate squared distance and things based on it */
2702 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2703 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2704 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2705 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2706 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2707 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2708 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2709 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2710 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2711 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2712
2713 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
2714 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2715 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2716 rinv13 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq13);
2717 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2718 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2719 rinv23 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq23);
2720 rinv31 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq31);
2721 rinv32 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq32);
2722 rinv33 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq33);
2723
2724 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2725 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2726 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2727 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2728 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2729 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2730 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2731 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2732 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2733 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2734
2735 fjx0 = _mm_setzero_ps();
2736 fjy0 = _mm_setzero_ps();
2737 fjz0 = _mm_setzero_ps();
2738 fjx1 = _mm_setzero_ps();
2739 fjy1 = _mm_setzero_ps();
2740 fjz1 = _mm_setzero_ps();
2741 fjx2 = _mm_setzero_ps();
2742 fjy2 = _mm_setzero_ps();
2743 fjz2 = _mm_setzero_ps();
2744 fjx3 = _mm_setzero_ps();
2745 fjy3 = _mm_setzero_ps();
2746 fjz3 = _mm_setzero_ps();
2747
2748 /**************************
2749 * CALCULATE INTERACTIONS *
2750 **************************/
2751
2752 if (gmx_mm_any_lt(rsq00,rcutoff2))
2753 {
2754
2755 r00 = _mm_mul_ps(rsq00,rinv00);
2756 r00 = _mm_andnot_ps(dummy_mask,r00);
2757
2758 /* LENNARD-JONES DISPERSION/REPULSION */
2759
2760 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2761 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2762 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2763 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2764 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2765
2766 d = _mm_sub_ps(r00,rswitch);
2767 d = _mm_max_ps(d,_mm_setzero_ps());
2768 d2 = _mm_mul_ps(d,d);
2769 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2770
2771 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2772
2773 /* Evaluate switch function */
2774 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2775 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2776 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2777
2778 fscal = fvdw;
2779
2780 fscal = _mm_and_ps(fscal,cutoff_mask);
2781
2782 fscal = _mm_andnot_ps(dummy_mask,fscal);
2783
2784 /* Calculate temporary vectorial force */
2785 tx = _mm_mul_ps(fscal,dx00);
2786 ty = _mm_mul_ps(fscal,dy00);
2787 tz = _mm_mul_ps(fscal,dz00);
2788
2789 /* Update vectorial force */
2790 fix0 = _mm_add_ps(fix0,tx);
2791 fiy0 = _mm_add_ps(fiy0,ty);
2792 fiz0 = _mm_add_ps(fiz0,tz);
2793
2794 fjx0 = _mm_add_ps(fjx0,tx);
2795 fjy0 = _mm_add_ps(fjy0,ty);
2796 fjz0 = _mm_add_ps(fjz0,tz);
2797
2798 }
2799
2800 /**************************
2801 * CALCULATE INTERACTIONS *
2802 **************************/
2803
2804 if (gmx_mm_any_lt(rsq11,rcutoff2))
2805 {
2806
2807 r11 = _mm_mul_ps(rsq11,rinv11);
2808 r11 = _mm_andnot_ps(dummy_mask,r11);
2809
2810 /* EWALD ELECTROSTATICS */
2811
2812 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2813 ewrt = _mm_mul_ps(r11,ewtabscale);
2814 ewitab = _mm_cvttps_epi32(ewrt);
2815 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2816 ewitab = _mm_slli_epi32(ewitab,2);
2817 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2818 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2819 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2820 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2821 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2822 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2823 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2824 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2825 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2826
2827 d = _mm_sub_ps(r11,rswitch);
2828 d = _mm_max_ps(d,_mm_setzero_ps());
2829 d2 = _mm_mul_ps(d,d);
2830 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2831
2832 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2833
2834 /* Evaluate switch function */
2835 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2836 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2837 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2838
2839 fscal = felec;
2840
2841 fscal = _mm_and_ps(fscal,cutoff_mask);
2842
2843 fscal = _mm_andnot_ps(dummy_mask,fscal);
2844
2845 /* Calculate temporary vectorial force */
2846 tx = _mm_mul_ps(fscal,dx11);
2847 ty = _mm_mul_ps(fscal,dy11);
2848 tz = _mm_mul_ps(fscal,dz11);
2849
2850 /* Update vectorial force */
2851 fix1 = _mm_add_ps(fix1,tx);
2852 fiy1 = _mm_add_ps(fiy1,ty);
2853 fiz1 = _mm_add_ps(fiz1,tz);
2854
2855 fjx1 = _mm_add_ps(fjx1,tx);
2856 fjy1 = _mm_add_ps(fjy1,ty);
2857 fjz1 = _mm_add_ps(fjz1,tz);
2858
2859 }
2860
2861 /**************************
2862 * CALCULATE INTERACTIONS *
2863 **************************/
2864
2865 if (gmx_mm_any_lt(rsq12,rcutoff2))
2866 {
2867
2868 r12 = _mm_mul_ps(rsq12,rinv12);
2869 r12 = _mm_andnot_ps(dummy_mask,r12);
2870
2871 /* EWALD ELECTROSTATICS */
2872
2873 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2874 ewrt = _mm_mul_ps(r12,ewtabscale);
2875 ewitab = _mm_cvttps_epi32(ewrt);
2876 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2877 ewitab = _mm_slli_epi32(ewitab,2);
2878 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2879 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2880 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2881 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2882 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2883 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2884 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2885 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2886 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2887
2888 d = _mm_sub_ps(r12,rswitch);
2889 d = _mm_max_ps(d,_mm_setzero_ps());
2890 d2 = _mm_mul_ps(d,d);
2891 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2892
2893 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2894
2895 /* Evaluate switch function */
2896 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2897 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2898 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2899
2900 fscal = felec;
2901
2902 fscal = _mm_and_ps(fscal,cutoff_mask);
2903
2904 fscal = _mm_andnot_ps(dummy_mask,fscal);
2905
2906 /* Calculate temporary vectorial force */
2907 tx = _mm_mul_ps(fscal,dx12);
2908 ty = _mm_mul_ps(fscal,dy12);
2909 tz = _mm_mul_ps(fscal,dz12);
2910
2911 /* Update vectorial force */
2912 fix1 = _mm_add_ps(fix1,tx);
2913 fiy1 = _mm_add_ps(fiy1,ty);
2914 fiz1 = _mm_add_ps(fiz1,tz);
2915
2916 fjx2 = _mm_add_ps(fjx2,tx);
2917 fjy2 = _mm_add_ps(fjy2,ty);
2918 fjz2 = _mm_add_ps(fjz2,tz);
2919
2920 }
2921
2922 /**************************
2923 * CALCULATE INTERACTIONS *
2924 **************************/
2925
2926 if (gmx_mm_any_lt(rsq13,rcutoff2))
2927 {
2928
2929 r13 = _mm_mul_ps(rsq13,rinv13);
2930 r13 = _mm_andnot_ps(dummy_mask,r13);
2931
2932 /* EWALD ELECTROSTATICS */
2933
2934 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2935 ewrt = _mm_mul_ps(r13,ewtabscale);
2936 ewitab = _mm_cvttps_epi32(ewrt);
2937 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2938 ewitab = _mm_slli_epi32(ewitab,2);
2939 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2940 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2941 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2942 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2943 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2944 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2945 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2946 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2947 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2948
2949 d = _mm_sub_ps(r13,rswitch);
2950 d = _mm_max_ps(d,_mm_setzero_ps());
2951 d2 = _mm_mul_ps(d,d);
2952 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2953
2954 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2955
2956 /* Evaluate switch function */
2957 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2958 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2959 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2960
2961 fscal = felec;
2962
2963 fscal = _mm_and_ps(fscal,cutoff_mask);
2964
2965 fscal = _mm_andnot_ps(dummy_mask,fscal);
2966
2967 /* Calculate temporary vectorial force */
2968 tx = _mm_mul_ps(fscal,dx13);
2969 ty = _mm_mul_ps(fscal,dy13);
2970 tz = _mm_mul_ps(fscal,dz13);
2971
2972 /* Update vectorial force */
2973 fix1 = _mm_add_ps(fix1,tx);
2974 fiy1 = _mm_add_ps(fiy1,ty);
2975 fiz1 = _mm_add_ps(fiz1,tz);
2976
2977 fjx3 = _mm_add_ps(fjx3,tx);
2978 fjy3 = _mm_add_ps(fjy3,ty);
2979 fjz3 = _mm_add_ps(fjz3,tz);
2980
2981 }
2982
2983 /**************************
2984 * CALCULATE INTERACTIONS *
2985 **************************/
2986
2987 if (gmx_mm_any_lt(rsq21,rcutoff2))
2988 {
2989
2990 r21 = _mm_mul_ps(rsq21,rinv21);
2991 r21 = _mm_andnot_ps(dummy_mask,r21);
2992
2993 /* EWALD ELECTROSTATICS */
2994
2995 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2996 ewrt = _mm_mul_ps(r21,ewtabscale);
2997 ewitab = _mm_cvttps_epi32(ewrt);
2998 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2999 ewitab = _mm_slli_epi32(ewitab,2);
3000 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3001 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3002 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3003 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3004 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3005 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3006 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3007 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3008 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3009
3010 d = _mm_sub_ps(r21,rswitch);
3011 d = _mm_max_ps(d,_mm_setzero_ps());
3012 d2 = _mm_mul_ps(d,d);
3013 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3014
3015 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3016
3017 /* Evaluate switch function */
3018 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3019 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3020 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3021
3022 fscal = felec;
3023
3024 fscal = _mm_and_ps(fscal,cutoff_mask);
3025
3026 fscal = _mm_andnot_ps(dummy_mask,fscal);
3027
3028 /* Calculate temporary vectorial force */
3029 tx = _mm_mul_ps(fscal,dx21);
3030 ty = _mm_mul_ps(fscal,dy21);
3031 tz = _mm_mul_ps(fscal,dz21);
3032
3033 /* Update vectorial force */
3034 fix2 = _mm_add_ps(fix2,tx);
3035 fiy2 = _mm_add_ps(fiy2,ty);
3036 fiz2 = _mm_add_ps(fiz2,tz);
3037
3038 fjx1 = _mm_add_ps(fjx1,tx);
3039 fjy1 = _mm_add_ps(fjy1,ty);
3040 fjz1 = _mm_add_ps(fjz1,tz);
3041
3042 }
3043
3044 /**************************
3045 * CALCULATE INTERACTIONS *
3046 **************************/
3047
3048 if (gmx_mm_any_lt(rsq22,rcutoff2))
3049 {
3050
3051 r22 = _mm_mul_ps(rsq22,rinv22);
3052 r22 = _mm_andnot_ps(dummy_mask,r22);
3053
3054 /* EWALD ELECTROSTATICS */
3055
3056 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3057 ewrt = _mm_mul_ps(r22,ewtabscale);
3058 ewitab = _mm_cvttps_epi32(ewrt);
3059 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3060 ewitab = _mm_slli_epi32(ewitab,2);
3061 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3062 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3063 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3064 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3065 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3066 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3067 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3068 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3069 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3070
3071 d = _mm_sub_ps(r22,rswitch);
3072 d = _mm_max_ps(d,_mm_setzero_ps());
3073 d2 = _mm_mul_ps(d,d);
3074 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3075
3076 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3077
3078 /* Evaluate switch function */
3079 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3080 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3081 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3082
3083 fscal = felec;
3084
3085 fscal = _mm_and_ps(fscal,cutoff_mask);
3086
3087 fscal = _mm_andnot_ps(dummy_mask,fscal);
3088
3089 /* Calculate temporary vectorial force */
3090 tx = _mm_mul_ps(fscal,dx22);
3091 ty = _mm_mul_ps(fscal,dy22);
3092 tz = _mm_mul_ps(fscal,dz22);
3093
3094 /* Update vectorial force */
3095 fix2 = _mm_add_ps(fix2,tx);
3096 fiy2 = _mm_add_ps(fiy2,ty);
3097 fiz2 = _mm_add_ps(fiz2,tz);
3098
3099 fjx2 = _mm_add_ps(fjx2,tx);
3100 fjy2 = _mm_add_ps(fjy2,ty);
3101 fjz2 = _mm_add_ps(fjz2,tz);
3102
3103 }
3104
3105 /**************************
3106 * CALCULATE INTERACTIONS *
3107 **************************/
3108
3109 if (gmx_mm_any_lt(rsq23,rcutoff2))
3110 {
3111
3112 r23 = _mm_mul_ps(rsq23,rinv23);
3113 r23 = _mm_andnot_ps(dummy_mask,r23);
3114
3115 /* EWALD ELECTROSTATICS */
3116
3117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3118 ewrt = _mm_mul_ps(r23,ewtabscale);
3119 ewitab = _mm_cvttps_epi32(ewrt);
3120 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3121 ewitab = _mm_slli_epi32(ewitab,2);
3122 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3123 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3124 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3125 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3126 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3127 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3128 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3129 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3130 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3131
3132 d = _mm_sub_ps(r23,rswitch);
3133 d = _mm_max_ps(d,_mm_setzero_ps());
3134 d2 = _mm_mul_ps(d,d);
3135 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3136
3137 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3138
3139 /* Evaluate switch function */
3140 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3141 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3142 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
3143
3144 fscal = felec;
3145
3146 fscal = _mm_and_ps(fscal,cutoff_mask);
3147
3148 fscal = _mm_andnot_ps(dummy_mask,fscal);
3149
3150 /* Calculate temporary vectorial force */
3151 tx = _mm_mul_ps(fscal,dx23);
3152 ty = _mm_mul_ps(fscal,dy23);
3153 tz = _mm_mul_ps(fscal,dz23);
3154
3155 /* Update vectorial force */
3156 fix2 = _mm_add_ps(fix2,tx);
3157 fiy2 = _mm_add_ps(fiy2,ty);
3158 fiz2 = _mm_add_ps(fiz2,tz);
3159
3160 fjx3 = _mm_add_ps(fjx3,tx);
3161 fjy3 = _mm_add_ps(fjy3,ty);
3162 fjz3 = _mm_add_ps(fjz3,tz);
3163
3164 }
3165
3166 /**************************
3167 * CALCULATE INTERACTIONS *
3168 **************************/
3169
3170 if (gmx_mm_any_lt(rsq31,rcutoff2))
3171 {
3172
3173 r31 = _mm_mul_ps(rsq31,rinv31);
3174 r31 = _mm_andnot_ps(dummy_mask,r31);
3175
3176 /* EWALD ELECTROSTATICS */
3177
3178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3179 ewrt = _mm_mul_ps(r31,ewtabscale);
3180 ewitab = _mm_cvttps_epi32(ewrt);
3181 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3182 ewitab = _mm_slli_epi32(ewitab,2);
3183 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3184 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3185 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3186 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3187 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3188 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3189 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3190 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3191 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3192
3193 d = _mm_sub_ps(r31,rswitch);
3194 d = _mm_max_ps(d,_mm_setzero_ps());
3195 d2 = _mm_mul_ps(d,d);
3196 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3197
3198 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3199
3200 /* Evaluate switch function */
3201 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3202 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3203 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
3204
3205 fscal = felec;
3206
3207 fscal = _mm_and_ps(fscal,cutoff_mask);
3208
3209 fscal = _mm_andnot_ps(dummy_mask,fscal);
3210
3211 /* Calculate temporary vectorial force */
3212 tx = _mm_mul_ps(fscal,dx31);
3213 ty = _mm_mul_ps(fscal,dy31);
3214 tz = _mm_mul_ps(fscal,dz31);
3215
3216 /* Update vectorial force */
3217 fix3 = _mm_add_ps(fix3,tx);
3218 fiy3 = _mm_add_ps(fiy3,ty);
3219 fiz3 = _mm_add_ps(fiz3,tz);
3220
3221 fjx1 = _mm_add_ps(fjx1,tx);
3222 fjy1 = _mm_add_ps(fjy1,ty);
3223 fjz1 = _mm_add_ps(fjz1,tz);
3224
3225 }
3226
3227 /**************************
3228 * CALCULATE INTERACTIONS *
3229 **************************/
3230
3231 if (gmx_mm_any_lt(rsq32,rcutoff2))
3232 {
3233
3234 r32 = _mm_mul_ps(rsq32,rinv32);
3235 r32 = _mm_andnot_ps(dummy_mask,r32);
3236
3237 /* EWALD ELECTROSTATICS */
3238
3239 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3240 ewrt = _mm_mul_ps(r32,ewtabscale);
3241 ewitab = _mm_cvttps_epi32(ewrt);
3242 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3243 ewitab = _mm_slli_epi32(ewitab,2);
3244 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3245 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3246 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3247 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3248 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3249 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3250 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3251 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3252 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3253
3254 d = _mm_sub_ps(r32,rswitch);
3255 d = _mm_max_ps(d,_mm_setzero_ps());
3256 d2 = _mm_mul_ps(d,d);
3257 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3258
3259 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3260
3261 /* Evaluate switch function */
3262 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3263 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3264 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
3265
3266 fscal = felec;
3267
3268 fscal = _mm_and_ps(fscal,cutoff_mask);
3269
3270 fscal = _mm_andnot_ps(dummy_mask,fscal);
3271
3272 /* Calculate temporary vectorial force */
3273 tx = _mm_mul_ps(fscal,dx32);
3274 ty = _mm_mul_ps(fscal,dy32);
3275 tz = _mm_mul_ps(fscal,dz32);
3276
3277 /* Update vectorial force */
3278 fix3 = _mm_add_ps(fix3,tx);
3279 fiy3 = _mm_add_ps(fiy3,ty);
3280 fiz3 = _mm_add_ps(fiz3,tz);
3281
3282 fjx2 = _mm_add_ps(fjx2,tx);
3283 fjy2 = _mm_add_ps(fjy2,ty);
3284 fjz2 = _mm_add_ps(fjz2,tz);
3285
3286 }
3287
3288 /**************************
3289 * CALCULATE INTERACTIONS *
3290 **************************/
3291
3292 if (gmx_mm_any_lt(rsq33,rcutoff2))
3293 {
3294
3295 r33 = _mm_mul_ps(rsq33,rinv33);
3296 r33 = _mm_andnot_ps(dummy_mask,r33);
3297
3298 /* EWALD ELECTROSTATICS */
3299
3300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3301 ewrt = _mm_mul_ps(r33,ewtabscale);
3302 ewitab = _mm_cvttps_epi32(ewrt);
3303 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3304 ewitab = _mm_slli_epi32(ewitab,2);
3305 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3306 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3307 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3308 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3309 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3310 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3311 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3312 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3313 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3314
3315 d = _mm_sub_ps(r33,rswitch);
3316 d = _mm_max_ps(d,_mm_setzero_ps());
3317 d2 = _mm_mul_ps(d,d);
3318 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3319
3320 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3321
3322 /* Evaluate switch function */
3323 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3324 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3325 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
3326
3327 fscal = felec;
3328
3329 fscal = _mm_and_ps(fscal,cutoff_mask);
3330
3331 fscal = _mm_andnot_ps(dummy_mask,fscal);
3332
3333 /* Calculate temporary vectorial force */
3334 tx = _mm_mul_ps(fscal,dx33);
3335 ty = _mm_mul_ps(fscal,dy33);
3336 tz = _mm_mul_ps(fscal,dz33);
3337
3338 /* Update vectorial force */
3339 fix3 = _mm_add_ps(fix3,tx);
3340 fiy3 = _mm_add_ps(fiy3,ty);
3341 fiz3 = _mm_add_ps(fiz3,tz);
3342
3343 fjx3 = _mm_add_ps(fjx3,tx);
3344 fjy3 = _mm_add_ps(fjy3,ty);
3345 fjz3 = _mm_add_ps(fjz3,tz);
3346
3347 }
3348
3349 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3350 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3351 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3352 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3353
3354 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3355 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3356 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3357
3358 /* Inner loop uses 627 flops */
3359 }
3360
3361 /* End of innermost loop */
3362
3363 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3364 f+i_coord_offset,fshift+i_shift_offset);
3365
3366 /* Increment number of inner iterations */
3367 inneriter += j_index_end - j_index_start;
3368
3369 /* Outer loop uses 24 flops */
3370 }
3371
3372 /* Increment number of outer iterations */
3373 outeriter += nri;
3374
3375 /* Update outer/inner flops */
3376
3377 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W4W4_F] += outeriter*24 + inneriter
*627
;
3378}