Bug Summary

File:gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse4_1_single.c
Location:line 188, column 5
Description:Value stored to 'j_coord_offsetB' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
8 *
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
13 *
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 *
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
31 *
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
34 */
35/*
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37 */
38#ifdef HAVE_CONFIG_H1
39#include <config.h>
40#endif
41
42#include <math.h>
43
44#include "../nb_kernel.h"
45#include "types/simple.h"
46#include "gromacs/math/vec.h"
47#include "nrnb.h"
48
49#include "gromacs/simd/math_x86_sse4_1_single.h"
50#include "kernelutil_x86_sse4_1_single.h"
51
52/*
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
58 */
59void
60nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
68{
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 */
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real rcutoff_scalar;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 real scratch[4*DIM3];
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 int vdwioffset0;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 int vdwioffset1;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 int vdwioffset2;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 real *charge;
108 int nvdwtype;
109 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 int *vdwtype;
111 real *vdwparam;
112 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
113 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
114 __m128i ewitab;
115 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116 real *ewtab;
117 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118 real rswitch_scalar,d_scalar;
119 __m128 dummy_mask,cutoff_mask;
120 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
121 __m128 one = _mm_set1_ps(1.0);
122 __m128 two = _mm_set1_ps(2.0);
123 x = xx[0];
124 f = ff[0];
125
126 nri = nlist->nri;
127 iinr = nlist->iinr;
128 jindex = nlist->jindex;
129 jjnr = nlist->jjnr;
130 shiftidx = nlist->shift;
131 gid = nlist->gid;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = _mm_set1_ps(fr->epsfac);
135 charge = mdatoms->chargeA;
136 nvdwtype = fr->ntype;
137 vdwparam = fr->nbfp;
138 vdwtype = mdatoms->typeA;
139
140 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
143 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
144
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
148 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
151
152 jq0 = _mm_set1_ps(charge[inr+0]);
153 jq1 = _mm_set1_ps(charge[inr+1]);
154 jq2 = _mm_set1_ps(charge[inr+2]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 qq00 = _mm_mul_ps(iq0,jq0);
157 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
158 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
159 qq01 = _mm_mul_ps(iq0,jq1);
160 qq02 = _mm_mul_ps(iq0,jq2);
161 qq10 = _mm_mul_ps(iq1,jq0);
162 qq11 = _mm_mul_ps(iq1,jq1);
163 qq12 = _mm_mul_ps(iq1,jq2);
164 qq20 = _mm_mul_ps(iq2,jq0);
165 qq21 = _mm_mul_ps(iq2,jq1);
166 qq22 = _mm_mul_ps(iq2,jq2);
167
168 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169 rcutoff_scalar = fr->rcoulomb;
170 rcutoff = _mm_set1_ps(rcutoff_scalar);
171 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
172
173 rswitch_scalar = fr->rcoulomb_switch;
174 rswitch = _mm_set1_ps(rswitch_scalar);
175 /* Setup switch parameters */
176 d_scalar = rcutoff_scalar-rswitch_scalar;
177 d = _mm_set1_ps(d_scalar);
178 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
179 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
182 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
184
185 /* Avoid stupid compiler warnings */
186 jnrA = jnrB = jnrC = jnrD = 0;
187 j_coord_offsetA = 0;
188 j_coord_offsetB = 0;
Value stored to 'j_coord_offsetB' is never read
189 j_coord_offsetC = 0;
190 j_coord_offsetD = 0;
191
192 outeriter = 0;
193 inneriter = 0;
194
195 for(iidx=0;iidx<4*DIM3;iidx++)
196 {
197 scratch[iidx] = 0.0;
198 }
199
200 /* Start outer loop over neighborlists */
201 for(iidx=0; iidx<nri; iidx++)
202 {
203 /* Load shift vector for this list */
204 i_shift_offset = DIM3*shiftidx[iidx];
205
206 /* Load limits for loop over neighbors */
207 j_index_start = jindex[iidx];
208 j_index_end = jindex[iidx+1];
209
210 /* Get outer coordinate index */
211 inr = iinr[iidx];
212 i_coord_offset = DIM3*inr;
213
214 /* Load i particle coords and add shift vector */
215 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
216 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
217
218 fix0 = _mm_setzero_ps();
219 fiy0 = _mm_setzero_ps();
220 fiz0 = _mm_setzero_ps();
221 fix1 = _mm_setzero_ps();
222 fiy1 = _mm_setzero_ps();
223 fiz1 = _mm_setzero_ps();
224 fix2 = _mm_setzero_ps();
225 fiy2 = _mm_setzero_ps();
226 fiz2 = _mm_setzero_ps();
227
228 /* Reset potential sums */
229 velecsum = _mm_setzero_ps();
230 vvdwsum = _mm_setzero_ps();
231
232 /* Start inner kernel loop */
233 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
234 {
235
236 /* Get j neighbor index, and coordinate index */
237 jnrA = jjnr[jidx];
238 jnrB = jjnr[jidx+1];
239 jnrC = jjnr[jidx+2];
240 jnrD = jjnr[jidx+3];
241 j_coord_offsetA = DIM3*jnrA;
242 j_coord_offsetB = DIM3*jnrB;
243 j_coord_offsetC = DIM3*jnrC;
244 j_coord_offsetD = DIM3*jnrD;
245
246 /* load j atom coordinates */
247 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
248 x+j_coord_offsetC,x+j_coord_offsetD,
249 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
250
251 /* Calculate displacement vector */
252 dx00 = _mm_sub_ps(ix0,jx0);
253 dy00 = _mm_sub_ps(iy0,jy0);
254 dz00 = _mm_sub_ps(iz0,jz0);
255 dx01 = _mm_sub_ps(ix0,jx1);
256 dy01 = _mm_sub_ps(iy0,jy1);
257 dz01 = _mm_sub_ps(iz0,jz1);
258 dx02 = _mm_sub_ps(ix0,jx2);
259 dy02 = _mm_sub_ps(iy0,jy2);
260 dz02 = _mm_sub_ps(iz0,jz2);
261 dx10 = _mm_sub_ps(ix1,jx0);
262 dy10 = _mm_sub_ps(iy1,jy0);
263 dz10 = _mm_sub_ps(iz1,jz0);
264 dx11 = _mm_sub_ps(ix1,jx1);
265 dy11 = _mm_sub_ps(iy1,jy1);
266 dz11 = _mm_sub_ps(iz1,jz1);
267 dx12 = _mm_sub_ps(ix1,jx2);
268 dy12 = _mm_sub_ps(iy1,jy2);
269 dz12 = _mm_sub_ps(iz1,jz2);
270 dx20 = _mm_sub_ps(ix2,jx0);
271 dy20 = _mm_sub_ps(iy2,jy0);
272 dz20 = _mm_sub_ps(iz2,jz0);
273 dx21 = _mm_sub_ps(ix2,jx1);
274 dy21 = _mm_sub_ps(iy2,jy1);
275 dz21 = _mm_sub_ps(iz2,jz1);
276 dx22 = _mm_sub_ps(ix2,jx2);
277 dy22 = _mm_sub_ps(iy2,jy2);
278 dz22 = _mm_sub_ps(iz2,jz2);
279
280 /* Calculate squared distance and things based on it */
281 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
282 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
283 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
284 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
285 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
286 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
287 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
288 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
289 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
290
291 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
292 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
293 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
294 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
295 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
296 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
297 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
298 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
299 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
300
301 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
302 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
303 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
304 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
305 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
306 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
307 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
308 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
309 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
310
311 fjx0 = _mm_setzero_ps();
312 fjy0 = _mm_setzero_ps();
313 fjz0 = _mm_setzero_ps();
314 fjx1 = _mm_setzero_ps();
315 fjy1 = _mm_setzero_ps();
316 fjz1 = _mm_setzero_ps();
317 fjx2 = _mm_setzero_ps();
318 fjy2 = _mm_setzero_ps();
319 fjz2 = _mm_setzero_ps();
320
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
324
325 if (gmx_mm_any_lt(rsq00,rcutoff2))
326 {
327
328 r00 = _mm_mul_ps(rsq00,rinv00);
329
330 /* EWALD ELECTROSTATICS */
331
332 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
333 ewrt = _mm_mul_ps(r00,ewtabscale);
334 ewitab = _mm_cvttps_epi32(ewrt);
335 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
336 ewitab = _mm_slli_epi32(ewitab,2);
337 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
338 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
339 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
340 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
341 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
342 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
343 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
344 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
345 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
346
347 /* LENNARD-JONES DISPERSION/REPULSION */
348
349 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
350 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
351 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
352 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
353 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
354
355 d = _mm_sub_ps(r00,rswitch);
356 d = _mm_max_ps(d,_mm_setzero_ps());
357 d2 = _mm_mul_ps(d,d);
358 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
359
360 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
361
362 /* Evaluate switch function */
363 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
364 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
365 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
366 velec = _mm_mul_ps(velec,sw);
367 vvdw = _mm_mul_ps(vvdw,sw);
368 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
369
370 /* Update potential sum for this i atom from the interaction with this j atom. */
371 velec = _mm_and_ps(velec,cutoff_mask);
372 velecsum = _mm_add_ps(velecsum,velec);
373 vvdw = _mm_and_ps(vvdw,cutoff_mask);
374 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
375
376 fscal = _mm_add_ps(felec,fvdw);
377
378 fscal = _mm_and_ps(fscal,cutoff_mask);
379
380 /* Calculate temporary vectorial force */
381 tx = _mm_mul_ps(fscal,dx00);
382 ty = _mm_mul_ps(fscal,dy00);
383 tz = _mm_mul_ps(fscal,dz00);
384
385 /* Update vectorial force */
386 fix0 = _mm_add_ps(fix0,tx);
387 fiy0 = _mm_add_ps(fiy0,ty);
388 fiz0 = _mm_add_ps(fiz0,tz);
389
390 fjx0 = _mm_add_ps(fjx0,tx);
391 fjy0 = _mm_add_ps(fjy0,ty);
392 fjz0 = _mm_add_ps(fjz0,tz);
393
394 }
395
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
399
400 if (gmx_mm_any_lt(rsq01,rcutoff2))
401 {
402
403 r01 = _mm_mul_ps(rsq01,rinv01);
404
405 /* EWALD ELECTROSTATICS */
406
407 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408 ewrt = _mm_mul_ps(r01,ewtabscale);
409 ewitab = _mm_cvttps_epi32(ewrt);
410 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
411 ewitab = _mm_slli_epi32(ewitab,2);
412 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
413 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
414 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
415 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
416 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
417 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
418 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
419 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
420 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
421
422 d = _mm_sub_ps(r01,rswitch);
423 d = _mm_max_ps(d,_mm_setzero_ps());
424 d2 = _mm_mul_ps(d,d);
425 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
426
427 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
428
429 /* Evaluate switch function */
430 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
431 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
432 velec = _mm_mul_ps(velec,sw);
433 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
434
435 /* Update potential sum for this i atom from the interaction with this j atom. */
436 velec = _mm_and_ps(velec,cutoff_mask);
437 velecsum = _mm_add_ps(velecsum,velec);
438
439 fscal = felec;
440
441 fscal = _mm_and_ps(fscal,cutoff_mask);
442
443 /* Calculate temporary vectorial force */
444 tx = _mm_mul_ps(fscal,dx01);
445 ty = _mm_mul_ps(fscal,dy01);
446 tz = _mm_mul_ps(fscal,dz01);
447
448 /* Update vectorial force */
449 fix0 = _mm_add_ps(fix0,tx);
450 fiy0 = _mm_add_ps(fiy0,ty);
451 fiz0 = _mm_add_ps(fiz0,tz);
452
453 fjx1 = _mm_add_ps(fjx1,tx);
454 fjy1 = _mm_add_ps(fjy1,ty);
455 fjz1 = _mm_add_ps(fjz1,tz);
456
457 }
458
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
462
463 if (gmx_mm_any_lt(rsq02,rcutoff2))
464 {
465
466 r02 = _mm_mul_ps(rsq02,rinv02);
467
468 /* EWALD ELECTROSTATICS */
469
470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
471 ewrt = _mm_mul_ps(r02,ewtabscale);
472 ewitab = _mm_cvttps_epi32(ewrt);
473 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
474 ewitab = _mm_slli_epi32(ewitab,2);
475 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
476 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
477 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
478 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
479 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
480 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
481 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
482 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
483 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
484
485 d = _mm_sub_ps(r02,rswitch);
486 d = _mm_max_ps(d,_mm_setzero_ps());
487 d2 = _mm_mul_ps(d,d);
488 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
489
490 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
491
492 /* Evaluate switch function */
493 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
494 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
495 velec = _mm_mul_ps(velec,sw);
496 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
497
498 /* Update potential sum for this i atom from the interaction with this j atom. */
499 velec = _mm_and_ps(velec,cutoff_mask);
500 velecsum = _mm_add_ps(velecsum,velec);
501
502 fscal = felec;
503
504 fscal = _mm_and_ps(fscal,cutoff_mask);
505
506 /* Calculate temporary vectorial force */
507 tx = _mm_mul_ps(fscal,dx02);
508 ty = _mm_mul_ps(fscal,dy02);
509 tz = _mm_mul_ps(fscal,dz02);
510
511 /* Update vectorial force */
512 fix0 = _mm_add_ps(fix0,tx);
513 fiy0 = _mm_add_ps(fiy0,ty);
514 fiz0 = _mm_add_ps(fiz0,tz);
515
516 fjx2 = _mm_add_ps(fjx2,tx);
517 fjy2 = _mm_add_ps(fjy2,ty);
518 fjz2 = _mm_add_ps(fjz2,tz);
519
520 }
521
522 /**************************
523 * CALCULATE INTERACTIONS *
524 **************************/
525
526 if (gmx_mm_any_lt(rsq10,rcutoff2))
527 {
528
529 r10 = _mm_mul_ps(rsq10,rinv10);
530
531 /* EWALD ELECTROSTATICS */
532
533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
534 ewrt = _mm_mul_ps(r10,ewtabscale);
535 ewitab = _mm_cvttps_epi32(ewrt);
536 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
537 ewitab = _mm_slli_epi32(ewitab,2);
538 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
539 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
540 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
541 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
542 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
543 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
544 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
545 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
546 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
547
548 d = _mm_sub_ps(r10,rswitch);
549 d = _mm_max_ps(d,_mm_setzero_ps());
550 d2 = _mm_mul_ps(d,d);
551 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
552
553 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
554
555 /* Evaluate switch function */
556 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
557 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
558 velec = _mm_mul_ps(velec,sw);
559 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
560
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velec = _mm_and_ps(velec,cutoff_mask);
563 velecsum = _mm_add_ps(velecsum,velec);
564
565 fscal = felec;
566
567 fscal = _mm_and_ps(fscal,cutoff_mask);
568
569 /* Calculate temporary vectorial force */
570 tx = _mm_mul_ps(fscal,dx10);
571 ty = _mm_mul_ps(fscal,dy10);
572 tz = _mm_mul_ps(fscal,dz10);
573
574 /* Update vectorial force */
575 fix1 = _mm_add_ps(fix1,tx);
576 fiy1 = _mm_add_ps(fiy1,ty);
577 fiz1 = _mm_add_ps(fiz1,tz);
578
579 fjx0 = _mm_add_ps(fjx0,tx);
580 fjy0 = _mm_add_ps(fjy0,ty);
581 fjz0 = _mm_add_ps(fjz0,tz);
582
583 }
584
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
588
589 if (gmx_mm_any_lt(rsq11,rcutoff2))
590 {
591
592 r11 = _mm_mul_ps(rsq11,rinv11);
593
594 /* EWALD ELECTROSTATICS */
595
596 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
597 ewrt = _mm_mul_ps(r11,ewtabscale);
598 ewitab = _mm_cvttps_epi32(ewrt);
599 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
600 ewitab = _mm_slli_epi32(ewitab,2);
601 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
602 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
603 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
604 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
605 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
606 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
607 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
608 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
609 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
610
611 d = _mm_sub_ps(r11,rswitch);
612 d = _mm_max_ps(d,_mm_setzero_ps());
613 d2 = _mm_mul_ps(d,d);
614 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
615
616 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
617
618 /* Evaluate switch function */
619 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
620 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
621 velec = _mm_mul_ps(velec,sw);
622 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
623
624 /* Update potential sum for this i atom from the interaction with this j atom. */
625 velec = _mm_and_ps(velec,cutoff_mask);
626 velecsum = _mm_add_ps(velecsum,velec);
627
628 fscal = felec;
629
630 fscal = _mm_and_ps(fscal,cutoff_mask);
631
632 /* Calculate temporary vectorial force */
633 tx = _mm_mul_ps(fscal,dx11);
634 ty = _mm_mul_ps(fscal,dy11);
635 tz = _mm_mul_ps(fscal,dz11);
636
637 /* Update vectorial force */
638 fix1 = _mm_add_ps(fix1,tx);
639 fiy1 = _mm_add_ps(fiy1,ty);
640 fiz1 = _mm_add_ps(fiz1,tz);
641
642 fjx1 = _mm_add_ps(fjx1,tx);
643 fjy1 = _mm_add_ps(fjy1,ty);
644 fjz1 = _mm_add_ps(fjz1,tz);
645
646 }
647
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
651
652 if (gmx_mm_any_lt(rsq12,rcutoff2))
653 {
654
655 r12 = _mm_mul_ps(rsq12,rinv12);
656
657 /* EWALD ELECTROSTATICS */
658
659 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
660 ewrt = _mm_mul_ps(r12,ewtabscale);
661 ewitab = _mm_cvttps_epi32(ewrt);
662 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
663 ewitab = _mm_slli_epi32(ewitab,2);
664 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
665 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
666 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
667 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
668 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
669 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
670 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
671 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
672 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
673
674 d = _mm_sub_ps(r12,rswitch);
675 d = _mm_max_ps(d,_mm_setzero_ps());
676 d2 = _mm_mul_ps(d,d);
677 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
678
679 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
680
681 /* Evaluate switch function */
682 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
683 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
684 velec = _mm_mul_ps(velec,sw);
685 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
686
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velec = _mm_and_ps(velec,cutoff_mask);
689 velecsum = _mm_add_ps(velecsum,velec);
690
691 fscal = felec;
692
693 fscal = _mm_and_ps(fscal,cutoff_mask);
694
695 /* Calculate temporary vectorial force */
696 tx = _mm_mul_ps(fscal,dx12);
697 ty = _mm_mul_ps(fscal,dy12);
698 tz = _mm_mul_ps(fscal,dz12);
699
700 /* Update vectorial force */
701 fix1 = _mm_add_ps(fix1,tx);
702 fiy1 = _mm_add_ps(fiy1,ty);
703 fiz1 = _mm_add_ps(fiz1,tz);
704
705 fjx2 = _mm_add_ps(fjx2,tx);
706 fjy2 = _mm_add_ps(fjy2,ty);
707 fjz2 = _mm_add_ps(fjz2,tz);
708
709 }
710
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
714
715 if (gmx_mm_any_lt(rsq20,rcutoff2))
716 {
717
718 r20 = _mm_mul_ps(rsq20,rinv20);
719
720 /* EWALD ELECTROSTATICS */
721
722 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
723 ewrt = _mm_mul_ps(r20,ewtabscale);
724 ewitab = _mm_cvttps_epi32(ewrt);
725 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
726 ewitab = _mm_slli_epi32(ewitab,2);
727 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
728 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
729 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
730 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
731 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
732 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
733 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
734 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
735 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
736
737 d = _mm_sub_ps(r20,rswitch);
738 d = _mm_max_ps(d,_mm_setzero_ps());
739 d2 = _mm_mul_ps(d,d);
740 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
741
742 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
743
744 /* Evaluate switch function */
745 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
746 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
747 velec = _mm_mul_ps(velec,sw);
748 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
749
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec = _mm_and_ps(velec,cutoff_mask);
752 velecsum = _mm_add_ps(velecsum,velec);
753
754 fscal = felec;
755
756 fscal = _mm_and_ps(fscal,cutoff_mask);
757
758 /* Calculate temporary vectorial force */
759 tx = _mm_mul_ps(fscal,dx20);
760 ty = _mm_mul_ps(fscal,dy20);
761 tz = _mm_mul_ps(fscal,dz20);
762
763 /* Update vectorial force */
764 fix2 = _mm_add_ps(fix2,tx);
765 fiy2 = _mm_add_ps(fiy2,ty);
766 fiz2 = _mm_add_ps(fiz2,tz);
767
768 fjx0 = _mm_add_ps(fjx0,tx);
769 fjy0 = _mm_add_ps(fjy0,ty);
770 fjz0 = _mm_add_ps(fjz0,tz);
771
772 }
773
774 /**************************
775 * CALCULATE INTERACTIONS *
776 **************************/
777
778 if (gmx_mm_any_lt(rsq21,rcutoff2))
779 {
780
781 r21 = _mm_mul_ps(rsq21,rinv21);
782
783 /* EWALD ELECTROSTATICS */
784
785 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
786 ewrt = _mm_mul_ps(r21,ewtabscale);
787 ewitab = _mm_cvttps_epi32(ewrt);
788 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
789 ewitab = _mm_slli_epi32(ewitab,2);
790 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
791 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
792 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
793 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
794 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
795 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
796 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
797 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
798 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
799
800 d = _mm_sub_ps(r21,rswitch);
801 d = _mm_max_ps(d,_mm_setzero_ps());
802 d2 = _mm_mul_ps(d,d);
803 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
804
805 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
806
807 /* Evaluate switch function */
808 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
809 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
810 velec = _mm_mul_ps(velec,sw);
811 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
812
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_and_ps(velec,cutoff_mask);
815 velecsum = _mm_add_ps(velecsum,velec);
816
817 fscal = felec;
818
819 fscal = _mm_and_ps(fscal,cutoff_mask);
820
821 /* Calculate temporary vectorial force */
822 tx = _mm_mul_ps(fscal,dx21);
823 ty = _mm_mul_ps(fscal,dy21);
824 tz = _mm_mul_ps(fscal,dz21);
825
826 /* Update vectorial force */
827 fix2 = _mm_add_ps(fix2,tx);
828 fiy2 = _mm_add_ps(fiy2,ty);
829 fiz2 = _mm_add_ps(fiz2,tz);
830
831 fjx1 = _mm_add_ps(fjx1,tx);
832 fjy1 = _mm_add_ps(fjy1,ty);
833 fjz1 = _mm_add_ps(fjz1,tz);
834
835 }
836
837 /**************************
838 * CALCULATE INTERACTIONS *
839 **************************/
840
841 if (gmx_mm_any_lt(rsq22,rcutoff2))
842 {
843
844 r22 = _mm_mul_ps(rsq22,rinv22);
845
846 /* EWALD ELECTROSTATICS */
847
848 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
849 ewrt = _mm_mul_ps(r22,ewtabscale);
850 ewitab = _mm_cvttps_epi32(ewrt);
851 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
852 ewitab = _mm_slli_epi32(ewitab,2);
853 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
854 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
855 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
856 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
857 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
858 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
859 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
860 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
861 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
862
863 d = _mm_sub_ps(r22,rswitch);
864 d = _mm_max_ps(d,_mm_setzero_ps());
865 d2 = _mm_mul_ps(d,d);
866 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
867
868 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
869
870 /* Evaluate switch function */
871 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
872 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
873 velec = _mm_mul_ps(velec,sw);
874 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
875
876 /* Update potential sum for this i atom from the interaction with this j atom. */
877 velec = _mm_and_ps(velec,cutoff_mask);
878 velecsum = _mm_add_ps(velecsum,velec);
879
880 fscal = felec;
881
882 fscal = _mm_and_ps(fscal,cutoff_mask);
883
884 /* Calculate temporary vectorial force */
885 tx = _mm_mul_ps(fscal,dx22);
886 ty = _mm_mul_ps(fscal,dy22);
887 tz = _mm_mul_ps(fscal,dz22);
888
889 /* Update vectorial force */
890 fix2 = _mm_add_ps(fix2,tx);
891 fiy2 = _mm_add_ps(fiy2,ty);
892 fiz2 = _mm_add_ps(fiz2,tz);
893
894 fjx2 = _mm_add_ps(fjx2,tx);
895 fjy2 = _mm_add_ps(fjy2,ty);
896 fjz2 = _mm_add_ps(fjz2,tz);
897
898 }
899
900 fjptrA = f+j_coord_offsetA;
901 fjptrB = f+j_coord_offsetB;
902 fjptrC = f+j_coord_offsetC;
903 fjptrD = f+j_coord_offsetD;
904
905 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
906 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
907
908 /* Inner loop uses 603 flops */
909 }
910
911 if(jidx<j_index_end)
912 {
913
914 /* Get j neighbor index, and coordinate index */
915 jnrlistA = jjnr[jidx];
916 jnrlistB = jjnr[jidx+1];
917 jnrlistC = jjnr[jidx+2];
918 jnrlistD = jjnr[jidx+3];
919 /* Sign of each element will be negative for non-real atoms.
920 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
921 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
922 */
923 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
924 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
925 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
926 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
927 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
928 j_coord_offsetA = DIM3*jnrA;
929 j_coord_offsetB = DIM3*jnrB;
930 j_coord_offsetC = DIM3*jnrC;
931 j_coord_offsetD = DIM3*jnrD;
932
933 /* load j atom coordinates */
934 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
935 x+j_coord_offsetC,x+j_coord_offsetD,
936 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
937
938 /* Calculate displacement vector */
939 dx00 = _mm_sub_ps(ix0,jx0);
940 dy00 = _mm_sub_ps(iy0,jy0);
941 dz00 = _mm_sub_ps(iz0,jz0);
942 dx01 = _mm_sub_ps(ix0,jx1);
943 dy01 = _mm_sub_ps(iy0,jy1);
944 dz01 = _mm_sub_ps(iz0,jz1);
945 dx02 = _mm_sub_ps(ix0,jx2);
946 dy02 = _mm_sub_ps(iy0,jy2);
947 dz02 = _mm_sub_ps(iz0,jz2);
948 dx10 = _mm_sub_ps(ix1,jx0);
949 dy10 = _mm_sub_ps(iy1,jy0);
950 dz10 = _mm_sub_ps(iz1,jz0);
951 dx11 = _mm_sub_ps(ix1,jx1);
952 dy11 = _mm_sub_ps(iy1,jy1);
953 dz11 = _mm_sub_ps(iz1,jz1);
954 dx12 = _mm_sub_ps(ix1,jx2);
955 dy12 = _mm_sub_ps(iy1,jy2);
956 dz12 = _mm_sub_ps(iz1,jz2);
957 dx20 = _mm_sub_ps(ix2,jx0);
958 dy20 = _mm_sub_ps(iy2,jy0);
959 dz20 = _mm_sub_ps(iz2,jz0);
960 dx21 = _mm_sub_ps(ix2,jx1);
961 dy21 = _mm_sub_ps(iy2,jy1);
962 dz21 = _mm_sub_ps(iz2,jz1);
963 dx22 = _mm_sub_ps(ix2,jx2);
964 dy22 = _mm_sub_ps(iy2,jy2);
965 dz22 = _mm_sub_ps(iz2,jz2);
966
967 /* Calculate squared distance and things based on it */
968 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
969 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
970 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
971 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
972 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
973 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
974 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
975 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
976 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
977
978 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
979 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
980 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
981 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
982 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
983 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
984 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
985 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
986 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
987
988 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
989 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
990 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
991 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
992 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
993 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
994 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
995 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
996 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
997
998 fjx0 = _mm_setzero_ps();
999 fjy0 = _mm_setzero_ps();
1000 fjz0 = _mm_setzero_ps();
1001 fjx1 = _mm_setzero_ps();
1002 fjy1 = _mm_setzero_ps();
1003 fjz1 = _mm_setzero_ps();
1004 fjx2 = _mm_setzero_ps();
1005 fjy2 = _mm_setzero_ps();
1006 fjz2 = _mm_setzero_ps();
1007
1008 /**************************
1009 * CALCULATE INTERACTIONS *
1010 **************************/
1011
1012 if (gmx_mm_any_lt(rsq00,rcutoff2))
1013 {
1014
1015 r00 = _mm_mul_ps(rsq00,rinv00);
1016 r00 = _mm_andnot_ps(dummy_mask,r00);
1017
1018 /* EWALD ELECTROSTATICS */
1019
1020 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1021 ewrt = _mm_mul_ps(r00,ewtabscale);
1022 ewitab = _mm_cvttps_epi32(ewrt);
1023 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1024 ewitab = _mm_slli_epi32(ewitab,2);
1025 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1026 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1027 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1028 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1029 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1030 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1031 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1032 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1033 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1034
1035 /* LENNARD-JONES DISPERSION/REPULSION */
1036
1037 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1038 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1039 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1040 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1041 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1042
1043 d = _mm_sub_ps(r00,rswitch);
1044 d = _mm_max_ps(d,_mm_setzero_ps());
1045 d2 = _mm_mul_ps(d,d);
1046 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1047
1048 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1049
1050 /* Evaluate switch function */
1051 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1052 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1053 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1054 velec = _mm_mul_ps(velec,sw);
1055 vvdw = _mm_mul_ps(vvdw,sw);
1056 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1057
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm_and_ps(velec,cutoff_mask);
1060 velec = _mm_andnot_ps(dummy_mask,velec);
1061 velecsum = _mm_add_ps(velecsum,velec);
1062 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1063 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1064 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1065
1066 fscal = _mm_add_ps(felec,fvdw);
1067
1068 fscal = _mm_and_ps(fscal,cutoff_mask);
1069
1070 fscal = _mm_andnot_ps(dummy_mask,fscal);
1071
1072 /* Calculate temporary vectorial force */
1073 tx = _mm_mul_ps(fscal,dx00);
1074 ty = _mm_mul_ps(fscal,dy00);
1075 tz = _mm_mul_ps(fscal,dz00);
1076
1077 /* Update vectorial force */
1078 fix0 = _mm_add_ps(fix0,tx);
1079 fiy0 = _mm_add_ps(fiy0,ty);
1080 fiz0 = _mm_add_ps(fiz0,tz);
1081
1082 fjx0 = _mm_add_ps(fjx0,tx);
1083 fjy0 = _mm_add_ps(fjy0,ty);
1084 fjz0 = _mm_add_ps(fjz0,tz);
1085
1086 }
1087
1088 /**************************
1089 * CALCULATE INTERACTIONS *
1090 **************************/
1091
1092 if (gmx_mm_any_lt(rsq01,rcutoff2))
1093 {
1094
1095 r01 = _mm_mul_ps(rsq01,rinv01);
1096 r01 = _mm_andnot_ps(dummy_mask,r01);
1097
1098 /* EWALD ELECTROSTATICS */
1099
1100 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1101 ewrt = _mm_mul_ps(r01,ewtabscale);
1102 ewitab = _mm_cvttps_epi32(ewrt);
1103 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1104 ewitab = _mm_slli_epi32(ewitab,2);
1105 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1106 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1107 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1108 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1109 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1110 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1111 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1112 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1113 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1114
1115 d = _mm_sub_ps(r01,rswitch);
1116 d = _mm_max_ps(d,_mm_setzero_ps());
1117 d2 = _mm_mul_ps(d,d);
1118 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1119
1120 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1121
1122 /* Evaluate switch function */
1123 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1124 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1125 velec = _mm_mul_ps(velec,sw);
1126 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1127
1128 /* Update potential sum for this i atom from the interaction with this j atom. */
1129 velec = _mm_and_ps(velec,cutoff_mask);
1130 velec = _mm_andnot_ps(dummy_mask,velec);
1131 velecsum = _mm_add_ps(velecsum,velec);
1132
1133 fscal = felec;
1134
1135 fscal = _mm_and_ps(fscal,cutoff_mask);
1136
1137 fscal = _mm_andnot_ps(dummy_mask,fscal);
1138
1139 /* Calculate temporary vectorial force */
1140 tx = _mm_mul_ps(fscal,dx01);
1141 ty = _mm_mul_ps(fscal,dy01);
1142 tz = _mm_mul_ps(fscal,dz01);
1143
1144 /* Update vectorial force */
1145 fix0 = _mm_add_ps(fix0,tx);
1146 fiy0 = _mm_add_ps(fiy0,ty);
1147 fiz0 = _mm_add_ps(fiz0,tz);
1148
1149 fjx1 = _mm_add_ps(fjx1,tx);
1150 fjy1 = _mm_add_ps(fjy1,ty);
1151 fjz1 = _mm_add_ps(fjz1,tz);
1152
1153 }
1154
1155 /**************************
1156 * CALCULATE INTERACTIONS *
1157 **************************/
1158
1159 if (gmx_mm_any_lt(rsq02,rcutoff2))
1160 {
1161
1162 r02 = _mm_mul_ps(rsq02,rinv02);
1163 r02 = _mm_andnot_ps(dummy_mask,r02);
1164
1165 /* EWALD ELECTROSTATICS */
1166
1167 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1168 ewrt = _mm_mul_ps(r02,ewtabscale);
1169 ewitab = _mm_cvttps_epi32(ewrt);
1170 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1171 ewitab = _mm_slli_epi32(ewitab,2);
1172 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1173 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1174 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1175 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1176 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1177 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1178 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1179 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1180 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1181
1182 d = _mm_sub_ps(r02,rswitch);
1183 d = _mm_max_ps(d,_mm_setzero_ps());
1184 d2 = _mm_mul_ps(d,d);
1185 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1186
1187 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1188
1189 /* Evaluate switch function */
1190 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1191 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1192 velec = _mm_mul_ps(velec,sw);
1193 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1194
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec = _mm_and_ps(velec,cutoff_mask);
1197 velec = _mm_andnot_ps(dummy_mask,velec);
1198 velecsum = _mm_add_ps(velecsum,velec);
1199
1200 fscal = felec;
1201
1202 fscal = _mm_and_ps(fscal,cutoff_mask);
1203
1204 fscal = _mm_andnot_ps(dummy_mask,fscal);
1205
1206 /* Calculate temporary vectorial force */
1207 tx = _mm_mul_ps(fscal,dx02);
1208 ty = _mm_mul_ps(fscal,dy02);
1209 tz = _mm_mul_ps(fscal,dz02);
1210
1211 /* Update vectorial force */
1212 fix0 = _mm_add_ps(fix0,tx);
1213 fiy0 = _mm_add_ps(fiy0,ty);
1214 fiz0 = _mm_add_ps(fiz0,tz);
1215
1216 fjx2 = _mm_add_ps(fjx2,tx);
1217 fjy2 = _mm_add_ps(fjy2,ty);
1218 fjz2 = _mm_add_ps(fjz2,tz);
1219
1220 }
1221
1222 /**************************
1223 * CALCULATE INTERACTIONS *
1224 **************************/
1225
1226 if (gmx_mm_any_lt(rsq10,rcutoff2))
1227 {
1228
1229 r10 = _mm_mul_ps(rsq10,rinv10);
1230 r10 = _mm_andnot_ps(dummy_mask,r10);
1231
1232 /* EWALD ELECTROSTATICS */
1233
1234 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1235 ewrt = _mm_mul_ps(r10,ewtabscale);
1236 ewitab = _mm_cvttps_epi32(ewrt);
1237 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1238 ewitab = _mm_slli_epi32(ewitab,2);
1239 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1240 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1241 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1242 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1243 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1244 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1245 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1246 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1247 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1248
1249 d = _mm_sub_ps(r10,rswitch);
1250 d = _mm_max_ps(d,_mm_setzero_ps());
1251 d2 = _mm_mul_ps(d,d);
1252 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1253
1254 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1255
1256 /* Evaluate switch function */
1257 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1258 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1259 velec = _mm_mul_ps(velec,sw);
1260 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1261
1262 /* Update potential sum for this i atom from the interaction with this j atom. */
1263 velec = _mm_and_ps(velec,cutoff_mask);
1264 velec = _mm_andnot_ps(dummy_mask,velec);
1265 velecsum = _mm_add_ps(velecsum,velec);
1266
1267 fscal = felec;
1268
1269 fscal = _mm_and_ps(fscal,cutoff_mask);
1270
1271 fscal = _mm_andnot_ps(dummy_mask,fscal);
1272
1273 /* Calculate temporary vectorial force */
1274 tx = _mm_mul_ps(fscal,dx10);
1275 ty = _mm_mul_ps(fscal,dy10);
1276 tz = _mm_mul_ps(fscal,dz10);
1277
1278 /* Update vectorial force */
1279 fix1 = _mm_add_ps(fix1,tx);
1280 fiy1 = _mm_add_ps(fiy1,ty);
1281 fiz1 = _mm_add_ps(fiz1,tz);
1282
1283 fjx0 = _mm_add_ps(fjx0,tx);
1284 fjy0 = _mm_add_ps(fjy0,ty);
1285 fjz0 = _mm_add_ps(fjz0,tz);
1286
1287 }
1288
1289 /**************************
1290 * CALCULATE INTERACTIONS *
1291 **************************/
1292
1293 if (gmx_mm_any_lt(rsq11,rcutoff2))
1294 {
1295
1296 r11 = _mm_mul_ps(rsq11,rinv11);
1297 r11 = _mm_andnot_ps(dummy_mask,r11);
1298
1299 /* EWALD ELECTROSTATICS */
1300
1301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1302 ewrt = _mm_mul_ps(r11,ewtabscale);
1303 ewitab = _mm_cvttps_epi32(ewrt);
1304 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1305 ewitab = _mm_slli_epi32(ewitab,2);
1306 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1307 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1308 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1309 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1310 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1311 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1312 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1313 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1314 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1315
1316 d = _mm_sub_ps(r11,rswitch);
1317 d = _mm_max_ps(d,_mm_setzero_ps());
1318 d2 = _mm_mul_ps(d,d);
1319 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1320
1321 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1322
1323 /* Evaluate switch function */
1324 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1325 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1326 velec = _mm_mul_ps(velec,sw);
1327 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1328
1329 /* Update potential sum for this i atom from the interaction with this j atom. */
1330 velec = _mm_and_ps(velec,cutoff_mask);
1331 velec = _mm_andnot_ps(dummy_mask,velec);
1332 velecsum = _mm_add_ps(velecsum,velec);
1333
1334 fscal = felec;
1335
1336 fscal = _mm_and_ps(fscal,cutoff_mask);
1337
1338 fscal = _mm_andnot_ps(dummy_mask,fscal);
1339
1340 /* Calculate temporary vectorial force */
1341 tx = _mm_mul_ps(fscal,dx11);
1342 ty = _mm_mul_ps(fscal,dy11);
1343 tz = _mm_mul_ps(fscal,dz11);
1344
1345 /* Update vectorial force */
1346 fix1 = _mm_add_ps(fix1,tx);
1347 fiy1 = _mm_add_ps(fiy1,ty);
1348 fiz1 = _mm_add_ps(fiz1,tz);
1349
1350 fjx1 = _mm_add_ps(fjx1,tx);
1351 fjy1 = _mm_add_ps(fjy1,ty);
1352 fjz1 = _mm_add_ps(fjz1,tz);
1353
1354 }
1355
1356 /**************************
1357 * CALCULATE INTERACTIONS *
1358 **************************/
1359
1360 if (gmx_mm_any_lt(rsq12,rcutoff2))
1361 {
1362
1363 r12 = _mm_mul_ps(rsq12,rinv12);
1364 r12 = _mm_andnot_ps(dummy_mask,r12);
1365
1366 /* EWALD ELECTROSTATICS */
1367
1368 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1369 ewrt = _mm_mul_ps(r12,ewtabscale);
1370 ewitab = _mm_cvttps_epi32(ewrt);
1371 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1372 ewitab = _mm_slli_epi32(ewitab,2);
1373 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1374 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1375 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1376 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1377 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1378 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1379 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1380 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1381 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1382
1383 d = _mm_sub_ps(r12,rswitch);
1384 d = _mm_max_ps(d,_mm_setzero_ps());
1385 d2 = _mm_mul_ps(d,d);
1386 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1387
1388 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1389
1390 /* Evaluate switch function */
1391 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1392 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1393 velec = _mm_mul_ps(velec,sw);
1394 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1395
1396 /* Update potential sum for this i atom from the interaction with this j atom. */
1397 velec = _mm_and_ps(velec,cutoff_mask);
1398 velec = _mm_andnot_ps(dummy_mask,velec);
1399 velecsum = _mm_add_ps(velecsum,velec);
1400
1401 fscal = felec;
1402
1403 fscal = _mm_and_ps(fscal,cutoff_mask);
1404
1405 fscal = _mm_andnot_ps(dummy_mask,fscal);
1406
1407 /* Calculate temporary vectorial force */
1408 tx = _mm_mul_ps(fscal,dx12);
1409 ty = _mm_mul_ps(fscal,dy12);
1410 tz = _mm_mul_ps(fscal,dz12);
1411
1412 /* Update vectorial force */
1413 fix1 = _mm_add_ps(fix1,tx);
1414 fiy1 = _mm_add_ps(fiy1,ty);
1415 fiz1 = _mm_add_ps(fiz1,tz);
1416
1417 fjx2 = _mm_add_ps(fjx2,tx);
1418 fjy2 = _mm_add_ps(fjy2,ty);
1419 fjz2 = _mm_add_ps(fjz2,tz);
1420
1421 }
1422
1423 /**************************
1424 * CALCULATE INTERACTIONS *
1425 **************************/
1426
1427 if (gmx_mm_any_lt(rsq20,rcutoff2))
1428 {
1429
1430 r20 = _mm_mul_ps(rsq20,rinv20);
1431 r20 = _mm_andnot_ps(dummy_mask,r20);
1432
1433 /* EWALD ELECTROSTATICS */
1434
1435 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1436 ewrt = _mm_mul_ps(r20,ewtabscale);
1437 ewitab = _mm_cvttps_epi32(ewrt);
1438 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1439 ewitab = _mm_slli_epi32(ewitab,2);
1440 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1441 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1442 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1443 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1444 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1445 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1446 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1447 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1448 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1449
1450 d = _mm_sub_ps(r20,rswitch);
1451 d = _mm_max_ps(d,_mm_setzero_ps());
1452 d2 = _mm_mul_ps(d,d);
1453 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1454
1455 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1456
1457 /* Evaluate switch function */
1458 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1459 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1460 velec = _mm_mul_ps(velec,sw);
1461 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1462
1463 /* Update potential sum for this i atom from the interaction with this j atom. */
1464 velec = _mm_and_ps(velec,cutoff_mask);
1465 velec = _mm_andnot_ps(dummy_mask,velec);
1466 velecsum = _mm_add_ps(velecsum,velec);
1467
1468 fscal = felec;
1469
1470 fscal = _mm_and_ps(fscal,cutoff_mask);
1471
1472 fscal = _mm_andnot_ps(dummy_mask,fscal);
1473
1474 /* Calculate temporary vectorial force */
1475 tx = _mm_mul_ps(fscal,dx20);
1476 ty = _mm_mul_ps(fscal,dy20);
1477 tz = _mm_mul_ps(fscal,dz20);
1478
1479 /* Update vectorial force */
1480 fix2 = _mm_add_ps(fix2,tx);
1481 fiy2 = _mm_add_ps(fiy2,ty);
1482 fiz2 = _mm_add_ps(fiz2,tz);
1483
1484 fjx0 = _mm_add_ps(fjx0,tx);
1485 fjy0 = _mm_add_ps(fjy0,ty);
1486 fjz0 = _mm_add_ps(fjz0,tz);
1487
1488 }
1489
1490 /**************************
1491 * CALCULATE INTERACTIONS *
1492 **************************/
1493
1494 if (gmx_mm_any_lt(rsq21,rcutoff2))
1495 {
1496
1497 r21 = _mm_mul_ps(rsq21,rinv21);
1498 r21 = _mm_andnot_ps(dummy_mask,r21);
1499
1500 /* EWALD ELECTROSTATICS */
1501
1502 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1503 ewrt = _mm_mul_ps(r21,ewtabscale);
1504 ewitab = _mm_cvttps_epi32(ewrt);
1505 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1506 ewitab = _mm_slli_epi32(ewitab,2);
1507 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1508 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1509 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1510 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1511 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1512 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1513 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1514 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1515 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1516
1517 d = _mm_sub_ps(r21,rswitch);
1518 d = _mm_max_ps(d,_mm_setzero_ps());
1519 d2 = _mm_mul_ps(d,d);
1520 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1521
1522 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1523
1524 /* Evaluate switch function */
1525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1526 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1527 velec = _mm_mul_ps(velec,sw);
1528 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1529
1530 /* Update potential sum for this i atom from the interaction with this j atom. */
1531 velec = _mm_and_ps(velec,cutoff_mask);
1532 velec = _mm_andnot_ps(dummy_mask,velec);
1533 velecsum = _mm_add_ps(velecsum,velec);
1534
1535 fscal = felec;
1536
1537 fscal = _mm_and_ps(fscal,cutoff_mask);
1538
1539 fscal = _mm_andnot_ps(dummy_mask,fscal);
1540
1541 /* Calculate temporary vectorial force */
1542 tx = _mm_mul_ps(fscal,dx21);
1543 ty = _mm_mul_ps(fscal,dy21);
1544 tz = _mm_mul_ps(fscal,dz21);
1545
1546 /* Update vectorial force */
1547 fix2 = _mm_add_ps(fix2,tx);
1548 fiy2 = _mm_add_ps(fiy2,ty);
1549 fiz2 = _mm_add_ps(fiz2,tz);
1550
1551 fjx1 = _mm_add_ps(fjx1,tx);
1552 fjy1 = _mm_add_ps(fjy1,ty);
1553 fjz1 = _mm_add_ps(fjz1,tz);
1554
1555 }
1556
1557 /**************************
1558 * CALCULATE INTERACTIONS *
1559 **************************/
1560
1561 if (gmx_mm_any_lt(rsq22,rcutoff2))
1562 {
1563
1564 r22 = _mm_mul_ps(rsq22,rinv22);
1565 r22 = _mm_andnot_ps(dummy_mask,r22);
1566
1567 /* EWALD ELECTROSTATICS */
1568
1569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1570 ewrt = _mm_mul_ps(r22,ewtabscale);
1571 ewitab = _mm_cvttps_epi32(ewrt);
1572 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1573 ewitab = _mm_slli_epi32(ewitab,2);
1574 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1575 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1576 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1577 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1578 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1579 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1580 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1581 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1582 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1583
1584 d = _mm_sub_ps(r22,rswitch);
1585 d = _mm_max_ps(d,_mm_setzero_ps());
1586 d2 = _mm_mul_ps(d,d);
1587 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1588
1589 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1590
1591 /* Evaluate switch function */
1592 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1593 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1594 velec = _mm_mul_ps(velec,sw);
1595 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1596
1597 /* Update potential sum for this i atom from the interaction with this j atom. */
1598 velec = _mm_and_ps(velec,cutoff_mask);
1599 velec = _mm_andnot_ps(dummy_mask,velec);
1600 velecsum = _mm_add_ps(velecsum,velec);
1601
1602 fscal = felec;
1603
1604 fscal = _mm_and_ps(fscal,cutoff_mask);
1605
1606 fscal = _mm_andnot_ps(dummy_mask,fscal);
1607
1608 /* Calculate temporary vectorial force */
1609 tx = _mm_mul_ps(fscal,dx22);
1610 ty = _mm_mul_ps(fscal,dy22);
1611 tz = _mm_mul_ps(fscal,dz22);
1612
1613 /* Update vectorial force */
1614 fix2 = _mm_add_ps(fix2,tx);
1615 fiy2 = _mm_add_ps(fiy2,ty);
1616 fiz2 = _mm_add_ps(fiz2,tz);
1617
1618 fjx2 = _mm_add_ps(fjx2,tx);
1619 fjy2 = _mm_add_ps(fjy2,ty);
1620 fjz2 = _mm_add_ps(fjz2,tz);
1621
1622 }
1623
1624 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1625 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1626 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1627 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1628
1629 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1630 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1631
1632 /* Inner loop uses 612 flops */
1633 }
1634
1635 /* End of innermost loop */
1636
1637 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1638 f+i_coord_offset,fshift+i_shift_offset);
1639
1640 ggid = gid[iidx];
1641 /* Update potential energies */
1642 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1643 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1644
1645 /* Increment number of inner iterations */
1646 inneriter += j_index_end - j_index_start;
1647
1648 /* Outer loop uses 20 flops */
1649 }
1650
1651 /* Increment number of outer iterations */
1652 outeriter += nri;
1653
1654 /* Update outer/inner flops */
1655
1656 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_VF] += outeriter*20 +
inneriter*612
;
1657}
1658/*
1659 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single
1660 * Electrostatics interaction: Ewald
1661 * VdW interaction: LennardJones
1662 * Geometry: Water3-Water3
1663 * Calculate force/pot: Force
1664 */
1665void
1666nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single
1667 (t_nblist * gmx_restrict nlist,
1668 rvec * gmx_restrict xx,
1669 rvec * gmx_restrict ff,
1670 t_forcerec * gmx_restrict fr,
1671 t_mdatoms * gmx_restrict mdatoms,
1672 nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1673 t_nrnb * gmx_restrict nrnb)
1674{
1675 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1676 * just 0 for non-waters.
1677 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1678 * jnr indices corresponding to data put in the four positions in the SIMD register.
1679 */
1680 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1681 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1682 int jnrA,jnrB,jnrC,jnrD;
1683 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1684 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1685 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1686 real rcutoff_scalar;
1687 real *shiftvec,*fshift,*x,*f;
1688 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1689 real scratch[4*DIM3];
1690 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1691 int vdwioffset0;
1692 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1693 int vdwioffset1;
1694 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1695 int vdwioffset2;
1696 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1697 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1698 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1699 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1700 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1701 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1702 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1703 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1704 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1705 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1706 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1707 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1708 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1709 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1710 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1711 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1712 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1713 real *charge;
1714 int nvdwtype;
1715 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1716 int *vdwtype;
1717 real *vdwparam;
1718 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1719 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1720 __m128i ewitab;
1721 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1722 real *ewtab;
1723 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1724 real rswitch_scalar,d_scalar;
1725 __m128 dummy_mask,cutoff_mask;
1726 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1727 __m128 one = _mm_set1_ps(1.0);
1728 __m128 two = _mm_set1_ps(2.0);
1729 x = xx[0];
1730 f = ff[0];
1731
1732 nri = nlist->nri;
1733 iinr = nlist->iinr;
1734 jindex = nlist->jindex;
1735 jjnr = nlist->jjnr;
1736 shiftidx = nlist->shift;
1737 gid = nlist->gid;
1738 shiftvec = fr->shift_vec[0];
1739 fshift = fr->fshift[0];
1740 facel = _mm_set1_ps(fr->epsfac);
1741 charge = mdatoms->chargeA;
1742 nvdwtype = fr->ntype;
1743 vdwparam = fr->nbfp;
1744 vdwtype = mdatoms->typeA;
1745
1746 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1747 ewtab = fr->ic->tabq_coul_FDV0;
1748 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1749 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1750
1751 /* Setup water-specific parameters */
1752 inr = nlist->iinr[0];
1753 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1754 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1755 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1756 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1757
1758 jq0 = _mm_set1_ps(charge[inr+0]);
1759 jq1 = _mm_set1_ps(charge[inr+1]);
1760 jq2 = _mm_set1_ps(charge[inr+2]);
1761 vdwjidx0A = 2*vdwtype[inr+0];
1762 qq00 = _mm_mul_ps(iq0,jq0);
1763 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1764 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1765 qq01 = _mm_mul_ps(iq0,jq1);
1766 qq02 = _mm_mul_ps(iq0,jq2);
1767 qq10 = _mm_mul_ps(iq1,jq0);
1768 qq11 = _mm_mul_ps(iq1,jq1);
1769 qq12 = _mm_mul_ps(iq1,jq2);
1770 qq20 = _mm_mul_ps(iq2,jq0);
1771 qq21 = _mm_mul_ps(iq2,jq1);
1772 qq22 = _mm_mul_ps(iq2,jq2);
1773
1774 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1775 rcutoff_scalar = fr->rcoulomb;
1776 rcutoff = _mm_set1_ps(rcutoff_scalar);
1777 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1778
1779 rswitch_scalar = fr->rcoulomb_switch;
1780 rswitch = _mm_set1_ps(rswitch_scalar);
1781 /* Setup switch parameters */
1782 d_scalar = rcutoff_scalar-rswitch_scalar;
1783 d = _mm_set1_ps(d_scalar);
1784 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1785 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1786 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1787 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1788 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1789 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1790
1791 /* Avoid stupid compiler warnings */
1792 jnrA = jnrB = jnrC = jnrD = 0;
1793 j_coord_offsetA = 0;
1794 j_coord_offsetB = 0;
1795 j_coord_offsetC = 0;
1796 j_coord_offsetD = 0;
1797
1798 outeriter = 0;
1799 inneriter = 0;
1800
1801 for(iidx=0;iidx<4*DIM3;iidx++)
1802 {
1803 scratch[iidx] = 0.0;
1804 }
1805
1806 /* Start outer loop over neighborlists */
1807 for(iidx=0; iidx<nri; iidx++)
1808 {
1809 /* Load shift vector for this list */
1810 i_shift_offset = DIM3*shiftidx[iidx];
1811
1812 /* Load limits for loop over neighbors */
1813 j_index_start = jindex[iidx];
1814 j_index_end = jindex[iidx+1];
1815
1816 /* Get outer coordinate index */
1817 inr = iinr[iidx];
1818 i_coord_offset = DIM3*inr;
1819
1820 /* Load i particle coords and add shift vector */
1821 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1822 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1823
1824 fix0 = _mm_setzero_ps();
1825 fiy0 = _mm_setzero_ps();
1826 fiz0 = _mm_setzero_ps();
1827 fix1 = _mm_setzero_ps();
1828 fiy1 = _mm_setzero_ps();
1829 fiz1 = _mm_setzero_ps();
1830 fix2 = _mm_setzero_ps();
1831 fiy2 = _mm_setzero_ps();
1832 fiz2 = _mm_setzero_ps();
1833
1834 /* Start inner kernel loop */
1835 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1836 {
1837
1838 /* Get j neighbor index, and coordinate index */
1839 jnrA = jjnr[jidx];
1840 jnrB = jjnr[jidx+1];
1841 jnrC = jjnr[jidx+2];
1842 jnrD = jjnr[jidx+3];
1843 j_coord_offsetA = DIM3*jnrA;
1844 j_coord_offsetB = DIM3*jnrB;
1845 j_coord_offsetC = DIM3*jnrC;
1846 j_coord_offsetD = DIM3*jnrD;
1847
1848 /* load j atom coordinates */
1849 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1850 x+j_coord_offsetC,x+j_coord_offsetD,
1851 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1852
1853 /* Calculate displacement vector */
1854 dx00 = _mm_sub_ps(ix0,jx0);
1855 dy00 = _mm_sub_ps(iy0,jy0);
1856 dz00 = _mm_sub_ps(iz0,jz0);
1857 dx01 = _mm_sub_ps(ix0,jx1);
1858 dy01 = _mm_sub_ps(iy0,jy1);
1859 dz01 = _mm_sub_ps(iz0,jz1);
1860 dx02 = _mm_sub_ps(ix0,jx2);
1861 dy02 = _mm_sub_ps(iy0,jy2);
1862 dz02 = _mm_sub_ps(iz0,jz2);
1863 dx10 = _mm_sub_ps(ix1,jx0);
1864 dy10 = _mm_sub_ps(iy1,jy0);
1865 dz10 = _mm_sub_ps(iz1,jz0);
1866 dx11 = _mm_sub_ps(ix1,jx1);
1867 dy11 = _mm_sub_ps(iy1,jy1);
1868 dz11 = _mm_sub_ps(iz1,jz1);
1869 dx12 = _mm_sub_ps(ix1,jx2);
1870 dy12 = _mm_sub_ps(iy1,jy2);
1871 dz12 = _mm_sub_ps(iz1,jz2);
1872 dx20 = _mm_sub_ps(ix2,jx0);
1873 dy20 = _mm_sub_ps(iy2,jy0);
1874 dz20 = _mm_sub_ps(iz2,jz0);
1875 dx21 = _mm_sub_ps(ix2,jx1);
1876 dy21 = _mm_sub_ps(iy2,jy1);
1877 dz21 = _mm_sub_ps(iz2,jz1);
1878 dx22 = _mm_sub_ps(ix2,jx2);
1879 dy22 = _mm_sub_ps(iy2,jy2);
1880 dz22 = _mm_sub_ps(iz2,jz2);
1881
1882 /* Calculate squared distance and things based on it */
1883 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1884 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1885 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1886 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1887 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1888 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1889 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1890 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1891 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1892
1893 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1894 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1895 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1896 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1897 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1898 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1899 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1900 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1901 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1902
1903 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1904 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1905 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1906 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1907 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1908 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1909 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1910 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1911 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1912
1913 fjx0 = _mm_setzero_ps();
1914 fjy0 = _mm_setzero_ps();
1915 fjz0 = _mm_setzero_ps();
1916 fjx1 = _mm_setzero_ps();
1917 fjy1 = _mm_setzero_ps();
1918 fjz1 = _mm_setzero_ps();
1919 fjx2 = _mm_setzero_ps();
1920 fjy2 = _mm_setzero_ps();
1921 fjz2 = _mm_setzero_ps();
1922
1923 /**************************
1924 * CALCULATE INTERACTIONS *
1925 **************************/
1926
1927 if (gmx_mm_any_lt(rsq00,rcutoff2))
1928 {
1929
1930 r00 = _mm_mul_ps(rsq00,rinv00);
1931
1932 /* EWALD ELECTROSTATICS */
1933
1934 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1935 ewrt = _mm_mul_ps(r00,ewtabscale);
1936 ewitab = _mm_cvttps_epi32(ewrt);
1937 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
1938 ewitab = _mm_slli_epi32(ewitab,2);
1939 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
1940 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
1941 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
1942 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
1943 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
1944 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1945 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1946 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1947 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1948
1949 /* LENNARD-JONES DISPERSION/REPULSION */
1950
1951 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1952 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1953 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1954 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1955 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1956
1957 d = _mm_sub_ps(r00,rswitch);
1958 d = _mm_max_ps(d,_mm_setzero_ps());
1959 d2 = _mm_mul_ps(d,d);
1960 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1961
1962 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1963
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1967 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1968 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1969
1970 fscal = _mm_add_ps(felec,fvdw);
1971
1972 fscal = _mm_and_ps(fscal,cutoff_mask);
1973
1974 /* Calculate temporary vectorial force */
1975 tx = _mm_mul_ps(fscal,dx00);
1976 ty = _mm_mul_ps(fscal,dy00);
1977 tz = _mm_mul_ps(fscal,dz00);
1978
1979 /* Update vectorial force */
1980 fix0 = _mm_add_ps(fix0,tx);
1981 fiy0 = _mm_add_ps(fiy0,ty);
1982 fiz0 = _mm_add_ps(fiz0,tz);
1983
1984 fjx0 = _mm_add_ps(fjx0,tx);
1985 fjy0 = _mm_add_ps(fjy0,ty);
1986 fjz0 = _mm_add_ps(fjz0,tz);
1987
1988 }
1989
1990 /**************************
1991 * CALCULATE INTERACTIONS *
1992 **************************/
1993
1994 if (gmx_mm_any_lt(rsq01,rcutoff2))
1995 {
1996
1997 r01 = _mm_mul_ps(rsq01,rinv01);
1998
1999 /* EWALD ELECTROSTATICS */
2000
2001 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2002 ewrt = _mm_mul_ps(r01,ewtabscale);
2003 ewitab = _mm_cvttps_epi32(ewrt);
2004 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2005 ewitab = _mm_slli_epi32(ewitab,2);
2006 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2007 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2008 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2009 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2010 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2011 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2012 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2013 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2014 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2015
2016 d = _mm_sub_ps(r01,rswitch);
2017 d = _mm_max_ps(d,_mm_setzero_ps());
2018 d2 = _mm_mul_ps(d,d);
2019 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2020
2021 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2022
2023 /* Evaluate switch function */
2024 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2025 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2026 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2027
2028 fscal = felec;
2029
2030 fscal = _mm_and_ps(fscal,cutoff_mask);
2031
2032 /* Calculate temporary vectorial force */
2033 tx = _mm_mul_ps(fscal,dx01);
2034 ty = _mm_mul_ps(fscal,dy01);
2035 tz = _mm_mul_ps(fscal,dz01);
2036
2037 /* Update vectorial force */
2038 fix0 = _mm_add_ps(fix0,tx);
2039 fiy0 = _mm_add_ps(fiy0,ty);
2040 fiz0 = _mm_add_ps(fiz0,tz);
2041
2042 fjx1 = _mm_add_ps(fjx1,tx);
2043 fjy1 = _mm_add_ps(fjy1,ty);
2044 fjz1 = _mm_add_ps(fjz1,tz);
2045
2046 }
2047
2048 /**************************
2049 * CALCULATE INTERACTIONS *
2050 **************************/
2051
2052 if (gmx_mm_any_lt(rsq02,rcutoff2))
2053 {
2054
2055 r02 = _mm_mul_ps(rsq02,rinv02);
2056
2057 /* EWALD ELECTROSTATICS */
2058
2059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2060 ewrt = _mm_mul_ps(r02,ewtabscale);
2061 ewitab = _mm_cvttps_epi32(ewrt);
2062 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2063 ewitab = _mm_slli_epi32(ewitab,2);
2064 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2065 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2066 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2067 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2068 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2069 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2070 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2071 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2072 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2073
2074 d = _mm_sub_ps(r02,rswitch);
2075 d = _mm_max_ps(d,_mm_setzero_ps());
2076 d2 = _mm_mul_ps(d,d);
2077 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2078
2079 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2080
2081 /* Evaluate switch function */
2082 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2083 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2084 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2085
2086 fscal = felec;
2087
2088 fscal = _mm_and_ps(fscal,cutoff_mask);
2089
2090 /* Calculate temporary vectorial force */
2091 tx = _mm_mul_ps(fscal,dx02);
2092 ty = _mm_mul_ps(fscal,dy02);
2093 tz = _mm_mul_ps(fscal,dz02);
2094
2095 /* Update vectorial force */
2096 fix0 = _mm_add_ps(fix0,tx);
2097 fiy0 = _mm_add_ps(fiy0,ty);
2098 fiz0 = _mm_add_ps(fiz0,tz);
2099
2100 fjx2 = _mm_add_ps(fjx2,tx);
2101 fjy2 = _mm_add_ps(fjy2,ty);
2102 fjz2 = _mm_add_ps(fjz2,tz);
2103
2104 }
2105
2106 /**************************
2107 * CALCULATE INTERACTIONS *
2108 **************************/
2109
2110 if (gmx_mm_any_lt(rsq10,rcutoff2))
2111 {
2112
2113 r10 = _mm_mul_ps(rsq10,rinv10);
2114
2115 /* EWALD ELECTROSTATICS */
2116
2117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2118 ewrt = _mm_mul_ps(r10,ewtabscale);
2119 ewitab = _mm_cvttps_epi32(ewrt);
2120 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2121 ewitab = _mm_slli_epi32(ewitab,2);
2122 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2123 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2124 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2125 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2126 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2127 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2128 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2129 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2130 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2131
2132 d = _mm_sub_ps(r10,rswitch);
2133 d = _mm_max_ps(d,_mm_setzero_ps());
2134 d2 = _mm_mul_ps(d,d);
2135 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2136
2137 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2138
2139 /* Evaluate switch function */
2140 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2141 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2142 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2143
2144 fscal = felec;
2145
2146 fscal = _mm_and_ps(fscal,cutoff_mask);
2147
2148 /* Calculate temporary vectorial force */
2149 tx = _mm_mul_ps(fscal,dx10);
2150 ty = _mm_mul_ps(fscal,dy10);
2151 tz = _mm_mul_ps(fscal,dz10);
2152
2153 /* Update vectorial force */
2154 fix1 = _mm_add_ps(fix1,tx);
2155 fiy1 = _mm_add_ps(fiy1,ty);
2156 fiz1 = _mm_add_ps(fiz1,tz);
2157
2158 fjx0 = _mm_add_ps(fjx0,tx);
2159 fjy0 = _mm_add_ps(fjy0,ty);
2160 fjz0 = _mm_add_ps(fjz0,tz);
2161
2162 }
2163
2164 /**************************
2165 * CALCULATE INTERACTIONS *
2166 **************************/
2167
2168 if (gmx_mm_any_lt(rsq11,rcutoff2))
2169 {
2170
2171 r11 = _mm_mul_ps(rsq11,rinv11);
2172
2173 /* EWALD ELECTROSTATICS */
2174
2175 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2176 ewrt = _mm_mul_ps(r11,ewtabscale);
2177 ewitab = _mm_cvttps_epi32(ewrt);
2178 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2179 ewitab = _mm_slli_epi32(ewitab,2);
2180 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2181 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2182 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2183 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2184 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2185 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2186 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2187 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2188 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2189
2190 d = _mm_sub_ps(r11,rswitch);
2191 d = _mm_max_ps(d,_mm_setzero_ps());
2192 d2 = _mm_mul_ps(d,d);
2193 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2194
2195 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2196
2197 /* Evaluate switch function */
2198 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2199 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2200 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2201
2202 fscal = felec;
2203
2204 fscal = _mm_and_ps(fscal,cutoff_mask);
2205
2206 /* Calculate temporary vectorial force */
2207 tx = _mm_mul_ps(fscal,dx11);
2208 ty = _mm_mul_ps(fscal,dy11);
2209 tz = _mm_mul_ps(fscal,dz11);
2210
2211 /* Update vectorial force */
2212 fix1 = _mm_add_ps(fix1,tx);
2213 fiy1 = _mm_add_ps(fiy1,ty);
2214 fiz1 = _mm_add_ps(fiz1,tz);
2215
2216 fjx1 = _mm_add_ps(fjx1,tx);
2217 fjy1 = _mm_add_ps(fjy1,ty);
2218 fjz1 = _mm_add_ps(fjz1,tz);
2219
2220 }
2221
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2225
2226 if (gmx_mm_any_lt(rsq12,rcutoff2))
2227 {
2228
2229 r12 = _mm_mul_ps(rsq12,rinv12);
2230
2231 /* EWALD ELECTROSTATICS */
2232
2233 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2234 ewrt = _mm_mul_ps(r12,ewtabscale);
2235 ewitab = _mm_cvttps_epi32(ewrt);
2236 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2237 ewitab = _mm_slli_epi32(ewitab,2);
2238 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2239 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2240 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2241 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2242 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2243 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2244 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2245 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2246 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2247
2248 d = _mm_sub_ps(r12,rswitch);
2249 d = _mm_max_ps(d,_mm_setzero_ps());
2250 d2 = _mm_mul_ps(d,d);
2251 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2252
2253 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2254
2255 /* Evaluate switch function */
2256 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2257 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2258 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2259
2260 fscal = felec;
2261
2262 fscal = _mm_and_ps(fscal,cutoff_mask);
2263
2264 /* Calculate temporary vectorial force */
2265 tx = _mm_mul_ps(fscal,dx12);
2266 ty = _mm_mul_ps(fscal,dy12);
2267 tz = _mm_mul_ps(fscal,dz12);
2268
2269 /* Update vectorial force */
2270 fix1 = _mm_add_ps(fix1,tx);
2271 fiy1 = _mm_add_ps(fiy1,ty);
2272 fiz1 = _mm_add_ps(fiz1,tz);
2273
2274 fjx2 = _mm_add_ps(fjx2,tx);
2275 fjy2 = _mm_add_ps(fjy2,ty);
2276 fjz2 = _mm_add_ps(fjz2,tz);
2277
2278 }
2279
2280 /**************************
2281 * CALCULATE INTERACTIONS *
2282 **************************/
2283
2284 if (gmx_mm_any_lt(rsq20,rcutoff2))
2285 {
2286
2287 r20 = _mm_mul_ps(rsq20,rinv20);
2288
2289 /* EWALD ELECTROSTATICS */
2290
2291 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2292 ewrt = _mm_mul_ps(r20,ewtabscale);
2293 ewitab = _mm_cvttps_epi32(ewrt);
2294 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2295 ewitab = _mm_slli_epi32(ewitab,2);
2296 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2297 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2298 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2299 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2300 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2301 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2302 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2303 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2304 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2305
2306 d = _mm_sub_ps(r20,rswitch);
2307 d = _mm_max_ps(d,_mm_setzero_ps());
2308 d2 = _mm_mul_ps(d,d);
2309 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2310
2311 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2312
2313 /* Evaluate switch function */
2314 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2315 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2316 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2317
2318 fscal = felec;
2319
2320 fscal = _mm_and_ps(fscal,cutoff_mask);
2321
2322 /* Calculate temporary vectorial force */
2323 tx = _mm_mul_ps(fscal,dx20);
2324 ty = _mm_mul_ps(fscal,dy20);
2325 tz = _mm_mul_ps(fscal,dz20);
2326
2327 /* Update vectorial force */
2328 fix2 = _mm_add_ps(fix2,tx);
2329 fiy2 = _mm_add_ps(fiy2,ty);
2330 fiz2 = _mm_add_ps(fiz2,tz);
2331
2332 fjx0 = _mm_add_ps(fjx0,tx);
2333 fjy0 = _mm_add_ps(fjy0,ty);
2334 fjz0 = _mm_add_ps(fjz0,tz);
2335
2336 }
2337
2338 /**************************
2339 * CALCULATE INTERACTIONS *
2340 **************************/
2341
2342 if (gmx_mm_any_lt(rsq21,rcutoff2))
2343 {
2344
2345 r21 = _mm_mul_ps(rsq21,rinv21);
2346
2347 /* EWALD ELECTROSTATICS */
2348
2349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2350 ewrt = _mm_mul_ps(r21,ewtabscale);
2351 ewitab = _mm_cvttps_epi32(ewrt);
2352 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2353 ewitab = _mm_slli_epi32(ewitab,2);
2354 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2355 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2356 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2357 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2358 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2359 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2360 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2361 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2362 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2363
2364 d = _mm_sub_ps(r21,rswitch);
2365 d = _mm_max_ps(d,_mm_setzero_ps());
2366 d2 = _mm_mul_ps(d,d);
2367 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2368
2369 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2370
2371 /* Evaluate switch function */
2372 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2373 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2374 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2375
2376 fscal = felec;
2377
2378 fscal = _mm_and_ps(fscal,cutoff_mask);
2379
2380 /* Calculate temporary vectorial force */
2381 tx = _mm_mul_ps(fscal,dx21);
2382 ty = _mm_mul_ps(fscal,dy21);
2383 tz = _mm_mul_ps(fscal,dz21);
2384
2385 /* Update vectorial force */
2386 fix2 = _mm_add_ps(fix2,tx);
2387 fiy2 = _mm_add_ps(fiy2,ty);
2388 fiz2 = _mm_add_ps(fiz2,tz);
2389
2390 fjx1 = _mm_add_ps(fjx1,tx);
2391 fjy1 = _mm_add_ps(fjy1,ty);
2392 fjz1 = _mm_add_ps(fjz1,tz);
2393
2394 }
2395
2396 /**************************
2397 * CALCULATE INTERACTIONS *
2398 **************************/
2399
2400 if (gmx_mm_any_lt(rsq22,rcutoff2))
2401 {
2402
2403 r22 = _mm_mul_ps(rsq22,rinv22);
2404
2405 /* EWALD ELECTROSTATICS */
2406
2407 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2408 ewrt = _mm_mul_ps(r22,ewtabscale);
2409 ewitab = _mm_cvttps_epi32(ewrt);
2410 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2411 ewitab = _mm_slli_epi32(ewitab,2);
2412 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2413 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2414 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2415 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2416 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2417 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2418 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2419 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2420 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2421
2422 d = _mm_sub_ps(r22,rswitch);
2423 d = _mm_max_ps(d,_mm_setzero_ps());
2424 d2 = _mm_mul_ps(d,d);
2425 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2426
2427 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2428
2429 /* Evaluate switch function */
2430 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2431 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2432 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2433
2434 fscal = felec;
2435
2436 fscal = _mm_and_ps(fscal,cutoff_mask);
2437
2438 /* Calculate temporary vectorial force */
2439 tx = _mm_mul_ps(fscal,dx22);
2440 ty = _mm_mul_ps(fscal,dy22);
2441 tz = _mm_mul_ps(fscal,dz22);
2442
2443 /* Update vectorial force */
2444 fix2 = _mm_add_ps(fix2,tx);
2445 fiy2 = _mm_add_ps(fiy2,ty);
2446 fiz2 = _mm_add_ps(fiz2,tz);
2447
2448 fjx2 = _mm_add_ps(fjx2,tx);
2449 fjy2 = _mm_add_ps(fjy2,ty);
2450 fjz2 = _mm_add_ps(fjz2,tz);
2451
2452 }
2453
2454 fjptrA = f+j_coord_offsetA;
2455 fjptrB = f+j_coord_offsetB;
2456 fjptrC = f+j_coord_offsetC;
2457 fjptrD = f+j_coord_offsetD;
2458
2459 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2460 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2461
2462 /* Inner loop uses 573 flops */
2463 }
2464
2465 if(jidx<j_index_end)
2466 {
2467
2468 /* Get j neighbor index, and coordinate index */
2469 jnrlistA = jjnr[jidx];
2470 jnrlistB = jjnr[jidx+1];
2471 jnrlistC = jjnr[jidx+2];
2472 jnrlistD = jjnr[jidx+3];
2473 /* Sign of each element will be negative for non-real atoms.
2474 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2475 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2476 */
2477 dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2478 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2479 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2480 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2481 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2482 j_coord_offsetA = DIM3*jnrA;
2483 j_coord_offsetB = DIM3*jnrB;
2484 j_coord_offsetC = DIM3*jnrC;
2485 j_coord_offsetD = DIM3*jnrD;
2486
2487 /* load j atom coordinates */
2488 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2489 x+j_coord_offsetC,x+j_coord_offsetD,
2490 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2491
2492 /* Calculate displacement vector */
2493 dx00 = _mm_sub_ps(ix0,jx0);
2494 dy00 = _mm_sub_ps(iy0,jy0);
2495 dz00 = _mm_sub_ps(iz0,jz0);
2496 dx01 = _mm_sub_ps(ix0,jx1);
2497 dy01 = _mm_sub_ps(iy0,jy1);
2498 dz01 = _mm_sub_ps(iz0,jz1);
2499 dx02 = _mm_sub_ps(ix0,jx2);
2500 dy02 = _mm_sub_ps(iy0,jy2);
2501 dz02 = _mm_sub_ps(iz0,jz2);
2502 dx10 = _mm_sub_ps(ix1,jx0);
2503 dy10 = _mm_sub_ps(iy1,jy0);
2504 dz10 = _mm_sub_ps(iz1,jz0);
2505 dx11 = _mm_sub_ps(ix1,jx1);
2506 dy11 = _mm_sub_ps(iy1,jy1);
2507 dz11 = _mm_sub_ps(iz1,jz1);
2508 dx12 = _mm_sub_ps(ix1,jx2);
2509 dy12 = _mm_sub_ps(iy1,jy2);
2510 dz12 = _mm_sub_ps(iz1,jz2);
2511 dx20 = _mm_sub_ps(ix2,jx0);
2512 dy20 = _mm_sub_ps(iy2,jy0);
2513 dz20 = _mm_sub_ps(iz2,jz0);
2514 dx21 = _mm_sub_ps(ix2,jx1);
2515 dy21 = _mm_sub_ps(iy2,jy1);
2516 dz21 = _mm_sub_ps(iz2,jz1);
2517 dx22 = _mm_sub_ps(ix2,jx2);
2518 dy22 = _mm_sub_ps(iy2,jy2);
2519 dz22 = _mm_sub_ps(iz2,jz2);
2520
2521 /* Calculate squared distance and things based on it */
2522 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2523 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2524 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2525 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2526 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2527 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2528 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2529 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2530 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2531
2532 rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
2533 rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
2534 rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
2535 rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
2536 rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2537 rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2538 rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
2539 rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2540 rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2541
2542 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2543 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2544 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2545 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2546 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2547 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2548 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2549 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2550 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2551
2552 fjx0 = _mm_setzero_ps();
2553 fjy0 = _mm_setzero_ps();
2554 fjz0 = _mm_setzero_ps();
2555 fjx1 = _mm_setzero_ps();
2556 fjy1 = _mm_setzero_ps();
2557 fjz1 = _mm_setzero_ps();
2558 fjx2 = _mm_setzero_ps();
2559 fjy2 = _mm_setzero_ps();
2560 fjz2 = _mm_setzero_ps();
2561
2562 /**************************
2563 * CALCULATE INTERACTIONS *
2564 **************************/
2565
2566 if (gmx_mm_any_lt(rsq00,rcutoff2))
2567 {
2568
2569 r00 = _mm_mul_ps(rsq00,rinv00);
2570 r00 = _mm_andnot_ps(dummy_mask,r00);
2571
2572 /* EWALD ELECTROSTATICS */
2573
2574 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2575 ewrt = _mm_mul_ps(r00,ewtabscale);
2576 ewitab = _mm_cvttps_epi32(ewrt);
2577 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2578 ewitab = _mm_slli_epi32(ewitab,2);
2579 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2580 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2581 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2582 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2583 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2584 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2585 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2586 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2587 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2588
2589 /* LENNARD-JONES DISPERSION/REPULSION */
2590
2591 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2592 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2593 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2594 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2595 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2596
2597 d = _mm_sub_ps(r00,rswitch);
2598 d = _mm_max_ps(d,_mm_setzero_ps());
2599 d2 = _mm_mul_ps(d,d);
2600 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2601
2602 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2603
2604 /* Evaluate switch function */
2605 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2606 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2607 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2608 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2609
2610 fscal = _mm_add_ps(felec,fvdw);
2611
2612 fscal = _mm_and_ps(fscal,cutoff_mask);
2613
2614 fscal = _mm_andnot_ps(dummy_mask,fscal);
2615
2616 /* Calculate temporary vectorial force */
2617 tx = _mm_mul_ps(fscal,dx00);
2618 ty = _mm_mul_ps(fscal,dy00);
2619 tz = _mm_mul_ps(fscal,dz00);
2620
2621 /* Update vectorial force */
2622 fix0 = _mm_add_ps(fix0,tx);
2623 fiy0 = _mm_add_ps(fiy0,ty);
2624 fiz0 = _mm_add_ps(fiz0,tz);
2625
2626 fjx0 = _mm_add_ps(fjx0,tx);
2627 fjy0 = _mm_add_ps(fjy0,ty);
2628 fjz0 = _mm_add_ps(fjz0,tz);
2629
2630 }
2631
2632 /**************************
2633 * CALCULATE INTERACTIONS *
2634 **************************/
2635
2636 if (gmx_mm_any_lt(rsq01,rcutoff2))
2637 {
2638
2639 r01 = _mm_mul_ps(rsq01,rinv01);
2640 r01 = _mm_andnot_ps(dummy_mask,r01);
2641
2642 /* EWALD ELECTROSTATICS */
2643
2644 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2645 ewrt = _mm_mul_ps(r01,ewtabscale);
2646 ewitab = _mm_cvttps_epi32(ewrt);
2647 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2648 ewitab = _mm_slli_epi32(ewitab,2);
2649 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2650 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2651 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2652 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2653 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2654 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2655 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2656 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2657 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2658
2659 d = _mm_sub_ps(r01,rswitch);
2660 d = _mm_max_ps(d,_mm_setzero_ps());
2661 d2 = _mm_mul_ps(d,d);
2662 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2663
2664 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2665
2666 /* Evaluate switch function */
2667 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2668 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2669 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2670
2671 fscal = felec;
2672
2673 fscal = _mm_and_ps(fscal,cutoff_mask);
2674
2675 fscal = _mm_andnot_ps(dummy_mask,fscal);
2676
2677 /* Calculate temporary vectorial force */
2678 tx = _mm_mul_ps(fscal,dx01);
2679 ty = _mm_mul_ps(fscal,dy01);
2680 tz = _mm_mul_ps(fscal,dz01);
2681
2682 /* Update vectorial force */
2683 fix0 = _mm_add_ps(fix0,tx);
2684 fiy0 = _mm_add_ps(fiy0,ty);
2685 fiz0 = _mm_add_ps(fiz0,tz);
2686
2687 fjx1 = _mm_add_ps(fjx1,tx);
2688 fjy1 = _mm_add_ps(fjy1,ty);
2689 fjz1 = _mm_add_ps(fjz1,tz);
2690
2691 }
2692
2693 /**************************
2694 * CALCULATE INTERACTIONS *
2695 **************************/
2696
2697 if (gmx_mm_any_lt(rsq02,rcutoff2))
2698 {
2699
2700 r02 = _mm_mul_ps(rsq02,rinv02);
2701 r02 = _mm_andnot_ps(dummy_mask,r02);
2702
2703 /* EWALD ELECTROSTATICS */
2704
2705 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2706 ewrt = _mm_mul_ps(r02,ewtabscale);
2707 ewitab = _mm_cvttps_epi32(ewrt);
2708 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2709 ewitab = _mm_slli_epi32(ewitab,2);
2710 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2711 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2712 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2713 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2714 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2715 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2716 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2717 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2718 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2719
2720 d = _mm_sub_ps(r02,rswitch);
2721 d = _mm_max_ps(d,_mm_setzero_ps());
2722 d2 = _mm_mul_ps(d,d);
2723 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2724
2725 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2726
2727 /* Evaluate switch function */
2728 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2729 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2730 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2731
2732 fscal = felec;
2733
2734 fscal = _mm_and_ps(fscal,cutoff_mask);
2735
2736 fscal = _mm_andnot_ps(dummy_mask,fscal);
2737
2738 /* Calculate temporary vectorial force */
2739 tx = _mm_mul_ps(fscal,dx02);
2740 ty = _mm_mul_ps(fscal,dy02);
2741 tz = _mm_mul_ps(fscal,dz02);
2742
2743 /* Update vectorial force */
2744 fix0 = _mm_add_ps(fix0,tx);
2745 fiy0 = _mm_add_ps(fiy0,ty);
2746 fiz0 = _mm_add_ps(fiz0,tz);
2747
2748 fjx2 = _mm_add_ps(fjx2,tx);
2749 fjy2 = _mm_add_ps(fjy2,ty);
2750 fjz2 = _mm_add_ps(fjz2,tz);
2751
2752 }
2753
2754 /**************************
2755 * CALCULATE INTERACTIONS *
2756 **************************/
2757
2758 if (gmx_mm_any_lt(rsq10,rcutoff2))
2759 {
2760
2761 r10 = _mm_mul_ps(rsq10,rinv10);
2762 r10 = _mm_andnot_ps(dummy_mask,r10);
2763
2764 /* EWALD ELECTROSTATICS */
2765
2766 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2767 ewrt = _mm_mul_ps(r10,ewtabscale);
2768 ewitab = _mm_cvttps_epi32(ewrt);
2769 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2770 ewitab = _mm_slli_epi32(ewitab,2);
2771 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2772 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2773 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2774 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2775 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2776 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2777 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2778 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2779 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2780
2781 d = _mm_sub_ps(r10,rswitch);
2782 d = _mm_max_ps(d,_mm_setzero_ps());
2783 d2 = _mm_mul_ps(d,d);
2784 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2785
2786 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2787
2788 /* Evaluate switch function */
2789 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2790 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2791 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2792
2793 fscal = felec;
2794
2795 fscal = _mm_and_ps(fscal,cutoff_mask);
2796
2797 fscal = _mm_andnot_ps(dummy_mask,fscal);
2798
2799 /* Calculate temporary vectorial force */
2800 tx = _mm_mul_ps(fscal,dx10);
2801 ty = _mm_mul_ps(fscal,dy10);
2802 tz = _mm_mul_ps(fscal,dz10);
2803
2804 /* Update vectorial force */
2805 fix1 = _mm_add_ps(fix1,tx);
2806 fiy1 = _mm_add_ps(fiy1,ty);
2807 fiz1 = _mm_add_ps(fiz1,tz);
2808
2809 fjx0 = _mm_add_ps(fjx0,tx);
2810 fjy0 = _mm_add_ps(fjy0,ty);
2811 fjz0 = _mm_add_ps(fjz0,tz);
2812
2813 }
2814
2815 /**************************
2816 * CALCULATE INTERACTIONS *
2817 **************************/
2818
2819 if (gmx_mm_any_lt(rsq11,rcutoff2))
2820 {
2821
2822 r11 = _mm_mul_ps(rsq11,rinv11);
2823 r11 = _mm_andnot_ps(dummy_mask,r11);
2824
2825 /* EWALD ELECTROSTATICS */
2826
2827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2828 ewrt = _mm_mul_ps(r11,ewtabscale);
2829 ewitab = _mm_cvttps_epi32(ewrt);
2830 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2831 ewitab = _mm_slli_epi32(ewitab,2);
2832 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2833 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2834 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2835 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2836 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2837 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2838 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2839 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2840 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2841
2842 d = _mm_sub_ps(r11,rswitch);
2843 d = _mm_max_ps(d,_mm_setzero_ps());
2844 d2 = _mm_mul_ps(d,d);
2845 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2846
2847 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2848
2849 /* Evaluate switch function */
2850 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2851 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2852 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2853
2854 fscal = felec;
2855
2856 fscal = _mm_and_ps(fscal,cutoff_mask);
2857
2858 fscal = _mm_andnot_ps(dummy_mask,fscal);
2859
2860 /* Calculate temporary vectorial force */
2861 tx = _mm_mul_ps(fscal,dx11);
2862 ty = _mm_mul_ps(fscal,dy11);
2863 tz = _mm_mul_ps(fscal,dz11);
2864
2865 /* Update vectorial force */
2866 fix1 = _mm_add_ps(fix1,tx);
2867 fiy1 = _mm_add_ps(fiy1,ty);
2868 fiz1 = _mm_add_ps(fiz1,tz);
2869
2870 fjx1 = _mm_add_ps(fjx1,tx);
2871 fjy1 = _mm_add_ps(fjy1,ty);
2872 fjz1 = _mm_add_ps(fjz1,tz);
2873
2874 }
2875
2876 /**************************
2877 * CALCULATE INTERACTIONS *
2878 **************************/
2879
2880 if (gmx_mm_any_lt(rsq12,rcutoff2))
2881 {
2882
2883 r12 = _mm_mul_ps(rsq12,rinv12);
2884 r12 = _mm_andnot_ps(dummy_mask,r12);
2885
2886 /* EWALD ELECTROSTATICS */
2887
2888 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2889 ewrt = _mm_mul_ps(r12,ewtabscale);
2890 ewitab = _mm_cvttps_epi32(ewrt);
2891 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2892 ewitab = _mm_slli_epi32(ewitab,2);
2893 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2894 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2895 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2896 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2897 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2898 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2899 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2900 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2901 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2902
2903 d = _mm_sub_ps(r12,rswitch);
2904 d = _mm_max_ps(d,_mm_setzero_ps());
2905 d2 = _mm_mul_ps(d,d);
2906 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2907
2908 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2909
2910 /* Evaluate switch function */
2911 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2912 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2913 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2914
2915 fscal = felec;
2916
2917 fscal = _mm_and_ps(fscal,cutoff_mask);
2918
2919 fscal = _mm_andnot_ps(dummy_mask,fscal);
2920
2921 /* Calculate temporary vectorial force */
2922 tx = _mm_mul_ps(fscal,dx12);
2923 ty = _mm_mul_ps(fscal,dy12);
2924 tz = _mm_mul_ps(fscal,dz12);
2925
2926 /* Update vectorial force */
2927 fix1 = _mm_add_ps(fix1,tx);
2928 fiy1 = _mm_add_ps(fiy1,ty);
2929 fiz1 = _mm_add_ps(fiz1,tz);
2930
2931 fjx2 = _mm_add_ps(fjx2,tx);
2932 fjy2 = _mm_add_ps(fjy2,ty);
2933 fjz2 = _mm_add_ps(fjz2,tz);
2934
2935 }
2936
2937 /**************************
2938 * CALCULATE INTERACTIONS *
2939 **************************/
2940
2941 if (gmx_mm_any_lt(rsq20,rcutoff2))
2942 {
2943
2944 r20 = _mm_mul_ps(rsq20,rinv20);
2945 r20 = _mm_andnot_ps(dummy_mask,r20);
2946
2947 /* EWALD ELECTROSTATICS */
2948
2949 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2950 ewrt = _mm_mul_ps(r20,ewtabscale);
2951 ewitab = _mm_cvttps_epi32(ewrt);
2952 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
2953 ewitab = _mm_slli_epi32(ewitab,2);
2954 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
2955 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
2956 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
2957 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
2958 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
2959 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2960 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2961 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2962 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2963
2964 d = _mm_sub_ps(r20,rswitch);
2965 d = _mm_max_ps(d,_mm_setzero_ps());
2966 d2 = _mm_mul_ps(d,d);
2967 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2968
2969 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2970
2971 /* Evaluate switch function */
2972 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2973 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2974 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2975
2976 fscal = felec;
2977
2978 fscal = _mm_and_ps(fscal,cutoff_mask);
2979
2980 fscal = _mm_andnot_ps(dummy_mask,fscal);
2981
2982 /* Calculate temporary vectorial force */
2983 tx = _mm_mul_ps(fscal,dx20);
2984 ty = _mm_mul_ps(fscal,dy20);
2985 tz = _mm_mul_ps(fscal,dz20);
2986
2987 /* Update vectorial force */
2988 fix2 = _mm_add_ps(fix2,tx);
2989 fiy2 = _mm_add_ps(fiy2,ty);
2990 fiz2 = _mm_add_ps(fiz2,tz);
2991
2992 fjx0 = _mm_add_ps(fjx0,tx);
2993 fjy0 = _mm_add_ps(fjy0,ty);
2994 fjz0 = _mm_add_ps(fjz0,tz);
2995
2996 }
2997
2998 /**************************
2999 * CALCULATE INTERACTIONS *
3000 **************************/
3001
3002 if (gmx_mm_any_lt(rsq21,rcutoff2))
3003 {
3004
3005 r21 = _mm_mul_ps(rsq21,rinv21);
3006 r21 = _mm_andnot_ps(dummy_mask,r21);
3007
3008 /* EWALD ELECTROSTATICS */
3009
3010 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3011 ewrt = _mm_mul_ps(r21,ewtabscale);
3012 ewitab = _mm_cvttps_epi32(ewrt);
3013 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3014 ewitab = _mm_slli_epi32(ewitab,2);
3015 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3016 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3017 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3018 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3019 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3020 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3021 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3022 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3023 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3024
3025 d = _mm_sub_ps(r21,rswitch);
3026 d = _mm_max_ps(d,_mm_setzero_ps());
3027 d2 = _mm_mul_ps(d,d);
3028 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3029
3030 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3031
3032 /* Evaluate switch function */
3033 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3034 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3035 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3036
3037 fscal = felec;
3038
3039 fscal = _mm_and_ps(fscal,cutoff_mask);
3040
3041 fscal = _mm_andnot_ps(dummy_mask,fscal);
3042
3043 /* Calculate temporary vectorial force */
3044 tx = _mm_mul_ps(fscal,dx21);
3045 ty = _mm_mul_ps(fscal,dy21);
3046 tz = _mm_mul_ps(fscal,dz21);
3047
3048 /* Update vectorial force */
3049 fix2 = _mm_add_ps(fix2,tx);
3050 fiy2 = _mm_add_ps(fiy2,ty);
3051 fiz2 = _mm_add_ps(fiz2,tz);
3052
3053 fjx1 = _mm_add_ps(fjx1,tx);
3054 fjy1 = _mm_add_ps(fjy1,ty);
3055 fjz1 = _mm_add_ps(fjz1,tz);
3056
3057 }
3058
3059 /**************************
3060 * CALCULATE INTERACTIONS *
3061 **************************/
3062
3063 if (gmx_mm_any_lt(rsq22,rcutoff2))
3064 {
3065
3066 r22 = _mm_mul_ps(rsq22,rinv22);
3067 r22 = _mm_andnot_ps(dummy_mask,r22);
3068
3069 /* EWALD ELECTROSTATICS */
3070
3071 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3072 ewrt = _mm_mul_ps(r22,ewtabscale);
3073 ewitab = _mm_cvttps_epi32(ewrt);
3074 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps
((__v4sf)__X, ((0x00 | 0x01))); })
);
3075 ewitab = _mm_slli_epi32(ewitab,2);
3076 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) &
3];}))
);
3077 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) &
3];}))
);
3078 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) &
3];}))
);
3079 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) &
3];}))
);
3080 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF
), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1
= _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps
((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); (
ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps
(tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while (
0)
;
3081 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3082 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3083 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3084 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3085
3086 d = _mm_sub_ps(r22,rswitch);
3087 d = _mm_max_ps(d,_mm_setzero_ps());
3088 d2 = _mm_mul_ps(d,d);
3089 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3090
3091 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3092
3093 /* Evaluate switch function */
3094 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3095 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3096 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3097
3098 fscal = felec;
3099
3100 fscal = _mm_and_ps(fscal,cutoff_mask);
3101
3102 fscal = _mm_andnot_ps(dummy_mask,fscal);
3103
3104 /* Calculate temporary vectorial force */
3105 tx = _mm_mul_ps(fscal,dx22);
3106 ty = _mm_mul_ps(fscal,dy22);
3107 tz = _mm_mul_ps(fscal,dz22);
3108
3109 /* Update vectorial force */
3110 fix2 = _mm_add_ps(fix2,tx);
3111 fiy2 = _mm_add_ps(fiy2,ty);
3112 fiz2 = _mm_add_ps(fiz2,tz);
3113
3114 fjx2 = _mm_add_ps(fjx2,tx);
3115 fjy2 = _mm_add_ps(fjy2,ty);
3116 fjz2 = _mm_add_ps(fjz2,tz);
3117
3118 }
3119
3120 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3121 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3122 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3123 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3124
3125 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3126 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3127
3128 /* Inner loop uses 582 flops */
3129 }
3130
3131 /* End of innermost loop */
3132
3133 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3134 f+i_coord_offset,fshift+i_shift_offset);
3135
3136 /* Increment number of inner iterations */
3137 inneriter += j_index_end - j_index_start;
3138
3139 /* Outer loop uses 18 flops */
3140 }
3141
3142 /* Increment number of outer iterations */
3143 outeriter += nri;
3144
3145 /* Update outer/inner flops */
3146
3147 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_F] += outeriter*18 + inneriter
*582
;
3148}