2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
101 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
103 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
104 real rswitch_scalar,d_scalar;
105 __m128 dummy_mask,cutoff_mask;
106 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
107 __m128 one = _mm_set1_ps(1.0);
108 __m128 two = _mm_set1_ps(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm_set1_ps(fr->epsfac);
121 charge = mdatoms->chargeA;
122 nvdwtype = fr->ntype;
124 vdwtype = mdatoms->typeA;
126 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
127 ewtab = fr->ic->tabq_coul_FDV0;
128 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
129 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm_set1_ps(charge[inr+1]);
139 jq2 = _mm_set1_ps(charge[inr+2]);
140 jq3 = _mm_set1_ps(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
143 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
144 qq11 = _mm_mul_ps(iq1,jq1);
145 qq12 = _mm_mul_ps(iq1,jq2);
146 qq13 = _mm_mul_ps(iq1,jq3);
147 qq21 = _mm_mul_ps(iq2,jq1);
148 qq22 = _mm_mul_ps(iq2,jq2);
149 qq23 = _mm_mul_ps(iq2,jq3);
150 qq31 = _mm_mul_ps(iq3,jq1);
151 qq32 = _mm_mul_ps(iq3,jq2);
152 qq33 = _mm_mul_ps(iq3,jq3);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm_set1_ps(rcutoff_scalar);
157 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
159 rswitch_scalar = fr->rcoulomb_switch;
160 rswitch = _mm_set1_ps(rswitch_scalar);
161 /* Setup switch parameters */
162 d_scalar = rcutoff_scalar-rswitch_scalar;
163 d = _mm_set1_ps(d_scalar);
164 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
165 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
166 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
168 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
171 /* Avoid stupid compiler warnings */
172 jnrA = jnrB = jnrC = jnrD = 0;
181 /* Start outer loop over neighborlists */
182 for(iidx=0; iidx<nri; iidx++)
184 /* Load shift vector for this list */
185 i_shift_offset = DIM*shiftidx[iidx];
186 shX = shiftvec[i_shift_offset+XX];
187 shY = shiftvec[i_shift_offset+YY];
188 shZ = shiftvec[i_shift_offset+ZZ];
190 /* Load limits for loop over neighbors */
191 j_index_start = jindex[iidx];
192 j_index_end = jindex[iidx+1];
194 /* Get outer coordinate index */
196 i_coord_offset = DIM*inr;
198 /* Load i particle coords and add shift vector */
199 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
200 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
201 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
202 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
203 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
204 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
205 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
206 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
207 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
208 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
209 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
210 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
212 fix0 = _mm_setzero_ps();
213 fiy0 = _mm_setzero_ps();
214 fiz0 = _mm_setzero_ps();
215 fix1 = _mm_setzero_ps();
216 fiy1 = _mm_setzero_ps();
217 fiz1 = _mm_setzero_ps();
218 fix2 = _mm_setzero_ps();
219 fiy2 = _mm_setzero_ps();
220 fiz2 = _mm_setzero_ps();
221 fix3 = _mm_setzero_ps();
222 fiy3 = _mm_setzero_ps();
223 fiz3 = _mm_setzero_ps();
225 /* Reset potential sums */
226 velecsum = _mm_setzero_ps();
227 vvdwsum = _mm_setzero_ps();
229 /* Start inner kernel loop */
230 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
233 /* Get j neighbor index, and coordinate index */
239 j_coord_offsetA = DIM*jnrA;
240 j_coord_offsetB = DIM*jnrB;
241 j_coord_offsetC = DIM*jnrC;
242 j_coord_offsetD = DIM*jnrD;
244 /* load j atom coordinates */
245 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
246 x+j_coord_offsetC,x+j_coord_offsetD,
247 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
248 &jy2,&jz2,&jx3,&jy3,&jz3);
250 /* Calculate displacement vector */
251 dx00 = _mm_sub_ps(ix0,jx0);
252 dy00 = _mm_sub_ps(iy0,jy0);
253 dz00 = _mm_sub_ps(iz0,jz0);
254 dx11 = _mm_sub_ps(ix1,jx1);
255 dy11 = _mm_sub_ps(iy1,jy1);
256 dz11 = _mm_sub_ps(iz1,jz1);
257 dx12 = _mm_sub_ps(ix1,jx2);
258 dy12 = _mm_sub_ps(iy1,jy2);
259 dz12 = _mm_sub_ps(iz1,jz2);
260 dx13 = _mm_sub_ps(ix1,jx3);
261 dy13 = _mm_sub_ps(iy1,jy3);
262 dz13 = _mm_sub_ps(iz1,jz3);
263 dx21 = _mm_sub_ps(ix2,jx1);
264 dy21 = _mm_sub_ps(iy2,jy1);
265 dz21 = _mm_sub_ps(iz2,jz1);
266 dx22 = _mm_sub_ps(ix2,jx2);
267 dy22 = _mm_sub_ps(iy2,jy2);
268 dz22 = _mm_sub_ps(iz2,jz2);
269 dx23 = _mm_sub_ps(ix2,jx3);
270 dy23 = _mm_sub_ps(iy2,jy3);
271 dz23 = _mm_sub_ps(iz2,jz3);
272 dx31 = _mm_sub_ps(ix3,jx1);
273 dy31 = _mm_sub_ps(iy3,jy1);
274 dz31 = _mm_sub_ps(iz3,jz1);
275 dx32 = _mm_sub_ps(ix3,jx2);
276 dy32 = _mm_sub_ps(iy3,jy2);
277 dz32 = _mm_sub_ps(iz3,jz2);
278 dx33 = _mm_sub_ps(ix3,jx3);
279 dy33 = _mm_sub_ps(iy3,jy3);
280 dz33 = _mm_sub_ps(iz3,jz3);
282 /* Calculate squared distance and things based on it */
283 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
284 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
285 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
286 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
287 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
288 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
289 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
290 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
291 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
292 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
294 rinv00 = gmx_mm_invsqrt_ps(rsq00);
295 rinv11 = gmx_mm_invsqrt_ps(rsq11);
296 rinv12 = gmx_mm_invsqrt_ps(rsq12);
297 rinv13 = gmx_mm_invsqrt_ps(rsq13);
298 rinv21 = gmx_mm_invsqrt_ps(rsq21);
299 rinv22 = gmx_mm_invsqrt_ps(rsq22);
300 rinv23 = gmx_mm_invsqrt_ps(rsq23);
301 rinv31 = gmx_mm_invsqrt_ps(rsq31);
302 rinv32 = gmx_mm_invsqrt_ps(rsq32);
303 rinv33 = gmx_mm_invsqrt_ps(rsq33);
305 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
306 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
307 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
308 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
309 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
310 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
311 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
312 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
313 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
314 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
316 fjx0 = _mm_setzero_ps();
317 fjy0 = _mm_setzero_ps();
318 fjz0 = _mm_setzero_ps();
319 fjx1 = _mm_setzero_ps();
320 fjy1 = _mm_setzero_ps();
321 fjz1 = _mm_setzero_ps();
322 fjx2 = _mm_setzero_ps();
323 fjy2 = _mm_setzero_ps();
324 fjz2 = _mm_setzero_ps();
325 fjx3 = _mm_setzero_ps();
326 fjy3 = _mm_setzero_ps();
327 fjz3 = _mm_setzero_ps();
329 /**************************
330 * CALCULATE INTERACTIONS *
331 **************************/
333 if (gmx_mm_any_lt(rsq00,rcutoff2))
336 r00 = _mm_mul_ps(rsq00,rinv00);
338 /* LENNARD-JONES DISPERSION/REPULSION */
340 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
341 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
342 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
343 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
344 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
346 d = _mm_sub_ps(r00,rswitch);
347 d = _mm_max_ps(d,_mm_setzero_ps());
348 d2 = _mm_mul_ps(d,d);
349 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
351 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
353 /* Evaluate switch function */
354 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
355 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
356 vvdw = _mm_mul_ps(vvdw,sw);
357 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 vvdw = _mm_and_ps(vvdw,cutoff_mask);
361 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
365 fscal = _mm_and_ps(fscal,cutoff_mask);
367 /* Calculate temporary vectorial force */
368 tx = _mm_mul_ps(fscal,dx00);
369 ty = _mm_mul_ps(fscal,dy00);
370 tz = _mm_mul_ps(fscal,dz00);
372 /* Update vectorial force */
373 fix0 = _mm_add_ps(fix0,tx);
374 fiy0 = _mm_add_ps(fiy0,ty);
375 fiz0 = _mm_add_ps(fiz0,tz);
377 fjx0 = _mm_add_ps(fjx0,tx);
378 fjy0 = _mm_add_ps(fjy0,ty);
379 fjz0 = _mm_add_ps(fjz0,tz);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 if (gmx_mm_any_lt(rsq11,rcutoff2))
390 r11 = _mm_mul_ps(rsq11,rinv11);
392 /* EWALD ELECTROSTATICS */
394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395 ewrt = _mm_mul_ps(r11,ewtabscale);
396 ewitab = _mm_cvttps_epi32(ewrt);
397 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
398 ewitab = _mm_slli_epi32(ewitab,2);
399 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
400 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
401 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
402 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
403 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
404 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
405 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
406 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
407 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
409 d = _mm_sub_ps(r11,rswitch);
410 d = _mm_max_ps(d,_mm_setzero_ps());
411 d2 = _mm_mul_ps(d,d);
412 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
414 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
416 /* Evaluate switch function */
417 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
418 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
419 velec = _mm_mul_ps(velec,sw);
420 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velec = _mm_and_ps(velec,cutoff_mask);
424 velecsum = _mm_add_ps(velecsum,velec);
428 fscal = _mm_and_ps(fscal,cutoff_mask);
430 /* Calculate temporary vectorial force */
431 tx = _mm_mul_ps(fscal,dx11);
432 ty = _mm_mul_ps(fscal,dy11);
433 tz = _mm_mul_ps(fscal,dz11);
435 /* Update vectorial force */
436 fix1 = _mm_add_ps(fix1,tx);
437 fiy1 = _mm_add_ps(fiy1,ty);
438 fiz1 = _mm_add_ps(fiz1,tz);
440 fjx1 = _mm_add_ps(fjx1,tx);
441 fjy1 = _mm_add_ps(fjy1,ty);
442 fjz1 = _mm_add_ps(fjz1,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 if (gmx_mm_any_lt(rsq12,rcutoff2))
453 r12 = _mm_mul_ps(rsq12,rinv12);
455 /* EWALD ELECTROSTATICS */
457 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
458 ewrt = _mm_mul_ps(r12,ewtabscale);
459 ewitab = _mm_cvttps_epi32(ewrt);
460 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
461 ewitab = _mm_slli_epi32(ewitab,2);
462 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
463 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
464 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
465 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
466 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
467 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
468 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
469 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
470 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
472 d = _mm_sub_ps(r12,rswitch);
473 d = _mm_max_ps(d,_mm_setzero_ps());
474 d2 = _mm_mul_ps(d,d);
475 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
477 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
479 /* Evaluate switch function */
480 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
481 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
482 velec = _mm_mul_ps(velec,sw);
483 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velec = _mm_and_ps(velec,cutoff_mask);
487 velecsum = _mm_add_ps(velecsum,velec);
491 fscal = _mm_and_ps(fscal,cutoff_mask);
493 /* Calculate temporary vectorial force */
494 tx = _mm_mul_ps(fscal,dx12);
495 ty = _mm_mul_ps(fscal,dy12);
496 tz = _mm_mul_ps(fscal,dz12);
498 /* Update vectorial force */
499 fix1 = _mm_add_ps(fix1,tx);
500 fiy1 = _mm_add_ps(fiy1,ty);
501 fiz1 = _mm_add_ps(fiz1,tz);
503 fjx2 = _mm_add_ps(fjx2,tx);
504 fjy2 = _mm_add_ps(fjy2,ty);
505 fjz2 = _mm_add_ps(fjz2,tz);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 if (gmx_mm_any_lt(rsq13,rcutoff2))
516 r13 = _mm_mul_ps(rsq13,rinv13);
518 /* EWALD ELECTROSTATICS */
520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521 ewrt = _mm_mul_ps(r13,ewtabscale);
522 ewitab = _mm_cvttps_epi32(ewrt);
523 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
524 ewitab = _mm_slli_epi32(ewitab,2);
525 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
526 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
527 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
528 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
529 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
530 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
531 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
532 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
533 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
535 d = _mm_sub_ps(r13,rswitch);
536 d = _mm_max_ps(d,_mm_setzero_ps());
537 d2 = _mm_mul_ps(d,d);
538 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
540 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
542 /* Evaluate switch function */
543 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
544 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
545 velec = _mm_mul_ps(velec,sw);
546 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velec = _mm_and_ps(velec,cutoff_mask);
550 velecsum = _mm_add_ps(velecsum,velec);
554 fscal = _mm_and_ps(fscal,cutoff_mask);
556 /* Calculate temporary vectorial force */
557 tx = _mm_mul_ps(fscal,dx13);
558 ty = _mm_mul_ps(fscal,dy13);
559 tz = _mm_mul_ps(fscal,dz13);
561 /* Update vectorial force */
562 fix1 = _mm_add_ps(fix1,tx);
563 fiy1 = _mm_add_ps(fiy1,ty);
564 fiz1 = _mm_add_ps(fiz1,tz);
566 fjx3 = _mm_add_ps(fjx3,tx);
567 fjy3 = _mm_add_ps(fjy3,ty);
568 fjz3 = _mm_add_ps(fjz3,tz);
572 /**************************
573 * CALCULATE INTERACTIONS *
574 **************************/
576 if (gmx_mm_any_lt(rsq21,rcutoff2))
579 r21 = _mm_mul_ps(rsq21,rinv21);
581 /* EWALD ELECTROSTATICS */
583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584 ewrt = _mm_mul_ps(r21,ewtabscale);
585 ewitab = _mm_cvttps_epi32(ewrt);
586 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
587 ewitab = _mm_slli_epi32(ewitab,2);
588 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
589 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
590 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
591 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
592 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
593 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
594 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
595 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
596 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
598 d = _mm_sub_ps(r21,rswitch);
599 d = _mm_max_ps(d,_mm_setzero_ps());
600 d2 = _mm_mul_ps(d,d);
601 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
603 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
605 /* Evaluate switch function */
606 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
607 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
608 velec = _mm_mul_ps(velec,sw);
609 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec = _mm_and_ps(velec,cutoff_mask);
613 velecsum = _mm_add_ps(velecsum,velec);
617 fscal = _mm_and_ps(fscal,cutoff_mask);
619 /* Calculate temporary vectorial force */
620 tx = _mm_mul_ps(fscal,dx21);
621 ty = _mm_mul_ps(fscal,dy21);
622 tz = _mm_mul_ps(fscal,dz21);
624 /* Update vectorial force */
625 fix2 = _mm_add_ps(fix2,tx);
626 fiy2 = _mm_add_ps(fiy2,ty);
627 fiz2 = _mm_add_ps(fiz2,tz);
629 fjx1 = _mm_add_ps(fjx1,tx);
630 fjy1 = _mm_add_ps(fjy1,ty);
631 fjz1 = _mm_add_ps(fjz1,tz);
635 /**************************
636 * CALCULATE INTERACTIONS *
637 **************************/
639 if (gmx_mm_any_lt(rsq22,rcutoff2))
642 r22 = _mm_mul_ps(rsq22,rinv22);
644 /* EWALD ELECTROSTATICS */
646 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
647 ewrt = _mm_mul_ps(r22,ewtabscale);
648 ewitab = _mm_cvttps_epi32(ewrt);
649 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
650 ewitab = _mm_slli_epi32(ewitab,2);
651 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
652 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
653 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
654 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
655 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
656 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
657 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
658 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
659 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
661 d = _mm_sub_ps(r22,rswitch);
662 d = _mm_max_ps(d,_mm_setzero_ps());
663 d2 = _mm_mul_ps(d,d);
664 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
666 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
668 /* Evaluate switch function */
669 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
670 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
671 velec = _mm_mul_ps(velec,sw);
672 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
674 /* Update potential sum for this i atom from the interaction with this j atom. */
675 velec = _mm_and_ps(velec,cutoff_mask);
676 velecsum = _mm_add_ps(velecsum,velec);
680 fscal = _mm_and_ps(fscal,cutoff_mask);
682 /* Calculate temporary vectorial force */
683 tx = _mm_mul_ps(fscal,dx22);
684 ty = _mm_mul_ps(fscal,dy22);
685 tz = _mm_mul_ps(fscal,dz22);
687 /* Update vectorial force */
688 fix2 = _mm_add_ps(fix2,tx);
689 fiy2 = _mm_add_ps(fiy2,ty);
690 fiz2 = _mm_add_ps(fiz2,tz);
692 fjx2 = _mm_add_ps(fjx2,tx);
693 fjy2 = _mm_add_ps(fjy2,ty);
694 fjz2 = _mm_add_ps(fjz2,tz);
698 /**************************
699 * CALCULATE INTERACTIONS *
700 **************************/
702 if (gmx_mm_any_lt(rsq23,rcutoff2))
705 r23 = _mm_mul_ps(rsq23,rinv23);
707 /* EWALD ELECTROSTATICS */
709 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
710 ewrt = _mm_mul_ps(r23,ewtabscale);
711 ewitab = _mm_cvttps_epi32(ewrt);
712 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
713 ewitab = _mm_slli_epi32(ewitab,2);
714 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
715 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
716 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
717 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
718 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
719 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
720 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
721 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
722 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
724 d = _mm_sub_ps(r23,rswitch);
725 d = _mm_max_ps(d,_mm_setzero_ps());
726 d2 = _mm_mul_ps(d,d);
727 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
729 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
731 /* Evaluate switch function */
732 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
733 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
734 velec = _mm_mul_ps(velec,sw);
735 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velec = _mm_and_ps(velec,cutoff_mask);
739 velecsum = _mm_add_ps(velecsum,velec);
743 fscal = _mm_and_ps(fscal,cutoff_mask);
745 /* Calculate temporary vectorial force */
746 tx = _mm_mul_ps(fscal,dx23);
747 ty = _mm_mul_ps(fscal,dy23);
748 tz = _mm_mul_ps(fscal,dz23);
750 /* Update vectorial force */
751 fix2 = _mm_add_ps(fix2,tx);
752 fiy2 = _mm_add_ps(fiy2,ty);
753 fiz2 = _mm_add_ps(fiz2,tz);
755 fjx3 = _mm_add_ps(fjx3,tx);
756 fjy3 = _mm_add_ps(fjy3,ty);
757 fjz3 = _mm_add_ps(fjz3,tz);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 if (gmx_mm_any_lt(rsq31,rcutoff2))
768 r31 = _mm_mul_ps(rsq31,rinv31);
770 /* EWALD ELECTROSTATICS */
772 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
773 ewrt = _mm_mul_ps(r31,ewtabscale);
774 ewitab = _mm_cvttps_epi32(ewrt);
775 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
776 ewitab = _mm_slli_epi32(ewitab,2);
777 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
778 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
779 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
780 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
781 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
782 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
783 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
784 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
785 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
787 d = _mm_sub_ps(r31,rswitch);
788 d = _mm_max_ps(d,_mm_setzero_ps());
789 d2 = _mm_mul_ps(d,d);
790 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
792 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
794 /* Evaluate switch function */
795 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
797 velec = _mm_mul_ps(velec,sw);
798 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm_and_ps(velec,cutoff_mask);
802 velecsum = _mm_add_ps(velecsum,velec);
806 fscal = _mm_and_ps(fscal,cutoff_mask);
808 /* Calculate temporary vectorial force */
809 tx = _mm_mul_ps(fscal,dx31);
810 ty = _mm_mul_ps(fscal,dy31);
811 tz = _mm_mul_ps(fscal,dz31);
813 /* Update vectorial force */
814 fix3 = _mm_add_ps(fix3,tx);
815 fiy3 = _mm_add_ps(fiy3,ty);
816 fiz3 = _mm_add_ps(fiz3,tz);
818 fjx1 = _mm_add_ps(fjx1,tx);
819 fjy1 = _mm_add_ps(fjy1,ty);
820 fjz1 = _mm_add_ps(fjz1,tz);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 if (gmx_mm_any_lt(rsq32,rcutoff2))
831 r32 = _mm_mul_ps(rsq32,rinv32);
833 /* EWALD ELECTROSTATICS */
835 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
836 ewrt = _mm_mul_ps(r32,ewtabscale);
837 ewitab = _mm_cvttps_epi32(ewrt);
838 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
839 ewitab = _mm_slli_epi32(ewitab,2);
840 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
841 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
842 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
843 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
844 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
845 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
846 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
847 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
848 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
850 d = _mm_sub_ps(r32,rswitch);
851 d = _mm_max_ps(d,_mm_setzero_ps());
852 d2 = _mm_mul_ps(d,d);
853 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
855 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
857 /* Evaluate switch function */
858 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
859 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
860 velec = _mm_mul_ps(velec,sw);
861 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
863 /* Update potential sum for this i atom from the interaction with this j atom. */
864 velec = _mm_and_ps(velec,cutoff_mask);
865 velecsum = _mm_add_ps(velecsum,velec);
869 fscal = _mm_and_ps(fscal,cutoff_mask);
871 /* Calculate temporary vectorial force */
872 tx = _mm_mul_ps(fscal,dx32);
873 ty = _mm_mul_ps(fscal,dy32);
874 tz = _mm_mul_ps(fscal,dz32);
876 /* Update vectorial force */
877 fix3 = _mm_add_ps(fix3,tx);
878 fiy3 = _mm_add_ps(fiy3,ty);
879 fiz3 = _mm_add_ps(fiz3,tz);
881 fjx2 = _mm_add_ps(fjx2,tx);
882 fjy2 = _mm_add_ps(fjy2,ty);
883 fjz2 = _mm_add_ps(fjz2,tz);
887 /**************************
888 * CALCULATE INTERACTIONS *
889 **************************/
891 if (gmx_mm_any_lt(rsq33,rcutoff2))
894 r33 = _mm_mul_ps(rsq33,rinv33);
896 /* EWALD ELECTROSTATICS */
898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
899 ewrt = _mm_mul_ps(r33,ewtabscale);
900 ewitab = _mm_cvttps_epi32(ewrt);
901 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
902 ewitab = _mm_slli_epi32(ewitab,2);
903 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
904 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
905 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
906 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
907 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
908 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
909 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
910 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
911 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
913 d = _mm_sub_ps(r33,rswitch);
914 d = _mm_max_ps(d,_mm_setzero_ps());
915 d2 = _mm_mul_ps(d,d);
916 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
918 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
920 /* Evaluate switch function */
921 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
922 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
923 velec = _mm_mul_ps(velec,sw);
924 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
926 /* Update potential sum for this i atom from the interaction with this j atom. */
927 velec = _mm_and_ps(velec,cutoff_mask);
928 velecsum = _mm_add_ps(velecsum,velec);
932 fscal = _mm_and_ps(fscal,cutoff_mask);
934 /* Calculate temporary vectorial force */
935 tx = _mm_mul_ps(fscal,dx33);
936 ty = _mm_mul_ps(fscal,dy33);
937 tz = _mm_mul_ps(fscal,dz33);
939 /* Update vectorial force */
940 fix3 = _mm_add_ps(fix3,tx);
941 fiy3 = _mm_add_ps(fiy3,ty);
942 fiz3 = _mm_add_ps(fiz3,tz);
944 fjx3 = _mm_add_ps(fjx3,tx);
945 fjy3 = _mm_add_ps(fjy3,ty);
946 fjz3 = _mm_add_ps(fjz3,tz);
950 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
951 f+j_coord_offsetC,f+j_coord_offsetD,
952 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
953 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
955 /* Inner loop uses 647 flops */
961 /* Get j neighbor index, and coordinate index */
967 /* Sign of each element will be negative for non-real atoms.
968 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
969 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
971 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
972 jnrA = (jnrA>=0) ? jnrA : 0;
973 jnrB = (jnrB>=0) ? jnrB : 0;
974 jnrC = (jnrC>=0) ? jnrC : 0;
975 jnrD = (jnrD>=0) ? jnrD : 0;
977 j_coord_offsetA = DIM*jnrA;
978 j_coord_offsetB = DIM*jnrB;
979 j_coord_offsetC = DIM*jnrC;
980 j_coord_offsetD = DIM*jnrD;
982 /* load j atom coordinates */
983 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
984 x+j_coord_offsetC,x+j_coord_offsetD,
985 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
986 &jy2,&jz2,&jx3,&jy3,&jz3);
988 /* Calculate displacement vector */
989 dx00 = _mm_sub_ps(ix0,jx0);
990 dy00 = _mm_sub_ps(iy0,jy0);
991 dz00 = _mm_sub_ps(iz0,jz0);
992 dx11 = _mm_sub_ps(ix1,jx1);
993 dy11 = _mm_sub_ps(iy1,jy1);
994 dz11 = _mm_sub_ps(iz1,jz1);
995 dx12 = _mm_sub_ps(ix1,jx2);
996 dy12 = _mm_sub_ps(iy1,jy2);
997 dz12 = _mm_sub_ps(iz1,jz2);
998 dx13 = _mm_sub_ps(ix1,jx3);
999 dy13 = _mm_sub_ps(iy1,jy3);
1000 dz13 = _mm_sub_ps(iz1,jz3);
1001 dx21 = _mm_sub_ps(ix2,jx1);
1002 dy21 = _mm_sub_ps(iy2,jy1);
1003 dz21 = _mm_sub_ps(iz2,jz1);
1004 dx22 = _mm_sub_ps(ix2,jx2);
1005 dy22 = _mm_sub_ps(iy2,jy2);
1006 dz22 = _mm_sub_ps(iz2,jz2);
1007 dx23 = _mm_sub_ps(ix2,jx3);
1008 dy23 = _mm_sub_ps(iy2,jy3);
1009 dz23 = _mm_sub_ps(iz2,jz3);
1010 dx31 = _mm_sub_ps(ix3,jx1);
1011 dy31 = _mm_sub_ps(iy3,jy1);
1012 dz31 = _mm_sub_ps(iz3,jz1);
1013 dx32 = _mm_sub_ps(ix3,jx2);
1014 dy32 = _mm_sub_ps(iy3,jy2);
1015 dz32 = _mm_sub_ps(iz3,jz2);
1016 dx33 = _mm_sub_ps(ix3,jx3);
1017 dy33 = _mm_sub_ps(iy3,jy3);
1018 dz33 = _mm_sub_ps(iz3,jz3);
1020 /* Calculate squared distance and things based on it */
1021 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1022 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1023 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1024 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1025 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1026 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1027 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1028 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1029 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1030 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1032 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1033 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1034 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1035 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1036 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1037 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1038 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1039 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1040 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1041 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1043 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1044 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1045 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1046 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1047 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1048 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1049 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1050 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1051 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1052 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1054 fjx0 = _mm_setzero_ps();
1055 fjy0 = _mm_setzero_ps();
1056 fjz0 = _mm_setzero_ps();
1057 fjx1 = _mm_setzero_ps();
1058 fjy1 = _mm_setzero_ps();
1059 fjz1 = _mm_setzero_ps();
1060 fjx2 = _mm_setzero_ps();
1061 fjy2 = _mm_setzero_ps();
1062 fjz2 = _mm_setzero_ps();
1063 fjx3 = _mm_setzero_ps();
1064 fjy3 = _mm_setzero_ps();
1065 fjz3 = _mm_setzero_ps();
1067 /**************************
1068 * CALCULATE INTERACTIONS *
1069 **************************/
1071 if (gmx_mm_any_lt(rsq00,rcutoff2))
1074 r00 = _mm_mul_ps(rsq00,rinv00);
1075 r00 = _mm_andnot_ps(dummy_mask,r00);
1077 /* LENNARD-JONES DISPERSION/REPULSION */
1079 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1080 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1081 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1082 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1083 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1085 d = _mm_sub_ps(r00,rswitch);
1086 d = _mm_max_ps(d,_mm_setzero_ps());
1087 d2 = _mm_mul_ps(d,d);
1088 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1090 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1092 /* Evaluate switch function */
1093 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1094 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1095 vvdw = _mm_mul_ps(vvdw,sw);
1096 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1098 /* Update potential sum for this i atom from the interaction with this j atom. */
1099 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1100 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1101 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1105 fscal = _mm_and_ps(fscal,cutoff_mask);
1107 fscal = _mm_andnot_ps(dummy_mask,fscal);
1109 /* Calculate temporary vectorial force */
1110 tx = _mm_mul_ps(fscal,dx00);
1111 ty = _mm_mul_ps(fscal,dy00);
1112 tz = _mm_mul_ps(fscal,dz00);
1114 /* Update vectorial force */
1115 fix0 = _mm_add_ps(fix0,tx);
1116 fiy0 = _mm_add_ps(fiy0,ty);
1117 fiz0 = _mm_add_ps(fiz0,tz);
1119 fjx0 = _mm_add_ps(fjx0,tx);
1120 fjy0 = _mm_add_ps(fjy0,ty);
1121 fjz0 = _mm_add_ps(fjz0,tz);
1125 /**************************
1126 * CALCULATE INTERACTIONS *
1127 **************************/
1129 if (gmx_mm_any_lt(rsq11,rcutoff2))
1132 r11 = _mm_mul_ps(rsq11,rinv11);
1133 r11 = _mm_andnot_ps(dummy_mask,r11);
1135 /* EWALD ELECTROSTATICS */
1137 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1138 ewrt = _mm_mul_ps(r11,ewtabscale);
1139 ewitab = _mm_cvttps_epi32(ewrt);
1140 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1141 ewitab = _mm_slli_epi32(ewitab,2);
1142 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1143 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1144 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1145 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1146 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1147 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1148 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1149 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1150 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1152 d = _mm_sub_ps(r11,rswitch);
1153 d = _mm_max_ps(d,_mm_setzero_ps());
1154 d2 = _mm_mul_ps(d,d);
1155 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1157 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1159 /* Evaluate switch function */
1160 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1161 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1162 velec = _mm_mul_ps(velec,sw);
1163 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1165 /* Update potential sum for this i atom from the interaction with this j atom. */
1166 velec = _mm_and_ps(velec,cutoff_mask);
1167 velec = _mm_andnot_ps(dummy_mask,velec);
1168 velecsum = _mm_add_ps(velecsum,velec);
1172 fscal = _mm_and_ps(fscal,cutoff_mask);
1174 fscal = _mm_andnot_ps(dummy_mask,fscal);
1176 /* Calculate temporary vectorial force */
1177 tx = _mm_mul_ps(fscal,dx11);
1178 ty = _mm_mul_ps(fscal,dy11);
1179 tz = _mm_mul_ps(fscal,dz11);
1181 /* Update vectorial force */
1182 fix1 = _mm_add_ps(fix1,tx);
1183 fiy1 = _mm_add_ps(fiy1,ty);
1184 fiz1 = _mm_add_ps(fiz1,tz);
1186 fjx1 = _mm_add_ps(fjx1,tx);
1187 fjy1 = _mm_add_ps(fjy1,ty);
1188 fjz1 = _mm_add_ps(fjz1,tz);
1192 /**************************
1193 * CALCULATE INTERACTIONS *
1194 **************************/
1196 if (gmx_mm_any_lt(rsq12,rcutoff2))
1199 r12 = _mm_mul_ps(rsq12,rinv12);
1200 r12 = _mm_andnot_ps(dummy_mask,r12);
1202 /* EWALD ELECTROSTATICS */
1204 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1205 ewrt = _mm_mul_ps(r12,ewtabscale);
1206 ewitab = _mm_cvttps_epi32(ewrt);
1207 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1208 ewitab = _mm_slli_epi32(ewitab,2);
1209 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1210 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1211 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1212 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1213 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1214 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1215 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1216 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1217 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1219 d = _mm_sub_ps(r12,rswitch);
1220 d = _mm_max_ps(d,_mm_setzero_ps());
1221 d2 = _mm_mul_ps(d,d);
1222 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1224 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1226 /* Evaluate switch function */
1227 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1228 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1229 velec = _mm_mul_ps(velec,sw);
1230 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1232 /* Update potential sum for this i atom from the interaction with this j atom. */
1233 velec = _mm_and_ps(velec,cutoff_mask);
1234 velec = _mm_andnot_ps(dummy_mask,velec);
1235 velecsum = _mm_add_ps(velecsum,velec);
1239 fscal = _mm_and_ps(fscal,cutoff_mask);
1241 fscal = _mm_andnot_ps(dummy_mask,fscal);
1243 /* Calculate temporary vectorial force */
1244 tx = _mm_mul_ps(fscal,dx12);
1245 ty = _mm_mul_ps(fscal,dy12);
1246 tz = _mm_mul_ps(fscal,dz12);
1248 /* Update vectorial force */
1249 fix1 = _mm_add_ps(fix1,tx);
1250 fiy1 = _mm_add_ps(fiy1,ty);
1251 fiz1 = _mm_add_ps(fiz1,tz);
1253 fjx2 = _mm_add_ps(fjx2,tx);
1254 fjy2 = _mm_add_ps(fjy2,ty);
1255 fjz2 = _mm_add_ps(fjz2,tz);
1259 /**************************
1260 * CALCULATE INTERACTIONS *
1261 **************************/
1263 if (gmx_mm_any_lt(rsq13,rcutoff2))
1266 r13 = _mm_mul_ps(rsq13,rinv13);
1267 r13 = _mm_andnot_ps(dummy_mask,r13);
1269 /* EWALD ELECTROSTATICS */
1271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1272 ewrt = _mm_mul_ps(r13,ewtabscale);
1273 ewitab = _mm_cvttps_epi32(ewrt);
1274 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1275 ewitab = _mm_slli_epi32(ewitab,2);
1276 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1277 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1278 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1279 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1280 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1281 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1282 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1283 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1284 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1286 d = _mm_sub_ps(r13,rswitch);
1287 d = _mm_max_ps(d,_mm_setzero_ps());
1288 d2 = _mm_mul_ps(d,d);
1289 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1291 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1293 /* Evaluate switch function */
1294 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1295 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1296 velec = _mm_mul_ps(velec,sw);
1297 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1299 /* Update potential sum for this i atom from the interaction with this j atom. */
1300 velec = _mm_and_ps(velec,cutoff_mask);
1301 velec = _mm_andnot_ps(dummy_mask,velec);
1302 velecsum = _mm_add_ps(velecsum,velec);
1306 fscal = _mm_and_ps(fscal,cutoff_mask);
1308 fscal = _mm_andnot_ps(dummy_mask,fscal);
1310 /* Calculate temporary vectorial force */
1311 tx = _mm_mul_ps(fscal,dx13);
1312 ty = _mm_mul_ps(fscal,dy13);
1313 tz = _mm_mul_ps(fscal,dz13);
1315 /* Update vectorial force */
1316 fix1 = _mm_add_ps(fix1,tx);
1317 fiy1 = _mm_add_ps(fiy1,ty);
1318 fiz1 = _mm_add_ps(fiz1,tz);
1320 fjx3 = _mm_add_ps(fjx3,tx);
1321 fjy3 = _mm_add_ps(fjy3,ty);
1322 fjz3 = _mm_add_ps(fjz3,tz);
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 if (gmx_mm_any_lt(rsq21,rcutoff2))
1333 r21 = _mm_mul_ps(rsq21,rinv21);
1334 r21 = _mm_andnot_ps(dummy_mask,r21);
1336 /* EWALD ELECTROSTATICS */
1338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1339 ewrt = _mm_mul_ps(r21,ewtabscale);
1340 ewitab = _mm_cvttps_epi32(ewrt);
1341 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1342 ewitab = _mm_slli_epi32(ewitab,2);
1343 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1344 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1345 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1346 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1347 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1348 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1349 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1350 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1351 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1353 d = _mm_sub_ps(r21,rswitch);
1354 d = _mm_max_ps(d,_mm_setzero_ps());
1355 d2 = _mm_mul_ps(d,d);
1356 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1358 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1360 /* Evaluate switch function */
1361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1362 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1363 velec = _mm_mul_ps(velec,sw);
1364 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1366 /* Update potential sum for this i atom from the interaction with this j atom. */
1367 velec = _mm_and_ps(velec,cutoff_mask);
1368 velec = _mm_andnot_ps(dummy_mask,velec);
1369 velecsum = _mm_add_ps(velecsum,velec);
1373 fscal = _mm_and_ps(fscal,cutoff_mask);
1375 fscal = _mm_andnot_ps(dummy_mask,fscal);
1377 /* Calculate temporary vectorial force */
1378 tx = _mm_mul_ps(fscal,dx21);
1379 ty = _mm_mul_ps(fscal,dy21);
1380 tz = _mm_mul_ps(fscal,dz21);
1382 /* Update vectorial force */
1383 fix2 = _mm_add_ps(fix2,tx);
1384 fiy2 = _mm_add_ps(fiy2,ty);
1385 fiz2 = _mm_add_ps(fiz2,tz);
1387 fjx1 = _mm_add_ps(fjx1,tx);
1388 fjy1 = _mm_add_ps(fjy1,ty);
1389 fjz1 = _mm_add_ps(fjz1,tz);
1393 /**************************
1394 * CALCULATE INTERACTIONS *
1395 **************************/
1397 if (gmx_mm_any_lt(rsq22,rcutoff2))
1400 r22 = _mm_mul_ps(rsq22,rinv22);
1401 r22 = _mm_andnot_ps(dummy_mask,r22);
1403 /* EWALD ELECTROSTATICS */
1405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1406 ewrt = _mm_mul_ps(r22,ewtabscale);
1407 ewitab = _mm_cvttps_epi32(ewrt);
1408 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1409 ewitab = _mm_slli_epi32(ewitab,2);
1410 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1411 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1412 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1413 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1414 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1415 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1416 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1417 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1418 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1420 d = _mm_sub_ps(r22,rswitch);
1421 d = _mm_max_ps(d,_mm_setzero_ps());
1422 d2 = _mm_mul_ps(d,d);
1423 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1425 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1427 /* Evaluate switch function */
1428 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1429 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1430 velec = _mm_mul_ps(velec,sw);
1431 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1433 /* Update potential sum for this i atom from the interaction with this j atom. */
1434 velec = _mm_and_ps(velec,cutoff_mask);
1435 velec = _mm_andnot_ps(dummy_mask,velec);
1436 velecsum = _mm_add_ps(velecsum,velec);
1440 fscal = _mm_and_ps(fscal,cutoff_mask);
1442 fscal = _mm_andnot_ps(dummy_mask,fscal);
1444 /* Calculate temporary vectorial force */
1445 tx = _mm_mul_ps(fscal,dx22);
1446 ty = _mm_mul_ps(fscal,dy22);
1447 tz = _mm_mul_ps(fscal,dz22);
1449 /* Update vectorial force */
1450 fix2 = _mm_add_ps(fix2,tx);
1451 fiy2 = _mm_add_ps(fiy2,ty);
1452 fiz2 = _mm_add_ps(fiz2,tz);
1454 fjx2 = _mm_add_ps(fjx2,tx);
1455 fjy2 = _mm_add_ps(fjy2,ty);
1456 fjz2 = _mm_add_ps(fjz2,tz);
1460 /**************************
1461 * CALCULATE INTERACTIONS *
1462 **************************/
1464 if (gmx_mm_any_lt(rsq23,rcutoff2))
1467 r23 = _mm_mul_ps(rsq23,rinv23);
1468 r23 = _mm_andnot_ps(dummy_mask,r23);
1470 /* EWALD ELECTROSTATICS */
1472 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1473 ewrt = _mm_mul_ps(r23,ewtabscale);
1474 ewitab = _mm_cvttps_epi32(ewrt);
1475 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1476 ewitab = _mm_slli_epi32(ewitab,2);
1477 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1478 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1479 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1480 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1481 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1482 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1483 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1484 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1485 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1487 d = _mm_sub_ps(r23,rswitch);
1488 d = _mm_max_ps(d,_mm_setzero_ps());
1489 d2 = _mm_mul_ps(d,d);
1490 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1492 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1494 /* Evaluate switch function */
1495 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1496 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1497 velec = _mm_mul_ps(velec,sw);
1498 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1500 /* Update potential sum for this i atom from the interaction with this j atom. */
1501 velec = _mm_and_ps(velec,cutoff_mask);
1502 velec = _mm_andnot_ps(dummy_mask,velec);
1503 velecsum = _mm_add_ps(velecsum,velec);
1507 fscal = _mm_and_ps(fscal,cutoff_mask);
1509 fscal = _mm_andnot_ps(dummy_mask,fscal);
1511 /* Calculate temporary vectorial force */
1512 tx = _mm_mul_ps(fscal,dx23);
1513 ty = _mm_mul_ps(fscal,dy23);
1514 tz = _mm_mul_ps(fscal,dz23);
1516 /* Update vectorial force */
1517 fix2 = _mm_add_ps(fix2,tx);
1518 fiy2 = _mm_add_ps(fiy2,ty);
1519 fiz2 = _mm_add_ps(fiz2,tz);
1521 fjx3 = _mm_add_ps(fjx3,tx);
1522 fjy3 = _mm_add_ps(fjy3,ty);
1523 fjz3 = _mm_add_ps(fjz3,tz);
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 if (gmx_mm_any_lt(rsq31,rcutoff2))
1534 r31 = _mm_mul_ps(rsq31,rinv31);
1535 r31 = _mm_andnot_ps(dummy_mask,r31);
1537 /* EWALD ELECTROSTATICS */
1539 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1540 ewrt = _mm_mul_ps(r31,ewtabscale);
1541 ewitab = _mm_cvttps_epi32(ewrt);
1542 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1543 ewitab = _mm_slli_epi32(ewitab,2);
1544 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1545 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1546 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1547 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1548 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1549 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1550 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1551 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1552 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1554 d = _mm_sub_ps(r31,rswitch);
1555 d = _mm_max_ps(d,_mm_setzero_ps());
1556 d2 = _mm_mul_ps(d,d);
1557 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1559 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1561 /* Evaluate switch function */
1562 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1563 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1564 velec = _mm_mul_ps(velec,sw);
1565 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1567 /* Update potential sum for this i atom from the interaction with this j atom. */
1568 velec = _mm_and_ps(velec,cutoff_mask);
1569 velec = _mm_andnot_ps(dummy_mask,velec);
1570 velecsum = _mm_add_ps(velecsum,velec);
1574 fscal = _mm_and_ps(fscal,cutoff_mask);
1576 fscal = _mm_andnot_ps(dummy_mask,fscal);
1578 /* Calculate temporary vectorial force */
1579 tx = _mm_mul_ps(fscal,dx31);
1580 ty = _mm_mul_ps(fscal,dy31);
1581 tz = _mm_mul_ps(fscal,dz31);
1583 /* Update vectorial force */
1584 fix3 = _mm_add_ps(fix3,tx);
1585 fiy3 = _mm_add_ps(fiy3,ty);
1586 fiz3 = _mm_add_ps(fiz3,tz);
1588 fjx1 = _mm_add_ps(fjx1,tx);
1589 fjy1 = _mm_add_ps(fjy1,ty);
1590 fjz1 = _mm_add_ps(fjz1,tz);
1594 /**************************
1595 * CALCULATE INTERACTIONS *
1596 **************************/
1598 if (gmx_mm_any_lt(rsq32,rcutoff2))
1601 r32 = _mm_mul_ps(rsq32,rinv32);
1602 r32 = _mm_andnot_ps(dummy_mask,r32);
1604 /* EWALD ELECTROSTATICS */
1606 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1607 ewrt = _mm_mul_ps(r32,ewtabscale);
1608 ewitab = _mm_cvttps_epi32(ewrt);
1609 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1610 ewitab = _mm_slli_epi32(ewitab,2);
1611 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1612 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1613 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1614 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1615 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1616 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1617 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1618 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1619 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1621 d = _mm_sub_ps(r32,rswitch);
1622 d = _mm_max_ps(d,_mm_setzero_ps());
1623 d2 = _mm_mul_ps(d,d);
1624 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1626 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1628 /* Evaluate switch function */
1629 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1630 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1631 velec = _mm_mul_ps(velec,sw);
1632 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1634 /* Update potential sum for this i atom from the interaction with this j atom. */
1635 velec = _mm_and_ps(velec,cutoff_mask);
1636 velec = _mm_andnot_ps(dummy_mask,velec);
1637 velecsum = _mm_add_ps(velecsum,velec);
1641 fscal = _mm_and_ps(fscal,cutoff_mask);
1643 fscal = _mm_andnot_ps(dummy_mask,fscal);
1645 /* Calculate temporary vectorial force */
1646 tx = _mm_mul_ps(fscal,dx32);
1647 ty = _mm_mul_ps(fscal,dy32);
1648 tz = _mm_mul_ps(fscal,dz32);
1650 /* Update vectorial force */
1651 fix3 = _mm_add_ps(fix3,tx);
1652 fiy3 = _mm_add_ps(fiy3,ty);
1653 fiz3 = _mm_add_ps(fiz3,tz);
1655 fjx2 = _mm_add_ps(fjx2,tx);
1656 fjy2 = _mm_add_ps(fjy2,ty);
1657 fjz2 = _mm_add_ps(fjz2,tz);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 if (gmx_mm_any_lt(rsq33,rcutoff2))
1668 r33 = _mm_mul_ps(rsq33,rinv33);
1669 r33 = _mm_andnot_ps(dummy_mask,r33);
1671 /* EWALD ELECTROSTATICS */
1673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1674 ewrt = _mm_mul_ps(r33,ewtabscale);
1675 ewitab = _mm_cvttps_epi32(ewrt);
1676 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1677 ewitab = _mm_slli_epi32(ewitab,2);
1678 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1679 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1680 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1681 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1682 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1683 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1684 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1685 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1686 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1688 d = _mm_sub_ps(r33,rswitch);
1689 d = _mm_max_ps(d,_mm_setzero_ps());
1690 d2 = _mm_mul_ps(d,d);
1691 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1693 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1695 /* Evaluate switch function */
1696 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1697 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1698 velec = _mm_mul_ps(velec,sw);
1699 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1701 /* Update potential sum for this i atom from the interaction with this j atom. */
1702 velec = _mm_and_ps(velec,cutoff_mask);
1703 velec = _mm_andnot_ps(dummy_mask,velec);
1704 velecsum = _mm_add_ps(velecsum,velec);
1708 fscal = _mm_and_ps(fscal,cutoff_mask);
1710 fscal = _mm_andnot_ps(dummy_mask,fscal);
1712 /* Calculate temporary vectorial force */
1713 tx = _mm_mul_ps(fscal,dx33);
1714 ty = _mm_mul_ps(fscal,dy33);
1715 tz = _mm_mul_ps(fscal,dz33);
1717 /* Update vectorial force */
1718 fix3 = _mm_add_ps(fix3,tx);
1719 fiy3 = _mm_add_ps(fiy3,ty);
1720 fiz3 = _mm_add_ps(fiz3,tz);
1722 fjx3 = _mm_add_ps(fjx3,tx);
1723 fjy3 = _mm_add_ps(fjy3,ty);
1724 fjz3 = _mm_add_ps(fjz3,tz);
1728 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1729 f+j_coord_offsetC,f+j_coord_offsetD,
1730 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1731 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1733 /* Inner loop uses 657 flops */
1736 /* End of innermost loop */
1738 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1739 f+i_coord_offset,fshift+i_shift_offset);
1742 /* Update potential energies */
1743 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1744 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1746 /* Increment number of inner iterations */
1747 inneriter += j_index_end - j_index_start;
1749 /* Outer loop uses 38 flops */
1752 /* Increment number of outer iterations */
1755 /* Update outer/inner flops */
1757 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*657);
1760 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1761 * Electrostatics interaction: Ewald
1762 * VdW interaction: LennardJones
1763 * Geometry: Water4-Water4
1764 * Calculate force/pot: Force
1767 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1768 (t_nblist * gmx_restrict nlist,
1769 rvec * gmx_restrict xx,
1770 rvec * gmx_restrict ff,
1771 t_forcerec * gmx_restrict fr,
1772 t_mdatoms * gmx_restrict mdatoms,
1773 nb_kernel_data_t * gmx_restrict kernel_data,
1774 t_nrnb * gmx_restrict nrnb)
1776 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1777 * just 0 for non-waters.
1778 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1779 * jnr indices corresponding to data put in the four positions in the SIMD register.
1781 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1782 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1783 int jnrA,jnrB,jnrC,jnrD;
1784 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1785 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1786 real shX,shY,shZ,rcutoff_scalar;
1787 real *shiftvec,*fshift,*x,*f;
1788 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1790 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1792 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1794 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1796 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1797 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1798 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1799 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1800 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1801 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1802 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1803 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1804 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1805 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1806 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1807 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1808 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1809 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1810 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1811 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1812 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1813 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1814 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1815 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1818 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1821 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1822 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1824 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1826 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1827 real rswitch_scalar,d_scalar;
1828 __m128 dummy_mask,cutoff_mask;
1829 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1830 __m128 one = _mm_set1_ps(1.0);
1831 __m128 two = _mm_set1_ps(2.0);
1837 jindex = nlist->jindex;
1839 shiftidx = nlist->shift;
1841 shiftvec = fr->shift_vec[0];
1842 fshift = fr->fshift[0];
1843 facel = _mm_set1_ps(fr->epsfac);
1844 charge = mdatoms->chargeA;
1845 nvdwtype = fr->ntype;
1846 vdwparam = fr->nbfp;
1847 vdwtype = mdatoms->typeA;
1849 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1850 ewtab = fr->ic->tabq_coul_FDV0;
1851 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1852 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1854 /* Setup water-specific parameters */
1855 inr = nlist->iinr[0];
1856 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1857 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1858 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1859 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1861 jq1 = _mm_set1_ps(charge[inr+1]);
1862 jq2 = _mm_set1_ps(charge[inr+2]);
1863 jq3 = _mm_set1_ps(charge[inr+3]);
1864 vdwjidx0A = 2*vdwtype[inr+0];
1865 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1866 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1867 qq11 = _mm_mul_ps(iq1,jq1);
1868 qq12 = _mm_mul_ps(iq1,jq2);
1869 qq13 = _mm_mul_ps(iq1,jq3);
1870 qq21 = _mm_mul_ps(iq2,jq1);
1871 qq22 = _mm_mul_ps(iq2,jq2);
1872 qq23 = _mm_mul_ps(iq2,jq3);
1873 qq31 = _mm_mul_ps(iq3,jq1);
1874 qq32 = _mm_mul_ps(iq3,jq2);
1875 qq33 = _mm_mul_ps(iq3,jq3);
1877 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1878 rcutoff_scalar = fr->rcoulomb;
1879 rcutoff = _mm_set1_ps(rcutoff_scalar);
1880 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1882 rswitch_scalar = fr->rcoulomb_switch;
1883 rswitch = _mm_set1_ps(rswitch_scalar);
1884 /* Setup switch parameters */
1885 d_scalar = rcutoff_scalar-rswitch_scalar;
1886 d = _mm_set1_ps(d_scalar);
1887 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1888 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1889 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1890 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1891 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1892 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1894 /* Avoid stupid compiler warnings */
1895 jnrA = jnrB = jnrC = jnrD = 0;
1896 j_coord_offsetA = 0;
1897 j_coord_offsetB = 0;
1898 j_coord_offsetC = 0;
1899 j_coord_offsetD = 0;
1904 /* Start outer loop over neighborlists */
1905 for(iidx=0; iidx<nri; iidx++)
1907 /* Load shift vector for this list */
1908 i_shift_offset = DIM*shiftidx[iidx];
1909 shX = shiftvec[i_shift_offset+XX];
1910 shY = shiftvec[i_shift_offset+YY];
1911 shZ = shiftvec[i_shift_offset+ZZ];
1913 /* Load limits for loop over neighbors */
1914 j_index_start = jindex[iidx];
1915 j_index_end = jindex[iidx+1];
1917 /* Get outer coordinate index */
1919 i_coord_offset = DIM*inr;
1921 /* Load i particle coords and add shift vector */
1922 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1923 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1924 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1925 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1926 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1927 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1928 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1929 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1930 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1931 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1932 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1933 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1935 fix0 = _mm_setzero_ps();
1936 fiy0 = _mm_setzero_ps();
1937 fiz0 = _mm_setzero_ps();
1938 fix1 = _mm_setzero_ps();
1939 fiy1 = _mm_setzero_ps();
1940 fiz1 = _mm_setzero_ps();
1941 fix2 = _mm_setzero_ps();
1942 fiy2 = _mm_setzero_ps();
1943 fiz2 = _mm_setzero_ps();
1944 fix3 = _mm_setzero_ps();
1945 fiy3 = _mm_setzero_ps();
1946 fiz3 = _mm_setzero_ps();
1948 /* Start inner kernel loop */
1949 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1952 /* Get j neighbor index, and coordinate index */
1954 jnrB = jjnr[jidx+1];
1955 jnrC = jjnr[jidx+2];
1956 jnrD = jjnr[jidx+3];
1958 j_coord_offsetA = DIM*jnrA;
1959 j_coord_offsetB = DIM*jnrB;
1960 j_coord_offsetC = DIM*jnrC;
1961 j_coord_offsetD = DIM*jnrD;
1963 /* load j atom coordinates */
1964 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1965 x+j_coord_offsetC,x+j_coord_offsetD,
1966 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1967 &jy2,&jz2,&jx3,&jy3,&jz3);
1969 /* Calculate displacement vector */
1970 dx00 = _mm_sub_ps(ix0,jx0);
1971 dy00 = _mm_sub_ps(iy0,jy0);
1972 dz00 = _mm_sub_ps(iz0,jz0);
1973 dx11 = _mm_sub_ps(ix1,jx1);
1974 dy11 = _mm_sub_ps(iy1,jy1);
1975 dz11 = _mm_sub_ps(iz1,jz1);
1976 dx12 = _mm_sub_ps(ix1,jx2);
1977 dy12 = _mm_sub_ps(iy1,jy2);
1978 dz12 = _mm_sub_ps(iz1,jz2);
1979 dx13 = _mm_sub_ps(ix1,jx3);
1980 dy13 = _mm_sub_ps(iy1,jy3);
1981 dz13 = _mm_sub_ps(iz1,jz3);
1982 dx21 = _mm_sub_ps(ix2,jx1);
1983 dy21 = _mm_sub_ps(iy2,jy1);
1984 dz21 = _mm_sub_ps(iz2,jz1);
1985 dx22 = _mm_sub_ps(ix2,jx2);
1986 dy22 = _mm_sub_ps(iy2,jy2);
1987 dz22 = _mm_sub_ps(iz2,jz2);
1988 dx23 = _mm_sub_ps(ix2,jx3);
1989 dy23 = _mm_sub_ps(iy2,jy3);
1990 dz23 = _mm_sub_ps(iz2,jz3);
1991 dx31 = _mm_sub_ps(ix3,jx1);
1992 dy31 = _mm_sub_ps(iy3,jy1);
1993 dz31 = _mm_sub_ps(iz3,jz1);
1994 dx32 = _mm_sub_ps(ix3,jx2);
1995 dy32 = _mm_sub_ps(iy3,jy2);
1996 dz32 = _mm_sub_ps(iz3,jz2);
1997 dx33 = _mm_sub_ps(ix3,jx3);
1998 dy33 = _mm_sub_ps(iy3,jy3);
1999 dz33 = _mm_sub_ps(iz3,jz3);
2001 /* Calculate squared distance and things based on it */
2002 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2003 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2004 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2005 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2006 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2007 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2008 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2009 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2010 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2011 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2013 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2014 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2015 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2016 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2017 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2018 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2019 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2020 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2021 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2022 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2024 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2025 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2026 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2027 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2028 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2029 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2030 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2031 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2032 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2033 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2035 fjx0 = _mm_setzero_ps();
2036 fjy0 = _mm_setzero_ps();
2037 fjz0 = _mm_setzero_ps();
2038 fjx1 = _mm_setzero_ps();
2039 fjy1 = _mm_setzero_ps();
2040 fjz1 = _mm_setzero_ps();
2041 fjx2 = _mm_setzero_ps();
2042 fjy2 = _mm_setzero_ps();
2043 fjz2 = _mm_setzero_ps();
2044 fjx3 = _mm_setzero_ps();
2045 fjy3 = _mm_setzero_ps();
2046 fjz3 = _mm_setzero_ps();
2048 /**************************
2049 * CALCULATE INTERACTIONS *
2050 **************************/
2052 if (gmx_mm_any_lt(rsq00,rcutoff2))
2055 r00 = _mm_mul_ps(rsq00,rinv00);
2057 /* LENNARD-JONES DISPERSION/REPULSION */
2059 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2060 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2061 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2062 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2063 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2065 d = _mm_sub_ps(r00,rswitch);
2066 d = _mm_max_ps(d,_mm_setzero_ps());
2067 d2 = _mm_mul_ps(d,d);
2068 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2070 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2072 /* Evaluate switch function */
2073 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2074 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2075 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2079 fscal = _mm_and_ps(fscal,cutoff_mask);
2081 /* Calculate temporary vectorial force */
2082 tx = _mm_mul_ps(fscal,dx00);
2083 ty = _mm_mul_ps(fscal,dy00);
2084 tz = _mm_mul_ps(fscal,dz00);
2086 /* Update vectorial force */
2087 fix0 = _mm_add_ps(fix0,tx);
2088 fiy0 = _mm_add_ps(fiy0,ty);
2089 fiz0 = _mm_add_ps(fiz0,tz);
2091 fjx0 = _mm_add_ps(fjx0,tx);
2092 fjy0 = _mm_add_ps(fjy0,ty);
2093 fjz0 = _mm_add_ps(fjz0,tz);
2097 /**************************
2098 * CALCULATE INTERACTIONS *
2099 **************************/
2101 if (gmx_mm_any_lt(rsq11,rcutoff2))
2104 r11 = _mm_mul_ps(rsq11,rinv11);
2106 /* EWALD ELECTROSTATICS */
2108 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2109 ewrt = _mm_mul_ps(r11,ewtabscale);
2110 ewitab = _mm_cvttps_epi32(ewrt);
2111 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2112 ewitab = _mm_slli_epi32(ewitab,2);
2113 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2114 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2115 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2116 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2117 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2118 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2119 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2120 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2121 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2123 d = _mm_sub_ps(r11,rswitch);
2124 d = _mm_max_ps(d,_mm_setzero_ps());
2125 d2 = _mm_mul_ps(d,d);
2126 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2128 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2130 /* Evaluate switch function */
2131 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2132 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2133 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2137 fscal = _mm_and_ps(fscal,cutoff_mask);
2139 /* Calculate temporary vectorial force */
2140 tx = _mm_mul_ps(fscal,dx11);
2141 ty = _mm_mul_ps(fscal,dy11);
2142 tz = _mm_mul_ps(fscal,dz11);
2144 /* Update vectorial force */
2145 fix1 = _mm_add_ps(fix1,tx);
2146 fiy1 = _mm_add_ps(fiy1,ty);
2147 fiz1 = _mm_add_ps(fiz1,tz);
2149 fjx1 = _mm_add_ps(fjx1,tx);
2150 fjy1 = _mm_add_ps(fjy1,ty);
2151 fjz1 = _mm_add_ps(fjz1,tz);
2155 /**************************
2156 * CALCULATE INTERACTIONS *
2157 **************************/
2159 if (gmx_mm_any_lt(rsq12,rcutoff2))
2162 r12 = _mm_mul_ps(rsq12,rinv12);
2164 /* EWALD ELECTROSTATICS */
2166 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2167 ewrt = _mm_mul_ps(r12,ewtabscale);
2168 ewitab = _mm_cvttps_epi32(ewrt);
2169 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2170 ewitab = _mm_slli_epi32(ewitab,2);
2171 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2172 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2173 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2174 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2175 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2176 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2177 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2178 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2179 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2181 d = _mm_sub_ps(r12,rswitch);
2182 d = _mm_max_ps(d,_mm_setzero_ps());
2183 d2 = _mm_mul_ps(d,d);
2184 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2186 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2188 /* Evaluate switch function */
2189 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2190 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2191 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2195 fscal = _mm_and_ps(fscal,cutoff_mask);
2197 /* Calculate temporary vectorial force */
2198 tx = _mm_mul_ps(fscal,dx12);
2199 ty = _mm_mul_ps(fscal,dy12);
2200 tz = _mm_mul_ps(fscal,dz12);
2202 /* Update vectorial force */
2203 fix1 = _mm_add_ps(fix1,tx);
2204 fiy1 = _mm_add_ps(fiy1,ty);
2205 fiz1 = _mm_add_ps(fiz1,tz);
2207 fjx2 = _mm_add_ps(fjx2,tx);
2208 fjy2 = _mm_add_ps(fjy2,ty);
2209 fjz2 = _mm_add_ps(fjz2,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 if (gmx_mm_any_lt(rsq13,rcutoff2))
2220 r13 = _mm_mul_ps(rsq13,rinv13);
2222 /* EWALD ELECTROSTATICS */
2224 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2225 ewrt = _mm_mul_ps(r13,ewtabscale);
2226 ewitab = _mm_cvttps_epi32(ewrt);
2227 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2228 ewitab = _mm_slli_epi32(ewitab,2);
2229 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2230 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2231 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2232 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2233 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2234 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2235 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2236 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2237 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2239 d = _mm_sub_ps(r13,rswitch);
2240 d = _mm_max_ps(d,_mm_setzero_ps());
2241 d2 = _mm_mul_ps(d,d);
2242 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2244 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2246 /* Evaluate switch function */
2247 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2248 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2249 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2253 fscal = _mm_and_ps(fscal,cutoff_mask);
2255 /* Calculate temporary vectorial force */
2256 tx = _mm_mul_ps(fscal,dx13);
2257 ty = _mm_mul_ps(fscal,dy13);
2258 tz = _mm_mul_ps(fscal,dz13);
2260 /* Update vectorial force */
2261 fix1 = _mm_add_ps(fix1,tx);
2262 fiy1 = _mm_add_ps(fiy1,ty);
2263 fiz1 = _mm_add_ps(fiz1,tz);
2265 fjx3 = _mm_add_ps(fjx3,tx);
2266 fjy3 = _mm_add_ps(fjy3,ty);
2267 fjz3 = _mm_add_ps(fjz3,tz);
2271 /**************************
2272 * CALCULATE INTERACTIONS *
2273 **************************/
2275 if (gmx_mm_any_lt(rsq21,rcutoff2))
2278 r21 = _mm_mul_ps(rsq21,rinv21);
2280 /* EWALD ELECTROSTATICS */
2282 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2283 ewrt = _mm_mul_ps(r21,ewtabscale);
2284 ewitab = _mm_cvttps_epi32(ewrt);
2285 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2286 ewitab = _mm_slli_epi32(ewitab,2);
2287 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2288 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2289 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2290 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2291 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2292 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2293 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2294 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2295 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2297 d = _mm_sub_ps(r21,rswitch);
2298 d = _mm_max_ps(d,_mm_setzero_ps());
2299 d2 = _mm_mul_ps(d,d);
2300 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2302 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2304 /* Evaluate switch function */
2305 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2306 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2307 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2311 fscal = _mm_and_ps(fscal,cutoff_mask);
2313 /* Calculate temporary vectorial force */
2314 tx = _mm_mul_ps(fscal,dx21);
2315 ty = _mm_mul_ps(fscal,dy21);
2316 tz = _mm_mul_ps(fscal,dz21);
2318 /* Update vectorial force */
2319 fix2 = _mm_add_ps(fix2,tx);
2320 fiy2 = _mm_add_ps(fiy2,ty);
2321 fiz2 = _mm_add_ps(fiz2,tz);
2323 fjx1 = _mm_add_ps(fjx1,tx);
2324 fjy1 = _mm_add_ps(fjy1,ty);
2325 fjz1 = _mm_add_ps(fjz1,tz);
2329 /**************************
2330 * CALCULATE INTERACTIONS *
2331 **************************/
2333 if (gmx_mm_any_lt(rsq22,rcutoff2))
2336 r22 = _mm_mul_ps(rsq22,rinv22);
2338 /* EWALD ELECTROSTATICS */
2340 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2341 ewrt = _mm_mul_ps(r22,ewtabscale);
2342 ewitab = _mm_cvttps_epi32(ewrt);
2343 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2344 ewitab = _mm_slli_epi32(ewitab,2);
2345 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2346 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2347 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2348 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2349 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2350 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2351 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2352 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2353 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2355 d = _mm_sub_ps(r22,rswitch);
2356 d = _mm_max_ps(d,_mm_setzero_ps());
2357 d2 = _mm_mul_ps(d,d);
2358 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2360 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2362 /* Evaluate switch function */
2363 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2364 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2365 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2369 fscal = _mm_and_ps(fscal,cutoff_mask);
2371 /* Calculate temporary vectorial force */
2372 tx = _mm_mul_ps(fscal,dx22);
2373 ty = _mm_mul_ps(fscal,dy22);
2374 tz = _mm_mul_ps(fscal,dz22);
2376 /* Update vectorial force */
2377 fix2 = _mm_add_ps(fix2,tx);
2378 fiy2 = _mm_add_ps(fiy2,ty);
2379 fiz2 = _mm_add_ps(fiz2,tz);
2381 fjx2 = _mm_add_ps(fjx2,tx);
2382 fjy2 = _mm_add_ps(fjy2,ty);
2383 fjz2 = _mm_add_ps(fjz2,tz);
2387 /**************************
2388 * CALCULATE INTERACTIONS *
2389 **************************/
2391 if (gmx_mm_any_lt(rsq23,rcutoff2))
2394 r23 = _mm_mul_ps(rsq23,rinv23);
2396 /* EWALD ELECTROSTATICS */
2398 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2399 ewrt = _mm_mul_ps(r23,ewtabscale);
2400 ewitab = _mm_cvttps_epi32(ewrt);
2401 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2402 ewitab = _mm_slli_epi32(ewitab,2);
2403 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2404 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2405 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2406 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2407 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2408 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2409 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2410 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2411 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2413 d = _mm_sub_ps(r23,rswitch);
2414 d = _mm_max_ps(d,_mm_setzero_ps());
2415 d2 = _mm_mul_ps(d,d);
2416 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2418 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2420 /* Evaluate switch function */
2421 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2422 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2423 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2427 fscal = _mm_and_ps(fscal,cutoff_mask);
2429 /* Calculate temporary vectorial force */
2430 tx = _mm_mul_ps(fscal,dx23);
2431 ty = _mm_mul_ps(fscal,dy23);
2432 tz = _mm_mul_ps(fscal,dz23);
2434 /* Update vectorial force */
2435 fix2 = _mm_add_ps(fix2,tx);
2436 fiy2 = _mm_add_ps(fiy2,ty);
2437 fiz2 = _mm_add_ps(fiz2,tz);
2439 fjx3 = _mm_add_ps(fjx3,tx);
2440 fjy3 = _mm_add_ps(fjy3,ty);
2441 fjz3 = _mm_add_ps(fjz3,tz);
2445 /**************************
2446 * CALCULATE INTERACTIONS *
2447 **************************/
2449 if (gmx_mm_any_lt(rsq31,rcutoff2))
2452 r31 = _mm_mul_ps(rsq31,rinv31);
2454 /* EWALD ELECTROSTATICS */
2456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2457 ewrt = _mm_mul_ps(r31,ewtabscale);
2458 ewitab = _mm_cvttps_epi32(ewrt);
2459 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2460 ewitab = _mm_slli_epi32(ewitab,2);
2461 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2462 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2463 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2464 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2465 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2466 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2467 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2468 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2469 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2471 d = _mm_sub_ps(r31,rswitch);
2472 d = _mm_max_ps(d,_mm_setzero_ps());
2473 d2 = _mm_mul_ps(d,d);
2474 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2476 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2478 /* Evaluate switch function */
2479 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2480 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2481 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2485 fscal = _mm_and_ps(fscal,cutoff_mask);
2487 /* Calculate temporary vectorial force */
2488 tx = _mm_mul_ps(fscal,dx31);
2489 ty = _mm_mul_ps(fscal,dy31);
2490 tz = _mm_mul_ps(fscal,dz31);
2492 /* Update vectorial force */
2493 fix3 = _mm_add_ps(fix3,tx);
2494 fiy3 = _mm_add_ps(fiy3,ty);
2495 fiz3 = _mm_add_ps(fiz3,tz);
2497 fjx1 = _mm_add_ps(fjx1,tx);
2498 fjy1 = _mm_add_ps(fjy1,ty);
2499 fjz1 = _mm_add_ps(fjz1,tz);
2503 /**************************
2504 * CALCULATE INTERACTIONS *
2505 **************************/
2507 if (gmx_mm_any_lt(rsq32,rcutoff2))
2510 r32 = _mm_mul_ps(rsq32,rinv32);
2512 /* EWALD ELECTROSTATICS */
2514 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2515 ewrt = _mm_mul_ps(r32,ewtabscale);
2516 ewitab = _mm_cvttps_epi32(ewrt);
2517 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2518 ewitab = _mm_slli_epi32(ewitab,2);
2519 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2520 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2521 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2522 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2523 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2524 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2525 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2526 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2527 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2529 d = _mm_sub_ps(r32,rswitch);
2530 d = _mm_max_ps(d,_mm_setzero_ps());
2531 d2 = _mm_mul_ps(d,d);
2532 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2534 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2536 /* Evaluate switch function */
2537 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2538 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2539 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2543 fscal = _mm_and_ps(fscal,cutoff_mask);
2545 /* Calculate temporary vectorial force */
2546 tx = _mm_mul_ps(fscal,dx32);
2547 ty = _mm_mul_ps(fscal,dy32);
2548 tz = _mm_mul_ps(fscal,dz32);
2550 /* Update vectorial force */
2551 fix3 = _mm_add_ps(fix3,tx);
2552 fiy3 = _mm_add_ps(fiy3,ty);
2553 fiz3 = _mm_add_ps(fiz3,tz);
2555 fjx2 = _mm_add_ps(fjx2,tx);
2556 fjy2 = _mm_add_ps(fjy2,ty);
2557 fjz2 = _mm_add_ps(fjz2,tz);
2561 /**************************
2562 * CALCULATE INTERACTIONS *
2563 **************************/
2565 if (gmx_mm_any_lt(rsq33,rcutoff2))
2568 r33 = _mm_mul_ps(rsq33,rinv33);
2570 /* EWALD ELECTROSTATICS */
2572 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2573 ewrt = _mm_mul_ps(r33,ewtabscale);
2574 ewitab = _mm_cvttps_epi32(ewrt);
2575 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2576 ewitab = _mm_slli_epi32(ewitab,2);
2577 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2578 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2579 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2580 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2581 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2582 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2583 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2584 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2585 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2587 d = _mm_sub_ps(r33,rswitch);
2588 d = _mm_max_ps(d,_mm_setzero_ps());
2589 d2 = _mm_mul_ps(d,d);
2590 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2592 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2594 /* Evaluate switch function */
2595 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2596 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2597 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2601 fscal = _mm_and_ps(fscal,cutoff_mask);
2603 /* Calculate temporary vectorial force */
2604 tx = _mm_mul_ps(fscal,dx33);
2605 ty = _mm_mul_ps(fscal,dy33);
2606 tz = _mm_mul_ps(fscal,dz33);
2608 /* Update vectorial force */
2609 fix3 = _mm_add_ps(fix3,tx);
2610 fiy3 = _mm_add_ps(fiy3,ty);
2611 fiz3 = _mm_add_ps(fiz3,tz);
2613 fjx3 = _mm_add_ps(fjx3,tx);
2614 fjy3 = _mm_add_ps(fjy3,ty);
2615 fjz3 = _mm_add_ps(fjz3,tz);
2619 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2620 f+j_coord_offsetC,f+j_coord_offsetD,
2621 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2622 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2624 /* Inner loop uses 617 flops */
2627 if(jidx<j_index_end)
2630 /* Get j neighbor index, and coordinate index */
2632 jnrB = jjnr[jidx+1];
2633 jnrC = jjnr[jidx+2];
2634 jnrD = jjnr[jidx+3];
2636 /* Sign of each element will be negative for non-real atoms.
2637 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2638 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2640 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2641 jnrA = (jnrA>=0) ? jnrA : 0;
2642 jnrB = (jnrB>=0) ? jnrB : 0;
2643 jnrC = (jnrC>=0) ? jnrC : 0;
2644 jnrD = (jnrD>=0) ? jnrD : 0;
2646 j_coord_offsetA = DIM*jnrA;
2647 j_coord_offsetB = DIM*jnrB;
2648 j_coord_offsetC = DIM*jnrC;
2649 j_coord_offsetD = DIM*jnrD;
2651 /* load j atom coordinates */
2652 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2653 x+j_coord_offsetC,x+j_coord_offsetD,
2654 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2655 &jy2,&jz2,&jx3,&jy3,&jz3);
2657 /* Calculate displacement vector */
2658 dx00 = _mm_sub_ps(ix0,jx0);
2659 dy00 = _mm_sub_ps(iy0,jy0);
2660 dz00 = _mm_sub_ps(iz0,jz0);
2661 dx11 = _mm_sub_ps(ix1,jx1);
2662 dy11 = _mm_sub_ps(iy1,jy1);
2663 dz11 = _mm_sub_ps(iz1,jz1);
2664 dx12 = _mm_sub_ps(ix1,jx2);
2665 dy12 = _mm_sub_ps(iy1,jy2);
2666 dz12 = _mm_sub_ps(iz1,jz2);
2667 dx13 = _mm_sub_ps(ix1,jx3);
2668 dy13 = _mm_sub_ps(iy1,jy3);
2669 dz13 = _mm_sub_ps(iz1,jz3);
2670 dx21 = _mm_sub_ps(ix2,jx1);
2671 dy21 = _mm_sub_ps(iy2,jy1);
2672 dz21 = _mm_sub_ps(iz2,jz1);
2673 dx22 = _mm_sub_ps(ix2,jx2);
2674 dy22 = _mm_sub_ps(iy2,jy2);
2675 dz22 = _mm_sub_ps(iz2,jz2);
2676 dx23 = _mm_sub_ps(ix2,jx3);
2677 dy23 = _mm_sub_ps(iy2,jy3);
2678 dz23 = _mm_sub_ps(iz2,jz3);
2679 dx31 = _mm_sub_ps(ix3,jx1);
2680 dy31 = _mm_sub_ps(iy3,jy1);
2681 dz31 = _mm_sub_ps(iz3,jz1);
2682 dx32 = _mm_sub_ps(ix3,jx2);
2683 dy32 = _mm_sub_ps(iy3,jy2);
2684 dz32 = _mm_sub_ps(iz3,jz2);
2685 dx33 = _mm_sub_ps(ix3,jx3);
2686 dy33 = _mm_sub_ps(iy3,jy3);
2687 dz33 = _mm_sub_ps(iz3,jz3);
2689 /* Calculate squared distance and things based on it */
2690 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2691 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2692 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2693 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2694 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2695 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2696 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2697 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2698 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2699 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2701 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2702 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2703 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2704 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2705 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2706 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2707 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2708 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2709 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2710 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2712 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2713 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2714 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2715 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2716 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2717 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2718 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2719 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2720 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2721 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2723 fjx0 = _mm_setzero_ps();
2724 fjy0 = _mm_setzero_ps();
2725 fjz0 = _mm_setzero_ps();
2726 fjx1 = _mm_setzero_ps();
2727 fjy1 = _mm_setzero_ps();
2728 fjz1 = _mm_setzero_ps();
2729 fjx2 = _mm_setzero_ps();
2730 fjy2 = _mm_setzero_ps();
2731 fjz2 = _mm_setzero_ps();
2732 fjx3 = _mm_setzero_ps();
2733 fjy3 = _mm_setzero_ps();
2734 fjz3 = _mm_setzero_ps();
2736 /**************************
2737 * CALCULATE INTERACTIONS *
2738 **************************/
2740 if (gmx_mm_any_lt(rsq00,rcutoff2))
2743 r00 = _mm_mul_ps(rsq00,rinv00);
2744 r00 = _mm_andnot_ps(dummy_mask,r00);
2746 /* LENNARD-JONES DISPERSION/REPULSION */
2748 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2749 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2750 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2751 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2752 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2754 d = _mm_sub_ps(r00,rswitch);
2755 d = _mm_max_ps(d,_mm_setzero_ps());
2756 d2 = _mm_mul_ps(d,d);
2757 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2759 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2761 /* Evaluate switch function */
2762 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2763 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2764 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2768 fscal = _mm_and_ps(fscal,cutoff_mask);
2770 fscal = _mm_andnot_ps(dummy_mask,fscal);
2772 /* Calculate temporary vectorial force */
2773 tx = _mm_mul_ps(fscal,dx00);
2774 ty = _mm_mul_ps(fscal,dy00);
2775 tz = _mm_mul_ps(fscal,dz00);
2777 /* Update vectorial force */
2778 fix0 = _mm_add_ps(fix0,tx);
2779 fiy0 = _mm_add_ps(fiy0,ty);
2780 fiz0 = _mm_add_ps(fiz0,tz);
2782 fjx0 = _mm_add_ps(fjx0,tx);
2783 fjy0 = _mm_add_ps(fjy0,ty);
2784 fjz0 = _mm_add_ps(fjz0,tz);
2788 /**************************
2789 * CALCULATE INTERACTIONS *
2790 **************************/
2792 if (gmx_mm_any_lt(rsq11,rcutoff2))
2795 r11 = _mm_mul_ps(rsq11,rinv11);
2796 r11 = _mm_andnot_ps(dummy_mask,r11);
2798 /* EWALD ELECTROSTATICS */
2800 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2801 ewrt = _mm_mul_ps(r11,ewtabscale);
2802 ewitab = _mm_cvttps_epi32(ewrt);
2803 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2804 ewitab = _mm_slli_epi32(ewitab,2);
2805 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2806 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2807 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2808 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2809 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2810 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2811 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2812 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2813 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2815 d = _mm_sub_ps(r11,rswitch);
2816 d = _mm_max_ps(d,_mm_setzero_ps());
2817 d2 = _mm_mul_ps(d,d);
2818 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2820 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2822 /* Evaluate switch function */
2823 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2824 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2825 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2829 fscal = _mm_and_ps(fscal,cutoff_mask);
2831 fscal = _mm_andnot_ps(dummy_mask,fscal);
2833 /* Calculate temporary vectorial force */
2834 tx = _mm_mul_ps(fscal,dx11);
2835 ty = _mm_mul_ps(fscal,dy11);
2836 tz = _mm_mul_ps(fscal,dz11);
2838 /* Update vectorial force */
2839 fix1 = _mm_add_ps(fix1,tx);
2840 fiy1 = _mm_add_ps(fiy1,ty);
2841 fiz1 = _mm_add_ps(fiz1,tz);
2843 fjx1 = _mm_add_ps(fjx1,tx);
2844 fjy1 = _mm_add_ps(fjy1,ty);
2845 fjz1 = _mm_add_ps(fjz1,tz);
2849 /**************************
2850 * CALCULATE INTERACTIONS *
2851 **************************/
2853 if (gmx_mm_any_lt(rsq12,rcutoff2))
2856 r12 = _mm_mul_ps(rsq12,rinv12);
2857 r12 = _mm_andnot_ps(dummy_mask,r12);
2859 /* EWALD ELECTROSTATICS */
2861 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2862 ewrt = _mm_mul_ps(r12,ewtabscale);
2863 ewitab = _mm_cvttps_epi32(ewrt);
2864 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2865 ewitab = _mm_slli_epi32(ewitab,2);
2866 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2867 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2868 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2869 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2870 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2871 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2872 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2873 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2874 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2876 d = _mm_sub_ps(r12,rswitch);
2877 d = _mm_max_ps(d,_mm_setzero_ps());
2878 d2 = _mm_mul_ps(d,d);
2879 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2881 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2883 /* Evaluate switch function */
2884 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2885 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2886 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2890 fscal = _mm_and_ps(fscal,cutoff_mask);
2892 fscal = _mm_andnot_ps(dummy_mask,fscal);
2894 /* Calculate temporary vectorial force */
2895 tx = _mm_mul_ps(fscal,dx12);
2896 ty = _mm_mul_ps(fscal,dy12);
2897 tz = _mm_mul_ps(fscal,dz12);
2899 /* Update vectorial force */
2900 fix1 = _mm_add_ps(fix1,tx);
2901 fiy1 = _mm_add_ps(fiy1,ty);
2902 fiz1 = _mm_add_ps(fiz1,tz);
2904 fjx2 = _mm_add_ps(fjx2,tx);
2905 fjy2 = _mm_add_ps(fjy2,ty);
2906 fjz2 = _mm_add_ps(fjz2,tz);
2910 /**************************
2911 * CALCULATE INTERACTIONS *
2912 **************************/
2914 if (gmx_mm_any_lt(rsq13,rcutoff2))
2917 r13 = _mm_mul_ps(rsq13,rinv13);
2918 r13 = _mm_andnot_ps(dummy_mask,r13);
2920 /* EWALD ELECTROSTATICS */
2922 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2923 ewrt = _mm_mul_ps(r13,ewtabscale);
2924 ewitab = _mm_cvttps_epi32(ewrt);
2925 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2926 ewitab = _mm_slli_epi32(ewitab,2);
2927 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2928 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2929 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2930 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2931 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2932 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2933 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2934 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2935 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2937 d = _mm_sub_ps(r13,rswitch);
2938 d = _mm_max_ps(d,_mm_setzero_ps());
2939 d2 = _mm_mul_ps(d,d);
2940 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2942 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2944 /* Evaluate switch function */
2945 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2946 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2947 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2951 fscal = _mm_and_ps(fscal,cutoff_mask);
2953 fscal = _mm_andnot_ps(dummy_mask,fscal);
2955 /* Calculate temporary vectorial force */
2956 tx = _mm_mul_ps(fscal,dx13);
2957 ty = _mm_mul_ps(fscal,dy13);
2958 tz = _mm_mul_ps(fscal,dz13);
2960 /* Update vectorial force */
2961 fix1 = _mm_add_ps(fix1,tx);
2962 fiy1 = _mm_add_ps(fiy1,ty);
2963 fiz1 = _mm_add_ps(fiz1,tz);
2965 fjx3 = _mm_add_ps(fjx3,tx);
2966 fjy3 = _mm_add_ps(fjy3,ty);
2967 fjz3 = _mm_add_ps(fjz3,tz);
2971 /**************************
2972 * CALCULATE INTERACTIONS *
2973 **************************/
2975 if (gmx_mm_any_lt(rsq21,rcutoff2))
2978 r21 = _mm_mul_ps(rsq21,rinv21);
2979 r21 = _mm_andnot_ps(dummy_mask,r21);
2981 /* EWALD ELECTROSTATICS */
2983 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2984 ewrt = _mm_mul_ps(r21,ewtabscale);
2985 ewitab = _mm_cvttps_epi32(ewrt);
2986 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2987 ewitab = _mm_slli_epi32(ewitab,2);
2988 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2989 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2990 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2991 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2992 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2993 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2994 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2995 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2996 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2998 d = _mm_sub_ps(r21,rswitch);
2999 d = _mm_max_ps(d,_mm_setzero_ps());
3000 d2 = _mm_mul_ps(d,d);
3001 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3003 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3005 /* Evaluate switch function */
3006 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3007 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3008 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3012 fscal = _mm_and_ps(fscal,cutoff_mask);
3014 fscal = _mm_andnot_ps(dummy_mask,fscal);
3016 /* Calculate temporary vectorial force */
3017 tx = _mm_mul_ps(fscal,dx21);
3018 ty = _mm_mul_ps(fscal,dy21);
3019 tz = _mm_mul_ps(fscal,dz21);
3021 /* Update vectorial force */
3022 fix2 = _mm_add_ps(fix2,tx);
3023 fiy2 = _mm_add_ps(fiy2,ty);
3024 fiz2 = _mm_add_ps(fiz2,tz);
3026 fjx1 = _mm_add_ps(fjx1,tx);
3027 fjy1 = _mm_add_ps(fjy1,ty);
3028 fjz1 = _mm_add_ps(fjz1,tz);
3032 /**************************
3033 * CALCULATE INTERACTIONS *
3034 **************************/
3036 if (gmx_mm_any_lt(rsq22,rcutoff2))
3039 r22 = _mm_mul_ps(rsq22,rinv22);
3040 r22 = _mm_andnot_ps(dummy_mask,r22);
3042 /* EWALD ELECTROSTATICS */
3044 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3045 ewrt = _mm_mul_ps(r22,ewtabscale);
3046 ewitab = _mm_cvttps_epi32(ewrt);
3047 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3048 ewitab = _mm_slli_epi32(ewitab,2);
3049 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3050 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3051 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3052 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3053 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3054 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3055 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3056 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3057 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3059 d = _mm_sub_ps(r22,rswitch);
3060 d = _mm_max_ps(d,_mm_setzero_ps());
3061 d2 = _mm_mul_ps(d,d);
3062 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3064 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3066 /* Evaluate switch function */
3067 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3068 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3069 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3073 fscal = _mm_and_ps(fscal,cutoff_mask);
3075 fscal = _mm_andnot_ps(dummy_mask,fscal);
3077 /* Calculate temporary vectorial force */
3078 tx = _mm_mul_ps(fscal,dx22);
3079 ty = _mm_mul_ps(fscal,dy22);
3080 tz = _mm_mul_ps(fscal,dz22);
3082 /* Update vectorial force */
3083 fix2 = _mm_add_ps(fix2,tx);
3084 fiy2 = _mm_add_ps(fiy2,ty);
3085 fiz2 = _mm_add_ps(fiz2,tz);
3087 fjx2 = _mm_add_ps(fjx2,tx);
3088 fjy2 = _mm_add_ps(fjy2,ty);
3089 fjz2 = _mm_add_ps(fjz2,tz);
3093 /**************************
3094 * CALCULATE INTERACTIONS *
3095 **************************/
3097 if (gmx_mm_any_lt(rsq23,rcutoff2))
3100 r23 = _mm_mul_ps(rsq23,rinv23);
3101 r23 = _mm_andnot_ps(dummy_mask,r23);
3103 /* EWALD ELECTROSTATICS */
3105 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3106 ewrt = _mm_mul_ps(r23,ewtabscale);
3107 ewitab = _mm_cvttps_epi32(ewrt);
3108 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3109 ewitab = _mm_slli_epi32(ewitab,2);
3110 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3111 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3112 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3113 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3114 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3115 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3116 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3117 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3118 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3120 d = _mm_sub_ps(r23,rswitch);
3121 d = _mm_max_ps(d,_mm_setzero_ps());
3122 d2 = _mm_mul_ps(d,d);
3123 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3125 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3127 /* Evaluate switch function */
3128 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3129 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3130 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
3134 fscal = _mm_and_ps(fscal,cutoff_mask);
3136 fscal = _mm_andnot_ps(dummy_mask,fscal);
3138 /* Calculate temporary vectorial force */
3139 tx = _mm_mul_ps(fscal,dx23);
3140 ty = _mm_mul_ps(fscal,dy23);
3141 tz = _mm_mul_ps(fscal,dz23);
3143 /* Update vectorial force */
3144 fix2 = _mm_add_ps(fix2,tx);
3145 fiy2 = _mm_add_ps(fiy2,ty);
3146 fiz2 = _mm_add_ps(fiz2,tz);
3148 fjx3 = _mm_add_ps(fjx3,tx);
3149 fjy3 = _mm_add_ps(fjy3,ty);
3150 fjz3 = _mm_add_ps(fjz3,tz);
3154 /**************************
3155 * CALCULATE INTERACTIONS *
3156 **************************/
3158 if (gmx_mm_any_lt(rsq31,rcutoff2))
3161 r31 = _mm_mul_ps(rsq31,rinv31);
3162 r31 = _mm_andnot_ps(dummy_mask,r31);
3164 /* EWALD ELECTROSTATICS */
3166 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3167 ewrt = _mm_mul_ps(r31,ewtabscale);
3168 ewitab = _mm_cvttps_epi32(ewrt);
3169 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3170 ewitab = _mm_slli_epi32(ewitab,2);
3171 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3172 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3173 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3174 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3175 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3176 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3177 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3178 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3179 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3181 d = _mm_sub_ps(r31,rswitch);
3182 d = _mm_max_ps(d,_mm_setzero_ps());
3183 d2 = _mm_mul_ps(d,d);
3184 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3186 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3188 /* Evaluate switch function */
3189 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3190 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3191 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
3195 fscal = _mm_and_ps(fscal,cutoff_mask);
3197 fscal = _mm_andnot_ps(dummy_mask,fscal);
3199 /* Calculate temporary vectorial force */
3200 tx = _mm_mul_ps(fscal,dx31);
3201 ty = _mm_mul_ps(fscal,dy31);
3202 tz = _mm_mul_ps(fscal,dz31);
3204 /* Update vectorial force */
3205 fix3 = _mm_add_ps(fix3,tx);
3206 fiy3 = _mm_add_ps(fiy3,ty);
3207 fiz3 = _mm_add_ps(fiz3,tz);
3209 fjx1 = _mm_add_ps(fjx1,tx);
3210 fjy1 = _mm_add_ps(fjy1,ty);
3211 fjz1 = _mm_add_ps(fjz1,tz);
3215 /**************************
3216 * CALCULATE INTERACTIONS *
3217 **************************/
3219 if (gmx_mm_any_lt(rsq32,rcutoff2))
3222 r32 = _mm_mul_ps(rsq32,rinv32);
3223 r32 = _mm_andnot_ps(dummy_mask,r32);
3225 /* EWALD ELECTROSTATICS */
3227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3228 ewrt = _mm_mul_ps(r32,ewtabscale);
3229 ewitab = _mm_cvttps_epi32(ewrt);
3230 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3231 ewitab = _mm_slli_epi32(ewitab,2);
3232 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3233 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3234 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3235 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3236 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3237 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3238 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3239 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3240 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3242 d = _mm_sub_ps(r32,rswitch);
3243 d = _mm_max_ps(d,_mm_setzero_ps());
3244 d2 = _mm_mul_ps(d,d);
3245 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3247 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3249 /* Evaluate switch function */
3250 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3251 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3252 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
3256 fscal = _mm_and_ps(fscal,cutoff_mask);
3258 fscal = _mm_andnot_ps(dummy_mask,fscal);
3260 /* Calculate temporary vectorial force */
3261 tx = _mm_mul_ps(fscal,dx32);
3262 ty = _mm_mul_ps(fscal,dy32);
3263 tz = _mm_mul_ps(fscal,dz32);
3265 /* Update vectorial force */
3266 fix3 = _mm_add_ps(fix3,tx);
3267 fiy3 = _mm_add_ps(fiy3,ty);
3268 fiz3 = _mm_add_ps(fiz3,tz);
3270 fjx2 = _mm_add_ps(fjx2,tx);
3271 fjy2 = _mm_add_ps(fjy2,ty);
3272 fjz2 = _mm_add_ps(fjz2,tz);
3276 /**************************
3277 * CALCULATE INTERACTIONS *
3278 **************************/
3280 if (gmx_mm_any_lt(rsq33,rcutoff2))
3283 r33 = _mm_mul_ps(rsq33,rinv33);
3284 r33 = _mm_andnot_ps(dummy_mask,r33);
3286 /* EWALD ELECTROSTATICS */
3288 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3289 ewrt = _mm_mul_ps(r33,ewtabscale);
3290 ewitab = _mm_cvttps_epi32(ewrt);
3291 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3292 ewitab = _mm_slli_epi32(ewitab,2);
3293 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3294 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3295 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3296 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3297 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3298 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3299 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3300 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3301 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3303 d = _mm_sub_ps(r33,rswitch);
3304 d = _mm_max_ps(d,_mm_setzero_ps());
3305 d2 = _mm_mul_ps(d,d);
3306 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3308 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3310 /* Evaluate switch function */
3311 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3312 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3313 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
3317 fscal = _mm_and_ps(fscal,cutoff_mask);
3319 fscal = _mm_andnot_ps(dummy_mask,fscal);
3321 /* Calculate temporary vectorial force */
3322 tx = _mm_mul_ps(fscal,dx33);
3323 ty = _mm_mul_ps(fscal,dy33);
3324 tz = _mm_mul_ps(fscal,dz33);
3326 /* Update vectorial force */
3327 fix3 = _mm_add_ps(fix3,tx);
3328 fiy3 = _mm_add_ps(fiy3,ty);
3329 fiz3 = _mm_add_ps(fiz3,tz);
3331 fjx3 = _mm_add_ps(fjx3,tx);
3332 fjy3 = _mm_add_ps(fjy3,ty);
3333 fjz3 = _mm_add_ps(fjz3,tz);
3337 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
3338 f+j_coord_offsetC,f+j_coord_offsetD,
3339 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3340 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3342 /* Inner loop uses 627 flops */
3345 /* End of innermost loop */
3347 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3348 f+i_coord_offset,fshift+i_shift_offset);
3350 /* Increment number of inner iterations */
3351 inneriter += j_index_end - j_index_start;
3353 /* Outer loop uses 36 flops */
3356 /* Increment number of outer iterations */
3359 /* Update outer/inner flops */
3361 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*627);