2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
101 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
102 real rswitch_scalar,d_scalar;
103 __m128 dummy_mask,cutoff_mask;
104 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
105 __m128 one = _mm_set1_ps(1.0);
106 __m128 two = _mm_set1_ps(2.0);
112 jindex = nlist->jindex;
114 shiftidx = nlist->shift;
116 shiftvec = fr->shift_vec[0];
117 fshift = fr->fshift[0];
118 facel = _mm_set1_ps(fr->epsfac);
119 charge = mdatoms->chargeA;
120 nvdwtype = fr->ntype;
122 vdwtype = mdatoms->typeA;
124 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
125 ewtab = fr->ic->tabq_coul_FDV0;
126 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
127 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
132 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
133 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
134 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
136 jq0 = _mm_set1_ps(charge[inr+0]);
137 jq1 = _mm_set1_ps(charge[inr+1]);
138 jq2 = _mm_set1_ps(charge[inr+2]);
139 vdwjidx0A = 2*vdwtype[inr+0];
140 qq00 = _mm_mul_ps(iq0,jq0);
141 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
142 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
143 qq01 = _mm_mul_ps(iq0,jq1);
144 qq02 = _mm_mul_ps(iq0,jq2);
145 qq10 = _mm_mul_ps(iq1,jq0);
146 qq11 = _mm_mul_ps(iq1,jq1);
147 qq12 = _mm_mul_ps(iq1,jq2);
148 qq20 = _mm_mul_ps(iq2,jq0);
149 qq21 = _mm_mul_ps(iq2,jq1);
150 qq22 = _mm_mul_ps(iq2,jq2);
152 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153 rcutoff_scalar = fr->rcoulomb;
154 rcutoff = _mm_set1_ps(rcutoff_scalar);
155 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
157 rswitch_scalar = fr->rcoulomb_switch;
158 rswitch = _mm_set1_ps(rswitch_scalar);
159 /* Setup switch parameters */
160 d_scalar = rcutoff_scalar-rswitch_scalar;
161 d = _mm_set1_ps(d_scalar);
162 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
163 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
166 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
167 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
169 /* Avoid stupid compiler warnings */
170 jnrA = jnrB = jnrC = jnrD = 0;
179 for(iidx=0;iidx<4*DIM;iidx++)
184 /* Start outer loop over neighborlists */
185 for(iidx=0; iidx<nri; iidx++)
187 /* Load shift vector for this list */
188 i_shift_offset = DIM*shiftidx[iidx];
190 /* Load limits for loop over neighbors */
191 j_index_start = jindex[iidx];
192 j_index_end = jindex[iidx+1];
194 /* Get outer coordinate index */
196 i_coord_offset = DIM*inr;
198 /* Load i particle coords and add shift vector */
199 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
200 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
202 fix0 = _mm_setzero_ps();
203 fiy0 = _mm_setzero_ps();
204 fiz0 = _mm_setzero_ps();
205 fix1 = _mm_setzero_ps();
206 fiy1 = _mm_setzero_ps();
207 fiz1 = _mm_setzero_ps();
208 fix2 = _mm_setzero_ps();
209 fiy2 = _mm_setzero_ps();
210 fiz2 = _mm_setzero_ps();
212 /* Reset potential sums */
213 velecsum = _mm_setzero_ps();
214 vvdwsum = _mm_setzero_ps();
216 /* Start inner kernel loop */
217 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
220 /* Get j neighbor index, and coordinate index */
225 j_coord_offsetA = DIM*jnrA;
226 j_coord_offsetB = DIM*jnrB;
227 j_coord_offsetC = DIM*jnrC;
228 j_coord_offsetD = DIM*jnrD;
230 /* load j atom coordinates */
231 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
232 x+j_coord_offsetC,x+j_coord_offsetD,
233 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
235 /* Calculate displacement vector */
236 dx00 = _mm_sub_ps(ix0,jx0);
237 dy00 = _mm_sub_ps(iy0,jy0);
238 dz00 = _mm_sub_ps(iz0,jz0);
239 dx01 = _mm_sub_ps(ix0,jx1);
240 dy01 = _mm_sub_ps(iy0,jy1);
241 dz01 = _mm_sub_ps(iz0,jz1);
242 dx02 = _mm_sub_ps(ix0,jx2);
243 dy02 = _mm_sub_ps(iy0,jy2);
244 dz02 = _mm_sub_ps(iz0,jz2);
245 dx10 = _mm_sub_ps(ix1,jx0);
246 dy10 = _mm_sub_ps(iy1,jy0);
247 dz10 = _mm_sub_ps(iz1,jz0);
248 dx11 = _mm_sub_ps(ix1,jx1);
249 dy11 = _mm_sub_ps(iy1,jy1);
250 dz11 = _mm_sub_ps(iz1,jz1);
251 dx12 = _mm_sub_ps(ix1,jx2);
252 dy12 = _mm_sub_ps(iy1,jy2);
253 dz12 = _mm_sub_ps(iz1,jz2);
254 dx20 = _mm_sub_ps(ix2,jx0);
255 dy20 = _mm_sub_ps(iy2,jy0);
256 dz20 = _mm_sub_ps(iz2,jz0);
257 dx21 = _mm_sub_ps(ix2,jx1);
258 dy21 = _mm_sub_ps(iy2,jy1);
259 dz21 = _mm_sub_ps(iz2,jz1);
260 dx22 = _mm_sub_ps(ix2,jx2);
261 dy22 = _mm_sub_ps(iy2,jy2);
262 dz22 = _mm_sub_ps(iz2,jz2);
264 /* Calculate squared distance and things based on it */
265 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
266 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
267 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
268 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
269 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
270 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
271 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
272 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
273 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
275 rinv00 = gmx_mm_invsqrt_ps(rsq00);
276 rinv01 = gmx_mm_invsqrt_ps(rsq01);
277 rinv02 = gmx_mm_invsqrt_ps(rsq02);
278 rinv10 = gmx_mm_invsqrt_ps(rsq10);
279 rinv11 = gmx_mm_invsqrt_ps(rsq11);
280 rinv12 = gmx_mm_invsqrt_ps(rsq12);
281 rinv20 = gmx_mm_invsqrt_ps(rsq20);
282 rinv21 = gmx_mm_invsqrt_ps(rsq21);
283 rinv22 = gmx_mm_invsqrt_ps(rsq22);
285 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
286 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
287 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
288 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
289 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
290 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
291 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
292 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
293 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
295 fjx0 = _mm_setzero_ps();
296 fjy0 = _mm_setzero_ps();
297 fjz0 = _mm_setzero_ps();
298 fjx1 = _mm_setzero_ps();
299 fjy1 = _mm_setzero_ps();
300 fjz1 = _mm_setzero_ps();
301 fjx2 = _mm_setzero_ps();
302 fjy2 = _mm_setzero_ps();
303 fjz2 = _mm_setzero_ps();
305 /**************************
306 * CALCULATE INTERACTIONS *
307 **************************/
309 if (gmx_mm_any_lt(rsq00,rcutoff2))
312 r00 = _mm_mul_ps(rsq00,rinv00);
314 /* EWALD ELECTROSTATICS */
316 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
317 ewrt = _mm_mul_ps(r00,ewtabscale);
318 ewitab = _mm_cvttps_epi32(ewrt);
319 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
320 ewitab = _mm_slli_epi32(ewitab,2);
321 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
322 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
323 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
324 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
325 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
326 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
327 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
328 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
329 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
331 /* LENNARD-JONES DISPERSION/REPULSION */
333 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
334 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
335 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
336 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
337 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
339 d = _mm_sub_ps(r00,rswitch);
340 d = _mm_max_ps(d,_mm_setzero_ps());
341 d2 = _mm_mul_ps(d,d);
342 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
344 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
346 /* Evaluate switch function */
347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
349 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
350 velec = _mm_mul_ps(velec,sw);
351 vvdw = _mm_mul_ps(vvdw,sw);
352 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
354 /* Update potential sum for this i atom from the interaction with this j atom. */
355 velec = _mm_and_ps(velec,cutoff_mask);
356 velecsum = _mm_add_ps(velecsum,velec);
357 vvdw = _mm_and_ps(vvdw,cutoff_mask);
358 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
360 fscal = _mm_add_ps(felec,fvdw);
362 fscal = _mm_and_ps(fscal,cutoff_mask);
364 /* Calculate temporary vectorial force */
365 tx = _mm_mul_ps(fscal,dx00);
366 ty = _mm_mul_ps(fscal,dy00);
367 tz = _mm_mul_ps(fscal,dz00);
369 /* Update vectorial force */
370 fix0 = _mm_add_ps(fix0,tx);
371 fiy0 = _mm_add_ps(fiy0,ty);
372 fiz0 = _mm_add_ps(fiz0,tz);
374 fjx0 = _mm_add_ps(fjx0,tx);
375 fjy0 = _mm_add_ps(fjy0,ty);
376 fjz0 = _mm_add_ps(fjz0,tz);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 if (gmx_mm_any_lt(rsq01,rcutoff2))
387 r01 = _mm_mul_ps(rsq01,rinv01);
389 /* EWALD ELECTROSTATICS */
391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
392 ewrt = _mm_mul_ps(r01,ewtabscale);
393 ewitab = _mm_cvttps_epi32(ewrt);
394 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
395 ewitab = _mm_slli_epi32(ewitab,2);
396 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
397 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
398 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
399 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
400 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
401 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
402 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
403 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
404 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
406 d = _mm_sub_ps(r01,rswitch);
407 d = _mm_max_ps(d,_mm_setzero_ps());
408 d2 = _mm_mul_ps(d,d);
409 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
411 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
413 /* Evaluate switch function */
414 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
415 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
416 velec = _mm_mul_ps(velec,sw);
417 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velec = _mm_and_ps(velec,cutoff_mask);
421 velecsum = _mm_add_ps(velecsum,velec);
425 fscal = _mm_and_ps(fscal,cutoff_mask);
427 /* Calculate temporary vectorial force */
428 tx = _mm_mul_ps(fscal,dx01);
429 ty = _mm_mul_ps(fscal,dy01);
430 tz = _mm_mul_ps(fscal,dz01);
432 /* Update vectorial force */
433 fix0 = _mm_add_ps(fix0,tx);
434 fiy0 = _mm_add_ps(fiy0,ty);
435 fiz0 = _mm_add_ps(fiz0,tz);
437 fjx1 = _mm_add_ps(fjx1,tx);
438 fjy1 = _mm_add_ps(fjy1,ty);
439 fjz1 = _mm_add_ps(fjz1,tz);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 if (gmx_mm_any_lt(rsq02,rcutoff2))
450 r02 = _mm_mul_ps(rsq02,rinv02);
452 /* EWALD ELECTROSTATICS */
454 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
455 ewrt = _mm_mul_ps(r02,ewtabscale);
456 ewitab = _mm_cvttps_epi32(ewrt);
457 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
458 ewitab = _mm_slli_epi32(ewitab,2);
459 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
460 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
461 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
462 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
463 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
464 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
465 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
466 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
467 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
469 d = _mm_sub_ps(r02,rswitch);
470 d = _mm_max_ps(d,_mm_setzero_ps());
471 d2 = _mm_mul_ps(d,d);
472 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
474 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
476 /* Evaluate switch function */
477 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
478 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
479 velec = _mm_mul_ps(velec,sw);
480 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velec = _mm_and_ps(velec,cutoff_mask);
484 velecsum = _mm_add_ps(velecsum,velec);
488 fscal = _mm_and_ps(fscal,cutoff_mask);
490 /* Calculate temporary vectorial force */
491 tx = _mm_mul_ps(fscal,dx02);
492 ty = _mm_mul_ps(fscal,dy02);
493 tz = _mm_mul_ps(fscal,dz02);
495 /* Update vectorial force */
496 fix0 = _mm_add_ps(fix0,tx);
497 fiy0 = _mm_add_ps(fiy0,ty);
498 fiz0 = _mm_add_ps(fiz0,tz);
500 fjx2 = _mm_add_ps(fjx2,tx);
501 fjy2 = _mm_add_ps(fjy2,ty);
502 fjz2 = _mm_add_ps(fjz2,tz);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 if (gmx_mm_any_lt(rsq10,rcutoff2))
513 r10 = _mm_mul_ps(rsq10,rinv10);
515 /* EWALD ELECTROSTATICS */
517 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
518 ewrt = _mm_mul_ps(r10,ewtabscale);
519 ewitab = _mm_cvttps_epi32(ewrt);
520 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
521 ewitab = _mm_slli_epi32(ewitab,2);
522 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
523 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
524 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
525 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
526 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
527 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
528 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
529 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
530 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
532 d = _mm_sub_ps(r10,rswitch);
533 d = _mm_max_ps(d,_mm_setzero_ps());
534 d2 = _mm_mul_ps(d,d);
535 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
537 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
539 /* Evaluate switch function */
540 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
541 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
542 velec = _mm_mul_ps(velec,sw);
543 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
545 /* Update potential sum for this i atom from the interaction with this j atom. */
546 velec = _mm_and_ps(velec,cutoff_mask);
547 velecsum = _mm_add_ps(velecsum,velec);
551 fscal = _mm_and_ps(fscal,cutoff_mask);
553 /* Calculate temporary vectorial force */
554 tx = _mm_mul_ps(fscal,dx10);
555 ty = _mm_mul_ps(fscal,dy10);
556 tz = _mm_mul_ps(fscal,dz10);
558 /* Update vectorial force */
559 fix1 = _mm_add_ps(fix1,tx);
560 fiy1 = _mm_add_ps(fiy1,ty);
561 fiz1 = _mm_add_ps(fiz1,tz);
563 fjx0 = _mm_add_ps(fjx0,tx);
564 fjy0 = _mm_add_ps(fjy0,ty);
565 fjz0 = _mm_add_ps(fjz0,tz);
569 /**************************
570 * CALCULATE INTERACTIONS *
571 **************************/
573 if (gmx_mm_any_lt(rsq11,rcutoff2))
576 r11 = _mm_mul_ps(rsq11,rinv11);
578 /* EWALD ELECTROSTATICS */
580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
581 ewrt = _mm_mul_ps(r11,ewtabscale);
582 ewitab = _mm_cvttps_epi32(ewrt);
583 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
584 ewitab = _mm_slli_epi32(ewitab,2);
585 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
586 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
587 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
588 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
589 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
590 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
591 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
592 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
593 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
595 d = _mm_sub_ps(r11,rswitch);
596 d = _mm_max_ps(d,_mm_setzero_ps());
597 d2 = _mm_mul_ps(d,d);
598 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
600 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
602 /* Evaluate switch function */
603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
605 velec = _mm_mul_ps(velec,sw);
606 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velec = _mm_and_ps(velec,cutoff_mask);
610 velecsum = _mm_add_ps(velecsum,velec);
614 fscal = _mm_and_ps(fscal,cutoff_mask);
616 /* Calculate temporary vectorial force */
617 tx = _mm_mul_ps(fscal,dx11);
618 ty = _mm_mul_ps(fscal,dy11);
619 tz = _mm_mul_ps(fscal,dz11);
621 /* Update vectorial force */
622 fix1 = _mm_add_ps(fix1,tx);
623 fiy1 = _mm_add_ps(fiy1,ty);
624 fiz1 = _mm_add_ps(fiz1,tz);
626 fjx1 = _mm_add_ps(fjx1,tx);
627 fjy1 = _mm_add_ps(fjy1,ty);
628 fjz1 = _mm_add_ps(fjz1,tz);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 if (gmx_mm_any_lt(rsq12,rcutoff2))
639 r12 = _mm_mul_ps(rsq12,rinv12);
641 /* EWALD ELECTROSTATICS */
643 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
644 ewrt = _mm_mul_ps(r12,ewtabscale);
645 ewitab = _mm_cvttps_epi32(ewrt);
646 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
647 ewitab = _mm_slli_epi32(ewitab,2);
648 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
649 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
650 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
651 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
652 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
653 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
654 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
655 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
656 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
658 d = _mm_sub_ps(r12,rswitch);
659 d = _mm_max_ps(d,_mm_setzero_ps());
660 d2 = _mm_mul_ps(d,d);
661 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
663 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
665 /* Evaluate switch function */
666 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
667 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
668 velec = _mm_mul_ps(velec,sw);
669 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
671 /* Update potential sum for this i atom from the interaction with this j atom. */
672 velec = _mm_and_ps(velec,cutoff_mask);
673 velecsum = _mm_add_ps(velecsum,velec);
677 fscal = _mm_and_ps(fscal,cutoff_mask);
679 /* Calculate temporary vectorial force */
680 tx = _mm_mul_ps(fscal,dx12);
681 ty = _mm_mul_ps(fscal,dy12);
682 tz = _mm_mul_ps(fscal,dz12);
684 /* Update vectorial force */
685 fix1 = _mm_add_ps(fix1,tx);
686 fiy1 = _mm_add_ps(fiy1,ty);
687 fiz1 = _mm_add_ps(fiz1,tz);
689 fjx2 = _mm_add_ps(fjx2,tx);
690 fjy2 = _mm_add_ps(fjy2,ty);
691 fjz2 = _mm_add_ps(fjz2,tz);
695 /**************************
696 * CALCULATE INTERACTIONS *
697 **************************/
699 if (gmx_mm_any_lt(rsq20,rcutoff2))
702 r20 = _mm_mul_ps(rsq20,rinv20);
704 /* EWALD ELECTROSTATICS */
706 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
707 ewrt = _mm_mul_ps(r20,ewtabscale);
708 ewitab = _mm_cvttps_epi32(ewrt);
709 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
710 ewitab = _mm_slli_epi32(ewitab,2);
711 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
712 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
713 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
714 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
715 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
716 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
717 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
718 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
719 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
721 d = _mm_sub_ps(r20,rswitch);
722 d = _mm_max_ps(d,_mm_setzero_ps());
723 d2 = _mm_mul_ps(d,d);
724 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
726 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
728 /* Evaluate switch function */
729 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
730 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
731 velec = _mm_mul_ps(velec,sw);
732 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
734 /* Update potential sum for this i atom from the interaction with this j atom. */
735 velec = _mm_and_ps(velec,cutoff_mask);
736 velecsum = _mm_add_ps(velecsum,velec);
740 fscal = _mm_and_ps(fscal,cutoff_mask);
742 /* Calculate temporary vectorial force */
743 tx = _mm_mul_ps(fscal,dx20);
744 ty = _mm_mul_ps(fscal,dy20);
745 tz = _mm_mul_ps(fscal,dz20);
747 /* Update vectorial force */
748 fix2 = _mm_add_ps(fix2,tx);
749 fiy2 = _mm_add_ps(fiy2,ty);
750 fiz2 = _mm_add_ps(fiz2,tz);
752 fjx0 = _mm_add_ps(fjx0,tx);
753 fjy0 = _mm_add_ps(fjy0,ty);
754 fjz0 = _mm_add_ps(fjz0,tz);
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 if (gmx_mm_any_lt(rsq21,rcutoff2))
765 r21 = _mm_mul_ps(rsq21,rinv21);
767 /* EWALD ELECTROSTATICS */
769 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
770 ewrt = _mm_mul_ps(r21,ewtabscale);
771 ewitab = _mm_cvttps_epi32(ewrt);
772 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
773 ewitab = _mm_slli_epi32(ewitab,2);
774 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
775 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
776 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
777 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
778 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
779 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
780 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
781 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
782 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
784 d = _mm_sub_ps(r21,rswitch);
785 d = _mm_max_ps(d,_mm_setzero_ps());
786 d2 = _mm_mul_ps(d,d);
787 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
789 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
791 /* Evaluate switch function */
792 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
793 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
794 velec = _mm_mul_ps(velec,sw);
795 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
797 /* Update potential sum for this i atom from the interaction with this j atom. */
798 velec = _mm_and_ps(velec,cutoff_mask);
799 velecsum = _mm_add_ps(velecsum,velec);
803 fscal = _mm_and_ps(fscal,cutoff_mask);
805 /* Calculate temporary vectorial force */
806 tx = _mm_mul_ps(fscal,dx21);
807 ty = _mm_mul_ps(fscal,dy21);
808 tz = _mm_mul_ps(fscal,dz21);
810 /* Update vectorial force */
811 fix2 = _mm_add_ps(fix2,tx);
812 fiy2 = _mm_add_ps(fiy2,ty);
813 fiz2 = _mm_add_ps(fiz2,tz);
815 fjx1 = _mm_add_ps(fjx1,tx);
816 fjy1 = _mm_add_ps(fjy1,ty);
817 fjz1 = _mm_add_ps(fjz1,tz);
821 /**************************
822 * CALCULATE INTERACTIONS *
823 **************************/
825 if (gmx_mm_any_lt(rsq22,rcutoff2))
828 r22 = _mm_mul_ps(rsq22,rinv22);
830 /* EWALD ELECTROSTATICS */
832 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
833 ewrt = _mm_mul_ps(r22,ewtabscale);
834 ewitab = _mm_cvttps_epi32(ewrt);
835 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
836 ewitab = _mm_slli_epi32(ewitab,2);
837 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
838 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
839 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
840 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
841 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
842 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
843 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
844 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
845 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
847 d = _mm_sub_ps(r22,rswitch);
848 d = _mm_max_ps(d,_mm_setzero_ps());
849 d2 = _mm_mul_ps(d,d);
850 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
852 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
854 /* Evaluate switch function */
855 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
856 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
857 velec = _mm_mul_ps(velec,sw);
858 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
860 /* Update potential sum for this i atom from the interaction with this j atom. */
861 velec = _mm_and_ps(velec,cutoff_mask);
862 velecsum = _mm_add_ps(velecsum,velec);
866 fscal = _mm_and_ps(fscal,cutoff_mask);
868 /* Calculate temporary vectorial force */
869 tx = _mm_mul_ps(fscal,dx22);
870 ty = _mm_mul_ps(fscal,dy22);
871 tz = _mm_mul_ps(fscal,dz22);
873 /* Update vectorial force */
874 fix2 = _mm_add_ps(fix2,tx);
875 fiy2 = _mm_add_ps(fiy2,ty);
876 fiz2 = _mm_add_ps(fiz2,tz);
878 fjx2 = _mm_add_ps(fjx2,tx);
879 fjy2 = _mm_add_ps(fjy2,ty);
880 fjz2 = _mm_add_ps(fjz2,tz);
884 fjptrA = f+j_coord_offsetA;
885 fjptrB = f+j_coord_offsetB;
886 fjptrC = f+j_coord_offsetC;
887 fjptrD = f+j_coord_offsetD;
889 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
890 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
892 /* Inner loop uses 603 flops */
898 /* Get j neighbor index, and coordinate index */
899 jnrlistA = jjnr[jidx];
900 jnrlistB = jjnr[jidx+1];
901 jnrlistC = jjnr[jidx+2];
902 jnrlistD = jjnr[jidx+3];
903 /* Sign of each element will be negative for non-real atoms.
904 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
905 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
907 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
908 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
909 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
910 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
911 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
912 j_coord_offsetA = DIM*jnrA;
913 j_coord_offsetB = DIM*jnrB;
914 j_coord_offsetC = DIM*jnrC;
915 j_coord_offsetD = DIM*jnrD;
917 /* load j atom coordinates */
918 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
919 x+j_coord_offsetC,x+j_coord_offsetD,
920 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
922 /* Calculate displacement vector */
923 dx00 = _mm_sub_ps(ix0,jx0);
924 dy00 = _mm_sub_ps(iy0,jy0);
925 dz00 = _mm_sub_ps(iz0,jz0);
926 dx01 = _mm_sub_ps(ix0,jx1);
927 dy01 = _mm_sub_ps(iy0,jy1);
928 dz01 = _mm_sub_ps(iz0,jz1);
929 dx02 = _mm_sub_ps(ix0,jx2);
930 dy02 = _mm_sub_ps(iy0,jy2);
931 dz02 = _mm_sub_ps(iz0,jz2);
932 dx10 = _mm_sub_ps(ix1,jx0);
933 dy10 = _mm_sub_ps(iy1,jy0);
934 dz10 = _mm_sub_ps(iz1,jz0);
935 dx11 = _mm_sub_ps(ix1,jx1);
936 dy11 = _mm_sub_ps(iy1,jy1);
937 dz11 = _mm_sub_ps(iz1,jz1);
938 dx12 = _mm_sub_ps(ix1,jx2);
939 dy12 = _mm_sub_ps(iy1,jy2);
940 dz12 = _mm_sub_ps(iz1,jz2);
941 dx20 = _mm_sub_ps(ix2,jx0);
942 dy20 = _mm_sub_ps(iy2,jy0);
943 dz20 = _mm_sub_ps(iz2,jz0);
944 dx21 = _mm_sub_ps(ix2,jx1);
945 dy21 = _mm_sub_ps(iy2,jy1);
946 dz21 = _mm_sub_ps(iz2,jz1);
947 dx22 = _mm_sub_ps(ix2,jx2);
948 dy22 = _mm_sub_ps(iy2,jy2);
949 dz22 = _mm_sub_ps(iz2,jz2);
951 /* Calculate squared distance and things based on it */
952 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
953 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
954 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
955 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
956 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
957 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
958 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
959 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
960 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
962 rinv00 = gmx_mm_invsqrt_ps(rsq00);
963 rinv01 = gmx_mm_invsqrt_ps(rsq01);
964 rinv02 = gmx_mm_invsqrt_ps(rsq02);
965 rinv10 = gmx_mm_invsqrt_ps(rsq10);
966 rinv11 = gmx_mm_invsqrt_ps(rsq11);
967 rinv12 = gmx_mm_invsqrt_ps(rsq12);
968 rinv20 = gmx_mm_invsqrt_ps(rsq20);
969 rinv21 = gmx_mm_invsqrt_ps(rsq21);
970 rinv22 = gmx_mm_invsqrt_ps(rsq22);
972 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
973 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
974 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
975 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
976 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
977 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
978 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
979 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
980 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
982 fjx0 = _mm_setzero_ps();
983 fjy0 = _mm_setzero_ps();
984 fjz0 = _mm_setzero_ps();
985 fjx1 = _mm_setzero_ps();
986 fjy1 = _mm_setzero_ps();
987 fjz1 = _mm_setzero_ps();
988 fjx2 = _mm_setzero_ps();
989 fjy2 = _mm_setzero_ps();
990 fjz2 = _mm_setzero_ps();
992 /**************************
993 * CALCULATE INTERACTIONS *
994 **************************/
996 if (gmx_mm_any_lt(rsq00,rcutoff2))
999 r00 = _mm_mul_ps(rsq00,rinv00);
1000 r00 = _mm_andnot_ps(dummy_mask,r00);
1002 /* EWALD ELECTROSTATICS */
1004 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1005 ewrt = _mm_mul_ps(r00,ewtabscale);
1006 ewitab = _mm_cvttps_epi32(ewrt);
1007 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1008 ewitab = _mm_slli_epi32(ewitab,2);
1009 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1010 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1011 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1012 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1013 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1014 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1015 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1016 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1017 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1019 /* LENNARD-JONES DISPERSION/REPULSION */
1021 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1022 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1023 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1024 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1025 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1027 d = _mm_sub_ps(r00,rswitch);
1028 d = _mm_max_ps(d,_mm_setzero_ps());
1029 d2 = _mm_mul_ps(d,d);
1030 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1032 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1034 /* Evaluate switch function */
1035 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1036 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1037 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1038 velec = _mm_mul_ps(velec,sw);
1039 vvdw = _mm_mul_ps(vvdw,sw);
1040 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1042 /* Update potential sum for this i atom from the interaction with this j atom. */
1043 velec = _mm_and_ps(velec,cutoff_mask);
1044 velec = _mm_andnot_ps(dummy_mask,velec);
1045 velecsum = _mm_add_ps(velecsum,velec);
1046 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1047 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1048 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1050 fscal = _mm_add_ps(felec,fvdw);
1052 fscal = _mm_and_ps(fscal,cutoff_mask);
1054 fscal = _mm_andnot_ps(dummy_mask,fscal);
1056 /* Calculate temporary vectorial force */
1057 tx = _mm_mul_ps(fscal,dx00);
1058 ty = _mm_mul_ps(fscal,dy00);
1059 tz = _mm_mul_ps(fscal,dz00);
1061 /* Update vectorial force */
1062 fix0 = _mm_add_ps(fix0,tx);
1063 fiy0 = _mm_add_ps(fiy0,ty);
1064 fiz0 = _mm_add_ps(fiz0,tz);
1066 fjx0 = _mm_add_ps(fjx0,tx);
1067 fjy0 = _mm_add_ps(fjy0,ty);
1068 fjz0 = _mm_add_ps(fjz0,tz);
1072 /**************************
1073 * CALCULATE INTERACTIONS *
1074 **************************/
1076 if (gmx_mm_any_lt(rsq01,rcutoff2))
1079 r01 = _mm_mul_ps(rsq01,rinv01);
1080 r01 = _mm_andnot_ps(dummy_mask,r01);
1082 /* EWALD ELECTROSTATICS */
1084 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1085 ewrt = _mm_mul_ps(r01,ewtabscale);
1086 ewitab = _mm_cvttps_epi32(ewrt);
1087 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1088 ewitab = _mm_slli_epi32(ewitab,2);
1089 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1090 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1091 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1092 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1093 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1094 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1095 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1096 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1097 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1099 d = _mm_sub_ps(r01,rswitch);
1100 d = _mm_max_ps(d,_mm_setzero_ps());
1101 d2 = _mm_mul_ps(d,d);
1102 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1104 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1106 /* Evaluate switch function */
1107 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1108 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1109 velec = _mm_mul_ps(velec,sw);
1110 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1112 /* Update potential sum for this i atom from the interaction with this j atom. */
1113 velec = _mm_and_ps(velec,cutoff_mask);
1114 velec = _mm_andnot_ps(dummy_mask,velec);
1115 velecsum = _mm_add_ps(velecsum,velec);
1119 fscal = _mm_and_ps(fscal,cutoff_mask);
1121 fscal = _mm_andnot_ps(dummy_mask,fscal);
1123 /* Calculate temporary vectorial force */
1124 tx = _mm_mul_ps(fscal,dx01);
1125 ty = _mm_mul_ps(fscal,dy01);
1126 tz = _mm_mul_ps(fscal,dz01);
1128 /* Update vectorial force */
1129 fix0 = _mm_add_ps(fix0,tx);
1130 fiy0 = _mm_add_ps(fiy0,ty);
1131 fiz0 = _mm_add_ps(fiz0,tz);
1133 fjx1 = _mm_add_ps(fjx1,tx);
1134 fjy1 = _mm_add_ps(fjy1,ty);
1135 fjz1 = _mm_add_ps(fjz1,tz);
1139 /**************************
1140 * CALCULATE INTERACTIONS *
1141 **************************/
1143 if (gmx_mm_any_lt(rsq02,rcutoff2))
1146 r02 = _mm_mul_ps(rsq02,rinv02);
1147 r02 = _mm_andnot_ps(dummy_mask,r02);
1149 /* EWALD ELECTROSTATICS */
1151 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1152 ewrt = _mm_mul_ps(r02,ewtabscale);
1153 ewitab = _mm_cvttps_epi32(ewrt);
1154 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1155 ewitab = _mm_slli_epi32(ewitab,2);
1156 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1157 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1158 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1159 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1160 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1161 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1162 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1163 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1164 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1166 d = _mm_sub_ps(r02,rswitch);
1167 d = _mm_max_ps(d,_mm_setzero_ps());
1168 d2 = _mm_mul_ps(d,d);
1169 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1171 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1173 /* Evaluate switch function */
1174 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1175 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1176 velec = _mm_mul_ps(velec,sw);
1177 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1179 /* Update potential sum for this i atom from the interaction with this j atom. */
1180 velec = _mm_and_ps(velec,cutoff_mask);
1181 velec = _mm_andnot_ps(dummy_mask,velec);
1182 velecsum = _mm_add_ps(velecsum,velec);
1186 fscal = _mm_and_ps(fscal,cutoff_mask);
1188 fscal = _mm_andnot_ps(dummy_mask,fscal);
1190 /* Calculate temporary vectorial force */
1191 tx = _mm_mul_ps(fscal,dx02);
1192 ty = _mm_mul_ps(fscal,dy02);
1193 tz = _mm_mul_ps(fscal,dz02);
1195 /* Update vectorial force */
1196 fix0 = _mm_add_ps(fix0,tx);
1197 fiy0 = _mm_add_ps(fiy0,ty);
1198 fiz0 = _mm_add_ps(fiz0,tz);
1200 fjx2 = _mm_add_ps(fjx2,tx);
1201 fjy2 = _mm_add_ps(fjy2,ty);
1202 fjz2 = _mm_add_ps(fjz2,tz);
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 if (gmx_mm_any_lt(rsq10,rcutoff2))
1213 r10 = _mm_mul_ps(rsq10,rinv10);
1214 r10 = _mm_andnot_ps(dummy_mask,r10);
1216 /* EWALD ELECTROSTATICS */
1218 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1219 ewrt = _mm_mul_ps(r10,ewtabscale);
1220 ewitab = _mm_cvttps_epi32(ewrt);
1221 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1222 ewitab = _mm_slli_epi32(ewitab,2);
1223 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1224 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1225 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1226 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1227 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1228 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1229 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1230 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1231 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1233 d = _mm_sub_ps(r10,rswitch);
1234 d = _mm_max_ps(d,_mm_setzero_ps());
1235 d2 = _mm_mul_ps(d,d);
1236 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1238 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1240 /* Evaluate switch function */
1241 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1242 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1243 velec = _mm_mul_ps(velec,sw);
1244 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1246 /* Update potential sum for this i atom from the interaction with this j atom. */
1247 velec = _mm_and_ps(velec,cutoff_mask);
1248 velec = _mm_andnot_ps(dummy_mask,velec);
1249 velecsum = _mm_add_ps(velecsum,velec);
1253 fscal = _mm_and_ps(fscal,cutoff_mask);
1255 fscal = _mm_andnot_ps(dummy_mask,fscal);
1257 /* Calculate temporary vectorial force */
1258 tx = _mm_mul_ps(fscal,dx10);
1259 ty = _mm_mul_ps(fscal,dy10);
1260 tz = _mm_mul_ps(fscal,dz10);
1262 /* Update vectorial force */
1263 fix1 = _mm_add_ps(fix1,tx);
1264 fiy1 = _mm_add_ps(fiy1,ty);
1265 fiz1 = _mm_add_ps(fiz1,tz);
1267 fjx0 = _mm_add_ps(fjx0,tx);
1268 fjy0 = _mm_add_ps(fjy0,ty);
1269 fjz0 = _mm_add_ps(fjz0,tz);
1273 /**************************
1274 * CALCULATE INTERACTIONS *
1275 **************************/
1277 if (gmx_mm_any_lt(rsq11,rcutoff2))
1280 r11 = _mm_mul_ps(rsq11,rinv11);
1281 r11 = _mm_andnot_ps(dummy_mask,r11);
1283 /* EWALD ELECTROSTATICS */
1285 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1286 ewrt = _mm_mul_ps(r11,ewtabscale);
1287 ewitab = _mm_cvttps_epi32(ewrt);
1288 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1289 ewitab = _mm_slli_epi32(ewitab,2);
1290 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1291 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1292 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1293 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1294 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1295 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1296 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1297 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1298 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1300 d = _mm_sub_ps(r11,rswitch);
1301 d = _mm_max_ps(d,_mm_setzero_ps());
1302 d2 = _mm_mul_ps(d,d);
1303 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1305 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1307 /* Evaluate switch function */
1308 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1309 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1310 velec = _mm_mul_ps(velec,sw);
1311 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1313 /* Update potential sum for this i atom from the interaction with this j atom. */
1314 velec = _mm_and_ps(velec,cutoff_mask);
1315 velec = _mm_andnot_ps(dummy_mask,velec);
1316 velecsum = _mm_add_ps(velecsum,velec);
1320 fscal = _mm_and_ps(fscal,cutoff_mask);
1322 fscal = _mm_andnot_ps(dummy_mask,fscal);
1324 /* Calculate temporary vectorial force */
1325 tx = _mm_mul_ps(fscal,dx11);
1326 ty = _mm_mul_ps(fscal,dy11);
1327 tz = _mm_mul_ps(fscal,dz11);
1329 /* Update vectorial force */
1330 fix1 = _mm_add_ps(fix1,tx);
1331 fiy1 = _mm_add_ps(fiy1,ty);
1332 fiz1 = _mm_add_ps(fiz1,tz);
1334 fjx1 = _mm_add_ps(fjx1,tx);
1335 fjy1 = _mm_add_ps(fjy1,ty);
1336 fjz1 = _mm_add_ps(fjz1,tz);
1340 /**************************
1341 * CALCULATE INTERACTIONS *
1342 **************************/
1344 if (gmx_mm_any_lt(rsq12,rcutoff2))
1347 r12 = _mm_mul_ps(rsq12,rinv12);
1348 r12 = _mm_andnot_ps(dummy_mask,r12);
1350 /* EWALD ELECTROSTATICS */
1352 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1353 ewrt = _mm_mul_ps(r12,ewtabscale);
1354 ewitab = _mm_cvttps_epi32(ewrt);
1355 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1356 ewitab = _mm_slli_epi32(ewitab,2);
1357 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1358 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1359 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1360 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1361 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1362 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1363 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1364 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1365 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1367 d = _mm_sub_ps(r12,rswitch);
1368 d = _mm_max_ps(d,_mm_setzero_ps());
1369 d2 = _mm_mul_ps(d,d);
1370 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1372 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1374 /* Evaluate switch function */
1375 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1376 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1377 velec = _mm_mul_ps(velec,sw);
1378 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1380 /* Update potential sum for this i atom from the interaction with this j atom. */
1381 velec = _mm_and_ps(velec,cutoff_mask);
1382 velec = _mm_andnot_ps(dummy_mask,velec);
1383 velecsum = _mm_add_ps(velecsum,velec);
1387 fscal = _mm_and_ps(fscal,cutoff_mask);
1389 fscal = _mm_andnot_ps(dummy_mask,fscal);
1391 /* Calculate temporary vectorial force */
1392 tx = _mm_mul_ps(fscal,dx12);
1393 ty = _mm_mul_ps(fscal,dy12);
1394 tz = _mm_mul_ps(fscal,dz12);
1396 /* Update vectorial force */
1397 fix1 = _mm_add_ps(fix1,tx);
1398 fiy1 = _mm_add_ps(fiy1,ty);
1399 fiz1 = _mm_add_ps(fiz1,tz);
1401 fjx2 = _mm_add_ps(fjx2,tx);
1402 fjy2 = _mm_add_ps(fjy2,ty);
1403 fjz2 = _mm_add_ps(fjz2,tz);
1407 /**************************
1408 * CALCULATE INTERACTIONS *
1409 **************************/
1411 if (gmx_mm_any_lt(rsq20,rcutoff2))
1414 r20 = _mm_mul_ps(rsq20,rinv20);
1415 r20 = _mm_andnot_ps(dummy_mask,r20);
1417 /* EWALD ELECTROSTATICS */
1419 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1420 ewrt = _mm_mul_ps(r20,ewtabscale);
1421 ewitab = _mm_cvttps_epi32(ewrt);
1422 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1423 ewitab = _mm_slli_epi32(ewitab,2);
1424 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1425 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1426 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1427 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1428 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1429 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1430 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1431 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1432 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1434 d = _mm_sub_ps(r20,rswitch);
1435 d = _mm_max_ps(d,_mm_setzero_ps());
1436 d2 = _mm_mul_ps(d,d);
1437 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1439 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1441 /* Evaluate switch function */
1442 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1443 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1444 velec = _mm_mul_ps(velec,sw);
1445 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1447 /* Update potential sum for this i atom from the interaction with this j atom. */
1448 velec = _mm_and_ps(velec,cutoff_mask);
1449 velec = _mm_andnot_ps(dummy_mask,velec);
1450 velecsum = _mm_add_ps(velecsum,velec);
1454 fscal = _mm_and_ps(fscal,cutoff_mask);
1456 fscal = _mm_andnot_ps(dummy_mask,fscal);
1458 /* Calculate temporary vectorial force */
1459 tx = _mm_mul_ps(fscal,dx20);
1460 ty = _mm_mul_ps(fscal,dy20);
1461 tz = _mm_mul_ps(fscal,dz20);
1463 /* Update vectorial force */
1464 fix2 = _mm_add_ps(fix2,tx);
1465 fiy2 = _mm_add_ps(fiy2,ty);
1466 fiz2 = _mm_add_ps(fiz2,tz);
1468 fjx0 = _mm_add_ps(fjx0,tx);
1469 fjy0 = _mm_add_ps(fjy0,ty);
1470 fjz0 = _mm_add_ps(fjz0,tz);
1474 /**************************
1475 * CALCULATE INTERACTIONS *
1476 **************************/
1478 if (gmx_mm_any_lt(rsq21,rcutoff2))
1481 r21 = _mm_mul_ps(rsq21,rinv21);
1482 r21 = _mm_andnot_ps(dummy_mask,r21);
1484 /* EWALD ELECTROSTATICS */
1486 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1487 ewrt = _mm_mul_ps(r21,ewtabscale);
1488 ewitab = _mm_cvttps_epi32(ewrt);
1489 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1490 ewitab = _mm_slli_epi32(ewitab,2);
1491 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1492 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1493 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1494 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1495 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1496 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1497 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1498 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1499 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1501 d = _mm_sub_ps(r21,rswitch);
1502 d = _mm_max_ps(d,_mm_setzero_ps());
1503 d2 = _mm_mul_ps(d,d);
1504 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1506 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1508 /* Evaluate switch function */
1509 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1510 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1511 velec = _mm_mul_ps(velec,sw);
1512 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1514 /* Update potential sum for this i atom from the interaction with this j atom. */
1515 velec = _mm_and_ps(velec,cutoff_mask);
1516 velec = _mm_andnot_ps(dummy_mask,velec);
1517 velecsum = _mm_add_ps(velecsum,velec);
1521 fscal = _mm_and_ps(fscal,cutoff_mask);
1523 fscal = _mm_andnot_ps(dummy_mask,fscal);
1525 /* Calculate temporary vectorial force */
1526 tx = _mm_mul_ps(fscal,dx21);
1527 ty = _mm_mul_ps(fscal,dy21);
1528 tz = _mm_mul_ps(fscal,dz21);
1530 /* Update vectorial force */
1531 fix2 = _mm_add_ps(fix2,tx);
1532 fiy2 = _mm_add_ps(fiy2,ty);
1533 fiz2 = _mm_add_ps(fiz2,tz);
1535 fjx1 = _mm_add_ps(fjx1,tx);
1536 fjy1 = _mm_add_ps(fjy1,ty);
1537 fjz1 = _mm_add_ps(fjz1,tz);
1541 /**************************
1542 * CALCULATE INTERACTIONS *
1543 **************************/
1545 if (gmx_mm_any_lt(rsq22,rcutoff2))
1548 r22 = _mm_mul_ps(rsq22,rinv22);
1549 r22 = _mm_andnot_ps(dummy_mask,r22);
1551 /* EWALD ELECTROSTATICS */
1553 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1554 ewrt = _mm_mul_ps(r22,ewtabscale);
1555 ewitab = _mm_cvttps_epi32(ewrt);
1556 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1557 ewitab = _mm_slli_epi32(ewitab,2);
1558 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1559 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1560 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1561 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1562 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1563 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1564 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1565 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1566 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1568 d = _mm_sub_ps(r22,rswitch);
1569 d = _mm_max_ps(d,_mm_setzero_ps());
1570 d2 = _mm_mul_ps(d,d);
1571 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1573 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1575 /* Evaluate switch function */
1576 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1577 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1578 velec = _mm_mul_ps(velec,sw);
1579 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1581 /* Update potential sum for this i atom from the interaction with this j atom. */
1582 velec = _mm_and_ps(velec,cutoff_mask);
1583 velec = _mm_andnot_ps(dummy_mask,velec);
1584 velecsum = _mm_add_ps(velecsum,velec);
1588 fscal = _mm_and_ps(fscal,cutoff_mask);
1590 fscal = _mm_andnot_ps(dummy_mask,fscal);
1592 /* Calculate temporary vectorial force */
1593 tx = _mm_mul_ps(fscal,dx22);
1594 ty = _mm_mul_ps(fscal,dy22);
1595 tz = _mm_mul_ps(fscal,dz22);
1597 /* Update vectorial force */
1598 fix2 = _mm_add_ps(fix2,tx);
1599 fiy2 = _mm_add_ps(fiy2,ty);
1600 fiz2 = _mm_add_ps(fiz2,tz);
1602 fjx2 = _mm_add_ps(fjx2,tx);
1603 fjy2 = _mm_add_ps(fjy2,ty);
1604 fjz2 = _mm_add_ps(fjz2,tz);
1608 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1609 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1610 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1611 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1613 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1614 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1616 /* Inner loop uses 612 flops */
1619 /* End of innermost loop */
1621 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1622 f+i_coord_offset,fshift+i_shift_offset);
1625 /* Update potential energies */
1626 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1627 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1629 /* Increment number of inner iterations */
1630 inneriter += j_index_end - j_index_start;
1632 /* Outer loop uses 20 flops */
1635 /* Increment number of outer iterations */
1638 /* Update outer/inner flops */
1640 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1643 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1644 * Electrostatics interaction: Ewald
1645 * VdW interaction: LennardJones
1646 * Geometry: Water3-Water3
1647 * Calculate force/pot: Force
1650 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1651 (t_nblist * gmx_restrict nlist,
1652 rvec * gmx_restrict xx,
1653 rvec * gmx_restrict ff,
1654 t_forcerec * gmx_restrict fr,
1655 t_mdatoms * gmx_restrict mdatoms,
1656 nb_kernel_data_t * gmx_restrict kernel_data,
1657 t_nrnb * gmx_restrict nrnb)
1659 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1660 * just 0 for non-waters.
1661 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1662 * jnr indices corresponding to data put in the four positions in the SIMD register.
1664 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1665 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1666 int jnrA,jnrB,jnrC,jnrD;
1667 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1668 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1669 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1670 real rcutoff_scalar;
1671 real *shiftvec,*fshift,*x,*f;
1672 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1673 real scratch[4*DIM];
1674 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1676 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1678 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1680 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1681 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1682 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1683 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1684 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1685 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1686 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1687 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1688 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1689 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1690 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1691 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1692 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1693 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1694 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1695 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1696 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1699 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1702 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1703 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1705 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1707 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1708 real rswitch_scalar,d_scalar;
1709 __m128 dummy_mask,cutoff_mask;
1710 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1711 __m128 one = _mm_set1_ps(1.0);
1712 __m128 two = _mm_set1_ps(2.0);
1718 jindex = nlist->jindex;
1720 shiftidx = nlist->shift;
1722 shiftvec = fr->shift_vec[0];
1723 fshift = fr->fshift[0];
1724 facel = _mm_set1_ps(fr->epsfac);
1725 charge = mdatoms->chargeA;
1726 nvdwtype = fr->ntype;
1727 vdwparam = fr->nbfp;
1728 vdwtype = mdatoms->typeA;
1730 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1731 ewtab = fr->ic->tabq_coul_FDV0;
1732 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1733 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1735 /* Setup water-specific parameters */
1736 inr = nlist->iinr[0];
1737 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1738 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1739 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1740 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1742 jq0 = _mm_set1_ps(charge[inr+0]);
1743 jq1 = _mm_set1_ps(charge[inr+1]);
1744 jq2 = _mm_set1_ps(charge[inr+2]);
1745 vdwjidx0A = 2*vdwtype[inr+0];
1746 qq00 = _mm_mul_ps(iq0,jq0);
1747 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1748 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1749 qq01 = _mm_mul_ps(iq0,jq1);
1750 qq02 = _mm_mul_ps(iq0,jq2);
1751 qq10 = _mm_mul_ps(iq1,jq0);
1752 qq11 = _mm_mul_ps(iq1,jq1);
1753 qq12 = _mm_mul_ps(iq1,jq2);
1754 qq20 = _mm_mul_ps(iq2,jq0);
1755 qq21 = _mm_mul_ps(iq2,jq1);
1756 qq22 = _mm_mul_ps(iq2,jq2);
1758 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1759 rcutoff_scalar = fr->rcoulomb;
1760 rcutoff = _mm_set1_ps(rcutoff_scalar);
1761 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1763 rswitch_scalar = fr->rcoulomb_switch;
1764 rswitch = _mm_set1_ps(rswitch_scalar);
1765 /* Setup switch parameters */
1766 d_scalar = rcutoff_scalar-rswitch_scalar;
1767 d = _mm_set1_ps(d_scalar);
1768 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1769 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1770 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1771 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1772 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1773 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1775 /* Avoid stupid compiler warnings */
1776 jnrA = jnrB = jnrC = jnrD = 0;
1777 j_coord_offsetA = 0;
1778 j_coord_offsetB = 0;
1779 j_coord_offsetC = 0;
1780 j_coord_offsetD = 0;
1785 for(iidx=0;iidx<4*DIM;iidx++)
1787 scratch[iidx] = 0.0;
1790 /* Start outer loop over neighborlists */
1791 for(iidx=0; iidx<nri; iidx++)
1793 /* Load shift vector for this list */
1794 i_shift_offset = DIM*shiftidx[iidx];
1796 /* Load limits for loop over neighbors */
1797 j_index_start = jindex[iidx];
1798 j_index_end = jindex[iidx+1];
1800 /* Get outer coordinate index */
1802 i_coord_offset = DIM*inr;
1804 /* Load i particle coords and add shift vector */
1805 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1806 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1808 fix0 = _mm_setzero_ps();
1809 fiy0 = _mm_setzero_ps();
1810 fiz0 = _mm_setzero_ps();
1811 fix1 = _mm_setzero_ps();
1812 fiy1 = _mm_setzero_ps();
1813 fiz1 = _mm_setzero_ps();
1814 fix2 = _mm_setzero_ps();
1815 fiy2 = _mm_setzero_ps();
1816 fiz2 = _mm_setzero_ps();
1818 /* Start inner kernel loop */
1819 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1822 /* Get j neighbor index, and coordinate index */
1824 jnrB = jjnr[jidx+1];
1825 jnrC = jjnr[jidx+2];
1826 jnrD = jjnr[jidx+3];
1827 j_coord_offsetA = DIM*jnrA;
1828 j_coord_offsetB = DIM*jnrB;
1829 j_coord_offsetC = DIM*jnrC;
1830 j_coord_offsetD = DIM*jnrD;
1832 /* load j atom coordinates */
1833 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1834 x+j_coord_offsetC,x+j_coord_offsetD,
1835 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1837 /* Calculate displacement vector */
1838 dx00 = _mm_sub_ps(ix0,jx0);
1839 dy00 = _mm_sub_ps(iy0,jy0);
1840 dz00 = _mm_sub_ps(iz0,jz0);
1841 dx01 = _mm_sub_ps(ix0,jx1);
1842 dy01 = _mm_sub_ps(iy0,jy1);
1843 dz01 = _mm_sub_ps(iz0,jz1);
1844 dx02 = _mm_sub_ps(ix0,jx2);
1845 dy02 = _mm_sub_ps(iy0,jy2);
1846 dz02 = _mm_sub_ps(iz0,jz2);
1847 dx10 = _mm_sub_ps(ix1,jx0);
1848 dy10 = _mm_sub_ps(iy1,jy0);
1849 dz10 = _mm_sub_ps(iz1,jz0);
1850 dx11 = _mm_sub_ps(ix1,jx1);
1851 dy11 = _mm_sub_ps(iy1,jy1);
1852 dz11 = _mm_sub_ps(iz1,jz1);
1853 dx12 = _mm_sub_ps(ix1,jx2);
1854 dy12 = _mm_sub_ps(iy1,jy2);
1855 dz12 = _mm_sub_ps(iz1,jz2);
1856 dx20 = _mm_sub_ps(ix2,jx0);
1857 dy20 = _mm_sub_ps(iy2,jy0);
1858 dz20 = _mm_sub_ps(iz2,jz0);
1859 dx21 = _mm_sub_ps(ix2,jx1);
1860 dy21 = _mm_sub_ps(iy2,jy1);
1861 dz21 = _mm_sub_ps(iz2,jz1);
1862 dx22 = _mm_sub_ps(ix2,jx2);
1863 dy22 = _mm_sub_ps(iy2,jy2);
1864 dz22 = _mm_sub_ps(iz2,jz2);
1866 /* Calculate squared distance and things based on it */
1867 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1868 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1869 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1870 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1871 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1872 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1873 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1874 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1875 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1877 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1878 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1879 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1880 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1881 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1882 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1883 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1884 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1885 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1887 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1888 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1889 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1890 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1891 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1892 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1893 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1894 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1895 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1897 fjx0 = _mm_setzero_ps();
1898 fjy0 = _mm_setzero_ps();
1899 fjz0 = _mm_setzero_ps();
1900 fjx1 = _mm_setzero_ps();
1901 fjy1 = _mm_setzero_ps();
1902 fjz1 = _mm_setzero_ps();
1903 fjx2 = _mm_setzero_ps();
1904 fjy2 = _mm_setzero_ps();
1905 fjz2 = _mm_setzero_ps();
1907 /**************************
1908 * CALCULATE INTERACTIONS *
1909 **************************/
1911 if (gmx_mm_any_lt(rsq00,rcutoff2))
1914 r00 = _mm_mul_ps(rsq00,rinv00);
1916 /* EWALD ELECTROSTATICS */
1918 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1919 ewrt = _mm_mul_ps(r00,ewtabscale);
1920 ewitab = _mm_cvttps_epi32(ewrt);
1921 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1922 ewitab = _mm_slli_epi32(ewitab,2);
1923 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1924 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1925 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1926 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1927 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1928 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1929 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1930 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1931 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1933 /* LENNARD-JONES DISPERSION/REPULSION */
1935 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1936 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1937 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1938 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1939 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1941 d = _mm_sub_ps(r00,rswitch);
1942 d = _mm_max_ps(d,_mm_setzero_ps());
1943 d2 = _mm_mul_ps(d,d);
1944 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1946 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1948 /* Evaluate switch function */
1949 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1950 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1951 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1952 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1954 fscal = _mm_add_ps(felec,fvdw);
1956 fscal = _mm_and_ps(fscal,cutoff_mask);
1958 /* Calculate temporary vectorial force */
1959 tx = _mm_mul_ps(fscal,dx00);
1960 ty = _mm_mul_ps(fscal,dy00);
1961 tz = _mm_mul_ps(fscal,dz00);
1963 /* Update vectorial force */
1964 fix0 = _mm_add_ps(fix0,tx);
1965 fiy0 = _mm_add_ps(fiy0,ty);
1966 fiz0 = _mm_add_ps(fiz0,tz);
1968 fjx0 = _mm_add_ps(fjx0,tx);
1969 fjy0 = _mm_add_ps(fjy0,ty);
1970 fjz0 = _mm_add_ps(fjz0,tz);
1974 /**************************
1975 * CALCULATE INTERACTIONS *
1976 **************************/
1978 if (gmx_mm_any_lt(rsq01,rcutoff2))
1981 r01 = _mm_mul_ps(rsq01,rinv01);
1983 /* EWALD ELECTROSTATICS */
1985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986 ewrt = _mm_mul_ps(r01,ewtabscale);
1987 ewitab = _mm_cvttps_epi32(ewrt);
1988 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1989 ewitab = _mm_slli_epi32(ewitab,2);
1990 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1991 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1992 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1993 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1994 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1995 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1996 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1997 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1998 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2000 d = _mm_sub_ps(r01,rswitch);
2001 d = _mm_max_ps(d,_mm_setzero_ps());
2002 d2 = _mm_mul_ps(d,d);
2003 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2005 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2007 /* Evaluate switch function */
2008 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2009 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2010 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2014 fscal = _mm_and_ps(fscal,cutoff_mask);
2016 /* Calculate temporary vectorial force */
2017 tx = _mm_mul_ps(fscal,dx01);
2018 ty = _mm_mul_ps(fscal,dy01);
2019 tz = _mm_mul_ps(fscal,dz01);
2021 /* Update vectorial force */
2022 fix0 = _mm_add_ps(fix0,tx);
2023 fiy0 = _mm_add_ps(fiy0,ty);
2024 fiz0 = _mm_add_ps(fiz0,tz);
2026 fjx1 = _mm_add_ps(fjx1,tx);
2027 fjy1 = _mm_add_ps(fjy1,ty);
2028 fjz1 = _mm_add_ps(fjz1,tz);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm_any_lt(rsq02,rcutoff2))
2039 r02 = _mm_mul_ps(rsq02,rinv02);
2041 /* EWALD ELECTROSTATICS */
2043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044 ewrt = _mm_mul_ps(r02,ewtabscale);
2045 ewitab = _mm_cvttps_epi32(ewrt);
2046 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2047 ewitab = _mm_slli_epi32(ewitab,2);
2048 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2049 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2050 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2051 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2052 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2053 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2054 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2055 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2056 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2058 d = _mm_sub_ps(r02,rswitch);
2059 d = _mm_max_ps(d,_mm_setzero_ps());
2060 d2 = _mm_mul_ps(d,d);
2061 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2063 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2065 /* Evaluate switch function */
2066 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2067 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2068 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2072 fscal = _mm_and_ps(fscal,cutoff_mask);
2074 /* Calculate temporary vectorial force */
2075 tx = _mm_mul_ps(fscal,dx02);
2076 ty = _mm_mul_ps(fscal,dy02);
2077 tz = _mm_mul_ps(fscal,dz02);
2079 /* Update vectorial force */
2080 fix0 = _mm_add_ps(fix0,tx);
2081 fiy0 = _mm_add_ps(fiy0,ty);
2082 fiz0 = _mm_add_ps(fiz0,tz);
2084 fjx2 = _mm_add_ps(fjx2,tx);
2085 fjy2 = _mm_add_ps(fjy2,ty);
2086 fjz2 = _mm_add_ps(fjz2,tz);
2090 /**************************
2091 * CALCULATE INTERACTIONS *
2092 **************************/
2094 if (gmx_mm_any_lt(rsq10,rcutoff2))
2097 r10 = _mm_mul_ps(rsq10,rinv10);
2099 /* EWALD ELECTROSTATICS */
2101 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2102 ewrt = _mm_mul_ps(r10,ewtabscale);
2103 ewitab = _mm_cvttps_epi32(ewrt);
2104 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2105 ewitab = _mm_slli_epi32(ewitab,2);
2106 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2107 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2108 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2109 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2110 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2111 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2112 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2113 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2114 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2116 d = _mm_sub_ps(r10,rswitch);
2117 d = _mm_max_ps(d,_mm_setzero_ps());
2118 d2 = _mm_mul_ps(d,d);
2119 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2121 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2123 /* Evaluate switch function */
2124 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2125 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2126 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2130 fscal = _mm_and_ps(fscal,cutoff_mask);
2132 /* Calculate temporary vectorial force */
2133 tx = _mm_mul_ps(fscal,dx10);
2134 ty = _mm_mul_ps(fscal,dy10);
2135 tz = _mm_mul_ps(fscal,dz10);
2137 /* Update vectorial force */
2138 fix1 = _mm_add_ps(fix1,tx);
2139 fiy1 = _mm_add_ps(fiy1,ty);
2140 fiz1 = _mm_add_ps(fiz1,tz);
2142 fjx0 = _mm_add_ps(fjx0,tx);
2143 fjy0 = _mm_add_ps(fjy0,ty);
2144 fjz0 = _mm_add_ps(fjz0,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 if (gmx_mm_any_lt(rsq11,rcutoff2))
2155 r11 = _mm_mul_ps(rsq11,rinv11);
2157 /* EWALD ELECTROSTATICS */
2159 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160 ewrt = _mm_mul_ps(r11,ewtabscale);
2161 ewitab = _mm_cvttps_epi32(ewrt);
2162 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2163 ewitab = _mm_slli_epi32(ewitab,2);
2164 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2165 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2166 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2167 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2168 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2169 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2170 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2171 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2172 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2174 d = _mm_sub_ps(r11,rswitch);
2175 d = _mm_max_ps(d,_mm_setzero_ps());
2176 d2 = _mm_mul_ps(d,d);
2177 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2179 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2181 /* Evaluate switch function */
2182 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2183 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2184 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2188 fscal = _mm_and_ps(fscal,cutoff_mask);
2190 /* Calculate temporary vectorial force */
2191 tx = _mm_mul_ps(fscal,dx11);
2192 ty = _mm_mul_ps(fscal,dy11);
2193 tz = _mm_mul_ps(fscal,dz11);
2195 /* Update vectorial force */
2196 fix1 = _mm_add_ps(fix1,tx);
2197 fiy1 = _mm_add_ps(fiy1,ty);
2198 fiz1 = _mm_add_ps(fiz1,tz);
2200 fjx1 = _mm_add_ps(fjx1,tx);
2201 fjy1 = _mm_add_ps(fjy1,ty);
2202 fjz1 = _mm_add_ps(fjz1,tz);
2206 /**************************
2207 * CALCULATE INTERACTIONS *
2208 **************************/
2210 if (gmx_mm_any_lt(rsq12,rcutoff2))
2213 r12 = _mm_mul_ps(rsq12,rinv12);
2215 /* EWALD ELECTROSTATICS */
2217 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2218 ewrt = _mm_mul_ps(r12,ewtabscale);
2219 ewitab = _mm_cvttps_epi32(ewrt);
2220 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2221 ewitab = _mm_slli_epi32(ewitab,2);
2222 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2223 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2224 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2225 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2226 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2227 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2228 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2229 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2230 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2232 d = _mm_sub_ps(r12,rswitch);
2233 d = _mm_max_ps(d,_mm_setzero_ps());
2234 d2 = _mm_mul_ps(d,d);
2235 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2237 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2239 /* Evaluate switch function */
2240 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2241 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2242 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2246 fscal = _mm_and_ps(fscal,cutoff_mask);
2248 /* Calculate temporary vectorial force */
2249 tx = _mm_mul_ps(fscal,dx12);
2250 ty = _mm_mul_ps(fscal,dy12);
2251 tz = _mm_mul_ps(fscal,dz12);
2253 /* Update vectorial force */
2254 fix1 = _mm_add_ps(fix1,tx);
2255 fiy1 = _mm_add_ps(fiy1,ty);
2256 fiz1 = _mm_add_ps(fiz1,tz);
2258 fjx2 = _mm_add_ps(fjx2,tx);
2259 fjy2 = _mm_add_ps(fjy2,ty);
2260 fjz2 = _mm_add_ps(fjz2,tz);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 if (gmx_mm_any_lt(rsq20,rcutoff2))
2271 r20 = _mm_mul_ps(rsq20,rinv20);
2273 /* EWALD ELECTROSTATICS */
2275 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2276 ewrt = _mm_mul_ps(r20,ewtabscale);
2277 ewitab = _mm_cvttps_epi32(ewrt);
2278 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2279 ewitab = _mm_slli_epi32(ewitab,2);
2280 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2281 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2282 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2283 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2284 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2285 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2286 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2287 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2288 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2290 d = _mm_sub_ps(r20,rswitch);
2291 d = _mm_max_ps(d,_mm_setzero_ps());
2292 d2 = _mm_mul_ps(d,d);
2293 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2295 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2297 /* Evaluate switch function */
2298 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2299 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2300 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2304 fscal = _mm_and_ps(fscal,cutoff_mask);
2306 /* Calculate temporary vectorial force */
2307 tx = _mm_mul_ps(fscal,dx20);
2308 ty = _mm_mul_ps(fscal,dy20);
2309 tz = _mm_mul_ps(fscal,dz20);
2311 /* Update vectorial force */
2312 fix2 = _mm_add_ps(fix2,tx);
2313 fiy2 = _mm_add_ps(fiy2,ty);
2314 fiz2 = _mm_add_ps(fiz2,tz);
2316 fjx0 = _mm_add_ps(fjx0,tx);
2317 fjy0 = _mm_add_ps(fjy0,ty);
2318 fjz0 = _mm_add_ps(fjz0,tz);
2322 /**************************
2323 * CALCULATE INTERACTIONS *
2324 **************************/
2326 if (gmx_mm_any_lt(rsq21,rcutoff2))
2329 r21 = _mm_mul_ps(rsq21,rinv21);
2331 /* EWALD ELECTROSTATICS */
2333 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2334 ewrt = _mm_mul_ps(r21,ewtabscale);
2335 ewitab = _mm_cvttps_epi32(ewrt);
2336 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2337 ewitab = _mm_slli_epi32(ewitab,2);
2338 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2339 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2340 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2341 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2342 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2343 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2344 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2345 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2346 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2348 d = _mm_sub_ps(r21,rswitch);
2349 d = _mm_max_ps(d,_mm_setzero_ps());
2350 d2 = _mm_mul_ps(d,d);
2351 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2353 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2355 /* Evaluate switch function */
2356 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2357 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2358 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2362 fscal = _mm_and_ps(fscal,cutoff_mask);
2364 /* Calculate temporary vectorial force */
2365 tx = _mm_mul_ps(fscal,dx21);
2366 ty = _mm_mul_ps(fscal,dy21);
2367 tz = _mm_mul_ps(fscal,dz21);
2369 /* Update vectorial force */
2370 fix2 = _mm_add_ps(fix2,tx);
2371 fiy2 = _mm_add_ps(fiy2,ty);
2372 fiz2 = _mm_add_ps(fiz2,tz);
2374 fjx1 = _mm_add_ps(fjx1,tx);
2375 fjy1 = _mm_add_ps(fjy1,ty);
2376 fjz1 = _mm_add_ps(fjz1,tz);
2380 /**************************
2381 * CALCULATE INTERACTIONS *
2382 **************************/
2384 if (gmx_mm_any_lt(rsq22,rcutoff2))
2387 r22 = _mm_mul_ps(rsq22,rinv22);
2389 /* EWALD ELECTROSTATICS */
2391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2392 ewrt = _mm_mul_ps(r22,ewtabscale);
2393 ewitab = _mm_cvttps_epi32(ewrt);
2394 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2395 ewitab = _mm_slli_epi32(ewitab,2);
2396 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2397 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2398 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2399 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2400 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2401 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2402 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2403 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2404 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2406 d = _mm_sub_ps(r22,rswitch);
2407 d = _mm_max_ps(d,_mm_setzero_ps());
2408 d2 = _mm_mul_ps(d,d);
2409 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2411 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2413 /* Evaluate switch function */
2414 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2415 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2416 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2420 fscal = _mm_and_ps(fscal,cutoff_mask);
2422 /* Calculate temporary vectorial force */
2423 tx = _mm_mul_ps(fscal,dx22);
2424 ty = _mm_mul_ps(fscal,dy22);
2425 tz = _mm_mul_ps(fscal,dz22);
2427 /* Update vectorial force */
2428 fix2 = _mm_add_ps(fix2,tx);
2429 fiy2 = _mm_add_ps(fiy2,ty);
2430 fiz2 = _mm_add_ps(fiz2,tz);
2432 fjx2 = _mm_add_ps(fjx2,tx);
2433 fjy2 = _mm_add_ps(fjy2,ty);
2434 fjz2 = _mm_add_ps(fjz2,tz);
2438 fjptrA = f+j_coord_offsetA;
2439 fjptrB = f+j_coord_offsetB;
2440 fjptrC = f+j_coord_offsetC;
2441 fjptrD = f+j_coord_offsetD;
2443 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2444 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2446 /* Inner loop uses 573 flops */
2449 if(jidx<j_index_end)
2452 /* Get j neighbor index, and coordinate index */
2453 jnrlistA = jjnr[jidx];
2454 jnrlistB = jjnr[jidx+1];
2455 jnrlistC = jjnr[jidx+2];
2456 jnrlistD = jjnr[jidx+3];
2457 /* Sign of each element will be negative for non-real atoms.
2458 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2459 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2461 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2462 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2463 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2464 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2465 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2466 j_coord_offsetA = DIM*jnrA;
2467 j_coord_offsetB = DIM*jnrB;
2468 j_coord_offsetC = DIM*jnrC;
2469 j_coord_offsetD = DIM*jnrD;
2471 /* load j atom coordinates */
2472 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2473 x+j_coord_offsetC,x+j_coord_offsetD,
2474 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2476 /* Calculate displacement vector */
2477 dx00 = _mm_sub_ps(ix0,jx0);
2478 dy00 = _mm_sub_ps(iy0,jy0);
2479 dz00 = _mm_sub_ps(iz0,jz0);
2480 dx01 = _mm_sub_ps(ix0,jx1);
2481 dy01 = _mm_sub_ps(iy0,jy1);
2482 dz01 = _mm_sub_ps(iz0,jz1);
2483 dx02 = _mm_sub_ps(ix0,jx2);
2484 dy02 = _mm_sub_ps(iy0,jy2);
2485 dz02 = _mm_sub_ps(iz0,jz2);
2486 dx10 = _mm_sub_ps(ix1,jx0);
2487 dy10 = _mm_sub_ps(iy1,jy0);
2488 dz10 = _mm_sub_ps(iz1,jz0);
2489 dx11 = _mm_sub_ps(ix1,jx1);
2490 dy11 = _mm_sub_ps(iy1,jy1);
2491 dz11 = _mm_sub_ps(iz1,jz1);
2492 dx12 = _mm_sub_ps(ix1,jx2);
2493 dy12 = _mm_sub_ps(iy1,jy2);
2494 dz12 = _mm_sub_ps(iz1,jz2);
2495 dx20 = _mm_sub_ps(ix2,jx0);
2496 dy20 = _mm_sub_ps(iy2,jy0);
2497 dz20 = _mm_sub_ps(iz2,jz0);
2498 dx21 = _mm_sub_ps(ix2,jx1);
2499 dy21 = _mm_sub_ps(iy2,jy1);
2500 dz21 = _mm_sub_ps(iz2,jz1);
2501 dx22 = _mm_sub_ps(ix2,jx2);
2502 dy22 = _mm_sub_ps(iy2,jy2);
2503 dz22 = _mm_sub_ps(iz2,jz2);
2505 /* Calculate squared distance and things based on it */
2506 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2507 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2508 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2509 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2510 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2511 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2512 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2513 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2514 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2516 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2517 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2518 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2519 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2520 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2521 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2522 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2523 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2524 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2526 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2527 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2528 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2529 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2530 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2531 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2532 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2533 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2534 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2536 fjx0 = _mm_setzero_ps();
2537 fjy0 = _mm_setzero_ps();
2538 fjz0 = _mm_setzero_ps();
2539 fjx1 = _mm_setzero_ps();
2540 fjy1 = _mm_setzero_ps();
2541 fjz1 = _mm_setzero_ps();
2542 fjx2 = _mm_setzero_ps();
2543 fjy2 = _mm_setzero_ps();
2544 fjz2 = _mm_setzero_ps();
2546 /**************************
2547 * CALCULATE INTERACTIONS *
2548 **************************/
2550 if (gmx_mm_any_lt(rsq00,rcutoff2))
2553 r00 = _mm_mul_ps(rsq00,rinv00);
2554 r00 = _mm_andnot_ps(dummy_mask,r00);
2556 /* EWALD ELECTROSTATICS */
2558 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2559 ewrt = _mm_mul_ps(r00,ewtabscale);
2560 ewitab = _mm_cvttps_epi32(ewrt);
2561 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2562 ewitab = _mm_slli_epi32(ewitab,2);
2563 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2564 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2565 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2566 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2567 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2568 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2569 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2570 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2571 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2573 /* LENNARD-JONES DISPERSION/REPULSION */
2575 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2576 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2577 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2578 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2579 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2581 d = _mm_sub_ps(r00,rswitch);
2582 d = _mm_max_ps(d,_mm_setzero_ps());
2583 d2 = _mm_mul_ps(d,d);
2584 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2586 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2588 /* Evaluate switch function */
2589 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2590 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2591 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2592 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2594 fscal = _mm_add_ps(felec,fvdw);
2596 fscal = _mm_and_ps(fscal,cutoff_mask);
2598 fscal = _mm_andnot_ps(dummy_mask,fscal);
2600 /* Calculate temporary vectorial force */
2601 tx = _mm_mul_ps(fscal,dx00);
2602 ty = _mm_mul_ps(fscal,dy00);
2603 tz = _mm_mul_ps(fscal,dz00);
2605 /* Update vectorial force */
2606 fix0 = _mm_add_ps(fix0,tx);
2607 fiy0 = _mm_add_ps(fiy0,ty);
2608 fiz0 = _mm_add_ps(fiz0,tz);
2610 fjx0 = _mm_add_ps(fjx0,tx);
2611 fjy0 = _mm_add_ps(fjy0,ty);
2612 fjz0 = _mm_add_ps(fjz0,tz);
2616 /**************************
2617 * CALCULATE INTERACTIONS *
2618 **************************/
2620 if (gmx_mm_any_lt(rsq01,rcutoff2))
2623 r01 = _mm_mul_ps(rsq01,rinv01);
2624 r01 = _mm_andnot_ps(dummy_mask,r01);
2626 /* EWALD ELECTROSTATICS */
2628 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2629 ewrt = _mm_mul_ps(r01,ewtabscale);
2630 ewitab = _mm_cvttps_epi32(ewrt);
2631 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2632 ewitab = _mm_slli_epi32(ewitab,2);
2633 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2634 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2635 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2636 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2637 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2638 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2639 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2640 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2641 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2643 d = _mm_sub_ps(r01,rswitch);
2644 d = _mm_max_ps(d,_mm_setzero_ps());
2645 d2 = _mm_mul_ps(d,d);
2646 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2648 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2650 /* Evaluate switch function */
2651 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2652 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2653 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2657 fscal = _mm_and_ps(fscal,cutoff_mask);
2659 fscal = _mm_andnot_ps(dummy_mask,fscal);
2661 /* Calculate temporary vectorial force */
2662 tx = _mm_mul_ps(fscal,dx01);
2663 ty = _mm_mul_ps(fscal,dy01);
2664 tz = _mm_mul_ps(fscal,dz01);
2666 /* Update vectorial force */
2667 fix0 = _mm_add_ps(fix0,tx);
2668 fiy0 = _mm_add_ps(fiy0,ty);
2669 fiz0 = _mm_add_ps(fiz0,tz);
2671 fjx1 = _mm_add_ps(fjx1,tx);
2672 fjy1 = _mm_add_ps(fjy1,ty);
2673 fjz1 = _mm_add_ps(fjz1,tz);
2677 /**************************
2678 * CALCULATE INTERACTIONS *
2679 **************************/
2681 if (gmx_mm_any_lt(rsq02,rcutoff2))
2684 r02 = _mm_mul_ps(rsq02,rinv02);
2685 r02 = _mm_andnot_ps(dummy_mask,r02);
2687 /* EWALD ELECTROSTATICS */
2689 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2690 ewrt = _mm_mul_ps(r02,ewtabscale);
2691 ewitab = _mm_cvttps_epi32(ewrt);
2692 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2693 ewitab = _mm_slli_epi32(ewitab,2);
2694 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2695 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2696 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2697 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2698 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2699 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2700 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2701 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2702 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2704 d = _mm_sub_ps(r02,rswitch);
2705 d = _mm_max_ps(d,_mm_setzero_ps());
2706 d2 = _mm_mul_ps(d,d);
2707 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2709 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2711 /* Evaluate switch function */
2712 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2713 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2714 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2718 fscal = _mm_and_ps(fscal,cutoff_mask);
2720 fscal = _mm_andnot_ps(dummy_mask,fscal);
2722 /* Calculate temporary vectorial force */
2723 tx = _mm_mul_ps(fscal,dx02);
2724 ty = _mm_mul_ps(fscal,dy02);
2725 tz = _mm_mul_ps(fscal,dz02);
2727 /* Update vectorial force */
2728 fix0 = _mm_add_ps(fix0,tx);
2729 fiy0 = _mm_add_ps(fiy0,ty);
2730 fiz0 = _mm_add_ps(fiz0,tz);
2732 fjx2 = _mm_add_ps(fjx2,tx);
2733 fjy2 = _mm_add_ps(fjy2,ty);
2734 fjz2 = _mm_add_ps(fjz2,tz);
2738 /**************************
2739 * CALCULATE INTERACTIONS *
2740 **************************/
2742 if (gmx_mm_any_lt(rsq10,rcutoff2))
2745 r10 = _mm_mul_ps(rsq10,rinv10);
2746 r10 = _mm_andnot_ps(dummy_mask,r10);
2748 /* EWALD ELECTROSTATICS */
2750 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2751 ewrt = _mm_mul_ps(r10,ewtabscale);
2752 ewitab = _mm_cvttps_epi32(ewrt);
2753 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2754 ewitab = _mm_slli_epi32(ewitab,2);
2755 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2756 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2757 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2758 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2759 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2760 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2761 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2762 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2763 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2765 d = _mm_sub_ps(r10,rswitch);
2766 d = _mm_max_ps(d,_mm_setzero_ps());
2767 d2 = _mm_mul_ps(d,d);
2768 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2770 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2772 /* Evaluate switch function */
2773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2774 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2775 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2779 fscal = _mm_and_ps(fscal,cutoff_mask);
2781 fscal = _mm_andnot_ps(dummy_mask,fscal);
2783 /* Calculate temporary vectorial force */
2784 tx = _mm_mul_ps(fscal,dx10);
2785 ty = _mm_mul_ps(fscal,dy10);
2786 tz = _mm_mul_ps(fscal,dz10);
2788 /* Update vectorial force */
2789 fix1 = _mm_add_ps(fix1,tx);
2790 fiy1 = _mm_add_ps(fiy1,ty);
2791 fiz1 = _mm_add_ps(fiz1,tz);
2793 fjx0 = _mm_add_ps(fjx0,tx);
2794 fjy0 = _mm_add_ps(fjy0,ty);
2795 fjz0 = _mm_add_ps(fjz0,tz);
2799 /**************************
2800 * CALCULATE INTERACTIONS *
2801 **************************/
2803 if (gmx_mm_any_lt(rsq11,rcutoff2))
2806 r11 = _mm_mul_ps(rsq11,rinv11);
2807 r11 = _mm_andnot_ps(dummy_mask,r11);
2809 /* EWALD ELECTROSTATICS */
2811 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2812 ewrt = _mm_mul_ps(r11,ewtabscale);
2813 ewitab = _mm_cvttps_epi32(ewrt);
2814 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2815 ewitab = _mm_slli_epi32(ewitab,2);
2816 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2817 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2818 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2819 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2820 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2821 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2822 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2823 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2824 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2826 d = _mm_sub_ps(r11,rswitch);
2827 d = _mm_max_ps(d,_mm_setzero_ps());
2828 d2 = _mm_mul_ps(d,d);
2829 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2831 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2833 /* Evaluate switch function */
2834 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2835 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2836 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2840 fscal = _mm_and_ps(fscal,cutoff_mask);
2842 fscal = _mm_andnot_ps(dummy_mask,fscal);
2844 /* Calculate temporary vectorial force */
2845 tx = _mm_mul_ps(fscal,dx11);
2846 ty = _mm_mul_ps(fscal,dy11);
2847 tz = _mm_mul_ps(fscal,dz11);
2849 /* Update vectorial force */
2850 fix1 = _mm_add_ps(fix1,tx);
2851 fiy1 = _mm_add_ps(fiy1,ty);
2852 fiz1 = _mm_add_ps(fiz1,tz);
2854 fjx1 = _mm_add_ps(fjx1,tx);
2855 fjy1 = _mm_add_ps(fjy1,ty);
2856 fjz1 = _mm_add_ps(fjz1,tz);
2860 /**************************
2861 * CALCULATE INTERACTIONS *
2862 **************************/
2864 if (gmx_mm_any_lt(rsq12,rcutoff2))
2867 r12 = _mm_mul_ps(rsq12,rinv12);
2868 r12 = _mm_andnot_ps(dummy_mask,r12);
2870 /* EWALD ELECTROSTATICS */
2872 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2873 ewrt = _mm_mul_ps(r12,ewtabscale);
2874 ewitab = _mm_cvttps_epi32(ewrt);
2875 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2876 ewitab = _mm_slli_epi32(ewitab,2);
2877 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2878 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2879 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2880 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2881 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2882 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2883 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2884 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2885 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2887 d = _mm_sub_ps(r12,rswitch);
2888 d = _mm_max_ps(d,_mm_setzero_ps());
2889 d2 = _mm_mul_ps(d,d);
2890 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2892 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2894 /* Evaluate switch function */
2895 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2896 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2897 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2901 fscal = _mm_and_ps(fscal,cutoff_mask);
2903 fscal = _mm_andnot_ps(dummy_mask,fscal);
2905 /* Calculate temporary vectorial force */
2906 tx = _mm_mul_ps(fscal,dx12);
2907 ty = _mm_mul_ps(fscal,dy12);
2908 tz = _mm_mul_ps(fscal,dz12);
2910 /* Update vectorial force */
2911 fix1 = _mm_add_ps(fix1,tx);
2912 fiy1 = _mm_add_ps(fiy1,ty);
2913 fiz1 = _mm_add_ps(fiz1,tz);
2915 fjx2 = _mm_add_ps(fjx2,tx);
2916 fjy2 = _mm_add_ps(fjy2,ty);
2917 fjz2 = _mm_add_ps(fjz2,tz);
2921 /**************************
2922 * CALCULATE INTERACTIONS *
2923 **************************/
2925 if (gmx_mm_any_lt(rsq20,rcutoff2))
2928 r20 = _mm_mul_ps(rsq20,rinv20);
2929 r20 = _mm_andnot_ps(dummy_mask,r20);
2931 /* EWALD ELECTROSTATICS */
2933 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2934 ewrt = _mm_mul_ps(r20,ewtabscale);
2935 ewitab = _mm_cvttps_epi32(ewrt);
2936 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2937 ewitab = _mm_slli_epi32(ewitab,2);
2938 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2939 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2940 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2941 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2942 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2943 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2944 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2945 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2946 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2948 d = _mm_sub_ps(r20,rswitch);
2949 d = _mm_max_ps(d,_mm_setzero_ps());
2950 d2 = _mm_mul_ps(d,d);
2951 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2953 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2955 /* Evaluate switch function */
2956 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2957 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2958 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2962 fscal = _mm_and_ps(fscal,cutoff_mask);
2964 fscal = _mm_andnot_ps(dummy_mask,fscal);
2966 /* Calculate temporary vectorial force */
2967 tx = _mm_mul_ps(fscal,dx20);
2968 ty = _mm_mul_ps(fscal,dy20);
2969 tz = _mm_mul_ps(fscal,dz20);
2971 /* Update vectorial force */
2972 fix2 = _mm_add_ps(fix2,tx);
2973 fiy2 = _mm_add_ps(fiy2,ty);
2974 fiz2 = _mm_add_ps(fiz2,tz);
2976 fjx0 = _mm_add_ps(fjx0,tx);
2977 fjy0 = _mm_add_ps(fjy0,ty);
2978 fjz0 = _mm_add_ps(fjz0,tz);
2982 /**************************
2983 * CALCULATE INTERACTIONS *
2984 **************************/
2986 if (gmx_mm_any_lt(rsq21,rcutoff2))
2989 r21 = _mm_mul_ps(rsq21,rinv21);
2990 r21 = _mm_andnot_ps(dummy_mask,r21);
2992 /* EWALD ELECTROSTATICS */
2994 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2995 ewrt = _mm_mul_ps(r21,ewtabscale);
2996 ewitab = _mm_cvttps_epi32(ewrt);
2997 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2998 ewitab = _mm_slli_epi32(ewitab,2);
2999 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3000 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3001 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3002 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3003 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3004 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3005 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3006 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3007 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3009 d = _mm_sub_ps(r21,rswitch);
3010 d = _mm_max_ps(d,_mm_setzero_ps());
3011 d2 = _mm_mul_ps(d,d);
3012 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3014 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3016 /* Evaluate switch function */
3017 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3018 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3019 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3023 fscal = _mm_and_ps(fscal,cutoff_mask);
3025 fscal = _mm_andnot_ps(dummy_mask,fscal);
3027 /* Calculate temporary vectorial force */
3028 tx = _mm_mul_ps(fscal,dx21);
3029 ty = _mm_mul_ps(fscal,dy21);
3030 tz = _mm_mul_ps(fscal,dz21);
3032 /* Update vectorial force */
3033 fix2 = _mm_add_ps(fix2,tx);
3034 fiy2 = _mm_add_ps(fiy2,ty);
3035 fiz2 = _mm_add_ps(fiz2,tz);
3037 fjx1 = _mm_add_ps(fjx1,tx);
3038 fjy1 = _mm_add_ps(fjy1,ty);
3039 fjz1 = _mm_add_ps(fjz1,tz);
3043 /**************************
3044 * CALCULATE INTERACTIONS *
3045 **************************/
3047 if (gmx_mm_any_lt(rsq22,rcutoff2))
3050 r22 = _mm_mul_ps(rsq22,rinv22);
3051 r22 = _mm_andnot_ps(dummy_mask,r22);
3053 /* EWALD ELECTROSTATICS */
3055 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3056 ewrt = _mm_mul_ps(r22,ewtabscale);
3057 ewitab = _mm_cvttps_epi32(ewrt);
3058 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3059 ewitab = _mm_slli_epi32(ewitab,2);
3060 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3061 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3062 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3063 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3064 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3065 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3066 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3067 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3068 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3070 d = _mm_sub_ps(r22,rswitch);
3071 d = _mm_max_ps(d,_mm_setzero_ps());
3072 d2 = _mm_mul_ps(d,d);
3073 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3075 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3077 /* Evaluate switch function */
3078 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3079 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3080 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3084 fscal = _mm_and_ps(fscal,cutoff_mask);
3086 fscal = _mm_andnot_ps(dummy_mask,fscal);
3088 /* Calculate temporary vectorial force */
3089 tx = _mm_mul_ps(fscal,dx22);
3090 ty = _mm_mul_ps(fscal,dy22);
3091 tz = _mm_mul_ps(fscal,dz22);
3093 /* Update vectorial force */
3094 fix2 = _mm_add_ps(fix2,tx);
3095 fiy2 = _mm_add_ps(fiy2,ty);
3096 fiz2 = _mm_add_ps(fiz2,tz);
3098 fjx2 = _mm_add_ps(fjx2,tx);
3099 fjy2 = _mm_add_ps(fjy2,ty);
3100 fjz2 = _mm_add_ps(fjz2,tz);
3104 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3105 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3106 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3107 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3109 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3110 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3112 /* Inner loop uses 582 flops */
3115 /* End of innermost loop */
3117 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3118 f+i_coord_offset,fshift+i_shift_offset);
3120 /* Increment number of inner iterations */
3121 inneriter += j_index_end - j_index_start;
3123 /* Outer loop uses 18 flops */
3126 /* Increment number of outer iterations */
3129 /* Update outer/inner flops */
3131 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);