2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
94 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
96 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
99 real rswitch_scalar,d_scalar;
100 __m128 dummy_mask,cutoff_mask;
101 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
102 __m128 one = _mm_set1_ps(1.0);
103 __m128 two = _mm_set1_ps(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_ps(fr->epsfac);
116 charge = mdatoms->chargeA;
117 nvdwtype = fr->ntype;
119 vdwtype = mdatoms->typeA;
121 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
122 ewtab = fr->ic->tabq_coul_FDV0;
123 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
124 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
129 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
130 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_ps(charge[inr+0]);
134 jq1 = _mm_set1_ps(charge[inr+1]);
135 jq2 = _mm_set1_ps(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_ps(iq0,jq0);
138 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_ps(iq0,jq1);
141 qq02 = _mm_mul_ps(iq0,jq2);
142 qq10 = _mm_mul_ps(iq1,jq0);
143 qq11 = _mm_mul_ps(iq1,jq1);
144 qq12 = _mm_mul_ps(iq1,jq2);
145 qq20 = _mm_mul_ps(iq2,jq0);
146 qq21 = _mm_mul_ps(iq2,jq1);
147 qq22 = _mm_mul_ps(iq2,jq2);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->rcoulomb;
151 rcutoff = _mm_set1_ps(rcutoff_scalar);
152 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
154 rswitch_scalar = fr->rcoulomb_switch;
155 rswitch = _mm_set1_ps(rswitch_scalar);
156 /* Setup switch parameters */
157 d_scalar = rcutoff_scalar-rswitch_scalar;
158 d = _mm_set1_ps(d_scalar);
159 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
160 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
163 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
166 /* Avoid stupid compiler warnings */
167 jnrA = jnrB = jnrC = jnrD = 0;
176 /* Start outer loop over neighborlists */
177 for(iidx=0; iidx<nri; iidx++)
179 /* Load shift vector for this list */
180 i_shift_offset = DIM*shiftidx[iidx];
181 shX = shiftvec[i_shift_offset+XX];
182 shY = shiftvec[i_shift_offset+YY];
183 shZ = shiftvec[i_shift_offset+ZZ];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
195 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
196 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
197 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
198 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
199 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
200 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
201 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
202 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
204 fix0 = _mm_setzero_ps();
205 fiy0 = _mm_setzero_ps();
206 fiz0 = _mm_setzero_ps();
207 fix1 = _mm_setzero_ps();
208 fiy1 = _mm_setzero_ps();
209 fiz1 = _mm_setzero_ps();
210 fix2 = _mm_setzero_ps();
211 fiy2 = _mm_setzero_ps();
212 fiz2 = _mm_setzero_ps();
214 /* Reset potential sums */
215 velecsum = _mm_setzero_ps();
216 vvdwsum = _mm_setzero_ps();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
230 j_coord_offsetC = DIM*jnrC;
231 j_coord_offsetD = DIM*jnrD;
233 /* load j atom coordinates */
234 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
235 x+j_coord_offsetC,x+j_coord_offsetD,
236 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
238 /* Calculate displacement vector */
239 dx00 = _mm_sub_ps(ix0,jx0);
240 dy00 = _mm_sub_ps(iy0,jy0);
241 dz00 = _mm_sub_ps(iz0,jz0);
242 dx01 = _mm_sub_ps(ix0,jx1);
243 dy01 = _mm_sub_ps(iy0,jy1);
244 dz01 = _mm_sub_ps(iz0,jz1);
245 dx02 = _mm_sub_ps(ix0,jx2);
246 dy02 = _mm_sub_ps(iy0,jy2);
247 dz02 = _mm_sub_ps(iz0,jz2);
248 dx10 = _mm_sub_ps(ix1,jx0);
249 dy10 = _mm_sub_ps(iy1,jy0);
250 dz10 = _mm_sub_ps(iz1,jz0);
251 dx11 = _mm_sub_ps(ix1,jx1);
252 dy11 = _mm_sub_ps(iy1,jy1);
253 dz11 = _mm_sub_ps(iz1,jz1);
254 dx12 = _mm_sub_ps(ix1,jx2);
255 dy12 = _mm_sub_ps(iy1,jy2);
256 dz12 = _mm_sub_ps(iz1,jz2);
257 dx20 = _mm_sub_ps(ix2,jx0);
258 dy20 = _mm_sub_ps(iy2,jy0);
259 dz20 = _mm_sub_ps(iz2,jz0);
260 dx21 = _mm_sub_ps(ix2,jx1);
261 dy21 = _mm_sub_ps(iy2,jy1);
262 dz21 = _mm_sub_ps(iz2,jz1);
263 dx22 = _mm_sub_ps(ix2,jx2);
264 dy22 = _mm_sub_ps(iy2,jy2);
265 dz22 = _mm_sub_ps(iz2,jz2);
267 /* Calculate squared distance and things based on it */
268 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
269 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
270 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
271 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
272 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
275 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
278 rinv00 = gmx_mm_invsqrt_ps(rsq00);
279 rinv01 = gmx_mm_invsqrt_ps(rsq01);
280 rinv02 = gmx_mm_invsqrt_ps(rsq02);
281 rinv10 = gmx_mm_invsqrt_ps(rsq10);
282 rinv11 = gmx_mm_invsqrt_ps(rsq11);
283 rinv12 = gmx_mm_invsqrt_ps(rsq12);
284 rinv20 = gmx_mm_invsqrt_ps(rsq20);
285 rinv21 = gmx_mm_invsqrt_ps(rsq21);
286 rinv22 = gmx_mm_invsqrt_ps(rsq22);
288 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
289 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
290 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
291 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
292 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
293 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
294 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
295 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
296 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
298 fjx0 = _mm_setzero_ps();
299 fjy0 = _mm_setzero_ps();
300 fjz0 = _mm_setzero_ps();
301 fjx1 = _mm_setzero_ps();
302 fjy1 = _mm_setzero_ps();
303 fjz1 = _mm_setzero_ps();
304 fjx2 = _mm_setzero_ps();
305 fjy2 = _mm_setzero_ps();
306 fjz2 = _mm_setzero_ps();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm_any_lt(rsq00,rcutoff2))
315 r00 = _mm_mul_ps(rsq00,rinv00);
317 /* EWALD ELECTROSTATICS */
319 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
320 ewrt = _mm_mul_ps(r00,ewtabscale);
321 ewitab = _mm_cvttps_epi32(ewrt);
322 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
323 ewitab = _mm_slli_epi32(ewitab,2);
324 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
325 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
326 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
327 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
328 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
329 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
330 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
331 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
332 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
334 /* LENNARD-JONES DISPERSION/REPULSION */
336 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
337 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
338 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
339 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
340 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
342 d = _mm_sub_ps(r00,rswitch);
343 d = _mm_max_ps(d,_mm_setzero_ps());
344 d2 = _mm_mul_ps(d,d);
345 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
347 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
349 /* Evaluate switch function */
350 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
351 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
352 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
353 velec = _mm_mul_ps(velec,sw);
354 vvdw = _mm_mul_ps(vvdw,sw);
355 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velec = _mm_and_ps(velec,cutoff_mask);
359 velecsum = _mm_add_ps(velecsum,velec);
360 vvdw = _mm_and_ps(vvdw,cutoff_mask);
361 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
363 fscal = _mm_add_ps(felec,fvdw);
365 fscal = _mm_and_ps(fscal,cutoff_mask);
367 /* Calculate temporary vectorial force */
368 tx = _mm_mul_ps(fscal,dx00);
369 ty = _mm_mul_ps(fscal,dy00);
370 tz = _mm_mul_ps(fscal,dz00);
372 /* Update vectorial force */
373 fix0 = _mm_add_ps(fix0,tx);
374 fiy0 = _mm_add_ps(fiy0,ty);
375 fiz0 = _mm_add_ps(fiz0,tz);
377 fjx0 = _mm_add_ps(fjx0,tx);
378 fjy0 = _mm_add_ps(fjy0,ty);
379 fjz0 = _mm_add_ps(fjz0,tz);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 if (gmx_mm_any_lt(rsq01,rcutoff2))
390 r01 = _mm_mul_ps(rsq01,rinv01);
392 /* EWALD ELECTROSTATICS */
394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395 ewrt = _mm_mul_ps(r01,ewtabscale);
396 ewitab = _mm_cvttps_epi32(ewrt);
397 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
398 ewitab = _mm_slli_epi32(ewitab,2);
399 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
400 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
401 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
402 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
403 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
404 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
405 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
406 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
407 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
409 d = _mm_sub_ps(r01,rswitch);
410 d = _mm_max_ps(d,_mm_setzero_ps());
411 d2 = _mm_mul_ps(d,d);
412 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
414 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
416 /* Evaluate switch function */
417 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
418 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
419 velec = _mm_mul_ps(velec,sw);
420 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velec = _mm_and_ps(velec,cutoff_mask);
424 velecsum = _mm_add_ps(velecsum,velec);
428 fscal = _mm_and_ps(fscal,cutoff_mask);
430 /* Calculate temporary vectorial force */
431 tx = _mm_mul_ps(fscal,dx01);
432 ty = _mm_mul_ps(fscal,dy01);
433 tz = _mm_mul_ps(fscal,dz01);
435 /* Update vectorial force */
436 fix0 = _mm_add_ps(fix0,tx);
437 fiy0 = _mm_add_ps(fiy0,ty);
438 fiz0 = _mm_add_ps(fiz0,tz);
440 fjx1 = _mm_add_ps(fjx1,tx);
441 fjy1 = _mm_add_ps(fjy1,ty);
442 fjz1 = _mm_add_ps(fjz1,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 if (gmx_mm_any_lt(rsq02,rcutoff2))
453 r02 = _mm_mul_ps(rsq02,rinv02);
455 /* EWALD ELECTROSTATICS */
457 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
458 ewrt = _mm_mul_ps(r02,ewtabscale);
459 ewitab = _mm_cvttps_epi32(ewrt);
460 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
461 ewitab = _mm_slli_epi32(ewitab,2);
462 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
463 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
464 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
465 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
466 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
467 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
468 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
469 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
470 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
472 d = _mm_sub_ps(r02,rswitch);
473 d = _mm_max_ps(d,_mm_setzero_ps());
474 d2 = _mm_mul_ps(d,d);
475 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
477 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
479 /* Evaluate switch function */
480 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
481 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
482 velec = _mm_mul_ps(velec,sw);
483 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velec = _mm_and_ps(velec,cutoff_mask);
487 velecsum = _mm_add_ps(velecsum,velec);
491 fscal = _mm_and_ps(fscal,cutoff_mask);
493 /* Calculate temporary vectorial force */
494 tx = _mm_mul_ps(fscal,dx02);
495 ty = _mm_mul_ps(fscal,dy02);
496 tz = _mm_mul_ps(fscal,dz02);
498 /* Update vectorial force */
499 fix0 = _mm_add_ps(fix0,tx);
500 fiy0 = _mm_add_ps(fiy0,ty);
501 fiz0 = _mm_add_ps(fiz0,tz);
503 fjx2 = _mm_add_ps(fjx2,tx);
504 fjy2 = _mm_add_ps(fjy2,ty);
505 fjz2 = _mm_add_ps(fjz2,tz);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 if (gmx_mm_any_lt(rsq10,rcutoff2))
516 r10 = _mm_mul_ps(rsq10,rinv10);
518 /* EWALD ELECTROSTATICS */
520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521 ewrt = _mm_mul_ps(r10,ewtabscale);
522 ewitab = _mm_cvttps_epi32(ewrt);
523 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
524 ewitab = _mm_slli_epi32(ewitab,2);
525 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
526 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
527 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
528 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
529 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
530 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
531 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
532 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
533 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
535 d = _mm_sub_ps(r10,rswitch);
536 d = _mm_max_ps(d,_mm_setzero_ps());
537 d2 = _mm_mul_ps(d,d);
538 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
540 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
542 /* Evaluate switch function */
543 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
544 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
545 velec = _mm_mul_ps(velec,sw);
546 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velec = _mm_and_ps(velec,cutoff_mask);
550 velecsum = _mm_add_ps(velecsum,velec);
554 fscal = _mm_and_ps(fscal,cutoff_mask);
556 /* Calculate temporary vectorial force */
557 tx = _mm_mul_ps(fscal,dx10);
558 ty = _mm_mul_ps(fscal,dy10);
559 tz = _mm_mul_ps(fscal,dz10);
561 /* Update vectorial force */
562 fix1 = _mm_add_ps(fix1,tx);
563 fiy1 = _mm_add_ps(fiy1,ty);
564 fiz1 = _mm_add_ps(fiz1,tz);
566 fjx0 = _mm_add_ps(fjx0,tx);
567 fjy0 = _mm_add_ps(fjy0,ty);
568 fjz0 = _mm_add_ps(fjz0,tz);
572 /**************************
573 * CALCULATE INTERACTIONS *
574 **************************/
576 if (gmx_mm_any_lt(rsq11,rcutoff2))
579 r11 = _mm_mul_ps(rsq11,rinv11);
581 /* EWALD ELECTROSTATICS */
583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584 ewrt = _mm_mul_ps(r11,ewtabscale);
585 ewitab = _mm_cvttps_epi32(ewrt);
586 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
587 ewitab = _mm_slli_epi32(ewitab,2);
588 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
589 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
590 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
591 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
592 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
593 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
594 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
595 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
596 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
598 d = _mm_sub_ps(r11,rswitch);
599 d = _mm_max_ps(d,_mm_setzero_ps());
600 d2 = _mm_mul_ps(d,d);
601 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
603 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
605 /* Evaluate switch function */
606 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
607 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
608 velec = _mm_mul_ps(velec,sw);
609 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec = _mm_and_ps(velec,cutoff_mask);
613 velecsum = _mm_add_ps(velecsum,velec);
617 fscal = _mm_and_ps(fscal,cutoff_mask);
619 /* Calculate temporary vectorial force */
620 tx = _mm_mul_ps(fscal,dx11);
621 ty = _mm_mul_ps(fscal,dy11);
622 tz = _mm_mul_ps(fscal,dz11);
624 /* Update vectorial force */
625 fix1 = _mm_add_ps(fix1,tx);
626 fiy1 = _mm_add_ps(fiy1,ty);
627 fiz1 = _mm_add_ps(fiz1,tz);
629 fjx1 = _mm_add_ps(fjx1,tx);
630 fjy1 = _mm_add_ps(fjy1,ty);
631 fjz1 = _mm_add_ps(fjz1,tz);
635 /**************************
636 * CALCULATE INTERACTIONS *
637 **************************/
639 if (gmx_mm_any_lt(rsq12,rcutoff2))
642 r12 = _mm_mul_ps(rsq12,rinv12);
644 /* EWALD ELECTROSTATICS */
646 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
647 ewrt = _mm_mul_ps(r12,ewtabscale);
648 ewitab = _mm_cvttps_epi32(ewrt);
649 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
650 ewitab = _mm_slli_epi32(ewitab,2);
651 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
652 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
653 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
654 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
655 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
656 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
657 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
658 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
659 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
661 d = _mm_sub_ps(r12,rswitch);
662 d = _mm_max_ps(d,_mm_setzero_ps());
663 d2 = _mm_mul_ps(d,d);
664 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
666 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
668 /* Evaluate switch function */
669 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
670 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
671 velec = _mm_mul_ps(velec,sw);
672 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
674 /* Update potential sum for this i atom from the interaction with this j atom. */
675 velec = _mm_and_ps(velec,cutoff_mask);
676 velecsum = _mm_add_ps(velecsum,velec);
680 fscal = _mm_and_ps(fscal,cutoff_mask);
682 /* Calculate temporary vectorial force */
683 tx = _mm_mul_ps(fscal,dx12);
684 ty = _mm_mul_ps(fscal,dy12);
685 tz = _mm_mul_ps(fscal,dz12);
687 /* Update vectorial force */
688 fix1 = _mm_add_ps(fix1,tx);
689 fiy1 = _mm_add_ps(fiy1,ty);
690 fiz1 = _mm_add_ps(fiz1,tz);
692 fjx2 = _mm_add_ps(fjx2,tx);
693 fjy2 = _mm_add_ps(fjy2,ty);
694 fjz2 = _mm_add_ps(fjz2,tz);
698 /**************************
699 * CALCULATE INTERACTIONS *
700 **************************/
702 if (gmx_mm_any_lt(rsq20,rcutoff2))
705 r20 = _mm_mul_ps(rsq20,rinv20);
707 /* EWALD ELECTROSTATICS */
709 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
710 ewrt = _mm_mul_ps(r20,ewtabscale);
711 ewitab = _mm_cvttps_epi32(ewrt);
712 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
713 ewitab = _mm_slli_epi32(ewitab,2);
714 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
715 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
716 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
717 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
718 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
719 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
720 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
721 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
722 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
724 d = _mm_sub_ps(r20,rswitch);
725 d = _mm_max_ps(d,_mm_setzero_ps());
726 d2 = _mm_mul_ps(d,d);
727 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
729 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
731 /* Evaluate switch function */
732 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
733 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
734 velec = _mm_mul_ps(velec,sw);
735 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velec = _mm_and_ps(velec,cutoff_mask);
739 velecsum = _mm_add_ps(velecsum,velec);
743 fscal = _mm_and_ps(fscal,cutoff_mask);
745 /* Calculate temporary vectorial force */
746 tx = _mm_mul_ps(fscal,dx20);
747 ty = _mm_mul_ps(fscal,dy20);
748 tz = _mm_mul_ps(fscal,dz20);
750 /* Update vectorial force */
751 fix2 = _mm_add_ps(fix2,tx);
752 fiy2 = _mm_add_ps(fiy2,ty);
753 fiz2 = _mm_add_ps(fiz2,tz);
755 fjx0 = _mm_add_ps(fjx0,tx);
756 fjy0 = _mm_add_ps(fjy0,ty);
757 fjz0 = _mm_add_ps(fjz0,tz);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 if (gmx_mm_any_lt(rsq21,rcutoff2))
768 r21 = _mm_mul_ps(rsq21,rinv21);
770 /* EWALD ELECTROSTATICS */
772 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
773 ewrt = _mm_mul_ps(r21,ewtabscale);
774 ewitab = _mm_cvttps_epi32(ewrt);
775 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
776 ewitab = _mm_slli_epi32(ewitab,2);
777 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
778 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
779 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
780 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
781 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
782 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
783 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
784 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
785 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
787 d = _mm_sub_ps(r21,rswitch);
788 d = _mm_max_ps(d,_mm_setzero_ps());
789 d2 = _mm_mul_ps(d,d);
790 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
792 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
794 /* Evaluate switch function */
795 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
797 velec = _mm_mul_ps(velec,sw);
798 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm_and_ps(velec,cutoff_mask);
802 velecsum = _mm_add_ps(velecsum,velec);
806 fscal = _mm_and_ps(fscal,cutoff_mask);
808 /* Calculate temporary vectorial force */
809 tx = _mm_mul_ps(fscal,dx21);
810 ty = _mm_mul_ps(fscal,dy21);
811 tz = _mm_mul_ps(fscal,dz21);
813 /* Update vectorial force */
814 fix2 = _mm_add_ps(fix2,tx);
815 fiy2 = _mm_add_ps(fiy2,ty);
816 fiz2 = _mm_add_ps(fiz2,tz);
818 fjx1 = _mm_add_ps(fjx1,tx);
819 fjy1 = _mm_add_ps(fjy1,ty);
820 fjz1 = _mm_add_ps(fjz1,tz);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 if (gmx_mm_any_lt(rsq22,rcutoff2))
831 r22 = _mm_mul_ps(rsq22,rinv22);
833 /* EWALD ELECTROSTATICS */
835 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
836 ewrt = _mm_mul_ps(r22,ewtabscale);
837 ewitab = _mm_cvttps_epi32(ewrt);
838 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
839 ewitab = _mm_slli_epi32(ewitab,2);
840 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
841 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
842 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
843 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
844 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
845 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
846 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
847 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
848 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
850 d = _mm_sub_ps(r22,rswitch);
851 d = _mm_max_ps(d,_mm_setzero_ps());
852 d2 = _mm_mul_ps(d,d);
853 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
855 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
857 /* Evaluate switch function */
858 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
859 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
860 velec = _mm_mul_ps(velec,sw);
861 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
863 /* Update potential sum for this i atom from the interaction with this j atom. */
864 velec = _mm_and_ps(velec,cutoff_mask);
865 velecsum = _mm_add_ps(velecsum,velec);
869 fscal = _mm_and_ps(fscal,cutoff_mask);
871 /* Calculate temporary vectorial force */
872 tx = _mm_mul_ps(fscal,dx22);
873 ty = _mm_mul_ps(fscal,dy22);
874 tz = _mm_mul_ps(fscal,dz22);
876 /* Update vectorial force */
877 fix2 = _mm_add_ps(fix2,tx);
878 fiy2 = _mm_add_ps(fiy2,ty);
879 fiz2 = _mm_add_ps(fiz2,tz);
881 fjx2 = _mm_add_ps(fjx2,tx);
882 fjy2 = _mm_add_ps(fjy2,ty);
883 fjz2 = _mm_add_ps(fjz2,tz);
887 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
888 f+j_coord_offsetC,f+j_coord_offsetD,
889 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
891 /* Inner loop uses 603 flops */
897 /* Get j neighbor index, and coordinate index */
903 /* Sign of each element will be negative for non-real atoms.
904 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
905 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
907 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
908 jnrA = (jnrA>=0) ? jnrA : 0;
909 jnrB = (jnrB>=0) ? jnrB : 0;
910 jnrC = (jnrC>=0) ? jnrC : 0;
911 jnrD = (jnrD>=0) ? jnrD : 0;
913 j_coord_offsetA = DIM*jnrA;
914 j_coord_offsetB = DIM*jnrB;
915 j_coord_offsetC = DIM*jnrC;
916 j_coord_offsetD = DIM*jnrD;
918 /* load j atom coordinates */
919 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
920 x+j_coord_offsetC,x+j_coord_offsetD,
921 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
923 /* Calculate displacement vector */
924 dx00 = _mm_sub_ps(ix0,jx0);
925 dy00 = _mm_sub_ps(iy0,jy0);
926 dz00 = _mm_sub_ps(iz0,jz0);
927 dx01 = _mm_sub_ps(ix0,jx1);
928 dy01 = _mm_sub_ps(iy0,jy1);
929 dz01 = _mm_sub_ps(iz0,jz1);
930 dx02 = _mm_sub_ps(ix0,jx2);
931 dy02 = _mm_sub_ps(iy0,jy2);
932 dz02 = _mm_sub_ps(iz0,jz2);
933 dx10 = _mm_sub_ps(ix1,jx0);
934 dy10 = _mm_sub_ps(iy1,jy0);
935 dz10 = _mm_sub_ps(iz1,jz0);
936 dx11 = _mm_sub_ps(ix1,jx1);
937 dy11 = _mm_sub_ps(iy1,jy1);
938 dz11 = _mm_sub_ps(iz1,jz1);
939 dx12 = _mm_sub_ps(ix1,jx2);
940 dy12 = _mm_sub_ps(iy1,jy2);
941 dz12 = _mm_sub_ps(iz1,jz2);
942 dx20 = _mm_sub_ps(ix2,jx0);
943 dy20 = _mm_sub_ps(iy2,jy0);
944 dz20 = _mm_sub_ps(iz2,jz0);
945 dx21 = _mm_sub_ps(ix2,jx1);
946 dy21 = _mm_sub_ps(iy2,jy1);
947 dz21 = _mm_sub_ps(iz2,jz1);
948 dx22 = _mm_sub_ps(ix2,jx2);
949 dy22 = _mm_sub_ps(iy2,jy2);
950 dz22 = _mm_sub_ps(iz2,jz2);
952 /* Calculate squared distance and things based on it */
953 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
954 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
955 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
956 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
957 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
958 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
959 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
960 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
961 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
963 rinv00 = gmx_mm_invsqrt_ps(rsq00);
964 rinv01 = gmx_mm_invsqrt_ps(rsq01);
965 rinv02 = gmx_mm_invsqrt_ps(rsq02);
966 rinv10 = gmx_mm_invsqrt_ps(rsq10);
967 rinv11 = gmx_mm_invsqrt_ps(rsq11);
968 rinv12 = gmx_mm_invsqrt_ps(rsq12);
969 rinv20 = gmx_mm_invsqrt_ps(rsq20);
970 rinv21 = gmx_mm_invsqrt_ps(rsq21);
971 rinv22 = gmx_mm_invsqrt_ps(rsq22);
973 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
974 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
975 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
976 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
977 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
978 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
979 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
980 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
981 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
983 fjx0 = _mm_setzero_ps();
984 fjy0 = _mm_setzero_ps();
985 fjz0 = _mm_setzero_ps();
986 fjx1 = _mm_setzero_ps();
987 fjy1 = _mm_setzero_ps();
988 fjz1 = _mm_setzero_ps();
989 fjx2 = _mm_setzero_ps();
990 fjy2 = _mm_setzero_ps();
991 fjz2 = _mm_setzero_ps();
993 /**************************
994 * CALCULATE INTERACTIONS *
995 **************************/
997 if (gmx_mm_any_lt(rsq00,rcutoff2))
1000 r00 = _mm_mul_ps(rsq00,rinv00);
1001 r00 = _mm_andnot_ps(dummy_mask,r00);
1003 /* EWALD ELECTROSTATICS */
1005 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1006 ewrt = _mm_mul_ps(r00,ewtabscale);
1007 ewitab = _mm_cvttps_epi32(ewrt);
1008 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1009 ewitab = _mm_slli_epi32(ewitab,2);
1010 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1011 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1012 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1013 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1014 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1015 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1016 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1017 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1018 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1020 /* LENNARD-JONES DISPERSION/REPULSION */
1022 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1023 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1024 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1025 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1026 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1028 d = _mm_sub_ps(r00,rswitch);
1029 d = _mm_max_ps(d,_mm_setzero_ps());
1030 d2 = _mm_mul_ps(d,d);
1031 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1033 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1035 /* Evaluate switch function */
1036 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1037 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1038 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1039 velec = _mm_mul_ps(velec,sw);
1040 vvdw = _mm_mul_ps(vvdw,sw);
1041 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1043 /* Update potential sum for this i atom from the interaction with this j atom. */
1044 velec = _mm_and_ps(velec,cutoff_mask);
1045 velec = _mm_andnot_ps(dummy_mask,velec);
1046 velecsum = _mm_add_ps(velecsum,velec);
1047 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1048 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1049 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1051 fscal = _mm_add_ps(felec,fvdw);
1053 fscal = _mm_and_ps(fscal,cutoff_mask);
1055 fscal = _mm_andnot_ps(dummy_mask,fscal);
1057 /* Calculate temporary vectorial force */
1058 tx = _mm_mul_ps(fscal,dx00);
1059 ty = _mm_mul_ps(fscal,dy00);
1060 tz = _mm_mul_ps(fscal,dz00);
1062 /* Update vectorial force */
1063 fix0 = _mm_add_ps(fix0,tx);
1064 fiy0 = _mm_add_ps(fiy0,ty);
1065 fiz0 = _mm_add_ps(fiz0,tz);
1067 fjx0 = _mm_add_ps(fjx0,tx);
1068 fjy0 = _mm_add_ps(fjy0,ty);
1069 fjz0 = _mm_add_ps(fjz0,tz);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 if (gmx_mm_any_lt(rsq01,rcutoff2))
1080 r01 = _mm_mul_ps(rsq01,rinv01);
1081 r01 = _mm_andnot_ps(dummy_mask,r01);
1083 /* EWALD ELECTROSTATICS */
1085 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1086 ewrt = _mm_mul_ps(r01,ewtabscale);
1087 ewitab = _mm_cvttps_epi32(ewrt);
1088 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1089 ewitab = _mm_slli_epi32(ewitab,2);
1090 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1091 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1092 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1093 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1094 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1095 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1096 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1097 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1098 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1100 d = _mm_sub_ps(r01,rswitch);
1101 d = _mm_max_ps(d,_mm_setzero_ps());
1102 d2 = _mm_mul_ps(d,d);
1103 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1105 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1107 /* Evaluate switch function */
1108 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1109 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1110 velec = _mm_mul_ps(velec,sw);
1111 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec = _mm_and_ps(velec,cutoff_mask);
1115 velec = _mm_andnot_ps(dummy_mask,velec);
1116 velecsum = _mm_add_ps(velecsum,velec);
1120 fscal = _mm_and_ps(fscal,cutoff_mask);
1122 fscal = _mm_andnot_ps(dummy_mask,fscal);
1124 /* Calculate temporary vectorial force */
1125 tx = _mm_mul_ps(fscal,dx01);
1126 ty = _mm_mul_ps(fscal,dy01);
1127 tz = _mm_mul_ps(fscal,dz01);
1129 /* Update vectorial force */
1130 fix0 = _mm_add_ps(fix0,tx);
1131 fiy0 = _mm_add_ps(fiy0,ty);
1132 fiz0 = _mm_add_ps(fiz0,tz);
1134 fjx1 = _mm_add_ps(fjx1,tx);
1135 fjy1 = _mm_add_ps(fjy1,ty);
1136 fjz1 = _mm_add_ps(fjz1,tz);
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1144 if (gmx_mm_any_lt(rsq02,rcutoff2))
1147 r02 = _mm_mul_ps(rsq02,rinv02);
1148 r02 = _mm_andnot_ps(dummy_mask,r02);
1150 /* EWALD ELECTROSTATICS */
1152 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1153 ewrt = _mm_mul_ps(r02,ewtabscale);
1154 ewitab = _mm_cvttps_epi32(ewrt);
1155 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1156 ewitab = _mm_slli_epi32(ewitab,2);
1157 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1158 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1159 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1160 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1161 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1162 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1163 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1164 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1165 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1167 d = _mm_sub_ps(r02,rswitch);
1168 d = _mm_max_ps(d,_mm_setzero_ps());
1169 d2 = _mm_mul_ps(d,d);
1170 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1172 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1174 /* Evaluate switch function */
1175 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1176 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1177 velec = _mm_mul_ps(velec,sw);
1178 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1180 /* Update potential sum for this i atom from the interaction with this j atom. */
1181 velec = _mm_and_ps(velec,cutoff_mask);
1182 velec = _mm_andnot_ps(dummy_mask,velec);
1183 velecsum = _mm_add_ps(velecsum,velec);
1187 fscal = _mm_and_ps(fscal,cutoff_mask);
1189 fscal = _mm_andnot_ps(dummy_mask,fscal);
1191 /* Calculate temporary vectorial force */
1192 tx = _mm_mul_ps(fscal,dx02);
1193 ty = _mm_mul_ps(fscal,dy02);
1194 tz = _mm_mul_ps(fscal,dz02);
1196 /* Update vectorial force */
1197 fix0 = _mm_add_ps(fix0,tx);
1198 fiy0 = _mm_add_ps(fiy0,ty);
1199 fiz0 = _mm_add_ps(fiz0,tz);
1201 fjx2 = _mm_add_ps(fjx2,tx);
1202 fjy2 = _mm_add_ps(fjy2,ty);
1203 fjz2 = _mm_add_ps(fjz2,tz);
1207 /**************************
1208 * CALCULATE INTERACTIONS *
1209 **************************/
1211 if (gmx_mm_any_lt(rsq10,rcutoff2))
1214 r10 = _mm_mul_ps(rsq10,rinv10);
1215 r10 = _mm_andnot_ps(dummy_mask,r10);
1217 /* EWALD ELECTROSTATICS */
1219 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1220 ewrt = _mm_mul_ps(r10,ewtabscale);
1221 ewitab = _mm_cvttps_epi32(ewrt);
1222 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1223 ewitab = _mm_slli_epi32(ewitab,2);
1224 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1225 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1226 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1227 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1228 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1229 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1230 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1231 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1232 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1234 d = _mm_sub_ps(r10,rswitch);
1235 d = _mm_max_ps(d,_mm_setzero_ps());
1236 d2 = _mm_mul_ps(d,d);
1237 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1239 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1241 /* Evaluate switch function */
1242 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1243 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1244 velec = _mm_mul_ps(velec,sw);
1245 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1247 /* Update potential sum for this i atom from the interaction with this j atom. */
1248 velec = _mm_and_ps(velec,cutoff_mask);
1249 velec = _mm_andnot_ps(dummy_mask,velec);
1250 velecsum = _mm_add_ps(velecsum,velec);
1254 fscal = _mm_and_ps(fscal,cutoff_mask);
1256 fscal = _mm_andnot_ps(dummy_mask,fscal);
1258 /* Calculate temporary vectorial force */
1259 tx = _mm_mul_ps(fscal,dx10);
1260 ty = _mm_mul_ps(fscal,dy10);
1261 tz = _mm_mul_ps(fscal,dz10);
1263 /* Update vectorial force */
1264 fix1 = _mm_add_ps(fix1,tx);
1265 fiy1 = _mm_add_ps(fiy1,ty);
1266 fiz1 = _mm_add_ps(fiz1,tz);
1268 fjx0 = _mm_add_ps(fjx0,tx);
1269 fjy0 = _mm_add_ps(fjy0,ty);
1270 fjz0 = _mm_add_ps(fjz0,tz);
1274 /**************************
1275 * CALCULATE INTERACTIONS *
1276 **************************/
1278 if (gmx_mm_any_lt(rsq11,rcutoff2))
1281 r11 = _mm_mul_ps(rsq11,rinv11);
1282 r11 = _mm_andnot_ps(dummy_mask,r11);
1284 /* EWALD ELECTROSTATICS */
1286 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1287 ewrt = _mm_mul_ps(r11,ewtabscale);
1288 ewitab = _mm_cvttps_epi32(ewrt);
1289 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1290 ewitab = _mm_slli_epi32(ewitab,2);
1291 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1292 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1293 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1294 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1295 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1296 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1297 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1298 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1299 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1301 d = _mm_sub_ps(r11,rswitch);
1302 d = _mm_max_ps(d,_mm_setzero_ps());
1303 d2 = _mm_mul_ps(d,d);
1304 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1306 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1308 /* Evaluate switch function */
1309 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1310 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1311 velec = _mm_mul_ps(velec,sw);
1312 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1314 /* Update potential sum for this i atom from the interaction with this j atom. */
1315 velec = _mm_and_ps(velec,cutoff_mask);
1316 velec = _mm_andnot_ps(dummy_mask,velec);
1317 velecsum = _mm_add_ps(velecsum,velec);
1321 fscal = _mm_and_ps(fscal,cutoff_mask);
1323 fscal = _mm_andnot_ps(dummy_mask,fscal);
1325 /* Calculate temporary vectorial force */
1326 tx = _mm_mul_ps(fscal,dx11);
1327 ty = _mm_mul_ps(fscal,dy11);
1328 tz = _mm_mul_ps(fscal,dz11);
1330 /* Update vectorial force */
1331 fix1 = _mm_add_ps(fix1,tx);
1332 fiy1 = _mm_add_ps(fiy1,ty);
1333 fiz1 = _mm_add_ps(fiz1,tz);
1335 fjx1 = _mm_add_ps(fjx1,tx);
1336 fjy1 = _mm_add_ps(fjy1,ty);
1337 fjz1 = _mm_add_ps(fjz1,tz);
1341 /**************************
1342 * CALCULATE INTERACTIONS *
1343 **************************/
1345 if (gmx_mm_any_lt(rsq12,rcutoff2))
1348 r12 = _mm_mul_ps(rsq12,rinv12);
1349 r12 = _mm_andnot_ps(dummy_mask,r12);
1351 /* EWALD ELECTROSTATICS */
1353 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1354 ewrt = _mm_mul_ps(r12,ewtabscale);
1355 ewitab = _mm_cvttps_epi32(ewrt);
1356 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1357 ewitab = _mm_slli_epi32(ewitab,2);
1358 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1359 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1360 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1361 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1362 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1363 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1364 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1365 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1366 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1368 d = _mm_sub_ps(r12,rswitch);
1369 d = _mm_max_ps(d,_mm_setzero_ps());
1370 d2 = _mm_mul_ps(d,d);
1371 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1373 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1375 /* Evaluate switch function */
1376 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1377 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1378 velec = _mm_mul_ps(velec,sw);
1379 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1381 /* Update potential sum for this i atom from the interaction with this j atom. */
1382 velec = _mm_and_ps(velec,cutoff_mask);
1383 velec = _mm_andnot_ps(dummy_mask,velec);
1384 velecsum = _mm_add_ps(velecsum,velec);
1388 fscal = _mm_and_ps(fscal,cutoff_mask);
1390 fscal = _mm_andnot_ps(dummy_mask,fscal);
1392 /* Calculate temporary vectorial force */
1393 tx = _mm_mul_ps(fscal,dx12);
1394 ty = _mm_mul_ps(fscal,dy12);
1395 tz = _mm_mul_ps(fscal,dz12);
1397 /* Update vectorial force */
1398 fix1 = _mm_add_ps(fix1,tx);
1399 fiy1 = _mm_add_ps(fiy1,ty);
1400 fiz1 = _mm_add_ps(fiz1,tz);
1402 fjx2 = _mm_add_ps(fjx2,tx);
1403 fjy2 = _mm_add_ps(fjy2,ty);
1404 fjz2 = _mm_add_ps(fjz2,tz);
1408 /**************************
1409 * CALCULATE INTERACTIONS *
1410 **************************/
1412 if (gmx_mm_any_lt(rsq20,rcutoff2))
1415 r20 = _mm_mul_ps(rsq20,rinv20);
1416 r20 = _mm_andnot_ps(dummy_mask,r20);
1418 /* EWALD ELECTROSTATICS */
1420 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1421 ewrt = _mm_mul_ps(r20,ewtabscale);
1422 ewitab = _mm_cvttps_epi32(ewrt);
1423 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1424 ewitab = _mm_slli_epi32(ewitab,2);
1425 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1426 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1427 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1428 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1429 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1430 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1431 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1432 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1433 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1435 d = _mm_sub_ps(r20,rswitch);
1436 d = _mm_max_ps(d,_mm_setzero_ps());
1437 d2 = _mm_mul_ps(d,d);
1438 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1440 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1442 /* Evaluate switch function */
1443 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1444 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1445 velec = _mm_mul_ps(velec,sw);
1446 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1448 /* Update potential sum for this i atom from the interaction with this j atom. */
1449 velec = _mm_and_ps(velec,cutoff_mask);
1450 velec = _mm_andnot_ps(dummy_mask,velec);
1451 velecsum = _mm_add_ps(velecsum,velec);
1455 fscal = _mm_and_ps(fscal,cutoff_mask);
1457 fscal = _mm_andnot_ps(dummy_mask,fscal);
1459 /* Calculate temporary vectorial force */
1460 tx = _mm_mul_ps(fscal,dx20);
1461 ty = _mm_mul_ps(fscal,dy20);
1462 tz = _mm_mul_ps(fscal,dz20);
1464 /* Update vectorial force */
1465 fix2 = _mm_add_ps(fix2,tx);
1466 fiy2 = _mm_add_ps(fiy2,ty);
1467 fiz2 = _mm_add_ps(fiz2,tz);
1469 fjx0 = _mm_add_ps(fjx0,tx);
1470 fjy0 = _mm_add_ps(fjy0,ty);
1471 fjz0 = _mm_add_ps(fjz0,tz);
1475 /**************************
1476 * CALCULATE INTERACTIONS *
1477 **************************/
1479 if (gmx_mm_any_lt(rsq21,rcutoff2))
1482 r21 = _mm_mul_ps(rsq21,rinv21);
1483 r21 = _mm_andnot_ps(dummy_mask,r21);
1485 /* EWALD ELECTROSTATICS */
1487 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1488 ewrt = _mm_mul_ps(r21,ewtabscale);
1489 ewitab = _mm_cvttps_epi32(ewrt);
1490 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1491 ewitab = _mm_slli_epi32(ewitab,2);
1492 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1493 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1494 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1495 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1496 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1497 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1498 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1499 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1500 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1502 d = _mm_sub_ps(r21,rswitch);
1503 d = _mm_max_ps(d,_mm_setzero_ps());
1504 d2 = _mm_mul_ps(d,d);
1505 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1507 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1509 /* Evaluate switch function */
1510 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1511 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1512 velec = _mm_mul_ps(velec,sw);
1513 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1515 /* Update potential sum for this i atom from the interaction with this j atom. */
1516 velec = _mm_and_ps(velec,cutoff_mask);
1517 velec = _mm_andnot_ps(dummy_mask,velec);
1518 velecsum = _mm_add_ps(velecsum,velec);
1522 fscal = _mm_and_ps(fscal,cutoff_mask);
1524 fscal = _mm_andnot_ps(dummy_mask,fscal);
1526 /* Calculate temporary vectorial force */
1527 tx = _mm_mul_ps(fscal,dx21);
1528 ty = _mm_mul_ps(fscal,dy21);
1529 tz = _mm_mul_ps(fscal,dz21);
1531 /* Update vectorial force */
1532 fix2 = _mm_add_ps(fix2,tx);
1533 fiy2 = _mm_add_ps(fiy2,ty);
1534 fiz2 = _mm_add_ps(fiz2,tz);
1536 fjx1 = _mm_add_ps(fjx1,tx);
1537 fjy1 = _mm_add_ps(fjy1,ty);
1538 fjz1 = _mm_add_ps(fjz1,tz);
1542 /**************************
1543 * CALCULATE INTERACTIONS *
1544 **************************/
1546 if (gmx_mm_any_lt(rsq22,rcutoff2))
1549 r22 = _mm_mul_ps(rsq22,rinv22);
1550 r22 = _mm_andnot_ps(dummy_mask,r22);
1552 /* EWALD ELECTROSTATICS */
1554 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1555 ewrt = _mm_mul_ps(r22,ewtabscale);
1556 ewitab = _mm_cvttps_epi32(ewrt);
1557 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1558 ewitab = _mm_slli_epi32(ewitab,2);
1559 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1560 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1561 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1562 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1563 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1564 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1565 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1566 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1567 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1569 d = _mm_sub_ps(r22,rswitch);
1570 d = _mm_max_ps(d,_mm_setzero_ps());
1571 d2 = _mm_mul_ps(d,d);
1572 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1574 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1576 /* Evaluate switch function */
1577 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1578 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1579 velec = _mm_mul_ps(velec,sw);
1580 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1582 /* Update potential sum for this i atom from the interaction with this j atom. */
1583 velec = _mm_and_ps(velec,cutoff_mask);
1584 velec = _mm_andnot_ps(dummy_mask,velec);
1585 velecsum = _mm_add_ps(velecsum,velec);
1589 fscal = _mm_and_ps(fscal,cutoff_mask);
1591 fscal = _mm_andnot_ps(dummy_mask,fscal);
1593 /* Calculate temporary vectorial force */
1594 tx = _mm_mul_ps(fscal,dx22);
1595 ty = _mm_mul_ps(fscal,dy22);
1596 tz = _mm_mul_ps(fscal,dz22);
1598 /* Update vectorial force */
1599 fix2 = _mm_add_ps(fix2,tx);
1600 fiy2 = _mm_add_ps(fiy2,ty);
1601 fiz2 = _mm_add_ps(fiz2,tz);
1603 fjx2 = _mm_add_ps(fjx2,tx);
1604 fjy2 = _mm_add_ps(fjy2,ty);
1605 fjz2 = _mm_add_ps(fjz2,tz);
1609 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1610 f+j_coord_offsetC,f+j_coord_offsetD,
1611 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1613 /* Inner loop uses 612 flops */
1616 /* End of innermost loop */
1618 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1619 f+i_coord_offset,fshift+i_shift_offset);
1622 /* Update potential energies */
1623 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1624 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1626 /* Increment number of inner iterations */
1627 inneriter += j_index_end - j_index_start;
1629 /* Outer loop uses 29 flops */
1632 /* Increment number of outer iterations */
1635 /* Update outer/inner flops */
1637 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*612);
1640 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1641 * Electrostatics interaction: Ewald
1642 * VdW interaction: LennardJones
1643 * Geometry: Water3-Water3
1644 * Calculate force/pot: Force
1647 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1648 (t_nblist * gmx_restrict nlist,
1649 rvec * gmx_restrict xx,
1650 rvec * gmx_restrict ff,
1651 t_forcerec * gmx_restrict fr,
1652 t_mdatoms * gmx_restrict mdatoms,
1653 nb_kernel_data_t * gmx_restrict kernel_data,
1654 t_nrnb * gmx_restrict nrnb)
1656 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1657 * just 0 for non-waters.
1658 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1659 * jnr indices corresponding to data put in the four positions in the SIMD register.
1661 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1662 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1663 int jnrA,jnrB,jnrC,jnrD;
1664 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1665 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1666 real shX,shY,shZ,rcutoff_scalar;
1667 real *shiftvec,*fshift,*x,*f;
1668 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1670 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1672 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1674 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1675 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1676 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1677 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1678 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1679 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1680 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1681 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1682 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1683 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1684 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1685 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1686 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1687 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1688 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1689 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1690 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1693 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1696 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1697 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1699 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1701 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1702 real rswitch_scalar,d_scalar;
1703 __m128 dummy_mask,cutoff_mask;
1704 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1705 __m128 one = _mm_set1_ps(1.0);
1706 __m128 two = _mm_set1_ps(2.0);
1712 jindex = nlist->jindex;
1714 shiftidx = nlist->shift;
1716 shiftvec = fr->shift_vec[0];
1717 fshift = fr->fshift[0];
1718 facel = _mm_set1_ps(fr->epsfac);
1719 charge = mdatoms->chargeA;
1720 nvdwtype = fr->ntype;
1721 vdwparam = fr->nbfp;
1722 vdwtype = mdatoms->typeA;
1724 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1725 ewtab = fr->ic->tabq_coul_FDV0;
1726 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1727 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1729 /* Setup water-specific parameters */
1730 inr = nlist->iinr[0];
1731 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1732 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1733 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1734 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1736 jq0 = _mm_set1_ps(charge[inr+0]);
1737 jq1 = _mm_set1_ps(charge[inr+1]);
1738 jq2 = _mm_set1_ps(charge[inr+2]);
1739 vdwjidx0A = 2*vdwtype[inr+0];
1740 qq00 = _mm_mul_ps(iq0,jq0);
1741 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1742 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1743 qq01 = _mm_mul_ps(iq0,jq1);
1744 qq02 = _mm_mul_ps(iq0,jq2);
1745 qq10 = _mm_mul_ps(iq1,jq0);
1746 qq11 = _mm_mul_ps(iq1,jq1);
1747 qq12 = _mm_mul_ps(iq1,jq2);
1748 qq20 = _mm_mul_ps(iq2,jq0);
1749 qq21 = _mm_mul_ps(iq2,jq1);
1750 qq22 = _mm_mul_ps(iq2,jq2);
1752 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1753 rcutoff_scalar = fr->rcoulomb;
1754 rcutoff = _mm_set1_ps(rcutoff_scalar);
1755 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1757 rswitch_scalar = fr->rcoulomb_switch;
1758 rswitch = _mm_set1_ps(rswitch_scalar);
1759 /* Setup switch parameters */
1760 d_scalar = rcutoff_scalar-rswitch_scalar;
1761 d = _mm_set1_ps(d_scalar);
1762 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1763 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1764 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1765 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1766 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1767 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1769 /* Avoid stupid compiler warnings */
1770 jnrA = jnrB = jnrC = jnrD = 0;
1771 j_coord_offsetA = 0;
1772 j_coord_offsetB = 0;
1773 j_coord_offsetC = 0;
1774 j_coord_offsetD = 0;
1779 /* Start outer loop over neighborlists */
1780 for(iidx=0; iidx<nri; iidx++)
1782 /* Load shift vector for this list */
1783 i_shift_offset = DIM*shiftidx[iidx];
1784 shX = shiftvec[i_shift_offset+XX];
1785 shY = shiftvec[i_shift_offset+YY];
1786 shZ = shiftvec[i_shift_offset+ZZ];
1788 /* Load limits for loop over neighbors */
1789 j_index_start = jindex[iidx];
1790 j_index_end = jindex[iidx+1];
1792 /* Get outer coordinate index */
1794 i_coord_offset = DIM*inr;
1796 /* Load i particle coords and add shift vector */
1797 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1798 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1799 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1800 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1801 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1802 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1803 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1804 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1805 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1807 fix0 = _mm_setzero_ps();
1808 fiy0 = _mm_setzero_ps();
1809 fiz0 = _mm_setzero_ps();
1810 fix1 = _mm_setzero_ps();
1811 fiy1 = _mm_setzero_ps();
1812 fiz1 = _mm_setzero_ps();
1813 fix2 = _mm_setzero_ps();
1814 fiy2 = _mm_setzero_ps();
1815 fiz2 = _mm_setzero_ps();
1817 /* Start inner kernel loop */
1818 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1821 /* Get j neighbor index, and coordinate index */
1823 jnrB = jjnr[jidx+1];
1824 jnrC = jjnr[jidx+2];
1825 jnrD = jjnr[jidx+3];
1827 j_coord_offsetA = DIM*jnrA;
1828 j_coord_offsetB = DIM*jnrB;
1829 j_coord_offsetC = DIM*jnrC;
1830 j_coord_offsetD = DIM*jnrD;
1832 /* load j atom coordinates */
1833 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1834 x+j_coord_offsetC,x+j_coord_offsetD,
1835 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1837 /* Calculate displacement vector */
1838 dx00 = _mm_sub_ps(ix0,jx0);
1839 dy00 = _mm_sub_ps(iy0,jy0);
1840 dz00 = _mm_sub_ps(iz0,jz0);
1841 dx01 = _mm_sub_ps(ix0,jx1);
1842 dy01 = _mm_sub_ps(iy0,jy1);
1843 dz01 = _mm_sub_ps(iz0,jz1);
1844 dx02 = _mm_sub_ps(ix0,jx2);
1845 dy02 = _mm_sub_ps(iy0,jy2);
1846 dz02 = _mm_sub_ps(iz0,jz2);
1847 dx10 = _mm_sub_ps(ix1,jx0);
1848 dy10 = _mm_sub_ps(iy1,jy0);
1849 dz10 = _mm_sub_ps(iz1,jz0);
1850 dx11 = _mm_sub_ps(ix1,jx1);
1851 dy11 = _mm_sub_ps(iy1,jy1);
1852 dz11 = _mm_sub_ps(iz1,jz1);
1853 dx12 = _mm_sub_ps(ix1,jx2);
1854 dy12 = _mm_sub_ps(iy1,jy2);
1855 dz12 = _mm_sub_ps(iz1,jz2);
1856 dx20 = _mm_sub_ps(ix2,jx0);
1857 dy20 = _mm_sub_ps(iy2,jy0);
1858 dz20 = _mm_sub_ps(iz2,jz0);
1859 dx21 = _mm_sub_ps(ix2,jx1);
1860 dy21 = _mm_sub_ps(iy2,jy1);
1861 dz21 = _mm_sub_ps(iz2,jz1);
1862 dx22 = _mm_sub_ps(ix2,jx2);
1863 dy22 = _mm_sub_ps(iy2,jy2);
1864 dz22 = _mm_sub_ps(iz2,jz2);
1866 /* Calculate squared distance and things based on it */
1867 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1868 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1869 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1870 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1871 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1872 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1873 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1874 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1875 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1877 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1878 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1879 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1880 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1881 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1882 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1883 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1884 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1885 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1887 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1888 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1889 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1890 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1891 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1892 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1893 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1894 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1895 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1897 fjx0 = _mm_setzero_ps();
1898 fjy0 = _mm_setzero_ps();
1899 fjz0 = _mm_setzero_ps();
1900 fjx1 = _mm_setzero_ps();
1901 fjy1 = _mm_setzero_ps();
1902 fjz1 = _mm_setzero_ps();
1903 fjx2 = _mm_setzero_ps();
1904 fjy2 = _mm_setzero_ps();
1905 fjz2 = _mm_setzero_ps();
1907 /**************************
1908 * CALCULATE INTERACTIONS *
1909 **************************/
1911 if (gmx_mm_any_lt(rsq00,rcutoff2))
1914 r00 = _mm_mul_ps(rsq00,rinv00);
1916 /* EWALD ELECTROSTATICS */
1918 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1919 ewrt = _mm_mul_ps(r00,ewtabscale);
1920 ewitab = _mm_cvttps_epi32(ewrt);
1921 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1922 ewitab = _mm_slli_epi32(ewitab,2);
1923 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1924 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1925 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1926 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1927 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1928 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1929 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1930 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1931 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1933 /* LENNARD-JONES DISPERSION/REPULSION */
1935 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1936 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1937 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1938 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1939 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1941 d = _mm_sub_ps(r00,rswitch);
1942 d = _mm_max_ps(d,_mm_setzero_ps());
1943 d2 = _mm_mul_ps(d,d);
1944 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1946 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1948 /* Evaluate switch function */
1949 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1950 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1951 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1952 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1954 fscal = _mm_add_ps(felec,fvdw);
1956 fscal = _mm_and_ps(fscal,cutoff_mask);
1958 /* Calculate temporary vectorial force */
1959 tx = _mm_mul_ps(fscal,dx00);
1960 ty = _mm_mul_ps(fscal,dy00);
1961 tz = _mm_mul_ps(fscal,dz00);
1963 /* Update vectorial force */
1964 fix0 = _mm_add_ps(fix0,tx);
1965 fiy0 = _mm_add_ps(fiy0,ty);
1966 fiz0 = _mm_add_ps(fiz0,tz);
1968 fjx0 = _mm_add_ps(fjx0,tx);
1969 fjy0 = _mm_add_ps(fjy0,ty);
1970 fjz0 = _mm_add_ps(fjz0,tz);
1974 /**************************
1975 * CALCULATE INTERACTIONS *
1976 **************************/
1978 if (gmx_mm_any_lt(rsq01,rcutoff2))
1981 r01 = _mm_mul_ps(rsq01,rinv01);
1983 /* EWALD ELECTROSTATICS */
1985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986 ewrt = _mm_mul_ps(r01,ewtabscale);
1987 ewitab = _mm_cvttps_epi32(ewrt);
1988 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1989 ewitab = _mm_slli_epi32(ewitab,2);
1990 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1991 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1992 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1993 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1994 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1995 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1996 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1997 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1998 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2000 d = _mm_sub_ps(r01,rswitch);
2001 d = _mm_max_ps(d,_mm_setzero_ps());
2002 d2 = _mm_mul_ps(d,d);
2003 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2005 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2007 /* Evaluate switch function */
2008 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2009 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2010 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2014 fscal = _mm_and_ps(fscal,cutoff_mask);
2016 /* Calculate temporary vectorial force */
2017 tx = _mm_mul_ps(fscal,dx01);
2018 ty = _mm_mul_ps(fscal,dy01);
2019 tz = _mm_mul_ps(fscal,dz01);
2021 /* Update vectorial force */
2022 fix0 = _mm_add_ps(fix0,tx);
2023 fiy0 = _mm_add_ps(fiy0,ty);
2024 fiz0 = _mm_add_ps(fiz0,tz);
2026 fjx1 = _mm_add_ps(fjx1,tx);
2027 fjy1 = _mm_add_ps(fjy1,ty);
2028 fjz1 = _mm_add_ps(fjz1,tz);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm_any_lt(rsq02,rcutoff2))
2039 r02 = _mm_mul_ps(rsq02,rinv02);
2041 /* EWALD ELECTROSTATICS */
2043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044 ewrt = _mm_mul_ps(r02,ewtabscale);
2045 ewitab = _mm_cvttps_epi32(ewrt);
2046 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2047 ewitab = _mm_slli_epi32(ewitab,2);
2048 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2049 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2050 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2051 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2052 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2053 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2054 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2055 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2056 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2058 d = _mm_sub_ps(r02,rswitch);
2059 d = _mm_max_ps(d,_mm_setzero_ps());
2060 d2 = _mm_mul_ps(d,d);
2061 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2063 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2065 /* Evaluate switch function */
2066 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2067 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2068 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2072 fscal = _mm_and_ps(fscal,cutoff_mask);
2074 /* Calculate temporary vectorial force */
2075 tx = _mm_mul_ps(fscal,dx02);
2076 ty = _mm_mul_ps(fscal,dy02);
2077 tz = _mm_mul_ps(fscal,dz02);
2079 /* Update vectorial force */
2080 fix0 = _mm_add_ps(fix0,tx);
2081 fiy0 = _mm_add_ps(fiy0,ty);
2082 fiz0 = _mm_add_ps(fiz0,tz);
2084 fjx2 = _mm_add_ps(fjx2,tx);
2085 fjy2 = _mm_add_ps(fjy2,ty);
2086 fjz2 = _mm_add_ps(fjz2,tz);
2090 /**************************
2091 * CALCULATE INTERACTIONS *
2092 **************************/
2094 if (gmx_mm_any_lt(rsq10,rcutoff2))
2097 r10 = _mm_mul_ps(rsq10,rinv10);
2099 /* EWALD ELECTROSTATICS */
2101 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2102 ewrt = _mm_mul_ps(r10,ewtabscale);
2103 ewitab = _mm_cvttps_epi32(ewrt);
2104 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2105 ewitab = _mm_slli_epi32(ewitab,2);
2106 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2107 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2108 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2109 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2110 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2111 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2112 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2113 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2114 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2116 d = _mm_sub_ps(r10,rswitch);
2117 d = _mm_max_ps(d,_mm_setzero_ps());
2118 d2 = _mm_mul_ps(d,d);
2119 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2121 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2123 /* Evaluate switch function */
2124 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2125 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2126 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2130 fscal = _mm_and_ps(fscal,cutoff_mask);
2132 /* Calculate temporary vectorial force */
2133 tx = _mm_mul_ps(fscal,dx10);
2134 ty = _mm_mul_ps(fscal,dy10);
2135 tz = _mm_mul_ps(fscal,dz10);
2137 /* Update vectorial force */
2138 fix1 = _mm_add_ps(fix1,tx);
2139 fiy1 = _mm_add_ps(fiy1,ty);
2140 fiz1 = _mm_add_ps(fiz1,tz);
2142 fjx0 = _mm_add_ps(fjx0,tx);
2143 fjy0 = _mm_add_ps(fjy0,ty);
2144 fjz0 = _mm_add_ps(fjz0,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 if (gmx_mm_any_lt(rsq11,rcutoff2))
2155 r11 = _mm_mul_ps(rsq11,rinv11);
2157 /* EWALD ELECTROSTATICS */
2159 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160 ewrt = _mm_mul_ps(r11,ewtabscale);
2161 ewitab = _mm_cvttps_epi32(ewrt);
2162 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2163 ewitab = _mm_slli_epi32(ewitab,2);
2164 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2165 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2166 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2167 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2168 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2169 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2170 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2171 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2172 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2174 d = _mm_sub_ps(r11,rswitch);
2175 d = _mm_max_ps(d,_mm_setzero_ps());
2176 d2 = _mm_mul_ps(d,d);
2177 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2179 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2181 /* Evaluate switch function */
2182 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2183 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2184 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2188 fscal = _mm_and_ps(fscal,cutoff_mask);
2190 /* Calculate temporary vectorial force */
2191 tx = _mm_mul_ps(fscal,dx11);
2192 ty = _mm_mul_ps(fscal,dy11);
2193 tz = _mm_mul_ps(fscal,dz11);
2195 /* Update vectorial force */
2196 fix1 = _mm_add_ps(fix1,tx);
2197 fiy1 = _mm_add_ps(fiy1,ty);
2198 fiz1 = _mm_add_ps(fiz1,tz);
2200 fjx1 = _mm_add_ps(fjx1,tx);
2201 fjy1 = _mm_add_ps(fjy1,ty);
2202 fjz1 = _mm_add_ps(fjz1,tz);
2206 /**************************
2207 * CALCULATE INTERACTIONS *
2208 **************************/
2210 if (gmx_mm_any_lt(rsq12,rcutoff2))
2213 r12 = _mm_mul_ps(rsq12,rinv12);
2215 /* EWALD ELECTROSTATICS */
2217 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2218 ewrt = _mm_mul_ps(r12,ewtabscale);
2219 ewitab = _mm_cvttps_epi32(ewrt);
2220 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2221 ewitab = _mm_slli_epi32(ewitab,2);
2222 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2223 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2224 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2225 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2226 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2227 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2228 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2229 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2230 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2232 d = _mm_sub_ps(r12,rswitch);
2233 d = _mm_max_ps(d,_mm_setzero_ps());
2234 d2 = _mm_mul_ps(d,d);
2235 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2237 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2239 /* Evaluate switch function */
2240 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2241 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2242 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2246 fscal = _mm_and_ps(fscal,cutoff_mask);
2248 /* Calculate temporary vectorial force */
2249 tx = _mm_mul_ps(fscal,dx12);
2250 ty = _mm_mul_ps(fscal,dy12);
2251 tz = _mm_mul_ps(fscal,dz12);
2253 /* Update vectorial force */
2254 fix1 = _mm_add_ps(fix1,tx);
2255 fiy1 = _mm_add_ps(fiy1,ty);
2256 fiz1 = _mm_add_ps(fiz1,tz);
2258 fjx2 = _mm_add_ps(fjx2,tx);
2259 fjy2 = _mm_add_ps(fjy2,ty);
2260 fjz2 = _mm_add_ps(fjz2,tz);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 if (gmx_mm_any_lt(rsq20,rcutoff2))
2271 r20 = _mm_mul_ps(rsq20,rinv20);
2273 /* EWALD ELECTROSTATICS */
2275 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2276 ewrt = _mm_mul_ps(r20,ewtabscale);
2277 ewitab = _mm_cvttps_epi32(ewrt);
2278 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2279 ewitab = _mm_slli_epi32(ewitab,2);
2280 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2281 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2282 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2283 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2284 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2285 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2286 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2287 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2288 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2290 d = _mm_sub_ps(r20,rswitch);
2291 d = _mm_max_ps(d,_mm_setzero_ps());
2292 d2 = _mm_mul_ps(d,d);
2293 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2295 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2297 /* Evaluate switch function */
2298 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2299 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2300 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2304 fscal = _mm_and_ps(fscal,cutoff_mask);
2306 /* Calculate temporary vectorial force */
2307 tx = _mm_mul_ps(fscal,dx20);
2308 ty = _mm_mul_ps(fscal,dy20);
2309 tz = _mm_mul_ps(fscal,dz20);
2311 /* Update vectorial force */
2312 fix2 = _mm_add_ps(fix2,tx);
2313 fiy2 = _mm_add_ps(fiy2,ty);
2314 fiz2 = _mm_add_ps(fiz2,tz);
2316 fjx0 = _mm_add_ps(fjx0,tx);
2317 fjy0 = _mm_add_ps(fjy0,ty);
2318 fjz0 = _mm_add_ps(fjz0,tz);
2322 /**************************
2323 * CALCULATE INTERACTIONS *
2324 **************************/
2326 if (gmx_mm_any_lt(rsq21,rcutoff2))
2329 r21 = _mm_mul_ps(rsq21,rinv21);
2331 /* EWALD ELECTROSTATICS */
2333 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2334 ewrt = _mm_mul_ps(r21,ewtabscale);
2335 ewitab = _mm_cvttps_epi32(ewrt);
2336 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2337 ewitab = _mm_slli_epi32(ewitab,2);
2338 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2339 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2340 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2341 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2342 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2343 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2344 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2345 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2346 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2348 d = _mm_sub_ps(r21,rswitch);
2349 d = _mm_max_ps(d,_mm_setzero_ps());
2350 d2 = _mm_mul_ps(d,d);
2351 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2353 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2355 /* Evaluate switch function */
2356 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2357 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2358 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2362 fscal = _mm_and_ps(fscal,cutoff_mask);
2364 /* Calculate temporary vectorial force */
2365 tx = _mm_mul_ps(fscal,dx21);
2366 ty = _mm_mul_ps(fscal,dy21);
2367 tz = _mm_mul_ps(fscal,dz21);
2369 /* Update vectorial force */
2370 fix2 = _mm_add_ps(fix2,tx);
2371 fiy2 = _mm_add_ps(fiy2,ty);
2372 fiz2 = _mm_add_ps(fiz2,tz);
2374 fjx1 = _mm_add_ps(fjx1,tx);
2375 fjy1 = _mm_add_ps(fjy1,ty);
2376 fjz1 = _mm_add_ps(fjz1,tz);
2380 /**************************
2381 * CALCULATE INTERACTIONS *
2382 **************************/
2384 if (gmx_mm_any_lt(rsq22,rcutoff2))
2387 r22 = _mm_mul_ps(rsq22,rinv22);
2389 /* EWALD ELECTROSTATICS */
2391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2392 ewrt = _mm_mul_ps(r22,ewtabscale);
2393 ewitab = _mm_cvttps_epi32(ewrt);
2394 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2395 ewitab = _mm_slli_epi32(ewitab,2);
2396 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2397 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2398 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2399 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2400 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2401 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2402 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2403 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2404 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2406 d = _mm_sub_ps(r22,rswitch);
2407 d = _mm_max_ps(d,_mm_setzero_ps());
2408 d2 = _mm_mul_ps(d,d);
2409 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2411 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2413 /* Evaluate switch function */
2414 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2415 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2416 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2420 fscal = _mm_and_ps(fscal,cutoff_mask);
2422 /* Calculate temporary vectorial force */
2423 tx = _mm_mul_ps(fscal,dx22);
2424 ty = _mm_mul_ps(fscal,dy22);
2425 tz = _mm_mul_ps(fscal,dz22);
2427 /* Update vectorial force */
2428 fix2 = _mm_add_ps(fix2,tx);
2429 fiy2 = _mm_add_ps(fiy2,ty);
2430 fiz2 = _mm_add_ps(fiz2,tz);
2432 fjx2 = _mm_add_ps(fjx2,tx);
2433 fjy2 = _mm_add_ps(fjy2,ty);
2434 fjz2 = _mm_add_ps(fjz2,tz);
2438 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2439 f+j_coord_offsetC,f+j_coord_offsetD,
2440 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2442 /* Inner loop uses 573 flops */
2445 if(jidx<j_index_end)
2448 /* Get j neighbor index, and coordinate index */
2450 jnrB = jjnr[jidx+1];
2451 jnrC = jjnr[jidx+2];
2452 jnrD = jjnr[jidx+3];
2454 /* Sign of each element will be negative for non-real atoms.
2455 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2456 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2458 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2459 jnrA = (jnrA>=0) ? jnrA : 0;
2460 jnrB = (jnrB>=0) ? jnrB : 0;
2461 jnrC = (jnrC>=0) ? jnrC : 0;
2462 jnrD = (jnrD>=0) ? jnrD : 0;
2464 j_coord_offsetA = DIM*jnrA;
2465 j_coord_offsetB = DIM*jnrB;
2466 j_coord_offsetC = DIM*jnrC;
2467 j_coord_offsetD = DIM*jnrD;
2469 /* load j atom coordinates */
2470 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2471 x+j_coord_offsetC,x+j_coord_offsetD,
2472 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2474 /* Calculate displacement vector */
2475 dx00 = _mm_sub_ps(ix0,jx0);
2476 dy00 = _mm_sub_ps(iy0,jy0);
2477 dz00 = _mm_sub_ps(iz0,jz0);
2478 dx01 = _mm_sub_ps(ix0,jx1);
2479 dy01 = _mm_sub_ps(iy0,jy1);
2480 dz01 = _mm_sub_ps(iz0,jz1);
2481 dx02 = _mm_sub_ps(ix0,jx2);
2482 dy02 = _mm_sub_ps(iy0,jy2);
2483 dz02 = _mm_sub_ps(iz0,jz2);
2484 dx10 = _mm_sub_ps(ix1,jx0);
2485 dy10 = _mm_sub_ps(iy1,jy0);
2486 dz10 = _mm_sub_ps(iz1,jz0);
2487 dx11 = _mm_sub_ps(ix1,jx1);
2488 dy11 = _mm_sub_ps(iy1,jy1);
2489 dz11 = _mm_sub_ps(iz1,jz1);
2490 dx12 = _mm_sub_ps(ix1,jx2);
2491 dy12 = _mm_sub_ps(iy1,jy2);
2492 dz12 = _mm_sub_ps(iz1,jz2);
2493 dx20 = _mm_sub_ps(ix2,jx0);
2494 dy20 = _mm_sub_ps(iy2,jy0);
2495 dz20 = _mm_sub_ps(iz2,jz0);
2496 dx21 = _mm_sub_ps(ix2,jx1);
2497 dy21 = _mm_sub_ps(iy2,jy1);
2498 dz21 = _mm_sub_ps(iz2,jz1);
2499 dx22 = _mm_sub_ps(ix2,jx2);
2500 dy22 = _mm_sub_ps(iy2,jy2);
2501 dz22 = _mm_sub_ps(iz2,jz2);
2503 /* Calculate squared distance and things based on it */
2504 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2505 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2506 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2507 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2508 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2509 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2510 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2511 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2512 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2514 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2515 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2516 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2517 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2518 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2519 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2520 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2521 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2522 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2524 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2525 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2526 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2527 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2528 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2529 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2530 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2531 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2532 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2534 fjx0 = _mm_setzero_ps();
2535 fjy0 = _mm_setzero_ps();
2536 fjz0 = _mm_setzero_ps();
2537 fjx1 = _mm_setzero_ps();
2538 fjy1 = _mm_setzero_ps();
2539 fjz1 = _mm_setzero_ps();
2540 fjx2 = _mm_setzero_ps();
2541 fjy2 = _mm_setzero_ps();
2542 fjz2 = _mm_setzero_ps();
2544 /**************************
2545 * CALCULATE INTERACTIONS *
2546 **************************/
2548 if (gmx_mm_any_lt(rsq00,rcutoff2))
2551 r00 = _mm_mul_ps(rsq00,rinv00);
2552 r00 = _mm_andnot_ps(dummy_mask,r00);
2554 /* EWALD ELECTROSTATICS */
2556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2557 ewrt = _mm_mul_ps(r00,ewtabscale);
2558 ewitab = _mm_cvttps_epi32(ewrt);
2559 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2560 ewitab = _mm_slli_epi32(ewitab,2);
2561 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2562 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2563 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2564 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2565 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2566 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2567 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2568 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2569 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2571 /* LENNARD-JONES DISPERSION/REPULSION */
2573 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2574 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2575 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2576 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2577 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2579 d = _mm_sub_ps(r00,rswitch);
2580 d = _mm_max_ps(d,_mm_setzero_ps());
2581 d2 = _mm_mul_ps(d,d);
2582 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2584 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2586 /* Evaluate switch function */
2587 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2588 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2589 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2590 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2592 fscal = _mm_add_ps(felec,fvdw);
2594 fscal = _mm_and_ps(fscal,cutoff_mask);
2596 fscal = _mm_andnot_ps(dummy_mask,fscal);
2598 /* Calculate temporary vectorial force */
2599 tx = _mm_mul_ps(fscal,dx00);
2600 ty = _mm_mul_ps(fscal,dy00);
2601 tz = _mm_mul_ps(fscal,dz00);
2603 /* Update vectorial force */
2604 fix0 = _mm_add_ps(fix0,tx);
2605 fiy0 = _mm_add_ps(fiy0,ty);
2606 fiz0 = _mm_add_ps(fiz0,tz);
2608 fjx0 = _mm_add_ps(fjx0,tx);
2609 fjy0 = _mm_add_ps(fjy0,ty);
2610 fjz0 = _mm_add_ps(fjz0,tz);
2614 /**************************
2615 * CALCULATE INTERACTIONS *
2616 **************************/
2618 if (gmx_mm_any_lt(rsq01,rcutoff2))
2621 r01 = _mm_mul_ps(rsq01,rinv01);
2622 r01 = _mm_andnot_ps(dummy_mask,r01);
2624 /* EWALD ELECTROSTATICS */
2626 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2627 ewrt = _mm_mul_ps(r01,ewtabscale);
2628 ewitab = _mm_cvttps_epi32(ewrt);
2629 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2630 ewitab = _mm_slli_epi32(ewitab,2);
2631 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2632 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2633 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2634 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2635 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2636 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2637 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2638 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2639 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2641 d = _mm_sub_ps(r01,rswitch);
2642 d = _mm_max_ps(d,_mm_setzero_ps());
2643 d2 = _mm_mul_ps(d,d);
2644 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2646 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2648 /* Evaluate switch function */
2649 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2650 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2651 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2655 fscal = _mm_and_ps(fscal,cutoff_mask);
2657 fscal = _mm_andnot_ps(dummy_mask,fscal);
2659 /* Calculate temporary vectorial force */
2660 tx = _mm_mul_ps(fscal,dx01);
2661 ty = _mm_mul_ps(fscal,dy01);
2662 tz = _mm_mul_ps(fscal,dz01);
2664 /* Update vectorial force */
2665 fix0 = _mm_add_ps(fix0,tx);
2666 fiy0 = _mm_add_ps(fiy0,ty);
2667 fiz0 = _mm_add_ps(fiz0,tz);
2669 fjx1 = _mm_add_ps(fjx1,tx);
2670 fjy1 = _mm_add_ps(fjy1,ty);
2671 fjz1 = _mm_add_ps(fjz1,tz);
2675 /**************************
2676 * CALCULATE INTERACTIONS *
2677 **************************/
2679 if (gmx_mm_any_lt(rsq02,rcutoff2))
2682 r02 = _mm_mul_ps(rsq02,rinv02);
2683 r02 = _mm_andnot_ps(dummy_mask,r02);
2685 /* EWALD ELECTROSTATICS */
2687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2688 ewrt = _mm_mul_ps(r02,ewtabscale);
2689 ewitab = _mm_cvttps_epi32(ewrt);
2690 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2691 ewitab = _mm_slli_epi32(ewitab,2);
2692 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2693 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2694 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2695 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2696 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2697 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2698 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2699 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2700 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2702 d = _mm_sub_ps(r02,rswitch);
2703 d = _mm_max_ps(d,_mm_setzero_ps());
2704 d2 = _mm_mul_ps(d,d);
2705 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2707 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2709 /* Evaluate switch function */
2710 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2711 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2712 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2716 fscal = _mm_and_ps(fscal,cutoff_mask);
2718 fscal = _mm_andnot_ps(dummy_mask,fscal);
2720 /* Calculate temporary vectorial force */
2721 tx = _mm_mul_ps(fscal,dx02);
2722 ty = _mm_mul_ps(fscal,dy02);
2723 tz = _mm_mul_ps(fscal,dz02);
2725 /* Update vectorial force */
2726 fix0 = _mm_add_ps(fix0,tx);
2727 fiy0 = _mm_add_ps(fiy0,ty);
2728 fiz0 = _mm_add_ps(fiz0,tz);
2730 fjx2 = _mm_add_ps(fjx2,tx);
2731 fjy2 = _mm_add_ps(fjy2,ty);
2732 fjz2 = _mm_add_ps(fjz2,tz);
2736 /**************************
2737 * CALCULATE INTERACTIONS *
2738 **************************/
2740 if (gmx_mm_any_lt(rsq10,rcutoff2))
2743 r10 = _mm_mul_ps(rsq10,rinv10);
2744 r10 = _mm_andnot_ps(dummy_mask,r10);
2746 /* EWALD ELECTROSTATICS */
2748 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2749 ewrt = _mm_mul_ps(r10,ewtabscale);
2750 ewitab = _mm_cvttps_epi32(ewrt);
2751 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2752 ewitab = _mm_slli_epi32(ewitab,2);
2753 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2754 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2755 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2756 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2757 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2758 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2759 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2760 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2761 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2763 d = _mm_sub_ps(r10,rswitch);
2764 d = _mm_max_ps(d,_mm_setzero_ps());
2765 d2 = _mm_mul_ps(d,d);
2766 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2768 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2770 /* Evaluate switch function */
2771 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2772 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2773 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2777 fscal = _mm_and_ps(fscal,cutoff_mask);
2779 fscal = _mm_andnot_ps(dummy_mask,fscal);
2781 /* Calculate temporary vectorial force */
2782 tx = _mm_mul_ps(fscal,dx10);
2783 ty = _mm_mul_ps(fscal,dy10);
2784 tz = _mm_mul_ps(fscal,dz10);
2786 /* Update vectorial force */
2787 fix1 = _mm_add_ps(fix1,tx);
2788 fiy1 = _mm_add_ps(fiy1,ty);
2789 fiz1 = _mm_add_ps(fiz1,tz);
2791 fjx0 = _mm_add_ps(fjx0,tx);
2792 fjy0 = _mm_add_ps(fjy0,ty);
2793 fjz0 = _mm_add_ps(fjz0,tz);
2797 /**************************
2798 * CALCULATE INTERACTIONS *
2799 **************************/
2801 if (gmx_mm_any_lt(rsq11,rcutoff2))
2804 r11 = _mm_mul_ps(rsq11,rinv11);
2805 r11 = _mm_andnot_ps(dummy_mask,r11);
2807 /* EWALD ELECTROSTATICS */
2809 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2810 ewrt = _mm_mul_ps(r11,ewtabscale);
2811 ewitab = _mm_cvttps_epi32(ewrt);
2812 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2813 ewitab = _mm_slli_epi32(ewitab,2);
2814 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2815 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2816 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2817 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2818 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2819 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2820 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2821 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2822 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2824 d = _mm_sub_ps(r11,rswitch);
2825 d = _mm_max_ps(d,_mm_setzero_ps());
2826 d2 = _mm_mul_ps(d,d);
2827 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2829 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2831 /* Evaluate switch function */
2832 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2833 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2834 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2838 fscal = _mm_and_ps(fscal,cutoff_mask);
2840 fscal = _mm_andnot_ps(dummy_mask,fscal);
2842 /* Calculate temporary vectorial force */
2843 tx = _mm_mul_ps(fscal,dx11);
2844 ty = _mm_mul_ps(fscal,dy11);
2845 tz = _mm_mul_ps(fscal,dz11);
2847 /* Update vectorial force */
2848 fix1 = _mm_add_ps(fix1,tx);
2849 fiy1 = _mm_add_ps(fiy1,ty);
2850 fiz1 = _mm_add_ps(fiz1,tz);
2852 fjx1 = _mm_add_ps(fjx1,tx);
2853 fjy1 = _mm_add_ps(fjy1,ty);
2854 fjz1 = _mm_add_ps(fjz1,tz);
2858 /**************************
2859 * CALCULATE INTERACTIONS *
2860 **************************/
2862 if (gmx_mm_any_lt(rsq12,rcutoff2))
2865 r12 = _mm_mul_ps(rsq12,rinv12);
2866 r12 = _mm_andnot_ps(dummy_mask,r12);
2868 /* EWALD ELECTROSTATICS */
2870 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2871 ewrt = _mm_mul_ps(r12,ewtabscale);
2872 ewitab = _mm_cvttps_epi32(ewrt);
2873 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2874 ewitab = _mm_slli_epi32(ewitab,2);
2875 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2876 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2877 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2878 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2879 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2880 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2881 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2882 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2883 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2885 d = _mm_sub_ps(r12,rswitch);
2886 d = _mm_max_ps(d,_mm_setzero_ps());
2887 d2 = _mm_mul_ps(d,d);
2888 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2890 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2892 /* Evaluate switch function */
2893 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2894 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2895 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2899 fscal = _mm_and_ps(fscal,cutoff_mask);
2901 fscal = _mm_andnot_ps(dummy_mask,fscal);
2903 /* Calculate temporary vectorial force */
2904 tx = _mm_mul_ps(fscal,dx12);
2905 ty = _mm_mul_ps(fscal,dy12);
2906 tz = _mm_mul_ps(fscal,dz12);
2908 /* Update vectorial force */
2909 fix1 = _mm_add_ps(fix1,tx);
2910 fiy1 = _mm_add_ps(fiy1,ty);
2911 fiz1 = _mm_add_ps(fiz1,tz);
2913 fjx2 = _mm_add_ps(fjx2,tx);
2914 fjy2 = _mm_add_ps(fjy2,ty);
2915 fjz2 = _mm_add_ps(fjz2,tz);
2919 /**************************
2920 * CALCULATE INTERACTIONS *
2921 **************************/
2923 if (gmx_mm_any_lt(rsq20,rcutoff2))
2926 r20 = _mm_mul_ps(rsq20,rinv20);
2927 r20 = _mm_andnot_ps(dummy_mask,r20);
2929 /* EWALD ELECTROSTATICS */
2931 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2932 ewrt = _mm_mul_ps(r20,ewtabscale);
2933 ewitab = _mm_cvttps_epi32(ewrt);
2934 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2935 ewitab = _mm_slli_epi32(ewitab,2);
2936 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2937 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2938 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2939 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2940 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2941 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2942 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2943 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2944 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2946 d = _mm_sub_ps(r20,rswitch);
2947 d = _mm_max_ps(d,_mm_setzero_ps());
2948 d2 = _mm_mul_ps(d,d);
2949 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2951 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2953 /* Evaluate switch function */
2954 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2955 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2956 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2960 fscal = _mm_and_ps(fscal,cutoff_mask);
2962 fscal = _mm_andnot_ps(dummy_mask,fscal);
2964 /* Calculate temporary vectorial force */
2965 tx = _mm_mul_ps(fscal,dx20);
2966 ty = _mm_mul_ps(fscal,dy20);
2967 tz = _mm_mul_ps(fscal,dz20);
2969 /* Update vectorial force */
2970 fix2 = _mm_add_ps(fix2,tx);
2971 fiy2 = _mm_add_ps(fiy2,ty);
2972 fiz2 = _mm_add_ps(fiz2,tz);
2974 fjx0 = _mm_add_ps(fjx0,tx);
2975 fjy0 = _mm_add_ps(fjy0,ty);
2976 fjz0 = _mm_add_ps(fjz0,tz);
2980 /**************************
2981 * CALCULATE INTERACTIONS *
2982 **************************/
2984 if (gmx_mm_any_lt(rsq21,rcutoff2))
2987 r21 = _mm_mul_ps(rsq21,rinv21);
2988 r21 = _mm_andnot_ps(dummy_mask,r21);
2990 /* EWALD ELECTROSTATICS */
2992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2993 ewrt = _mm_mul_ps(r21,ewtabscale);
2994 ewitab = _mm_cvttps_epi32(ewrt);
2995 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2996 ewitab = _mm_slli_epi32(ewitab,2);
2997 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2998 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2999 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3000 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3001 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3002 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3003 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3004 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3005 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3007 d = _mm_sub_ps(r21,rswitch);
3008 d = _mm_max_ps(d,_mm_setzero_ps());
3009 d2 = _mm_mul_ps(d,d);
3010 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3012 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3014 /* Evaluate switch function */
3015 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3016 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3017 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3021 fscal = _mm_and_ps(fscal,cutoff_mask);
3023 fscal = _mm_andnot_ps(dummy_mask,fscal);
3025 /* Calculate temporary vectorial force */
3026 tx = _mm_mul_ps(fscal,dx21);
3027 ty = _mm_mul_ps(fscal,dy21);
3028 tz = _mm_mul_ps(fscal,dz21);
3030 /* Update vectorial force */
3031 fix2 = _mm_add_ps(fix2,tx);
3032 fiy2 = _mm_add_ps(fiy2,ty);
3033 fiz2 = _mm_add_ps(fiz2,tz);
3035 fjx1 = _mm_add_ps(fjx1,tx);
3036 fjy1 = _mm_add_ps(fjy1,ty);
3037 fjz1 = _mm_add_ps(fjz1,tz);
3041 /**************************
3042 * CALCULATE INTERACTIONS *
3043 **************************/
3045 if (gmx_mm_any_lt(rsq22,rcutoff2))
3048 r22 = _mm_mul_ps(rsq22,rinv22);
3049 r22 = _mm_andnot_ps(dummy_mask,r22);
3051 /* EWALD ELECTROSTATICS */
3053 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3054 ewrt = _mm_mul_ps(r22,ewtabscale);
3055 ewitab = _mm_cvttps_epi32(ewrt);
3056 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3057 ewitab = _mm_slli_epi32(ewitab,2);
3058 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3059 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3060 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3061 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3062 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3063 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3064 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3065 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3066 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3068 d = _mm_sub_ps(r22,rswitch);
3069 d = _mm_max_ps(d,_mm_setzero_ps());
3070 d2 = _mm_mul_ps(d,d);
3071 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3073 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3075 /* Evaluate switch function */
3076 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3077 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3078 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3082 fscal = _mm_and_ps(fscal,cutoff_mask);
3084 fscal = _mm_andnot_ps(dummy_mask,fscal);
3086 /* Calculate temporary vectorial force */
3087 tx = _mm_mul_ps(fscal,dx22);
3088 ty = _mm_mul_ps(fscal,dy22);
3089 tz = _mm_mul_ps(fscal,dz22);
3091 /* Update vectorial force */
3092 fix2 = _mm_add_ps(fix2,tx);
3093 fiy2 = _mm_add_ps(fiy2,ty);
3094 fiz2 = _mm_add_ps(fiz2,tz);
3096 fjx2 = _mm_add_ps(fjx2,tx);
3097 fjy2 = _mm_add_ps(fjy2,ty);
3098 fjz2 = _mm_add_ps(fjz2,tz);
3102 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
3103 f+j_coord_offsetC,f+j_coord_offsetD,
3104 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3106 /* Inner loop uses 582 flops */
3109 /* End of innermost loop */
3111 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3112 f+i_coord_offset,fshift+i_shift_offset);
3114 /* Increment number of inner iterations */
3115 inneriter += j_index_end - j_index_start;
3117 /* Outer loop uses 27 flops */
3120 /* Increment number of outer iterations */
3123 /* Update outer/inner flops */
3125 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*582);