2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
100 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
101 real rswitch_scalar,d_scalar;
102 __m128 dummy_mask,cutoff_mask;
103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104 __m128 one = _mm_set1_ps(1.0);
105 __m128 two = _mm_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
119 krf = _mm_set1_ps(fr->ic->k_rf);
120 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
121 crf = _mm_set1_ps(fr->ic->c_rf);
122 nvdwtype = fr->ntype;
124 vdwtype = mdatoms->typeA;
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
129 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
130 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq1 = _mm_set1_ps(charge[inr+1]);
134 jq2 = _mm_set1_ps(charge[inr+2]);
135 jq3 = _mm_set1_ps(charge[inr+3]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
138 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
139 qq11 = _mm_mul_ps(iq1,jq1);
140 qq12 = _mm_mul_ps(iq1,jq2);
141 qq13 = _mm_mul_ps(iq1,jq3);
142 qq21 = _mm_mul_ps(iq2,jq1);
143 qq22 = _mm_mul_ps(iq2,jq2);
144 qq23 = _mm_mul_ps(iq2,jq3);
145 qq31 = _mm_mul_ps(iq3,jq1);
146 qq32 = _mm_mul_ps(iq3,jq2);
147 qq33 = _mm_mul_ps(iq3,jq3);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->rcoulomb;
151 rcutoff = _mm_set1_ps(rcutoff_scalar);
152 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
154 rswitch_scalar = fr->rvdw_switch;
155 rswitch = _mm_set1_ps(rswitch_scalar);
156 /* Setup switch parameters */
157 d_scalar = rcutoff_scalar-rswitch_scalar;
158 d = _mm_set1_ps(d_scalar);
159 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
160 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
163 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
166 /* Avoid stupid compiler warnings */
167 jnrA = jnrB = jnrC = jnrD = 0;
176 /* Start outer loop over neighborlists */
177 for(iidx=0; iidx<nri; iidx++)
179 /* Load shift vector for this list */
180 i_shift_offset = DIM*shiftidx[iidx];
181 shX = shiftvec[i_shift_offset+XX];
182 shY = shiftvec[i_shift_offset+YY];
183 shZ = shiftvec[i_shift_offset+ZZ];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
195 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
196 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
197 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
198 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
199 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
200 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
201 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
202 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
203 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
204 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
205 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
207 fix0 = _mm_setzero_ps();
208 fiy0 = _mm_setzero_ps();
209 fiz0 = _mm_setzero_ps();
210 fix1 = _mm_setzero_ps();
211 fiy1 = _mm_setzero_ps();
212 fiz1 = _mm_setzero_ps();
213 fix2 = _mm_setzero_ps();
214 fiy2 = _mm_setzero_ps();
215 fiz2 = _mm_setzero_ps();
216 fix3 = _mm_setzero_ps();
217 fiy3 = _mm_setzero_ps();
218 fiz3 = _mm_setzero_ps();
220 /* Reset potential sums */
221 velecsum = _mm_setzero_ps();
222 vvdwsum = _mm_setzero_ps();
224 /* Start inner kernel loop */
225 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
228 /* Get j neighbor index, and coordinate index */
234 j_coord_offsetA = DIM*jnrA;
235 j_coord_offsetB = DIM*jnrB;
236 j_coord_offsetC = DIM*jnrC;
237 j_coord_offsetD = DIM*jnrD;
239 /* load j atom coordinates */
240 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
241 x+j_coord_offsetC,x+j_coord_offsetD,
242 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243 &jy2,&jz2,&jx3,&jy3,&jz3);
245 /* Calculate displacement vector */
246 dx00 = _mm_sub_ps(ix0,jx0);
247 dy00 = _mm_sub_ps(iy0,jy0);
248 dz00 = _mm_sub_ps(iz0,jz0);
249 dx11 = _mm_sub_ps(ix1,jx1);
250 dy11 = _mm_sub_ps(iy1,jy1);
251 dz11 = _mm_sub_ps(iz1,jz1);
252 dx12 = _mm_sub_ps(ix1,jx2);
253 dy12 = _mm_sub_ps(iy1,jy2);
254 dz12 = _mm_sub_ps(iz1,jz2);
255 dx13 = _mm_sub_ps(ix1,jx3);
256 dy13 = _mm_sub_ps(iy1,jy3);
257 dz13 = _mm_sub_ps(iz1,jz3);
258 dx21 = _mm_sub_ps(ix2,jx1);
259 dy21 = _mm_sub_ps(iy2,jy1);
260 dz21 = _mm_sub_ps(iz2,jz1);
261 dx22 = _mm_sub_ps(ix2,jx2);
262 dy22 = _mm_sub_ps(iy2,jy2);
263 dz22 = _mm_sub_ps(iz2,jz2);
264 dx23 = _mm_sub_ps(ix2,jx3);
265 dy23 = _mm_sub_ps(iy2,jy3);
266 dz23 = _mm_sub_ps(iz2,jz3);
267 dx31 = _mm_sub_ps(ix3,jx1);
268 dy31 = _mm_sub_ps(iy3,jy1);
269 dz31 = _mm_sub_ps(iz3,jz1);
270 dx32 = _mm_sub_ps(ix3,jx2);
271 dy32 = _mm_sub_ps(iy3,jy2);
272 dz32 = _mm_sub_ps(iz3,jz2);
273 dx33 = _mm_sub_ps(ix3,jx3);
274 dy33 = _mm_sub_ps(iy3,jy3);
275 dz33 = _mm_sub_ps(iz3,jz3);
277 /* Calculate squared distance and things based on it */
278 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
279 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
280 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
281 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
282 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
283 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
284 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
285 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
286 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
287 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
289 rinv00 = gmx_mm_invsqrt_ps(rsq00);
290 rinv11 = gmx_mm_invsqrt_ps(rsq11);
291 rinv12 = gmx_mm_invsqrt_ps(rsq12);
292 rinv13 = gmx_mm_invsqrt_ps(rsq13);
293 rinv21 = gmx_mm_invsqrt_ps(rsq21);
294 rinv22 = gmx_mm_invsqrt_ps(rsq22);
295 rinv23 = gmx_mm_invsqrt_ps(rsq23);
296 rinv31 = gmx_mm_invsqrt_ps(rsq31);
297 rinv32 = gmx_mm_invsqrt_ps(rsq32);
298 rinv33 = gmx_mm_invsqrt_ps(rsq33);
300 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
301 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
302 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
303 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
304 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
305 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
306 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
307 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
308 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
309 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
311 fjx0 = _mm_setzero_ps();
312 fjy0 = _mm_setzero_ps();
313 fjz0 = _mm_setzero_ps();
314 fjx1 = _mm_setzero_ps();
315 fjy1 = _mm_setzero_ps();
316 fjz1 = _mm_setzero_ps();
317 fjx2 = _mm_setzero_ps();
318 fjy2 = _mm_setzero_ps();
319 fjz2 = _mm_setzero_ps();
320 fjx3 = _mm_setzero_ps();
321 fjy3 = _mm_setzero_ps();
322 fjz3 = _mm_setzero_ps();
324 /**************************
325 * CALCULATE INTERACTIONS *
326 **************************/
328 if (gmx_mm_any_lt(rsq00,rcutoff2))
331 r00 = _mm_mul_ps(rsq00,rinv00);
333 /* LENNARD-JONES DISPERSION/REPULSION */
335 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
336 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
337 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
338 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
339 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
341 d = _mm_sub_ps(r00,rswitch);
342 d = _mm_max_ps(d,_mm_setzero_ps());
343 d2 = _mm_mul_ps(d,d);
344 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
346 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
348 /* Evaluate switch function */
349 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
350 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
351 vvdw = _mm_mul_ps(vvdw,sw);
352 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
354 /* Update potential sum for this i atom from the interaction with this j atom. */
355 vvdw = _mm_and_ps(vvdw,cutoff_mask);
356 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
360 fscal = _mm_and_ps(fscal,cutoff_mask);
362 /* Calculate temporary vectorial force */
363 tx = _mm_mul_ps(fscal,dx00);
364 ty = _mm_mul_ps(fscal,dy00);
365 tz = _mm_mul_ps(fscal,dz00);
367 /* Update vectorial force */
368 fix0 = _mm_add_ps(fix0,tx);
369 fiy0 = _mm_add_ps(fiy0,ty);
370 fiz0 = _mm_add_ps(fiz0,tz);
372 fjx0 = _mm_add_ps(fjx0,tx);
373 fjy0 = _mm_add_ps(fjy0,ty);
374 fjz0 = _mm_add_ps(fjz0,tz);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 if (gmx_mm_any_lt(rsq11,rcutoff2))
385 /* REACTION-FIELD ELECTROSTATICS */
386 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
387 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
389 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
391 /* Update potential sum for this i atom from the interaction with this j atom. */
392 velec = _mm_and_ps(velec,cutoff_mask);
393 velecsum = _mm_add_ps(velecsum,velec);
397 fscal = _mm_and_ps(fscal,cutoff_mask);
399 /* Calculate temporary vectorial force */
400 tx = _mm_mul_ps(fscal,dx11);
401 ty = _mm_mul_ps(fscal,dy11);
402 tz = _mm_mul_ps(fscal,dz11);
404 /* Update vectorial force */
405 fix1 = _mm_add_ps(fix1,tx);
406 fiy1 = _mm_add_ps(fiy1,ty);
407 fiz1 = _mm_add_ps(fiz1,tz);
409 fjx1 = _mm_add_ps(fjx1,tx);
410 fjy1 = _mm_add_ps(fjy1,ty);
411 fjz1 = _mm_add_ps(fjz1,tz);
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 if (gmx_mm_any_lt(rsq12,rcutoff2))
422 /* REACTION-FIELD ELECTROSTATICS */
423 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
424 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
426 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velec = _mm_and_ps(velec,cutoff_mask);
430 velecsum = _mm_add_ps(velecsum,velec);
434 fscal = _mm_and_ps(fscal,cutoff_mask);
436 /* Calculate temporary vectorial force */
437 tx = _mm_mul_ps(fscal,dx12);
438 ty = _mm_mul_ps(fscal,dy12);
439 tz = _mm_mul_ps(fscal,dz12);
441 /* Update vectorial force */
442 fix1 = _mm_add_ps(fix1,tx);
443 fiy1 = _mm_add_ps(fiy1,ty);
444 fiz1 = _mm_add_ps(fiz1,tz);
446 fjx2 = _mm_add_ps(fjx2,tx);
447 fjy2 = _mm_add_ps(fjy2,ty);
448 fjz2 = _mm_add_ps(fjz2,tz);
452 /**************************
453 * CALCULATE INTERACTIONS *
454 **************************/
456 if (gmx_mm_any_lt(rsq13,rcutoff2))
459 /* REACTION-FIELD ELECTROSTATICS */
460 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
461 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
463 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velec = _mm_and_ps(velec,cutoff_mask);
467 velecsum = _mm_add_ps(velecsum,velec);
471 fscal = _mm_and_ps(fscal,cutoff_mask);
473 /* Calculate temporary vectorial force */
474 tx = _mm_mul_ps(fscal,dx13);
475 ty = _mm_mul_ps(fscal,dy13);
476 tz = _mm_mul_ps(fscal,dz13);
478 /* Update vectorial force */
479 fix1 = _mm_add_ps(fix1,tx);
480 fiy1 = _mm_add_ps(fiy1,ty);
481 fiz1 = _mm_add_ps(fiz1,tz);
483 fjx3 = _mm_add_ps(fjx3,tx);
484 fjy3 = _mm_add_ps(fjy3,ty);
485 fjz3 = _mm_add_ps(fjz3,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 if (gmx_mm_any_lt(rsq21,rcutoff2))
496 /* REACTION-FIELD ELECTROSTATICS */
497 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
498 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
500 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velec = _mm_and_ps(velec,cutoff_mask);
504 velecsum = _mm_add_ps(velecsum,velec);
508 fscal = _mm_and_ps(fscal,cutoff_mask);
510 /* Calculate temporary vectorial force */
511 tx = _mm_mul_ps(fscal,dx21);
512 ty = _mm_mul_ps(fscal,dy21);
513 tz = _mm_mul_ps(fscal,dz21);
515 /* Update vectorial force */
516 fix2 = _mm_add_ps(fix2,tx);
517 fiy2 = _mm_add_ps(fiy2,ty);
518 fiz2 = _mm_add_ps(fiz2,tz);
520 fjx1 = _mm_add_ps(fjx1,tx);
521 fjy1 = _mm_add_ps(fjy1,ty);
522 fjz1 = _mm_add_ps(fjz1,tz);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 if (gmx_mm_any_lt(rsq22,rcutoff2))
533 /* REACTION-FIELD ELECTROSTATICS */
534 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
535 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
537 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velec = _mm_and_ps(velec,cutoff_mask);
541 velecsum = _mm_add_ps(velecsum,velec);
545 fscal = _mm_and_ps(fscal,cutoff_mask);
547 /* Calculate temporary vectorial force */
548 tx = _mm_mul_ps(fscal,dx22);
549 ty = _mm_mul_ps(fscal,dy22);
550 tz = _mm_mul_ps(fscal,dz22);
552 /* Update vectorial force */
553 fix2 = _mm_add_ps(fix2,tx);
554 fiy2 = _mm_add_ps(fiy2,ty);
555 fiz2 = _mm_add_ps(fiz2,tz);
557 fjx2 = _mm_add_ps(fjx2,tx);
558 fjy2 = _mm_add_ps(fjy2,ty);
559 fjz2 = _mm_add_ps(fjz2,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 if (gmx_mm_any_lt(rsq23,rcutoff2))
570 /* REACTION-FIELD ELECTROSTATICS */
571 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
572 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
574 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
576 /* Update potential sum for this i atom from the interaction with this j atom. */
577 velec = _mm_and_ps(velec,cutoff_mask);
578 velecsum = _mm_add_ps(velecsum,velec);
582 fscal = _mm_and_ps(fscal,cutoff_mask);
584 /* Calculate temporary vectorial force */
585 tx = _mm_mul_ps(fscal,dx23);
586 ty = _mm_mul_ps(fscal,dy23);
587 tz = _mm_mul_ps(fscal,dz23);
589 /* Update vectorial force */
590 fix2 = _mm_add_ps(fix2,tx);
591 fiy2 = _mm_add_ps(fiy2,ty);
592 fiz2 = _mm_add_ps(fiz2,tz);
594 fjx3 = _mm_add_ps(fjx3,tx);
595 fjy3 = _mm_add_ps(fjy3,ty);
596 fjz3 = _mm_add_ps(fjz3,tz);
600 /**************************
601 * CALCULATE INTERACTIONS *
602 **************************/
604 if (gmx_mm_any_lt(rsq31,rcutoff2))
607 /* REACTION-FIELD ELECTROSTATICS */
608 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
609 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
611 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velec = _mm_and_ps(velec,cutoff_mask);
615 velecsum = _mm_add_ps(velecsum,velec);
619 fscal = _mm_and_ps(fscal,cutoff_mask);
621 /* Calculate temporary vectorial force */
622 tx = _mm_mul_ps(fscal,dx31);
623 ty = _mm_mul_ps(fscal,dy31);
624 tz = _mm_mul_ps(fscal,dz31);
626 /* Update vectorial force */
627 fix3 = _mm_add_ps(fix3,tx);
628 fiy3 = _mm_add_ps(fiy3,ty);
629 fiz3 = _mm_add_ps(fiz3,tz);
631 fjx1 = _mm_add_ps(fjx1,tx);
632 fjy1 = _mm_add_ps(fjy1,ty);
633 fjz1 = _mm_add_ps(fjz1,tz);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 if (gmx_mm_any_lt(rsq32,rcutoff2))
644 /* REACTION-FIELD ELECTROSTATICS */
645 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
646 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
648 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velec = _mm_and_ps(velec,cutoff_mask);
652 velecsum = _mm_add_ps(velecsum,velec);
656 fscal = _mm_and_ps(fscal,cutoff_mask);
658 /* Calculate temporary vectorial force */
659 tx = _mm_mul_ps(fscal,dx32);
660 ty = _mm_mul_ps(fscal,dy32);
661 tz = _mm_mul_ps(fscal,dz32);
663 /* Update vectorial force */
664 fix3 = _mm_add_ps(fix3,tx);
665 fiy3 = _mm_add_ps(fiy3,ty);
666 fiz3 = _mm_add_ps(fiz3,tz);
668 fjx2 = _mm_add_ps(fjx2,tx);
669 fjy2 = _mm_add_ps(fjy2,ty);
670 fjz2 = _mm_add_ps(fjz2,tz);
674 /**************************
675 * CALCULATE INTERACTIONS *
676 **************************/
678 if (gmx_mm_any_lt(rsq33,rcutoff2))
681 /* REACTION-FIELD ELECTROSTATICS */
682 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
683 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
685 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velec = _mm_and_ps(velec,cutoff_mask);
689 velecsum = _mm_add_ps(velecsum,velec);
693 fscal = _mm_and_ps(fscal,cutoff_mask);
695 /* Calculate temporary vectorial force */
696 tx = _mm_mul_ps(fscal,dx33);
697 ty = _mm_mul_ps(fscal,dy33);
698 tz = _mm_mul_ps(fscal,dz33);
700 /* Update vectorial force */
701 fix3 = _mm_add_ps(fix3,tx);
702 fiy3 = _mm_add_ps(fiy3,ty);
703 fiz3 = _mm_add_ps(fiz3,tz);
705 fjx3 = _mm_add_ps(fjx3,tx);
706 fjy3 = _mm_add_ps(fjy3,ty);
707 fjz3 = _mm_add_ps(fjz3,tz);
711 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
712 f+j_coord_offsetC,f+j_coord_offsetD,
713 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
714 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
716 /* Inner loop uses 386 flops */
722 /* Get j neighbor index, and coordinate index */
728 /* Sign of each element will be negative for non-real atoms.
729 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
730 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
732 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
733 jnrA = (jnrA>=0) ? jnrA : 0;
734 jnrB = (jnrB>=0) ? jnrB : 0;
735 jnrC = (jnrC>=0) ? jnrC : 0;
736 jnrD = (jnrD>=0) ? jnrD : 0;
738 j_coord_offsetA = DIM*jnrA;
739 j_coord_offsetB = DIM*jnrB;
740 j_coord_offsetC = DIM*jnrC;
741 j_coord_offsetD = DIM*jnrD;
743 /* load j atom coordinates */
744 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
745 x+j_coord_offsetC,x+j_coord_offsetD,
746 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
747 &jy2,&jz2,&jx3,&jy3,&jz3);
749 /* Calculate displacement vector */
750 dx00 = _mm_sub_ps(ix0,jx0);
751 dy00 = _mm_sub_ps(iy0,jy0);
752 dz00 = _mm_sub_ps(iz0,jz0);
753 dx11 = _mm_sub_ps(ix1,jx1);
754 dy11 = _mm_sub_ps(iy1,jy1);
755 dz11 = _mm_sub_ps(iz1,jz1);
756 dx12 = _mm_sub_ps(ix1,jx2);
757 dy12 = _mm_sub_ps(iy1,jy2);
758 dz12 = _mm_sub_ps(iz1,jz2);
759 dx13 = _mm_sub_ps(ix1,jx3);
760 dy13 = _mm_sub_ps(iy1,jy3);
761 dz13 = _mm_sub_ps(iz1,jz3);
762 dx21 = _mm_sub_ps(ix2,jx1);
763 dy21 = _mm_sub_ps(iy2,jy1);
764 dz21 = _mm_sub_ps(iz2,jz1);
765 dx22 = _mm_sub_ps(ix2,jx2);
766 dy22 = _mm_sub_ps(iy2,jy2);
767 dz22 = _mm_sub_ps(iz2,jz2);
768 dx23 = _mm_sub_ps(ix2,jx3);
769 dy23 = _mm_sub_ps(iy2,jy3);
770 dz23 = _mm_sub_ps(iz2,jz3);
771 dx31 = _mm_sub_ps(ix3,jx1);
772 dy31 = _mm_sub_ps(iy3,jy1);
773 dz31 = _mm_sub_ps(iz3,jz1);
774 dx32 = _mm_sub_ps(ix3,jx2);
775 dy32 = _mm_sub_ps(iy3,jy2);
776 dz32 = _mm_sub_ps(iz3,jz2);
777 dx33 = _mm_sub_ps(ix3,jx3);
778 dy33 = _mm_sub_ps(iy3,jy3);
779 dz33 = _mm_sub_ps(iz3,jz3);
781 /* Calculate squared distance and things based on it */
782 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
783 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
784 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
785 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
786 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
787 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
788 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
789 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
790 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
791 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
793 rinv00 = gmx_mm_invsqrt_ps(rsq00);
794 rinv11 = gmx_mm_invsqrt_ps(rsq11);
795 rinv12 = gmx_mm_invsqrt_ps(rsq12);
796 rinv13 = gmx_mm_invsqrt_ps(rsq13);
797 rinv21 = gmx_mm_invsqrt_ps(rsq21);
798 rinv22 = gmx_mm_invsqrt_ps(rsq22);
799 rinv23 = gmx_mm_invsqrt_ps(rsq23);
800 rinv31 = gmx_mm_invsqrt_ps(rsq31);
801 rinv32 = gmx_mm_invsqrt_ps(rsq32);
802 rinv33 = gmx_mm_invsqrt_ps(rsq33);
804 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
805 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
806 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
807 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
808 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
809 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
810 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
811 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
812 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
813 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
815 fjx0 = _mm_setzero_ps();
816 fjy0 = _mm_setzero_ps();
817 fjz0 = _mm_setzero_ps();
818 fjx1 = _mm_setzero_ps();
819 fjy1 = _mm_setzero_ps();
820 fjz1 = _mm_setzero_ps();
821 fjx2 = _mm_setzero_ps();
822 fjy2 = _mm_setzero_ps();
823 fjz2 = _mm_setzero_ps();
824 fjx3 = _mm_setzero_ps();
825 fjy3 = _mm_setzero_ps();
826 fjz3 = _mm_setzero_ps();
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 if (gmx_mm_any_lt(rsq00,rcutoff2))
835 r00 = _mm_mul_ps(rsq00,rinv00);
836 r00 = _mm_andnot_ps(dummy_mask,r00);
838 /* LENNARD-JONES DISPERSION/REPULSION */
840 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
841 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
842 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
843 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
844 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
846 d = _mm_sub_ps(r00,rswitch);
847 d = _mm_max_ps(d,_mm_setzero_ps());
848 d2 = _mm_mul_ps(d,d);
849 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
851 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
853 /* Evaluate switch function */
854 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
855 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
856 vvdw = _mm_mul_ps(vvdw,sw);
857 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 vvdw = _mm_and_ps(vvdw,cutoff_mask);
861 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
862 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
866 fscal = _mm_and_ps(fscal,cutoff_mask);
868 fscal = _mm_andnot_ps(dummy_mask,fscal);
870 /* Calculate temporary vectorial force */
871 tx = _mm_mul_ps(fscal,dx00);
872 ty = _mm_mul_ps(fscal,dy00);
873 tz = _mm_mul_ps(fscal,dz00);
875 /* Update vectorial force */
876 fix0 = _mm_add_ps(fix0,tx);
877 fiy0 = _mm_add_ps(fiy0,ty);
878 fiz0 = _mm_add_ps(fiz0,tz);
880 fjx0 = _mm_add_ps(fjx0,tx);
881 fjy0 = _mm_add_ps(fjy0,ty);
882 fjz0 = _mm_add_ps(fjz0,tz);
886 /**************************
887 * CALCULATE INTERACTIONS *
888 **************************/
890 if (gmx_mm_any_lt(rsq11,rcutoff2))
893 /* REACTION-FIELD ELECTROSTATICS */
894 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
895 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
897 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec = _mm_and_ps(velec,cutoff_mask);
901 velec = _mm_andnot_ps(dummy_mask,velec);
902 velecsum = _mm_add_ps(velecsum,velec);
906 fscal = _mm_and_ps(fscal,cutoff_mask);
908 fscal = _mm_andnot_ps(dummy_mask,fscal);
910 /* Calculate temporary vectorial force */
911 tx = _mm_mul_ps(fscal,dx11);
912 ty = _mm_mul_ps(fscal,dy11);
913 tz = _mm_mul_ps(fscal,dz11);
915 /* Update vectorial force */
916 fix1 = _mm_add_ps(fix1,tx);
917 fiy1 = _mm_add_ps(fiy1,ty);
918 fiz1 = _mm_add_ps(fiz1,tz);
920 fjx1 = _mm_add_ps(fjx1,tx);
921 fjy1 = _mm_add_ps(fjy1,ty);
922 fjz1 = _mm_add_ps(fjz1,tz);
926 /**************************
927 * CALCULATE INTERACTIONS *
928 **************************/
930 if (gmx_mm_any_lt(rsq12,rcutoff2))
933 /* REACTION-FIELD ELECTROSTATICS */
934 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
935 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
937 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
939 /* Update potential sum for this i atom from the interaction with this j atom. */
940 velec = _mm_and_ps(velec,cutoff_mask);
941 velec = _mm_andnot_ps(dummy_mask,velec);
942 velecsum = _mm_add_ps(velecsum,velec);
946 fscal = _mm_and_ps(fscal,cutoff_mask);
948 fscal = _mm_andnot_ps(dummy_mask,fscal);
950 /* Calculate temporary vectorial force */
951 tx = _mm_mul_ps(fscal,dx12);
952 ty = _mm_mul_ps(fscal,dy12);
953 tz = _mm_mul_ps(fscal,dz12);
955 /* Update vectorial force */
956 fix1 = _mm_add_ps(fix1,tx);
957 fiy1 = _mm_add_ps(fiy1,ty);
958 fiz1 = _mm_add_ps(fiz1,tz);
960 fjx2 = _mm_add_ps(fjx2,tx);
961 fjy2 = _mm_add_ps(fjy2,ty);
962 fjz2 = _mm_add_ps(fjz2,tz);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 if (gmx_mm_any_lt(rsq13,rcutoff2))
973 /* REACTION-FIELD ELECTROSTATICS */
974 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
975 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
977 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm_and_ps(velec,cutoff_mask);
981 velec = _mm_andnot_ps(dummy_mask,velec);
982 velecsum = _mm_add_ps(velecsum,velec);
986 fscal = _mm_and_ps(fscal,cutoff_mask);
988 fscal = _mm_andnot_ps(dummy_mask,fscal);
990 /* Calculate temporary vectorial force */
991 tx = _mm_mul_ps(fscal,dx13);
992 ty = _mm_mul_ps(fscal,dy13);
993 tz = _mm_mul_ps(fscal,dz13);
995 /* Update vectorial force */
996 fix1 = _mm_add_ps(fix1,tx);
997 fiy1 = _mm_add_ps(fiy1,ty);
998 fiz1 = _mm_add_ps(fiz1,tz);
1000 fjx3 = _mm_add_ps(fjx3,tx);
1001 fjy3 = _mm_add_ps(fjy3,ty);
1002 fjz3 = _mm_add_ps(fjz3,tz);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 if (gmx_mm_any_lt(rsq21,rcutoff2))
1013 /* REACTION-FIELD ELECTROSTATICS */
1014 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1015 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1017 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1019 /* Update potential sum for this i atom from the interaction with this j atom. */
1020 velec = _mm_and_ps(velec,cutoff_mask);
1021 velec = _mm_andnot_ps(dummy_mask,velec);
1022 velecsum = _mm_add_ps(velecsum,velec);
1026 fscal = _mm_and_ps(fscal,cutoff_mask);
1028 fscal = _mm_andnot_ps(dummy_mask,fscal);
1030 /* Calculate temporary vectorial force */
1031 tx = _mm_mul_ps(fscal,dx21);
1032 ty = _mm_mul_ps(fscal,dy21);
1033 tz = _mm_mul_ps(fscal,dz21);
1035 /* Update vectorial force */
1036 fix2 = _mm_add_ps(fix2,tx);
1037 fiy2 = _mm_add_ps(fiy2,ty);
1038 fiz2 = _mm_add_ps(fiz2,tz);
1040 fjx1 = _mm_add_ps(fjx1,tx);
1041 fjy1 = _mm_add_ps(fjy1,ty);
1042 fjz1 = _mm_add_ps(fjz1,tz);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 if (gmx_mm_any_lt(rsq22,rcutoff2))
1053 /* REACTION-FIELD ELECTROSTATICS */
1054 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1055 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1057 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1059 /* Update potential sum for this i atom from the interaction with this j atom. */
1060 velec = _mm_and_ps(velec,cutoff_mask);
1061 velec = _mm_andnot_ps(dummy_mask,velec);
1062 velecsum = _mm_add_ps(velecsum,velec);
1066 fscal = _mm_and_ps(fscal,cutoff_mask);
1068 fscal = _mm_andnot_ps(dummy_mask,fscal);
1070 /* Calculate temporary vectorial force */
1071 tx = _mm_mul_ps(fscal,dx22);
1072 ty = _mm_mul_ps(fscal,dy22);
1073 tz = _mm_mul_ps(fscal,dz22);
1075 /* Update vectorial force */
1076 fix2 = _mm_add_ps(fix2,tx);
1077 fiy2 = _mm_add_ps(fiy2,ty);
1078 fiz2 = _mm_add_ps(fiz2,tz);
1080 fjx2 = _mm_add_ps(fjx2,tx);
1081 fjy2 = _mm_add_ps(fjy2,ty);
1082 fjz2 = _mm_add_ps(fjz2,tz);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 if (gmx_mm_any_lt(rsq23,rcutoff2))
1093 /* REACTION-FIELD ELECTROSTATICS */
1094 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1095 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1097 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _mm_and_ps(velec,cutoff_mask);
1101 velec = _mm_andnot_ps(dummy_mask,velec);
1102 velecsum = _mm_add_ps(velecsum,velec);
1106 fscal = _mm_and_ps(fscal,cutoff_mask);
1108 fscal = _mm_andnot_ps(dummy_mask,fscal);
1110 /* Calculate temporary vectorial force */
1111 tx = _mm_mul_ps(fscal,dx23);
1112 ty = _mm_mul_ps(fscal,dy23);
1113 tz = _mm_mul_ps(fscal,dz23);
1115 /* Update vectorial force */
1116 fix2 = _mm_add_ps(fix2,tx);
1117 fiy2 = _mm_add_ps(fiy2,ty);
1118 fiz2 = _mm_add_ps(fiz2,tz);
1120 fjx3 = _mm_add_ps(fjx3,tx);
1121 fjy3 = _mm_add_ps(fjy3,ty);
1122 fjz3 = _mm_add_ps(fjz3,tz);
1126 /**************************
1127 * CALCULATE INTERACTIONS *
1128 **************************/
1130 if (gmx_mm_any_lt(rsq31,rcutoff2))
1133 /* REACTION-FIELD ELECTROSTATICS */
1134 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1135 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1137 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1139 /* Update potential sum for this i atom from the interaction with this j atom. */
1140 velec = _mm_and_ps(velec,cutoff_mask);
1141 velec = _mm_andnot_ps(dummy_mask,velec);
1142 velecsum = _mm_add_ps(velecsum,velec);
1146 fscal = _mm_and_ps(fscal,cutoff_mask);
1148 fscal = _mm_andnot_ps(dummy_mask,fscal);
1150 /* Calculate temporary vectorial force */
1151 tx = _mm_mul_ps(fscal,dx31);
1152 ty = _mm_mul_ps(fscal,dy31);
1153 tz = _mm_mul_ps(fscal,dz31);
1155 /* Update vectorial force */
1156 fix3 = _mm_add_ps(fix3,tx);
1157 fiy3 = _mm_add_ps(fiy3,ty);
1158 fiz3 = _mm_add_ps(fiz3,tz);
1160 fjx1 = _mm_add_ps(fjx1,tx);
1161 fjy1 = _mm_add_ps(fjy1,ty);
1162 fjz1 = _mm_add_ps(fjz1,tz);
1166 /**************************
1167 * CALCULATE INTERACTIONS *
1168 **************************/
1170 if (gmx_mm_any_lt(rsq32,rcutoff2))
1173 /* REACTION-FIELD ELECTROSTATICS */
1174 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1175 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1177 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1179 /* Update potential sum for this i atom from the interaction with this j atom. */
1180 velec = _mm_and_ps(velec,cutoff_mask);
1181 velec = _mm_andnot_ps(dummy_mask,velec);
1182 velecsum = _mm_add_ps(velecsum,velec);
1186 fscal = _mm_and_ps(fscal,cutoff_mask);
1188 fscal = _mm_andnot_ps(dummy_mask,fscal);
1190 /* Calculate temporary vectorial force */
1191 tx = _mm_mul_ps(fscal,dx32);
1192 ty = _mm_mul_ps(fscal,dy32);
1193 tz = _mm_mul_ps(fscal,dz32);
1195 /* Update vectorial force */
1196 fix3 = _mm_add_ps(fix3,tx);
1197 fiy3 = _mm_add_ps(fiy3,ty);
1198 fiz3 = _mm_add_ps(fiz3,tz);
1200 fjx2 = _mm_add_ps(fjx2,tx);
1201 fjy2 = _mm_add_ps(fjy2,ty);
1202 fjz2 = _mm_add_ps(fjz2,tz);
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 if (gmx_mm_any_lt(rsq33,rcutoff2))
1213 /* REACTION-FIELD ELECTROSTATICS */
1214 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1215 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1217 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1219 /* Update potential sum for this i atom from the interaction with this j atom. */
1220 velec = _mm_and_ps(velec,cutoff_mask);
1221 velec = _mm_andnot_ps(dummy_mask,velec);
1222 velecsum = _mm_add_ps(velecsum,velec);
1226 fscal = _mm_and_ps(fscal,cutoff_mask);
1228 fscal = _mm_andnot_ps(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm_mul_ps(fscal,dx33);
1232 ty = _mm_mul_ps(fscal,dy33);
1233 tz = _mm_mul_ps(fscal,dz33);
1235 /* Update vectorial force */
1236 fix3 = _mm_add_ps(fix3,tx);
1237 fiy3 = _mm_add_ps(fiy3,ty);
1238 fiz3 = _mm_add_ps(fiz3,tz);
1240 fjx3 = _mm_add_ps(fjx3,tx);
1241 fjy3 = _mm_add_ps(fjy3,ty);
1242 fjz3 = _mm_add_ps(fjz3,tz);
1246 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1247 f+j_coord_offsetC,f+j_coord_offsetD,
1248 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1249 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1251 /* Inner loop uses 387 flops */
1254 /* End of innermost loop */
1256 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1257 f+i_coord_offset,fshift+i_shift_offset);
1260 /* Update potential energies */
1261 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1262 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1264 /* Increment number of inner iterations */
1265 inneriter += j_index_end - j_index_start;
1267 /* Outer loop uses 38 flops */
1270 /* Increment number of outer iterations */
1273 /* Update outer/inner flops */
1275 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*387);
1278 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
1279 * Electrostatics interaction: ReactionField
1280 * VdW interaction: LennardJones
1281 * Geometry: Water4-Water4
1282 * Calculate force/pot: Force
1285 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
1286 (t_nblist * gmx_restrict nlist,
1287 rvec * gmx_restrict xx,
1288 rvec * gmx_restrict ff,
1289 t_forcerec * gmx_restrict fr,
1290 t_mdatoms * gmx_restrict mdatoms,
1291 nb_kernel_data_t * gmx_restrict kernel_data,
1292 t_nrnb * gmx_restrict nrnb)
1294 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1295 * just 0 for non-waters.
1296 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1297 * jnr indices corresponding to data put in the four positions in the SIMD register.
1299 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1300 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1301 int jnrA,jnrB,jnrC,jnrD;
1302 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1303 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1304 real shX,shY,shZ,rcutoff_scalar;
1305 real *shiftvec,*fshift,*x,*f;
1306 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1308 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1310 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1312 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1314 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1315 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1316 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1317 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1318 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1319 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1320 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1321 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1322 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1323 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1324 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1325 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1326 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1327 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1328 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1329 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1330 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1331 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1332 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1333 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1336 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1339 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1340 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1341 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1342 real rswitch_scalar,d_scalar;
1343 __m128 dummy_mask,cutoff_mask;
1344 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1345 __m128 one = _mm_set1_ps(1.0);
1346 __m128 two = _mm_set1_ps(2.0);
1352 jindex = nlist->jindex;
1354 shiftidx = nlist->shift;
1356 shiftvec = fr->shift_vec[0];
1357 fshift = fr->fshift[0];
1358 facel = _mm_set1_ps(fr->epsfac);
1359 charge = mdatoms->chargeA;
1360 krf = _mm_set1_ps(fr->ic->k_rf);
1361 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1362 crf = _mm_set1_ps(fr->ic->c_rf);
1363 nvdwtype = fr->ntype;
1364 vdwparam = fr->nbfp;
1365 vdwtype = mdatoms->typeA;
1367 /* Setup water-specific parameters */
1368 inr = nlist->iinr[0];
1369 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1370 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1371 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1372 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1374 jq1 = _mm_set1_ps(charge[inr+1]);
1375 jq2 = _mm_set1_ps(charge[inr+2]);
1376 jq3 = _mm_set1_ps(charge[inr+3]);
1377 vdwjidx0A = 2*vdwtype[inr+0];
1378 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1379 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1380 qq11 = _mm_mul_ps(iq1,jq1);
1381 qq12 = _mm_mul_ps(iq1,jq2);
1382 qq13 = _mm_mul_ps(iq1,jq3);
1383 qq21 = _mm_mul_ps(iq2,jq1);
1384 qq22 = _mm_mul_ps(iq2,jq2);
1385 qq23 = _mm_mul_ps(iq2,jq3);
1386 qq31 = _mm_mul_ps(iq3,jq1);
1387 qq32 = _mm_mul_ps(iq3,jq2);
1388 qq33 = _mm_mul_ps(iq3,jq3);
1390 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1391 rcutoff_scalar = fr->rcoulomb;
1392 rcutoff = _mm_set1_ps(rcutoff_scalar);
1393 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1395 rswitch_scalar = fr->rvdw_switch;
1396 rswitch = _mm_set1_ps(rswitch_scalar);
1397 /* Setup switch parameters */
1398 d_scalar = rcutoff_scalar-rswitch_scalar;
1399 d = _mm_set1_ps(d_scalar);
1400 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1401 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1402 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1403 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1404 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1405 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1407 /* Avoid stupid compiler warnings */
1408 jnrA = jnrB = jnrC = jnrD = 0;
1409 j_coord_offsetA = 0;
1410 j_coord_offsetB = 0;
1411 j_coord_offsetC = 0;
1412 j_coord_offsetD = 0;
1417 /* Start outer loop over neighborlists */
1418 for(iidx=0; iidx<nri; iidx++)
1420 /* Load shift vector for this list */
1421 i_shift_offset = DIM*shiftidx[iidx];
1422 shX = shiftvec[i_shift_offset+XX];
1423 shY = shiftvec[i_shift_offset+YY];
1424 shZ = shiftvec[i_shift_offset+ZZ];
1426 /* Load limits for loop over neighbors */
1427 j_index_start = jindex[iidx];
1428 j_index_end = jindex[iidx+1];
1430 /* Get outer coordinate index */
1432 i_coord_offset = DIM*inr;
1434 /* Load i particle coords and add shift vector */
1435 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1436 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1437 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1438 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1439 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1440 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1441 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1442 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1443 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1444 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1445 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1446 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1448 fix0 = _mm_setzero_ps();
1449 fiy0 = _mm_setzero_ps();
1450 fiz0 = _mm_setzero_ps();
1451 fix1 = _mm_setzero_ps();
1452 fiy1 = _mm_setzero_ps();
1453 fiz1 = _mm_setzero_ps();
1454 fix2 = _mm_setzero_ps();
1455 fiy2 = _mm_setzero_ps();
1456 fiz2 = _mm_setzero_ps();
1457 fix3 = _mm_setzero_ps();
1458 fiy3 = _mm_setzero_ps();
1459 fiz3 = _mm_setzero_ps();
1461 /* Start inner kernel loop */
1462 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1465 /* Get j neighbor index, and coordinate index */
1467 jnrB = jjnr[jidx+1];
1468 jnrC = jjnr[jidx+2];
1469 jnrD = jjnr[jidx+3];
1471 j_coord_offsetA = DIM*jnrA;
1472 j_coord_offsetB = DIM*jnrB;
1473 j_coord_offsetC = DIM*jnrC;
1474 j_coord_offsetD = DIM*jnrD;
1476 /* load j atom coordinates */
1477 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1478 x+j_coord_offsetC,x+j_coord_offsetD,
1479 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1480 &jy2,&jz2,&jx3,&jy3,&jz3);
1482 /* Calculate displacement vector */
1483 dx00 = _mm_sub_ps(ix0,jx0);
1484 dy00 = _mm_sub_ps(iy0,jy0);
1485 dz00 = _mm_sub_ps(iz0,jz0);
1486 dx11 = _mm_sub_ps(ix1,jx1);
1487 dy11 = _mm_sub_ps(iy1,jy1);
1488 dz11 = _mm_sub_ps(iz1,jz1);
1489 dx12 = _mm_sub_ps(ix1,jx2);
1490 dy12 = _mm_sub_ps(iy1,jy2);
1491 dz12 = _mm_sub_ps(iz1,jz2);
1492 dx13 = _mm_sub_ps(ix1,jx3);
1493 dy13 = _mm_sub_ps(iy1,jy3);
1494 dz13 = _mm_sub_ps(iz1,jz3);
1495 dx21 = _mm_sub_ps(ix2,jx1);
1496 dy21 = _mm_sub_ps(iy2,jy1);
1497 dz21 = _mm_sub_ps(iz2,jz1);
1498 dx22 = _mm_sub_ps(ix2,jx2);
1499 dy22 = _mm_sub_ps(iy2,jy2);
1500 dz22 = _mm_sub_ps(iz2,jz2);
1501 dx23 = _mm_sub_ps(ix2,jx3);
1502 dy23 = _mm_sub_ps(iy2,jy3);
1503 dz23 = _mm_sub_ps(iz2,jz3);
1504 dx31 = _mm_sub_ps(ix3,jx1);
1505 dy31 = _mm_sub_ps(iy3,jy1);
1506 dz31 = _mm_sub_ps(iz3,jz1);
1507 dx32 = _mm_sub_ps(ix3,jx2);
1508 dy32 = _mm_sub_ps(iy3,jy2);
1509 dz32 = _mm_sub_ps(iz3,jz2);
1510 dx33 = _mm_sub_ps(ix3,jx3);
1511 dy33 = _mm_sub_ps(iy3,jy3);
1512 dz33 = _mm_sub_ps(iz3,jz3);
1514 /* Calculate squared distance and things based on it */
1515 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1516 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1517 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1518 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1519 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1520 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1521 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1522 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1523 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1524 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1526 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1527 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1528 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1529 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1530 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1531 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1532 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1533 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1534 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1535 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1537 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1538 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1539 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1540 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1541 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1542 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1543 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1544 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1545 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1546 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1548 fjx0 = _mm_setzero_ps();
1549 fjy0 = _mm_setzero_ps();
1550 fjz0 = _mm_setzero_ps();
1551 fjx1 = _mm_setzero_ps();
1552 fjy1 = _mm_setzero_ps();
1553 fjz1 = _mm_setzero_ps();
1554 fjx2 = _mm_setzero_ps();
1555 fjy2 = _mm_setzero_ps();
1556 fjz2 = _mm_setzero_ps();
1557 fjx3 = _mm_setzero_ps();
1558 fjy3 = _mm_setzero_ps();
1559 fjz3 = _mm_setzero_ps();
1561 /**************************
1562 * CALCULATE INTERACTIONS *
1563 **************************/
1565 if (gmx_mm_any_lt(rsq00,rcutoff2))
1568 r00 = _mm_mul_ps(rsq00,rinv00);
1570 /* LENNARD-JONES DISPERSION/REPULSION */
1572 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1573 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1574 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1575 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1576 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1578 d = _mm_sub_ps(r00,rswitch);
1579 d = _mm_max_ps(d,_mm_setzero_ps());
1580 d2 = _mm_mul_ps(d,d);
1581 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1583 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1585 /* Evaluate switch function */
1586 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1587 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1588 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1592 fscal = _mm_and_ps(fscal,cutoff_mask);
1594 /* Calculate temporary vectorial force */
1595 tx = _mm_mul_ps(fscal,dx00);
1596 ty = _mm_mul_ps(fscal,dy00);
1597 tz = _mm_mul_ps(fscal,dz00);
1599 /* Update vectorial force */
1600 fix0 = _mm_add_ps(fix0,tx);
1601 fiy0 = _mm_add_ps(fiy0,ty);
1602 fiz0 = _mm_add_ps(fiz0,tz);
1604 fjx0 = _mm_add_ps(fjx0,tx);
1605 fjy0 = _mm_add_ps(fjy0,ty);
1606 fjz0 = _mm_add_ps(fjz0,tz);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 if (gmx_mm_any_lt(rsq11,rcutoff2))
1617 /* REACTION-FIELD ELECTROSTATICS */
1618 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1620 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1624 fscal = _mm_and_ps(fscal,cutoff_mask);
1626 /* Calculate temporary vectorial force */
1627 tx = _mm_mul_ps(fscal,dx11);
1628 ty = _mm_mul_ps(fscal,dy11);
1629 tz = _mm_mul_ps(fscal,dz11);
1631 /* Update vectorial force */
1632 fix1 = _mm_add_ps(fix1,tx);
1633 fiy1 = _mm_add_ps(fiy1,ty);
1634 fiz1 = _mm_add_ps(fiz1,tz);
1636 fjx1 = _mm_add_ps(fjx1,tx);
1637 fjy1 = _mm_add_ps(fjy1,ty);
1638 fjz1 = _mm_add_ps(fjz1,tz);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 if (gmx_mm_any_lt(rsq12,rcutoff2))
1649 /* REACTION-FIELD ELECTROSTATICS */
1650 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1652 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1656 fscal = _mm_and_ps(fscal,cutoff_mask);
1658 /* Calculate temporary vectorial force */
1659 tx = _mm_mul_ps(fscal,dx12);
1660 ty = _mm_mul_ps(fscal,dy12);
1661 tz = _mm_mul_ps(fscal,dz12);
1663 /* Update vectorial force */
1664 fix1 = _mm_add_ps(fix1,tx);
1665 fiy1 = _mm_add_ps(fiy1,ty);
1666 fiz1 = _mm_add_ps(fiz1,tz);
1668 fjx2 = _mm_add_ps(fjx2,tx);
1669 fjy2 = _mm_add_ps(fjy2,ty);
1670 fjz2 = _mm_add_ps(fjz2,tz);
1674 /**************************
1675 * CALCULATE INTERACTIONS *
1676 **************************/
1678 if (gmx_mm_any_lt(rsq13,rcutoff2))
1681 /* REACTION-FIELD ELECTROSTATICS */
1682 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1684 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1688 fscal = _mm_and_ps(fscal,cutoff_mask);
1690 /* Calculate temporary vectorial force */
1691 tx = _mm_mul_ps(fscal,dx13);
1692 ty = _mm_mul_ps(fscal,dy13);
1693 tz = _mm_mul_ps(fscal,dz13);
1695 /* Update vectorial force */
1696 fix1 = _mm_add_ps(fix1,tx);
1697 fiy1 = _mm_add_ps(fiy1,ty);
1698 fiz1 = _mm_add_ps(fiz1,tz);
1700 fjx3 = _mm_add_ps(fjx3,tx);
1701 fjy3 = _mm_add_ps(fjy3,ty);
1702 fjz3 = _mm_add_ps(fjz3,tz);
1706 /**************************
1707 * CALCULATE INTERACTIONS *
1708 **************************/
1710 if (gmx_mm_any_lt(rsq21,rcutoff2))
1713 /* REACTION-FIELD ELECTROSTATICS */
1714 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1716 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1720 fscal = _mm_and_ps(fscal,cutoff_mask);
1722 /* Calculate temporary vectorial force */
1723 tx = _mm_mul_ps(fscal,dx21);
1724 ty = _mm_mul_ps(fscal,dy21);
1725 tz = _mm_mul_ps(fscal,dz21);
1727 /* Update vectorial force */
1728 fix2 = _mm_add_ps(fix2,tx);
1729 fiy2 = _mm_add_ps(fiy2,ty);
1730 fiz2 = _mm_add_ps(fiz2,tz);
1732 fjx1 = _mm_add_ps(fjx1,tx);
1733 fjy1 = _mm_add_ps(fjy1,ty);
1734 fjz1 = _mm_add_ps(fjz1,tz);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 if (gmx_mm_any_lt(rsq22,rcutoff2))
1745 /* REACTION-FIELD ELECTROSTATICS */
1746 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1748 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1752 fscal = _mm_and_ps(fscal,cutoff_mask);
1754 /* Calculate temporary vectorial force */
1755 tx = _mm_mul_ps(fscal,dx22);
1756 ty = _mm_mul_ps(fscal,dy22);
1757 tz = _mm_mul_ps(fscal,dz22);
1759 /* Update vectorial force */
1760 fix2 = _mm_add_ps(fix2,tx);
1761 fiy2 = _mm_add_ps(fiy2,ty);
1762 fiz2 = _mm_add_ps(fiz2,tz);
1764 fjx2 = _mm_add_ps(fjx2,tx);
1765 fjy2 = _mm_add_ps(fjy2,ty);
1766 fjz2 = _mm_add_ps(fjz2,tz);
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 if (gmx_mm_any_lt(rsq23,rcutoff2))
1777 /* REACTION-FIELD ELECTROSTATICS */
1778 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1780 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1784 fscal = _mm_and_ps(fscal,cutoff_mask);
1786 /* Calculate temporary vectorial force */
1787 tx = _mm_mul_ps(fscal,dx23);
1788 ty = _mm_mul_ps(fscal,dy23);
1789 tz = _mm_mul_ps(fscal,dz23);
1791 /* Update vectorial force */
1792 fix2 = _mm_add_ps(fix2,tx);
1793 fiy2 = _mm_add_ps(fiy2,ty);
1794 fiz2 = _mm_add_ps(fiz2,tz);
1796 fjx3 = _mm_add_ps(fjx3,tx);
1797 fjy3 = _mm_add_ps(fjy3,ty);
1798 fjz3 = _mm_add_ps(fjz3,tz);
1802 /**************************
1803 * CALCULATE INTERACTIONS *
1804 **************************/
1806 if (gmx_mm_any_lt(rsq31,rcutoff2))
1809 /* REACTION-FIELD ELECTROSTATICS */
1810 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1812 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1816 fscal = _mm_and_ps(fscal,cutoff_mask);
1818 /* Calculate temporary vectorial force */
1819 tx = _mm_mul_ps(fscal,dx31);
1820 ty = _mm_mul_ps(fscal,dy31);
1821 tz = _mm_mul_ps(fscal,dz31);
1823 /* Update vectorial force */
1824 fix3 = _mm_add_ps(fix3,tx);
1825 fiy3 = _mm_add_ps(fiy3,ty);
1826 fiz3 = _mm_add_ps(fiz3,tz);
1828 fjx1 = _mm_add_ps(fjx1,tx);
1829 fjy1 = _mm_add_ps(fjy1,ty);
1830 fjz1 = _mm_add_ps(fjz1,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 if (gmx_mm_any_lt(rsq32,rcutoff2))
1841 /* REACTION-FIELD ELECTROSTATICS */
1842 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1844 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1848 fscal = _mm_and_ps(fscal,cutoff_mask);
1850 /* Calculate temporary vectorial force */
1851 tx = _mm_mul_ps(fscal,dx32);
1852 ty = _mm_mul_ps(fscal,dy32);
1853 tz = _mm_mul_ps(fscal,dz32);
1855 /* Update vectorial force */
1856 fix3 = _mm_add_ps(fix3,tx);
1857 fiy3 = _mm_add_ps(fiy3,ty);
1858 fiz3 = _mm_add_ps(fiz3,tz);
1860 fjx2 = _mm_add_ps(fjx2,tx);
1861 fjy2 = _mm_add_ps(fjy2,ty);
1862 fjz2 = _mm_add_ps(fjz2,tz);
1866 /**************************
1867 * CALCULATE INTERACTIONS *
1868 **************************/
1870 if (gmx_mm_any_lt(rsq33,rcutoff2))
1873 /* REACTION-FIELD ELECTROSTATICS */
1874 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1876 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1880 fscal = _mm_and_ps(fscal,cutoff_mask);
1882 /* Calculate temporary vectorial force */
1883 tx = _mm_mul_ps(fscal,dx33);
1884 ty = _mm_mul_ps(fscal,dy33);
1885 tz = _mm_mul_ps(fscal,dz33);
1887 /* Update vectorial force */
1888 fix3 = _mm_add_ps(fix3,tx);
1889 fiy3 = _mm_add_ps(fiy3,ty);
1890 fiz3 = _mm_add_ps(fiz3,tz);
1892 fjx3 = _mm_add_ps(fjx3,tx);
1893 fjy3 = _mm_add_ps(fjy3,ty);
1894 fjz3 = _mm_add_ps(fjz3,tz);
1898 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1899 f+j_coord_offsetC,f+j_coord_offsetD,
1900 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1901 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1903 /* Inner loop uses 329 flops */
1906 if(jidx<j_index_end)
1909 /* Get j neighbor index, and coordinate index */
1911 jnrB = jjnr[jidx+1];
1912 jnrC = jjnr[jidx+2];
1913 jnrD = jjnr[jidx+3];
1915 /* Sign of each element will be negative for non-real atoms.
1916 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1917 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1919 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1920 jnrA = (jnrA>=0) ? jnrA : 0;
1921 jnrB = (jnrB>=0) ? jnrB : 0;
1922 jnrC = (jnrC>=0) ? jnrC : 0;
1923 jnrD = (jnrD>=0) ? jnrD : 0;
1925 j_coord_offsetA = DIM*jnrA;
1926 j_coord_offsetB = DIM*jnrB;
1927 j_coord_offsetC = DIM*jnrC;
1928 j_coord_offsetD = DIM*jnrD;
1930 /* load j atom coordinates */
1931 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1932 x+j_coord_offsetC,x+j_coord_offsetD,
1933 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1934 &jy2,&jz2,&jx3,&jy3,&jz3);
1936 /* Calculate displacement vector */
1937 dx00 = _mm_sub_ps(ix0,jx0);
1938 dy00 = _mm_sub_ps(iy0,jy0);
1939 dz00 = _mm_sub_ps(iz0,jz0);
1940 dx11 = _mm_sub_ps(ix1,jx1);
1941 dy11 = _mm_sub_ps(iy1,jy1);
1942 dz11 = _mm_sub_ps(iz1,jz1);
1943 dx12 = _mm_sub_ps(ix1,jx2);
1944 dy12 = _mm_sub_ps(iy1,jy2);
1945 dz12 = _mm_sub_ps(iz1,jz2);
1946 dx13 = _mm_sub_ps(ix1,jx3);
1947 dy13 = _mm_sub_ps(iy1,jy3);
1948 dz13 = _mm_sub_ps(iz1,jz3);
1949 dx21 = _mm_sub_ps(ix2,jx1);
1950 dy21 = _mm_sub_ps(iy2,jy1);
1951 dz21 = _mm_sub_ps(iz2,jz1);
1952 dx22 = _mm_sub_ps(ix2,jx2);
1953 dy22 = _mm_sub_ps(iy2,jy2);
1954 dz22 = _mm_sub_ps(iz2,jz2);
1955 dx23 = _mm_sub_ps(ix2,jx3);
1956 dy23 = _mm_sub_ps(iy2,jy3);
1957 dz23 = _mm_sub_ps(iz2,jz3);
1958 dx31 = _mm_sub_ps(ix3,jx1);
1959 dy31 = _mm_sub_ps(iy3,jy1);
1960 dz31 = _mm_sub_ps(iz3,jz1);
1961 dx32 = _mm_sub_ps(ix3,jx2);
1962 dy32 = _mm_sub_ps(iy3,jy2);
1963 dz32 = _mm_sub_ps(iz3,jz2);
1964 dx33 = _mm_sub_ps(ix3,jx3);
1965 dy33 = _mm_sub_ps(iy3,jy3);
1966 dz33 = _mm_sub_ps(iz3,jz3);
1968 /* Calculate squared distance and things based on it */
1969 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1970 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1971 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1972 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1973 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1974 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1975 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1976 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1977 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1978 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1980 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1981 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1982 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1983 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1984 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1985 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1986 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1987 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1988 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1989 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1991 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1992 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1993 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1994 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1995 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1996 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1997 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1998 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1999 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2000 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2002 fjx0 = _mm_setzero_ps();
2003 fjy0 = _mm_setzero_ps();
2004 fjz0 = _mm_setzero_ps();
2005 fjx1 = _mm_setzero_ps();
2006 fjy1 = _mm_setzero_ps();
2007 fjz1 = _mm_setzero_ps();
2008 fjx2 = _mm_setzero_ps();
2009 fjy2 = _mm_setzero_ps();
2010 fjz2 = _mm_setzero_ps();
2011 fjx3 = _mm_setzero_ps();
2012 fjy3 = _mm_setzero_ps();
2013 fjz3 = _mm_setzero_ps();
2015 /**************************
2016 * CALCULATE INTERACTIONS *
2017 **************************/
2019 if (gmx_mm_any_lt(rsq00,rcutoff2))
2022 r00 = _mm_mul_ps(rsq00,rinv00);
2023 r00 = _mm_andnot_ps(dummy_mask,r00);
2025 /* LENNARD-JONES DISPERSION/REPULSION */
2027 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2028 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2029 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2030 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2031 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2033 d = _mm_sub_ps(r00,rswitch);
2034 d = _mm_max_ps(d,_mm_setzero_ps());
2035 d2 = _mm_mul_ps(d,d);
2036 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2038 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2040 /* Evaluate switch function */
2041 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2042 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2043 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2047 fscal = _mm_and_ps(fscal,cutoff_mask);
2049 fscal = _mm_andnot_ps(dummy_mask,fscal);
2051 /* Calculate temporary vectorial force */
2052 tx = _mm_mul_ps(fscal,dx00);
2053 ty = _mm_mul_ps(fscal,dy00);
2054 tz = _mm_mul_ps(fscal,dz00);
2056 /* Update vectorial force */
2057 fix0 = _mm_add_ps(fix0,tx);
2058 fiy0 = _mm_add_ps(fiy0,ty);
2059 fiz0 = _mm_add_ps(fiz0,tz);
2061 fjx0 = _mm_add_ps(fjx0,tx);
2062 fjy0 = _mm_add_ps(fjy0,ty);
2063 fjz0 = _mm_add_ps(fjz0,tz);
2067 /**************************
2068 * CALCULATE INTERACTIONS *
2069 **************************/
2071 if (gmx_mm_any_lt(rsq11,rcutoff2))
2074 /* REACTION-FIELD ELECTROSTATICS */
2075 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2077 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2081 fscal = _mm_and_ps(fscal,cutoff_mask);
2083 fscal = _mm_andnot_ps(dummy_mask,fscal);
2085 /* Calculate temporary vectorial force */
2086 tx = _mm_mul_ps(fscal,dx11);
2087 ty = _mm_mul_ps(fscal,dy11);
2088 tz = _mm_mul_ps(fscal,dz11);
2090 /* Update vectorial force */
2091 fix1 = _mm_add_ps(fix1,tx);
2092 fiy1 = _mm_add_ps(fiy1,ty);
2093 fiz1 = _mm_add_ps(fiz1,tz);
2095 fjx1 = _mm_add_ps(fjx1,tx);
2096 fjy1 = _mm_add_ps(fjy1,ty);
2097 fjz1 = _mm_add_ps(fjz1,tz);
2101 /**************************
2102 * CALCULATE INTERACTIONS *
2103 **************************/
2105 if (gmx_mm_any_lt(rsq12,rcutoff2))
2108 /* REACTION-FIELD ELECTROSTATICS */
2109 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2111 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2115 fscal = _mm_and_ps(fscal,cutoff_mask);
2117 fscal = _mm_andnot_ps(dummy_mask,fscal);
2119 /* Calculate temporary vectorial force */
2120 tx = _mm_mul_ps(fscal,dx12);
2121 ty = _mm_mul_ps(fscal,dy12);
2122 tz = _mm_mul_ps(fscal,dz12);
2124 /* Update vectorial force */
2125 fix1 = _mm_add_ps(fix1,tx);
2126 fiy1 = _mm_add_ps(fiy1,ty);
2127 fiz1 = _mm_add_ps(fiz1,tz);
2129 fjx2 = _mm_add_ps(fjx2,tx);
2130 fjy2 = _mm_add_ps(fjy2,ty);
2131 fjz2 = _mm_add_ps(fjz2,tz);
2135 /**************************
2136 * CALCULATE INTERACTIONS *
2137 **************************/
2139 if (gmx_mm_any_lt(rsq13,rcutoff2))
2142 /* REACTION-FIELD ELECTROSTATICS */
2143 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2145 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2149 fscal = _mm_and_ps(fscal,cutoff_mask);
2151 fscal = _mm_andnot_ps(dummy_mask,fscal);
2153 /* Calculate temporary vectorial force */
2154 tx = _mm_mul_ps(fscal,dx13);
2155 ty = _mm_mul_ps(fscal,dy13);
2156 tz = _mm_mul_ps(fscal,dz13);
2158 /* Update vectorial force */
2159 fix1 = _mm_add_ps(fix1,tx);
2160 fiy1 = _mm_add_ps(fiy1,ty);
2161 fiz1 = _mm_add_ps(fiz1,tz);
2163 fjx3 = _mm_add_ps(fjx3,tx);
2164 fjy3 = _mm_add_ps(fjy3,ty);
2165 fjz3 = _mm_add_ps(fjz3,tz);
2169 /**************************
2170 * CALCULATE INTERACTIONS *
2171 **************************/
2173 if (gmx_mm_any_lt(rsq21,rcutoff2))
2176 /* REACTION-FIELD ELECTROSTATICS */
2177 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2179 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2183 fscal = _mm_and_ps(fscal,cutoff_mask);
2185 fscal = _mm_andnot_ps(dummy_mask,fscal);
2187 /* Calculate temporary vectorial force */
2188 tx = _mm_mul_ps(fscal,dx21);
2189 ty = _mm_mul_ps(fscal,dy21);
2190 tz = _mm_mul_ps(fscal,dz21);
2192 /* Update vectorial force */
2193 fix2 = _mm_add_ps(fix2,tx);
2194 fiy2 = _mm_add_ps(fiy2,ty);
2195 fiz2 = _mm_add_ps(fiz2,tz);
2197 fjx1 = _mm_add_ps(fjx1,tx);
2198 fjy1 = _mm_add_ps(fjy1,ty);
2199 fjz1 = _mm_add_ps(fjz1,tz);
2203 /**************************
2204 * CALCULATE INTERACTIONS *
2205 **************************/
2207 if (gmx_mm_any_lt(rsq22,rcutoff2))
2210 /* REACTION-FIELD ELECTROSTATICS */
2211 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2213 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2217 fscal = _mm_and_ps(fscal,cutoff_mask);
2219 fscal = _mm_andnot_ps(dummy_mask,fscal);
2221 /* Calculate temporary vectorial force */
2222 tx = _mm_mul_ps(fscal,dx22);
2223 ty = _mm_mul_ps(fscal,dy22);
2224 tz = _mm_mul_ps(fscal,dz22);
2226 /* Update vectorial force */
2227 fix2 = _mm_add_ps(fix2,tx);
2228 fiy2 = _mm_add_ps(fiy2,ty);
2229 fiz2 = _mm_add_ps(fiz2,tz);
2231 fjx2 = _mm_add_ps(fjx2,tx);
2232 fjy2 = _mm_add_ps(fjy2,ty);
2233 fjz2 = _mm_add_ps(fjz2,tz);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 if (gmx_mm_any_lt(rsq23,rcutoff2))
2244 /* REACTION-FIELD ELECTROSTATICS */
2245 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2247 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2251 fscal = _mm_and_ps(fscal,cutoff_mask);
2253 fscal = _mm_andnot_ps(dummy_mask,fscal);
2255 /* Calculate temporary vectorial force */
2256 tx = _mm_mul_ps(fscal,dx23);
2257 ty = _mm_mul_ps(fscal,dy23);
2258 tz = _mm_mul_ps(fscal,dz23);
2260 /* Update vectorial force */
2261 fix2 = _mm_add_ps(fix2,tx);
2262 fiy2 = _mm_add_ps(fiy2,ty);
2263 fiz2 = _mm_add_ps(fiz2,tz);
2265 fjx3 = _mm_add_ps(fjx3,tx);
2266 fjy3 = _mm_add_ps(fjy3,ty);
2267 fjz3 = _mm_add_ps(fjz3,tz);
2271 /**************************
2272 * CALCULATE INTERACTIONS *
2273 **************************/
2275 if (gmx_mm_any_lt(rsq31,rcutoff2))
2278 /* REACTION-FIELD ELECTROSTATICS */
2279 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2281 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2285 fscal = _mm_and_ps(fscal,cutoff_mask);
2287 fscal = _mm_andnot_ps(dummy_mask,fscal);
2289 /* Calculate temporary vectorial force */
2290 tx = _mm_mul_ps(fscal,dx31);
2291 ty = _mm_mul_ps(fscal,dy31);
2292 tz = _mm_mul_ps(fscal,dz31);
2294 /* Update vectorial force */
2295 fix3 = _mm_add_ps(fix3,tx);
2296 fiy3 = _mm_add_ps(fiy3,ty);
2297 fiz3 = _mm_add_ps(fiz3,tz);
2299 fjx1 = _mm_add_ps(fjx1,tx);
2300 fjy1 = _mm_add_ps(fjy1,ty);
2301 fjz1 = _mm_add_ps(fjz1,tz);
2305 /**************************
2306 * CALCULATE INTERACTIONS *
2307 **************************/
2309 if (gmx_mm_any_lt(rsq32,rcutoff2))
2312 /* REACTION-FIELD ELECTROSTATICS */
2313 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2315 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2319 fscal = _mm_and_ps(fscal,cutoff_mask);
2321 fscal = _mm_andnot_ps(dummy_mask,fscal);
2323 /* Calculate temporary vectorial force */
2324 tx = _mm_mul_ps(fscal,dx32);
2325 ty = _mm_mul_ps(fscal,dy32);
2326 tz = _mm_mul_ps(fscal,dz32);
2328 /* Update vectorial force */
2329 fix3 = _mm_add_ps(fix3,tx);
2330 fiy3 = _mm_add_ps(fiy3,ty);
2331 fiz3 = _mm_add_ps(fiz3,tz);
2333 fjx2 = _mm_add_ps(fjx2,tx);
2334 fjy2 = _mm_add_ps(fjy2,ty);
2335 fjz2 = _mm_add_ps(fjz2,tz);
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2343 if (gmx_mm_any_lt(rsq33,rcutoff2))
2346 /* REACTION-FIELD ELECTROSTATICS */
2347 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2349 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2353 fscal = _mm_and_ps(fscal,cutoff_mask);
2355 fscal = _mm_andnot_ps(dummy_mask,fscal);
2357 /* Calculate temporary vectorial force */
2358 tx = _mm_mul_ps(fscal,dx33);
2359 ty = _mm_mul_ps(fscal,dy33);
2360 tz = _mm_mul_ps(fscal,dz33);
2362 /* Update vectorial force */
2363 fix3 = _mm_add_ps(fix3,tx);
2364 fiy3 = _mm_add_ps(fiy3,ty);
2365 fiz3 = _mm_add_ps(fiz3,tz);
2367 fjx3 = _mm_add_ps(fjx3,tx);
2368 fjy3 = _mm_add_ps(fjy3,ty);
2369 fjz3 = _mm_add_ps(fjz3,tz);
2373 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2374 f+j_coord_offsetC,f+j_coord_offsetD,
2375 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2376 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2378 /* Inner loop uses 330 flops */
2381 /* End of innermost loop */
2383 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2384 f+i_coord_offset,fshift+i_shift_offset);
2386 /* Increment number of inner iterations */
2387 inneriter += j_index_end - j_index_start;
2389 /* Outer loop uses 36 flops */
2392 /* Increment number of outer iterations */
2395 /* Update outer/inner flops */
2397 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*330);