2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
94 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
95 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
96 real rswitch_scalar,d_scalar;
97 __m128 dummy_mask,cutoff_mask;
98 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
99 __m128 one = _mm_set1_ps(1.0);
100 __m128 two = _mm_set1_ps(2.0);
106 jindex = nlist->jindex;
108 shiftidx = nlist->shift;
110 shiftvec = fr->shift_vec[0];
111 fshift = fr->fshift[0];
112 facel = _mm_set1_ps(fr->epsfac);
113 charge = mdatoms->chargeA;
114 krf = _mm_set1_ps(fr->ic->k_rf);
115 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
116 crf = _mm_set1_ps(fr->ic->c_rf);
117 nvdwtype = fr->ntype;
119 vdwtype = mdatoms->typeA;
121 /* Setup water-specific parameters */
122 inr = nlist->iinr[0];
123 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
124 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
125 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
126 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
128 jq0 = _mm_set1_ps(charge[inr+0]);
129 jq1 = _mm_set1_ps(charge[inr+1]);
130 jq2 = _mm_set1_ps(charge[inr+2]);
131 vdwjidx0A = 2*vdwtype[inr+0];
132 qq00 = _mm_mul_ps(iq0,jq0);
133 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
134 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
135 qq01 = _mm_mul_ps(iq0,jq1);
136 qq02 = _mm_mul_ps(iq0,jq2);
137 qq10 = _mm_mul_ps(iq1,jq0);
138 qq11 = _mm_mul_ps(iq1,jq1);
139 qq12 = _mm_mul_ps(iq1,jq2);
140 qq20 = _mm_mul_ps(iq2,jq0);
141 qq21 = _mm_mul_ps(iq2,jq1);
142 qq22 = _mm_mul_ps(iq2,jq2);
144 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
145 rcutoff_scalar = fr->rcoulomb;
146 rcutoff = _mm_set1_ps(rcutoff_scalar);
147 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
149 rswitch_scalar = fr->rvdw_switch;
150 rswitch = _mm_set1_ps(rswitch_scalar);
151 /* Setup switch parameters */
152 d_scalar = rcutoff_scalar-rswitch_scalar;
153 d = _mm_set1_ps(d_scalar);
154 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
155 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
156 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
157 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
158 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
159 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
161 /* Avoid stupid compiler warnings */
162 jnrA = jnrB = jnrC = jnrD = 0;
171 /* Start outer loop over neighborlists */
172 for(iidx=0; iidx<nri; iidx++)
174 /* Load shift vector for this list */
175 i_shift_offset = DIM*shiftidx[iidx];
176 shX = shiftvec[i_shift_offset+XX];
177 shY = shiftvec[i_shift_offset+YY];
178 shZ = shiftvec[i_shift_offset+ZZ];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
190 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
191 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
192 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
193 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
194 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
195 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
196 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
197 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
199 fix0 = _mm_setzero_ps();
200 fiy0 = _mm_setzero_ps();
201 fiz0 = _mm_setzero_ps();
202 fix1 = _mm_setzero_ps();
203 fiy1 = _mm_setzero_ps();
204 fiz1 = _mm_setzero_ps();
205 fix2 = _mm_setzero_ps();
206 fiy2 = _mm_setzero_ps();
207 fiz2 = _mm_setzero_ps();
209 /* Reset potential sums */
210 velecsum = _mm_setzero_ps();
211 vvdwsum = _mm_setzero_ps();
213 /* Start inner kernel loop */
214 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
217 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
225 j_coord_offsetC = DIM*jnrC;
226 j_coord_offsetD = DIM*jnrD;
228 /* load j atom coordinates */
229 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
230 x+j_coord_offsetC,x+j_coord_offsetD,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm_sub_ps(ix0,jx0);
235 dy00 = _mm_sub_ps(iy0,jy0);
236 dz00 = _mm_sub_ps(iz0,jz0);
237 dx01 = _mm_sub_ps(ix0,jx1);
238 dy01 = _mm_sub_ps(iy0,jy1);
239 dz01 = _mm_sub_ps(iz0,jz1);
240 dx02 = _mm_sub_ps(ix0,jx2);
241 dy02 = _mm_sub_ps(iy0,jy2);
242 dz02 = _mm_sub_ps(iz0,jz2);
243 dx10 = _mm_sub_ps(ix1,jx0);
244 dy10 = _mm_sub_ps(iy1,jy0);
245 dz10 = _mm_sub_ps(iz1,jz0);
246 dx11 = _mm_sub_ps(ix1,jx1);
247 dy11 = _mm_sub_ps(iy1,jy1);
248 dz11 = _mm_sub_ps(iz1,jz1);
249 dx12 = _mm_sub_ps(ix1,jx2);
250 dy12 = _mm_sub_ps(iy1,jy2);
251 dz12 = _mm_sub_ps(iz1,jz2);
252 dx20 = _mm_sub_ps(ix2,jx0);
253 dy20 = _mm_sub_ps(iy2,jy0);
254 dz20 = _mm_sub_ps(iz2,jz0);
255 dx21 = _mm_sub_ps(ix2,jx1);
256 dy21 = _mm_sub_ps(iy2,jy1);
257 dz21 = _mm_sub_ps(iz2,jz1);
258 dx22 = _mm_sub_ps(ix2,jx2);
259 dy22 = _mm_sub_ps(iy2,jy2);
260 dz22 = _mm_sub_ps(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
265 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
266 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
267 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
270 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
273 rinv00 = gmx_mm_invsqrt_ps(rsq00);
274 rinv01 = gmx_mm_invsqrt_ps(rsq01);
275 rinv02 = gmx_mm_invsqrt_ps(rsq02);
276 rinv10 = gmx_mm_invsqrt_ps(rsq10);
277 rinv11 = gmx_mm_invsqrt_ps(rsq11);
278 rinv12 = gmx_mm_invsqrt_ps(rsq12);
279 rinv20 = gmx_mm_invsqrt_ps(rsq20);
280 rinv21 = gmx_mm_invsqrt_ps(rsq21);
281 rinv22 = gmx_mm_invsqrt_ps(rsq22);
283 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
284 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
285 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
286 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
287 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
288 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
289 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
290 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
291 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
293 fjx0 = _mm_setzero_ps();
294 fjy0 = _mm_setzero_ps();
295 fjz0 = _mm_setzero_ps();
296 fjx1 = _mm_setzero_ps();
297 fjy1 = _mm_setzero_ps();
298 fjz1 = _mm_setzero_ps();
299 fjx2 = _mm_setzero_ps();
300 fjy2 = _mm_setzero_ps();
301 fjz2 = _mm_setzero_ps();
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 if (gmx_mm_any_lt(rsq00,rcutoff2))
310 r00 = _mm_mul_ps(rsq00,rinv00);
312 /* REACTION-FIELD ELECTROSTATICS */
313 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
314 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
316 /* LENNARD-JONES DISPERSION/REPULSION */
318 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
319 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
320 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
321 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
322 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
324 d = _mm_sub_ps(r00,rswitch);
325 d = _mm_max_ps(d,_mm_setzero_ps());
326 d2 = _mm_mul_ps(d,d);
327 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
329 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
331 /* Evaluate switch function */
332 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
333 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
334 vvdw = _mm_mul_ps(vvdw,sw);
335 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
337 /* Update potential sum for this i atom from the interaction with this j atom. */
338 velec = _mm_and_ps(velec,cutoff_mask);
339 velecsum = _mm_add_ps(velecsum,velec);
340 vvdw = _mm_and_ps(vvdw,cutoff_mask);
341 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
343 fscal = _mm_add_ps(felec,fvdw);
345 fscal = _mm_and_ps(fscal,cutoff_mask);
347 /* Calculate temporary vectorial force */
348 tx = _mm_mul_ps(fscal,dx00);
349 ty = _mm_mul_ps(fscal,dy00);
350 tz = _mm_mul_ps(fscal,dz00);
352 /* Update vectorial force */
353 fix0 = _mm_add_ps(fix0,tx);
354 fiy0 = _mm_add_ps(fiy0,ty);
355 fiz0 = _mm_add_ps(fiz0,tz);
357 fjx0 = _mm_add_ps(fjx0,tx);
358 fjy0 = _mm_add_ps(fjy0,ty);
359 fjz0 = _mm_add_ps(fjz0,tz);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 if (gmx_mm_any_lt(rsq01,rcutoff2))
370 /* REACTION-FIELD ELECTROSTATICS */
371 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
372 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
374 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velec = _mm_and_ps(velec,cutoff_mask);
378 velecsum = _mm_add_ps(velecsum,velec);
382 fscal = _mm_and_ps(fscal,cutoff_mask);
384 /* Calculate temporary vectorial force */
385 tx = _mm_mul_ps(fscal,dx01);
386 ty = _mm_mul_ps(fscal,dy01);
387 tz = _mm_mul_ps(fscal,dz01);
389 /* Update vectorial force */
390 fix0 = _mm_add_ps(fix0,tx);
391 fiy0 = _mm_add_ps(fiy0,ty);
392 fiz0 = _mm_add_ps(fiz0,tz);
394 fjx1 = _mm_add_ps(fjx1,tx);
395 fjy1 = _mm_add_ps(fjy1,ty);
396 fjz1 = _mm_add_ps(fjz1,tz);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 if (gmx_mm_any_lt(rsq02,rcutoff2))
407 /* REACTION-FIELD ELECTROSTATICS */
408 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
409 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
411 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velec = _mm_and_ps(velec,cutoff_mask);
415 velecsum = _mm_add_ps(velecsum,velec);
419 fscal = _mm_and_ps(fscal,cutoff_mask);
421 /* Calculate temporary vectorial force */
422 tx = _mm_mul_ps(fscal,dx02);
423 ty = _mm_mul_ps(fscal,dy02);
424 tz = _mm_mul_ps(fscal,dz02);
426 /* Update vectorial force */
427 fix0 = _mm_add_ps(fix0,tx);
428 fiy0 = _mm_add_ps(fiy0,ty);
429 fiz0 = _mm_add_ps(fiz0,tz);
431 fjx2 = _mm_add_ps(fjx2,tx);
432 fjy2 = _mm_add_ps(fjy2,ty);
433 fjz2 = _mm_add_ps(fjz2,tz);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 if (gmx_mm_any_lt(rsq10,rcutoff2))
444 /* REACTION-FIELD ELECTROSTATICS */
445 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
446 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
448 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velec = _mm_and_ps(velec,cutoff_mask);
452 velecsum = _mm_add_ps(velecsum,velec);
456 fscal = _mm_and_ps(fscal,cutoff_mask);
458 /* Calculate temporary vectorial force */
459 tx = _mm_mul_ps(fscal,dx10);
460 ty = _mm_mul_ps(fscal,dy10);
461 tz = _mm_mul_ps(fscal,dz10);
463 /* Update vectorial force */
464 fix1 = _mm_add_ps(fix1,tx);
465 fiy1 = _mm_add_ps(fiy1,ty);
466 fiz1 = _mm_add_ps(fiz1,tz);
468 fjx0 = _mm_add_ps(fjx0,tx);
469 fjy0 = _mm_add_ps(fjy0,ty);
470 fjz0 = _mm_add_ps(fjz0,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 if (gmx_mm_any_lt(rsq11,rcutoff2))
481 /* REACTION-FIELD ELECTROSTATICS */
482 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
483 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
485 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velec = _mm_and_ps(velec,cutoff_mask);
489 velecsum = _mm_add_ps(velecsum,velec);
493 fscal = _mm_and_ps(fscal,cutoff_mask);
495 /* Calculate temporary vectorial force */
496 tx = _mm_mul_ps(fscal,dx11);
497 ty = _mm_mul_ps(fscal,dy11);
498 tz = _mm_mul_ps(fscal,dz11);
500 /* Update vectorial force */
501 fix1 = _mm_add_ps(fix1,tx);
502 fiy1 = _mm_add_ps(fiy1,ty);
503 fiz1 = _mm_add_ps(fiz1,tz);
505 fjx1 = _mm_add_ps(fjx1,tx);
506 fjy1 = _mm_add_ps(fjy1,ty);
507 fjz1 = _mm_add_ps(fjz1,tz);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 if (gmx_mm_any_lt(rsq12,rcutoff2))
518 /* REACTION-FIELD ELECTROSTATICS */
519 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
520 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
522 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
524 /* Update potential sum for this i atom from the interaction with this j atom. */
525 velec = _mm_and_ps(velec,cutoff_mask);
526 velecsum = _mm_add_ps(velecsum,velec);
530 fscal = _mm_and_ps(fscal,cutoff_mask);
532 /* Calculate temporary vectorial force */
533 tx = _mm_mul_ps(fscal,dx12);
534 ty = _mm_mul_ps(fscal,dy12);
535 tz = _mm_mul_ps(fscal,dz12);
537 /* Update vectorial force */
538 fix1 = _mm_add_ps(fix1,tx);
539 fiy1 = _mm_add_ps(fiy1,ty);
540 fiz1 = _mm_add_ps(fiz1,tz);
542 fjx2 = _mm_add_ps(fjx2,tx);
543 fjy2 = _mm_add_ps(fjy2,ty);
544 fjz2 = _mm_add_ps(fjz2,tz);
548 /**************************
549 * CALCULATE INTERACTIONS *
550 **************************/
552 if (gmx_mm_any_lt(rsq20,rcutoff2))
555 /* REACTION-FIELD ELECTROSTATICS */
556 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
557 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
559 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velec = _mm_and_ps(velec,cutoff_mask);
563 velecsum = _mm_add_ps(velecsum,velec);
567 fscal = _mm_and_ps(fscal,cutoff_mask);
569 /* Calculate temporary vectorial force */
570 tx = _mm_mul_ps(fscal,dx20);
571 ty = _mm_mul_ps(fscal,dy20);
572 tz = _mm_mul_ps(fscal,dz20);
574 /* Update vectorial force */
575 fix2 = _mm_add_ps(fix2,tx);
576 fiy2 = _mm_add_ps(fiy2,ty);
577 fiz2 = _mm_add_ps(fiz2,tz);
579 fjx0 = _mm_add_ps(fjx0,tx);
580 fjy0 = _mm_add_ps(fjy0,ty);
581 fjz0 = _mm_add_ps(fjz0,tz);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 if (gmx_mm_any_lt(rsq21,rcutoff2))
592 /* REACTION-FIELD ELECTROSTATICS */
593 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
594 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
596 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec = _mm_and_ps(velec,cutoff_mask);
600 velecsum = _mm_add_ps(velecsum,velec);
604 fscal = _mm_and_ps(fscal,cutoff_mask);
606 /* Calculate temporary vectorial force */
607 tx = _mm_mul_ps(fscal,dx21);
608 ty = _mm_mul_ps(fscal,dy21);
609 tz = _mm_mul_ps(fscal,dz21);
611 /* Update vectorial force */
612 fix2 = _mm_add_ps(fix2,tx);
613 fiy2 = _mm_add_ps(fiy2,ty);
614 fiz2 = _mm_add_ps(fiz2,tz);
616 fjx1 = _mm_add_ps(fjx1,tx);
617 fjy1 = _mm_add_ps(fjy1,ty);
618 fjz1 = _mm_add_ps(fjz1,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm_any_lt(rsq22,rcutoff2))
629 /* REACTION-FIELD ELECTROSTATICS */
630 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
631 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
633 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
635 /* Update potential sum for this i atom from the interaction with this j atom. */
636 velec = _mm_and_ps(velec,cutoff_mask);
637 velecsum = _mm_add_ps(velecsum,velec);
641 fscal = _mm_and_ps(fscal,cutoff_mask);
643 /* Calculate temporary vectorial force */
644 tx = _mm_mul_ps(fscal,dx22);
645 ty = _mm_mul_ps(fscal,dy22);
646 tz = _mm_mul_ps(fscal,dz22);
648 /* Update vectorial force */
649 fix2 = _mm_add_ps(fix2,tx);
650 fiy2 = _mm_add_ps(fiy2,ty);
651 fiz2 = _mm_add_ps(fiz2,tz);
653 fjx2 = _mm_add_ps(fjx2,tx);
654 fjy2 = _mm_add_ps(fjy2,ty);
655 fjz2 = _mm_add_ps(fjz2,tz);
659 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
660 f+j_coord_offsetC,f+j_coord_offsetD,
661 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
663 /* Inner loop uses 358 flops */
669 /* Get j neighbor index, and coordinate index */
675 /* Sign of each element will be negative for non-real atoms.
676 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
677 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
679 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
680 jnrA = (jnrA>=0) ? jnrA : 0;
681 jnrB = (jnrB>=0) ? jnrB : 0;
682 jnrC = (jnrC>=0) ? jnrC : 0;
683 jnrD = (jnrD>=0) ? jnrD : 0;
685 j_coord_offsetA = DIM*jnrA;
686 j_coord_offsetB = DIM*jnrB;
687 j_coord_offsetC = DIM*jnrC;
688 j_coord_offsetD = DIM*jnrD;
690 /* load j atom coordinates */
691 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
692 x+j_coord_offsetC,x+j_coord_offsetD,
693 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
695 /* Calculate displacement vector */
696 dx00 = _mm_sub_ps(ix0,jx0);
697 dy00 = _mm_sub_ps(iy0,jy0);
698 dz00 = _mm_sub_ps(iz0,jz0);
699 dx01 = _mm_sub_ps(ix0,jx1);
700 dy01 = _mm_sub_ps(iy0,jy1);
701 dz01 = _mm_sub_ps(iz0,jz1);
702 dx02 = _mm_sub_ps(ix0,jx2);
703 dy02 = _mm_sub_ps(iy0,jy2);
704 dz02 = _mm_sub_ps(iz0,jz2);
705 dx10 = _mm_sub_ps(ix1,jx0);
706 dy10 = _mm_sub_ps(iy1,jy0);
707 dz10 = _mm_sub_ps(iz1,jz0);
708 dx11 = _mm_sub_ps(ix1,jx1);
709 dy11 = _mm_sub_ps(iy1,jy1);
710 dz11 = _mm_sub_ps(iz1,jz1);
711 dx12 = _mm_sub_ps(ix1,jx2);
712 dy12 = _mm_sub_ps(iy1,jy2);
713 dz12 = _mm_sub_ps(iz1,jz2);
714 dx20 = _mm_sub_ps(ix2,jx0);
715 dy20 = _mm_sub_ps(iy2,jy0);
716 dz20 = _mm_sub_ps(iz2,jz0);
717 dx21 = _mm_sub_ps(ix2,jx1);
718 dy21 = _mm_sub_ps(iy2,jy1);
719 dz21 = _mm_sub_ps(iz2,jz1);
720 dx22 = _mm_sub_ps(ix2,jx2);
721 dy22 = _mm_sub_ps(iy2,jy2);
722 dz22 = _mm_sub_ps(iz2,jz2);
724 /* Calculate squared distance and things based on it */
725 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
726 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
727 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
728 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
729 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
730 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
731 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
732 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
733 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
735 rinv00 = gmx_mm_invsqrt_ps(rsq00);
736 rinv01 = gmx_mm_invsqrt_ps(rsq01);
737 rinv02 = gmx_mm_invsqrt_ps(rsq02);
738 rinv10 = gmx_mm_invsqrt_ps(rsq10);
739 rinv11 = gmx_mm_invsqrt_ps(rsq11);
740 rinv12 = gmx_mm_invsqrt_ps(rsq12);
741 rinv20 = gmx_mm_invsqrt_ps(rsq20);
742 rinv21 = gmx_mm_invsqrt_ps(rsq21);
743 rinv22 = gmx_mm_invsqrt_ps(rsq22);
745 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
746 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
747 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
748 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
749 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
750 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
751 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
752 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
753 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
755 fjx0 = _mm_setzero_ps();
756 fjy0 = _mm_setzero_ps();
757 fjz0 = _mm_setzero_ps();
758 fjx1 = _mm_setzero_ps();
759 fjy1 = _mm_setzero_ps();
760 fjz1 = _mm_setzero_ps();
761 fjx2 = _mm_setzero_ps();
762 fjy2 = _mm_setzero_ps();
763 fjz2 = _mm_setzero_ps();
765 /**************************
766 * CALCULATE INTERACTIONS *
767 **************************/
769 if (gmx_mm_any_lt(rsq00,rcutoff2))
772 r00 = _mm_mul_ps(rsq00,rinv00);
773 r00 = _mm_andnot_ps(dummy_mask,r00);
775 /* REACTION-FIELD ELECTROSTATICS */
776 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
777 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
779 /* LENNARD-JONES DISPERSION/REPULSION */
781 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
782 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
783 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
784 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
785 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
787 d = _mm_sub_ps(r00,rswitch);
788 d = _mm_max_ps(d,_mm_setzero_ps());
789 d2 = _mm_mul_ps(d,d);
790 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
792 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
794 /* Evaluate switch function */
795 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
797 vvdw = _mm_mul_ps(vvdw,sw);
798 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm_and_ps(velec,cutoff_mask);
802 velec = _mm_andnot_ps(dummy_mask,velec);
803 velecsum = _mm_add_ps(velecsum,velec);
804 vvdw = _mm_and_ps(vvdw,cutoff_mask);
805 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
806 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
808 fscal = _mm_add_ps(felec,fvdw);
810 fscal = _mm_and_ps(fscal,cutoff_mask);
812 fscal = _mm_andnot_ps(dummy_mask,fscal);
814 /* Calculate temporary vectorial force */
815 tx = _mm_mul_ps(fscal,dx00);
816 ty = _mm_mul_ps(fscal,dy00);
817 tz = _mm_mul_ps(fscal,dz00);
819 /* Update vectorial force */
820 fix0 = _mm_add_ps(fix0,tx);
821 fiy0 = _mm_add_ps(fiy0,ty);
822 fiz0 = _mm_add_ps(fiz0,tz);
824 fjx0 = _mm_add_ps(fjx0,tx);
825 fjy0 = _mm_add_ps(fjy0,ty);
826 fjz0 = _mm_add_ps(fjz0,tz);
830 /**************************
831 * CALCULATE INTERACTIONS *
832 **************************/
834 if (gmx_mm_any_lt(rsq01,rcutoff2))
837 /* REACTION-FIELD ELECTROSTATICS */
838 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
839 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
841 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
843 /* Update potential sum for this i atom from the interaction with this j atom. */
844 velec = _mm_and_ps(velec,cutoff_mask);
845 velec = _mm_andnot_ps(dummy_mask,velec);
846 velecsum = _mm_add_ps(velecsum,velec);
850 fscal = _mm_and_ps(fscal,cutoff_mask);
852 fscal = _mm_andnot_ps(dummy_mask,fscal);
854 /* Calculate temporary vectorial force */
855 tx = _mm_mul_ps(fscal,dx01);
856 ty = _mm_mul_ps(fscal,dy01);
857 tz = _mm_mul_ps(fscal,dz01);
859 /* Update vectorial force */
860 fix0 = _mm_add_ps(fix0,tx);
861 fiy0 = _mm_add_ps(fiy0,ty);
862 fiz0 = _mm_add_ps(fiz0,tz);
864 fjx1 = _mm_add_ps(fjx1,tx);
865 fjy1 = _mm_add_ps(fjy1,ty);
866 fjz1 = _mm_add_ps(fjz1,tz);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 if (gmx_mm_any_lt(rsq02,rcutoff2))
877 /* REACTION-FIELD ELECTROSTATICS */
878 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
879 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
881 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
883 /* Update potential sum for this i atom from the interaction with this j atom. */
884 velec = _mm_and_ps(velec,cutoff_mask);
885 velec = _mm_andnot_ps(dummy_mask,velec);
886 velecsum = _mm_add_ps(velecsum,velec);
890 fscal = _mm_and_ps(fscal,cutoff_mask);
892 fscal = _mm_andnot_ps(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm_mul_ps(fscal,dx02);
896 ty = _mm_mul_ps(fscal,dy02);
897 tz = _mm_mul_ps(fscal,dz02);
899 /* Update vectorial force */
900 fix0 = _mm_add_ps(fix0,tx);
901 fiy0 = _mm_add_ps(fiy0,ty);
902 fiz0 = _mm_add_ps(fiz0,tz);
904 fjx2 = _mm_add_ps(fjx2,tx);
905 fjy2 = _mm_add_ps(fjy2,ty);
906 fjz2 = _mm_add_ps(fjz2,tz);
910 /**************************
911 * CALCULATE INTERACTIONS *
912 **************************/
914 if (gmx_mm_any_lt(rsq10,rcutoff2))
917 /* REACTION-FIELD ELECTROSTATICS */
918 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
919 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
921 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
923 /* Update potential sum for this i atom from the interaction with this j atom. */
924 velec = _mm_and_ps(velec,cutoff_mask);
925 velec = _mm_andnot_ps(dummy_mask,velec);
926 velecsum = _mm_add_ps(velecsum,velec);
930 fscal = _mm_and_ps(fscal,cutoff_mask);
932 fscal = _mm_andnot_ps(dummy_mask,fscal);
934 /* Calculate temporary vectorial force */
935 tx = _mm_mul_ps(fscal,dx10);
936 ty = _mm_mul_ps(fscal,dy10);
937 tz = _mm_mul_ps(fscal,dz10);
939 /* Update vectorial force */
940 fix1 = _mm_add_ps(fix1,tx);
941 fiy1 = _mm_add_ps(fiy1,ty);
942 fiz1 = _mm_add_ps(fiz1,tz);
944 fjx0 = _mm_add_ps(fjx0,tx);
945 fjy0 = _mm_add_ps(fjy0,ty);
946 fjz0 = _mm_add_ps(fjz0,tz);
950 /**************************
951 * CALCULATE INTERACTIONS *
952 **************************/
954 if (gmx_mm_any_lt(rsq11,rcutoff2))
957 /* REACTION-FIELD ELECTROSTATICS */
958 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
959 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
961 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
963 /* Update potential sum for this i atom from the interaction with this j atom. */
964 velec = _mm_and_ps(velec,cutoff_mask);
965 velec = _mm_andnot_ps(dummy_mask,velec);
966 velecsum = _mm_add_ps(velecsum,velec);
970 fscal = _mm_and_ps(fscal,cutoff_mask);
972 fscal = _mm_andnot_ps(dummy_mask,fscal);
974 /* Calculate temporary vectorial force */
975 tx = _mm_mul_ps(fscal,dx11);
976 ty = _mm_mul_ps(fscal,dy11);
977 tz = _mm_mul_ps(fscal,dz11);
979 /* Update vectorial force */
980 fix1 = _mm_add_ps(fix1,tx);
981 fiy1 = _mm_add_ps(fiy1,ty);
982 fiz1 = _mm_add_ps(fiz1,tz);
984 fjx1 = _mm_add_ps(fjx1,tx);
985 fjy1 = _mm_add_ps(fjy1,ty);
986 fjz1 = _mm_add_ps(fjz1,tz);
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 if (gmx_mm_any_lt(rsq12,rcutoff2))
997 /* REACTION-FIELD ELECTROSTATICS */
998 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
999 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1001 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1003 /* Update potential sum for this i atom from the interaction with this j atom. */
1004 velec = _mm_and_ps(velec,cutoff_mask);
1005 velec = _mm_andnot_ps(dummy_mask,velec);
1006 velecsum = _mm_add_ps(velecsum,velec);
1010 fscal = _mm_and_ps(fscal,cutoff_mask);
1012 fscal = _mm_andnot_ps(dummy_mask,fscal);
1014 /* Calculate temporary vectorial force */
1015 tx = _mm_mul_ps(fscal,dx12);
1016 ty = _mm_mul_ps(fscal,dy12);
1017 tz = _mm_mul_ps(fscal,dz12);
1019 /* Update vectorial force */
1020 fix1 = _mm_add_ps(fix1,tx);
1021 fiy1 = _mm_add_ps(fiy1,ty);
1022 fiz1 = _mm_add_ps(fiz1,tz);
1024 fjx2 = _mm_add_ps(fjx2,tx);
1025 fjy2 = _mm_add_ps(fjy2,ty);
1026 fjz2 = _mm_add_ps(fjz2,tz);
1030 /**************************
1031 * CALCULATE INTERACTIONS *
1032 **************************/
1034 if (gmx_mm_any_lt(rsq20,rcutoff2))
1037 /* REACTION-FIELD ELECTROSTATICS */
1038 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1039 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1041 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1043 /* Update potential sum for this i atom from the interaction with this j atom. */
1044 velec = _mm_and_ps(velec,cutoff_mask);
1045 velec = _mm_andnot_ps(dummy_mask,velec);
1046 velecsum = _mm_add_ps(velecsum,velec);
1050 fscal = _mm_and_ps(fscal,cutoff_mask);
1052 fscal = _mm_andnot_ps(dummy_mask,fscal);
1054 /* Calculate temporary vectorial force */
1055 tx = _mm_mul_ps(fscal,dx20);
1056 ty = _mm_mul_ps(fscal,dy20);
1057 tz = _mm_mul_ps(fscal,dz20);
1059 /* Update vectorial force */
1060 fix2 = _mm_add_ps(fix2,tx);
1061 fiy2 = _mm_add_ps(fiy2,ty);
1062 fiz2 = _mm_add_ps(fiz2,tz);
1064 fjx0 = _mm_add_ps(fjx0,tx);
1065 fjy0 = _mm_add_ps(fjy0,ty);
1066 fjz0 = _mm_add_ps(fjz0,tz);
1070 /**************************
1071 * CALCULATE INTERACTIONS *
1072 **************************/
1074 if (gmx_mm_any_lt(rsq21,rcutoff2))
1077 /* REACTION-FIELD ELECTROSTATICS */
1078 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1079 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1081 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1083 /* Update potential sum for this i atom from the interaction with this j atom. */
1084 velec = _mm_and_ps(velec,cutoff_mask);
1085 velec = _mm_andnot_ps(dummy_mask,velec);
1086 velecsum = _mm_add_ps(velecsum,velec);
1090 fscal = _mm_and_ps(fscal,cutoff_mask);
1092 fscal = _mm_andnot_ps(dummy_mask,fscal);
1094 /* Calculate temporary vectorial force */
1095 tx = _mm_mul_ps(fscal,dx21);
1096 ty = _mm_mul_ps(fscal,dy21);
1097 tz = _mm_mul_ps(fscal,dz21);
1099 /* Update vectorial force */
1100 fix2 = _mm_add_ps(fix2,tx);
1101 fiy2 = _mm_add_ps(fiy2,ty);
1102 fiz2 = _mm_add_ps(fiz2,tz);
1104 fjx1 = _mm_add_ps(fjx1,tx);
1105 fjy1 = _mm_add_ps(fjy1,ty);
1106 fjz1 = _mm_add_ps(fjz1,tz);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 if (gmx_mm_any_lt(rsq22,rcutoff2))
1117 /* REACTION-FIELD ELECTROSTATICS */
1118 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1119 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1121 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1123 /* Update potential sum for this i atom from the interaction with this j atom. */
1124 velec = _mm_and_ps(velec,cutoff_mask);
1125 velec = _mm_andnot_ps(dummy_mask,velec);
1126 velecsum = _mm_add_ps(velecsum,velec);
1130 fscal = _mm_and_ps(fscal,cutoff_mask);
1132 fscal = _mm_andnot_ps(dummy_mask,fscal);
1134 /* Calculate temporary vectorial force */
1135 tx = _mm_mul_ps(fscal,dx22);
1136 ty = _mm_mul_ps(fscal,dy22);
1137 tz = _mm_mul_ps(fscal,dz22);
1139 /* Update vectorial force */
1140 fix2 = _mm_add_ps(fix2,tx);
1141 fiy2 = _mm_add_ps(fiy2,ty);
1142 fiz2 = _mm_add_ps(fiz2,tz);
1144 fjx2 = _mm_add_ps(fjx2,tx);
1145 fjy2 = _mm_add_ps(fjy2,ty);
1146 fjz2 = _mm_add_ps(fjz2,tz);
1150 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1151 f+j_coord_offsetC,f+j_coord_offsetD,
1152 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1154 /* Inner loop uses 359 flops */
1157 /* End of innermost loop */
1159 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1160 f+i_coord_offset,fshift+i_shift_offset);
1163 /* Update potential energies */
1164 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1165 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1167 /* Increment number of inner iterations */
1168 inneriter += j_index_end - j_index_start;
1170 /* Outer loop uses 29 flops */
1173 /* Increment number of outer iterations */
1176 /* Update outer/inner flops */
1178 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*359);
1181 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
1182 * Electrostatics interaction: ReactionField
1183 * VdW interaction: LennardJones
1184 * Geometry: Water3-Water3
1185 * Calculate force/pot: Force
1188 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
1189 (t_nblist * gmx_restrict nlist,
1190 rvec * gmx_restrict xx,
1191 rvec * gmx_restrict ff,
1192 t_forcerec * gmx_restrict fr,
1193 t_mdatoms * gmx_restrict mdatoms,
1194 nb_kernel_data_t * gmx_restrict kernel_data,
1195 t_nrnb * gmx_restrict nrnb)
1197 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1198 * just 0 for non-waters.
1199 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1200 * jnr indices corresponding to data put in the four positions in the SIMD register.
1202 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1203 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1204 int jnrA,jnrB,jnrC,jnrD;
1205 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1206 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1207 real shX,shY,shZ,rcutoff_scalar;
1208 real *shiftvec,*fshift,*x,*f;
1209 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1211 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1213 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1215 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1216 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1217 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1218 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1219 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1220 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1221 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1222 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1223 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1224 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1225 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1226 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1227 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1228 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1229 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1230 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1231 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1234 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1237 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1238 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1239 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1240 real rswitch_scalar,d_scalar;
1241 __m128 dummy_mask,cutoff_mask;
1242 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1243 __m128 one = _mm_set1_ps(1.0);
1244 __m128 two = _mm_set1_ps(2.0);
1250 jindex = nlist->jindex;
1252 shiftidx = nlist->shift;
1254 shiftvec = fr->shift_vec[0];
1255 fshift = fr->fshift[0];
1256 facel = _mm_set1_ps(fr->epsfac);
1257 charge = mdatoms->chargeA;
1258 krf = _mm_set1_ps(fr->ic->k_rf);
1259 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1260 crf = _mm_set1_ps(fr->ic->c_rf);
1261 nvdwtype = fr->ntype;
1262 vdwparam = fr->nbfp;
1263 vdwtype = mdatoms->typeA;
1265 /* Setup water-specific parameters */
1266 inr = nlist->iinr[0];
1267 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1268 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1269 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1270 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1272 jq0 = _mm_set1_ps(charge[inr+0]);
1273 jq1 = _mm_set1_ps(charge[inr+1]);
1274 jq2 = _mm_set1_ps(charge[inr+2]);
1275 vdwjidx0A = 2*vdwtype[inr+0];
1276 qq00 = _mm_mul_ps(iq0,jq0);
1277 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1278 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1279 qq01 = _mm_mul_ps(iq0,jq1);
1280 qq02 = _mm_mul_ps(iq0,jq2);
1281 qq10 = _mm_mul_ps(iq1,jq0);
1282 qq11 = _mm_mul_ps(iq1,jq1);
1283 qq12 = _mm_mul_ps(iq1,jq2);
1284 qq20 = _mm_mul_ps(iq2,jq0);
1285 qq21 = _mm_mul_ps(iq2,jq1);
1286 qq22 = _mm_mul_ps(iq2,jq2);
1288 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1289 rcutoff_scalar = fr->rcoulomb;
1290 rcutoff = _mm_set1_ps(rcutoff_scalar);
1291 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1293 rswitch_scalar = fr->rvdw_switch;
1294 rswitch = _mm_set1_ps(rswitch_scalar);
1295 /* Setup switch parameters */
1296 d_scalar = rcutoff_scalar-rswitch_scalar;
1297 d = _mm_set1_ps(d_scalar);
1298 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1299 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1300 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1301 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1302 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1303 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1305 /* Avoid stupid compiler warnings */
1306 jnrA = jnrB = jnrC = jnrD = 0;
1307 j_coord_offsetA = 0;
1308 j_coord_offsetB = 0;
1309 j_coord_offsetC = 0;
1310 j_coord_offsetD = 0;
1315 /* Start outer loop over neighborlists */
1316 for(iidx=0; iidx<nri; iidx++)
1318 /* Load shift vector for this list */
1319 i_shift_offset = DIM*shiftidx[iidx];
1320 shX = shiftvec[i_shift_offset+XX];
1321 shY = shiftvec[i_shift_offset+YY];
1322 shZ = shiftvec[i_shift_offset+ZZ];
1324 /* Load limits for loop over neighbors */
1325 j_index_start = jindex[iidx];
1326 j_index_end = jindex[iidx+1];
1328 /* Get outer coordinate index */
1330 i_coord_offset = DIM*inr;
1332 /* Load i particle coords and add shift vector */
1333 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1334 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1335 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1336 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1337 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1338 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1339 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1340 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1341 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1343 fix0 = _mm_setzero_ps();
1344 fiy0 = _mm_setzero_ps();
1345 fiz0 = _mm_setzero_ps();
1346 fix1 = _mm_setzero_ps();
1347 fiy1 = _mm_setzero_ps();
1348 fiz1 = _mm_setzero_ps();
1349 fix2 = _mm_setzero_ps();
1350 fiy2 = _mm_setzero_ps();
1351 fiz2 = _mm_setzero_ps();
1353 /* Start inner kernel loop */
1354 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1357 /* Get j neighbor index, and coordinate index */
1359 jnrB = jjnr[jidx+1];
1360 jnrC = jjnr[jidx+2];
1361 jnrD = jjnr[jidx+3];
1363 j_coord_offsetA = DIM*jnrA;
1364 j_coord_offsetB = DIM*jnrB;
1365 j_coord_offsetC = DIM*jnrC;
1366 j_coord_offsetD = DIM*jnrD;
1368 /* load j atom coordinates */
1369 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1370 x+j_coord_offsetC,x+j_coord_offsetD,
1371 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1373 /* Calculate displacement vector */
1374 dx00 = _mm_sub_ps(ix0,jx0);
1375 dy00 = _mm_sub_ps(iy0,jy0);
1376 dz00 = _mm_sub_ps(iz0,jz0);
1377 dx01 = _mm_sub_ps(ix0,jx1);
1378 dy01 = _mm_sub_ps(iy0,jy1);
1379 dz01 = _mm_sub_ps(iz0,jz1);
1380 dx02 = _mm_sub_ps(ix0,jx2);
1381 dy02 = _mm_sub_ps(iy0,jy2);
1382 dz02 = _mm_sub_ps(iz0,jz2);
1383 dx10 = _mm_sub_ps(ix1,jx0);
1384 dy10 = _mm_sub_ps(iy1,jy0);
1385 dz10 = _mm_sub_ps(iz1,jz0);
1386 dx11 = _mm_sub_ps(ix1,jx1);
1387 dy11 = _mm_sub_ps(iy1,jy1);
1388 dz11 = _mm_sub_ps(iz1,jz1);
1389 dx12 = _mm_sub_ps(ix1,jx2);
1390 dy12 = _mm_sub_ps(iy1,jy2);
1391 dz12 = _mm_sub_ps(iz1,jz2);
1392 dx20 = _mm_sub_ps(ix2,jx0);
1393 dy20 = _mm_sub_ps(iy2,jy0);
1394 dz20 = _mm_sub_ps(iz2,jz0);
1395 dx21 = _mm_sub_ps(ix2,jx1);
1396 dy21 = _mm_sub_ps(iy2,jy1);
1397 dz21 = _mm_sub_ps(iz2,jz1);
1398 dx22 = _mm_sub_ps(ix2,jx2);
1399 dy22 = _mm_sub_ps(iy2,jy2);
1400 dz22 = _mm_sub_ps(iz2,jz2);
1402 /* Calculate squared distance and things based on it */
1403 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1404 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1405 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1406 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1407 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1408 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1409 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1410 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1411 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1413 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1414 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1415 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1416 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1417 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1418 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1419 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1420 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1421 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1423 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1424 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1425 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1426 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1427 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1428 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1429 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1430 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1431 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1433 fjx0 = _mm_setzero_ps();
1434 fjy0 = _mm_setzero_ps();
1435 fjz0 = _mm_setzero_ps();
1436 fjx1 = _mm_setzero_ps();
1437 fjy1 = _mm_setzero_ps();
1438 fjz1 = _mm_setzero_ps();
1439 fjx2 = _mm_setzero_ps();
1440 fjy2 = _mm_setzero_ps();
1441 fjz2 = _mm_setzero_ps();
1443 /**************************
1444 * CALCULATE INTERACTIONS *
1445 **************************/
1447 if (gmx_mm_any_lt(rsq00,rcutoff2))
1450 r00 = _mm_mul_ps(rsq00,rinv00);
1452 /* REACTION-FIELD ELECTROSTATICS */
1453 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1455 /* LENNARD-JONES DISPERSION/REPULSION */
1457 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1458 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1459 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1460 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1461 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1463 d = _mm_sub_ps(r00,rswitch);
1464 d = _mm_max_ps(d,_mm_setzero_ps());
1465 d2 = _mm_mul_ps(d,d);
1466 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1468 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1470 /* Evaluate switch function */
1471 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1472 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1473 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1475 fscal = _mm_add_ps(felec,fvdw);
1477 fscal = _mm_and_ps(fscal,cutoff_mask);
1479 /* Calculate temporary vectorial force */
1480 tx = _mm_mul_ps(fscal,dx00);
1481 ty = _mm_mul_ps(fscal,dy00);
1482 tz = _mm_mul_ps(fscal,dz00);
1484 /* Update vectorial force */
1485 fix0 = _mm_add_ps(fix0,tx);
1486 fiy0 = _mm_add_ps(fiy0,ty);
1487 fiz0 = _mm_add_ps(fiz0,tz);
1489 fjx0 = _mm_add_ps(fjx0,tx);
1490 fjy0 = _mm_add_ps(fjy0,ty);
1491 fjz0 = _mm_add_ps(fjz0,tz);
1495 /**************************
1496 * CALCULATE INTERACTIONS *
1497 **************************/
1499 if (gmx_mm_any_lt(rsq01,rcutoff2))
1502 /* REACTION-FIELD ELECTROSTATICS */
1503 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1505 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1509 fscal = _mm_and_ps(fscal,cutoff_mask);
1511 /* Calculate temporary vectorial force */
1512 tx = _mm_mul_ps(fscal,dx01);
1513 ty = _mm_mul_ps(fscal,dy01);
1514 tz = _mm_mul_ps(fscal,dz01);
1516 /* Update vectorial force */
1517 fix0 = _mm_add_ps(fix0,tx);
1518 fiy0 = _mm_add_ps(fiy0,ty);
1519 fiz0 = _mm_add_ps(fiz0,tz);
1521 fjx1 = _mm_add_ps(fjx1,tx);
1522 fjy1 = _mm_add_ps(fjy1,ty);
1523 fjz1 = _mm_add_ps(fjz1,tz);
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 if (gmx_mm_any_lt(rsq02,rcutoff2))
1534 /* REACTION-FIELD ELECTROSTATICS */
1535 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1537 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1541 fscal = _mm_and_ps(fscal,cutoff_mask);
1543 /* Calculate temporary vectorial force */
1544 tx = _mm_mul_ps(fscal,dx02);
1545 ty = _mm_mul_ps(fscal,dy02);
1546 tz = _mm_mul_ps(fscal,dz02);
1548 /* Update vectorial force */
1549 fix0 = _mm_add_ps(fix0,tx);
1550 fiy0 = _mm_add_ps(fiy0,ty);
1551 fiz0 = _mm_add_ps(fiz0,tz);
1553 fjx2 = _mm_add_ps(fjx2,tx);
1554 fjy2 = _mm_add_ps(fjy2,ty);
1555 fjz2 = _mm_add_ps(fjz2,tz);
1559 /**************************
1560 * CALCULATE INTERACTIONS *
1561 **************************/
1563 if (gmx_mm_any_lt(rsq10,rcutoff2))
1566 /* REACTION-FIELD ELECTROSTATICS */
1567 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1569 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1573 fscal = _mm_and_ps(fscal,cutoff_mask);
1575 /* Calculate temporary vectorial force */
1576 tx = _mm_mul_ps(fscal,dx10);
1577 ty = _mm_mul_ps(fscal,dy10);
1578 tz = _mm_mul_ps(fscal,dz10);
1580 /* Update vectorial force */
1581 fix1 = _mm_add_ps(fix1,tx);
1582 fiy1 = _mm_add_ps(fiy1,ty);
1583 fiz1 = _mm_add_ps(fiz1,tz);
1585 fjx0 = _mm_add_ps(fjx0,tx);
1586 fjy0 = _mm_add_ps(fjy0,ty);
1587 fjz0 = _mm_add_ps(fjz0,tz);
1591 /**************************
1592 * CALCULATE INTERACTIONS *
1593 **************************/
1595 if (gmx_mm_any_lt(rsq11,rcutoff2))
1598 /* REACTION-FIELD ELECTROSTATICS */
1599 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1601 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1605 fscal = _mm_and_ps(fscal,cutoff_mask);
1607 /* Calculate temporary vectorial force */
1608 tx = _mm_mul_ps(fscal,dx11);
1609 ty = _mm_mul_ps(fscal,dy11);
1610 tz = _mm_mul_ps(fscal,dz11);
1612 /* Update vectorial force */
1613 fix1 = _mm_add_ps(fix1,tx);
1614 fiy1 = _mm_add_ps(fiy1,ty);
1615 fiz1 = _mm_add_ps(fiz1,tz);
1617 fjx1 = _mm_add_ps(fjx1,tx);
1618 fjy1 = _mm_add_ps(fjy1,ty);
1619 fjz1 = _mm_add_ps(fjz1,tz);
1623 /**************************
1624 * CALCULATE INTERACTIONS *
1625 **************************/
1627 if (gmx_mm_any_lt(rsq12,rcutoff2))
1630 /* REACTION-FIELD ELECTROSTATICS */
1631 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1633 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1637 fscal = _mm_and_ps(fscal,cutoff_mask);
1639 /* Calculate temporary vectorial force */
1640 tx = _mm_mul_ps(fscal,dx12);
1641 ty = _mm_mul_ps(fscal,dy12);
1642 tz = _mm_mul_ps(fscal,dz12);
1644 /* Update vectorial force */
1645 fix1 = _mm_add_ps(fix1,tx);
1646 fiy1 = _mm_add_ps(fiy1,ty);
1647 fiz1 = _mm_add_ps(fiz1,tz);
1649 fjx2 = _mm_add_ps(fjx2,tx);
1650 fjy2 = _mm_add_ps(fjy2,ty);
1651 fjz2 = _mm_add_ps(fjz2,tz);
1655 /**************************
1656 * CALCULATE INTERACTIONS *
1657 **************************/
1659 if (gmx_mm_any_lt(rsq20,rcutoff2))
1662 /* REACTION-FIELD ELECTROSTATICS */
1663 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1665 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1669 fscal = _mm_and_ps(fscal,cutoff_mask);
1671 /* Calculate temporary vectorial force */
1672 tx = _mm_mul_ps(fscal,dx20);
1673 ty = _mm_mul_ps(fscal,dy20);
1674 tz = _mm_mul_ps(fscal,dz20);
1676 /* Update vectorial force */
1677 fix2 = _mm_add_ps(fix2,tx);
1678 fiy2 = _mm_add_ps(fiy2,ty);
1679 fiz2 = _mm_add_ps(fiz2,tz);
1681 fjx0 = _mm_add_ps(fjx0,tx);
1682 fjy0 = _mm_add_ps(fjy0,ty);
1683 fjz0 = _mm_add_ps(fjz0,tz);
1687 /**************************
1688 * CALCULATE INTERACTIONS *
1689 **************************/
1691 if (gmx_mm_any_lt(rsq21,rcutoff2))
1694 /* REACTION-FIELD ELECTROSTATICS */
1695 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1697 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1701 fscal = _mm_and_ps(fscal,cutoff_mask);
1703 /* Calculate temporary vectorial force */
1704 tx = _mm_mul_ps(fscal,dx21);
1705 ty = _mm_mul_ps(fscal,dy21);
1706 tz = _mm_mul_ps(fscal,dz21);
1708 /* Update vectorial force */
1709 fix2 = _mm_add_ps(fix2,tx);
1710 fiy2 = _mm_add_ps(fiy2,ty);
1711 fiz2 = _mm_add_ps(fiz2,tz);
1713 fjx1 = _mm_add_ps(fjx1,tx);
1714 fjy1 = _mm_add_ps(fjy1,ty);
1715 fjz1 = _mm_add_ps(fjz1,tz);
1719 /**************************
1720 * CALCULATE INTERACTIONS *
1721 **************************/
1723 if (gmx_mm_any_lt(rsq22,rcutoff2))
1726 /* REACTION-FIELD ELECTROSTATICS */
1727 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1729 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1733 fscal = _mm_and_ps(fscal,cutoff_mask);
1735 /* Calculate temporary vectorial force */
1736 tx = _mm_mul_ps(fscal,dx22);
1737 ty = _mm_mul_ps(fscal,dy22);
1738 tz = _mm_mul_ps(fscal,dz22);
1740 /* Update vectorial force */
1741 fix2 = _mm_add_ps(fix2,tx);
1742 fiy2 = _mm_add_ps(fiy2,ty);
1743 fiz2 = _mm_add_ps(fiz2,tz);
1745 fjx2 = _mm_add_ps(fjx2,tx);
1746 fjy2 = _mm_add_ps(fjy2,ty);
1747 fjz2 = _mm_add_ps(fjz2,tz);
1751 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1752 f+j_coord_offsetC,f+j_coord_offsetD,
1753 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1755 /* Inner loop uses 301 flops */
1758 if(jidx<j_index_end)
1761 /* Get j neighbor index, and coordinate index */
1763 jnrB = jjnr[jidx+1];
1764 jnrC = jjnr[jidx+2];
1765 jnrD = jjnr[jidx+3];
1767 /* Sign of each element will be negative for non-real atoms.
1768 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1769 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1771 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1772 jnrA = (jnrA>=0) ? jnrA : 0;
1773 jnrB = (jnrB>=0) ? jnrB : 0;
1774 jnrC = (jnrC>=0) ? jnrC : 0;
1775 jnrD = (jnrD>=0) ? jnrD : 0;
1777 j_coord_offsetA = DIM*jnrA;
1778 j_coord_offsetB = DIM*jnrB;
1779 j_coord_offsetC = DIM*jnrC;
1780 j_coord_offsetD = DIM*jnrD;
1782 /* load j atom coordinates */
1783 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1784 x+j_coord_offsetC,x+j_coord_offsetD,
1785 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1787 /* Calculate displacement vector */
1788 dx00 = _mm_sub_ps(ix0,jx0);
1789 dy00 = _mm_sub_ps(iy0,jy0);
1790 dz00 = _mm_sub_ps(iz0,jz0);
1791 dx01 = _mm_sub_ps(ix0,jx1);
1792 dy01 = _mm_sub_ps(iy0,jy1);
1793 dz01 = _mm_sub_ps(iz0,jz1);
1794 dx02 = _mm_sub_ps(ix0,jx2);
1795 dy02 = _mm_sub_ps(iy0,jy2);
1796 dz02 = _mm_sub_ps(iz0,jz2);
1797 dx10 = _mm_sub_ps(ix1,jx0);
1798 dy10 = _mm_sub_ps(iy1,jy0);
1799 dz10 = _mm_sub_ps(iz1,jz0);
1800 dx11 = _mm_sub_ps(ix1,jx1);
1801 dy11 = _mm_sub_ps(iy1,jy1);
1802 dz11 = _mm_sub_ps(iz1,jz1);
1803 dx12 = _mm_sub_ps(ix1,jx2);
1804 dy12 = _mm_sub_ps(iy1,jy2);
1805 dz12 = _mm_sub_ps(iz1,jz2);
1806 dx20 = _mm_sub_ps(ix2,jx0);
1807 dy20 = _mm_sub_ps(iy2,jy0);
1808 dz20 = _mm_sub_ps(iz2,jz0);
1809 dx21 = _mm_sub_ps(ix2,jx1);
1810 dy21 = _mm_sub_ps(iy2,jy1);
1811 dz21 = _mm_sub_ps(iz2,jz1);
1812 dx22 = _mm_sub_ps(ix2,jx2);
1813 dy22 = _mm_sub_ps(iy2,jy2);
1814 dz22 = _mm_sub_ps(iz2,jz2);
1816 /* Calculate squared distance and things based on it */
1817 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1818 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1819 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1820 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1821 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1822 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1823 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1824 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1825 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1827 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1828 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1829 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1830 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1831 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1832 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1833 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1834 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1835 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1837 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1838 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1839 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1840 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1841 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1842 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1843 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1844 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1845 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1847 fjx0 = _mm_setzero_ps();
1848 fjy0 = _mm_setzero_ps();
1849 fjz0 = _mm_setzero_ps();
1850 fjx1 = _mm_setzero_ps();
1851 fjy1 = _mm_setzero_ps();
1852 fjz1 = _mm_setzero_ps();
1853 fjx2 = _mm_setzero_ps();
1854 fjy2 = _mm_setzero_ps();
1855 fjz2 = _mm_setzero_ps();
1857 /**************************
1858 * CALCULATE INTERACTIONS *
1859 **************************/
1861 if (gmx_mm_any_lt(rsq00,rcutoff2))
1864 r00 = _mm_mul_ps(rsq00,rinv00);
1865 r00 = _mm_andnot_ps(dummy_mask,r00);
1867 /* REACTION-FIELD ELECTROSTATICS */
1868 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1870 /* LENNARD-JONES DISPERSION/REPULSION */
1872 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1873 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1874 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1875 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1876 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1878 d = _mm_sub_ps(r00,rswitch);
1879 d = _mm_max_ps(d,_mm_setzero_ps());
1880 d2 = _mm_mul_ps(d,d);
1881 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1883 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1885 /* Evaluate switch function */
1886 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1887 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1888 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1890 fscal = _mm_add_ps(felec,fvdw);
1892 fscal = _mm_and_ps(fscal,cutoff_mask);
1894 fscal = _mm_andnot_ps(dummy_mask,fscal);
1896 /* Calculate temporary vectorial force */
1897 tx = _mm_mul_ps(fscal,dx00);
1898 ty = _mm_mul_ps(fscal,dy00);
1899 tz = _mm_mul_ps(fscal,dz00);
1901 /* Update vectorial force */
1902 fix0 = _mm_add_ps(fix0,tx);
1903 fiy0 = _mm_add_ps(fiy0,ty);
1904 fiz0 = _mm_add_ps(fiz0,tz);
1906 fjx0 = _mm_add_ps(fjx0,tx);
1907 fjy0 = _mm_add_ps(fjy0,ty);
1908 fjz0 = _mm_add_ps(fjz0,tz);
1912 /**************************
1913 * CALCULATE INTERACTIONS *
1914 **************************/
1916 if (gmx_mm_any_lt(rsq01,rcutoff2))
1919 /* REACTION-FIELD ELECTROSTATICS */
1920 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1922 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1926 fscal = _mm_and_ps(fscal,cutoff_mask);
1928 fscal = _mm_andnot_ps(dummy_mask,fscal);
1930 /* Calculate temporary vectorial force */
1931 tx = _mm_mul_ps(fscal,dx01);
1932 ty = _mm_mul_ps(fscal,dy01);
1933 tz = _mm_mul_ps(fscal,dz01);
1935 /* Update vectorial force */
1936 fix0 = _mm_add_ps(fix0,tx);
1937 fiy0 = _mm_add_ps(fiy0,ty);
1938 fiz0 = _mm_add_ps(fiz0,tz);
1940 fjx1 = _mm_add_ps(fjx1,tx);
1941 fjy1 = _mm_add_ps(fjy1,ty);
1942 fjz1 = _mm_add_ps(fjz1,tz);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 if (gmx_mm_any_lt(rsq02,rcutoff2))
1953 /* REACTION-FIELD ELECTROSTATICS */
1954 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1956 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1960 fscal = _mm_and_ps(fscal,cutoff_mask);
1962 fscal = _mm_andnot_ps(dummy_mask,fscal);
1964 /* Calculate temporary vectorial force */
1965 tx = _mm_mul_ps(fscal,dx02);
1966 ty = _mm_mul_ps(fscal,dy02);
1967 tz = _mm_mul_ps(fscal,dz02);
1969 /* Update vectorial force */
1970 fix0 = _mm_add_ps(fix0,tx);
1971 fiy0 = _mm_add_ps(fiy0,ty);
1972 fiz0 = _mm_add_ps(fiz0,tz);
1974 fjx2 = _mm_add_ps(fjx2,tx);
1975 fjy2 = _mm_add_ps(fjy2,ty);
1976 fjz2 = _mm_add_ps(fjz2,tz);
1980 /**************************
1981 * CALCULATE INTERACTIONS *
1982 **************************/
1984 if (gmx_mm_any_lt(rsq10,rcutoff2))
1987 /* REACTION-FIELD ELECTROSTATICS */
1988 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1990 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1994 fscal = _mm_and_ps(fscal,cutoff_mask);
1996 fscal = _mm_andnot_ps(dummy_mask,fscal);
1998 /* Calculate temporary vectorial force */
1999 tx = _mm_mul_ps(fscal,dx10);
2000 ty = _mm_mul_ps(fscal,dy10);
2001 tz = _mm_mul_ps(fscal,dz10);
2003 /* Update vectorial force */
2004 fix1 = _mm_add_ps(fix1,tx);
2005 fiy1 = _mm_add_ps(fiy1,ty);
2006 fiz1 = _mm_add_ps(fiz1,tz);
2008 fjx0 = _mm_add_ps(fjx0,tx);
2009 fjy0 = _mm_add_ps(fjy0,ty);
2010 fjz0 = _mm_add_ps(fjz0,tz);
2014 /**************************
2015 * CALCULATE INTERACTIONS *
2016 **************************/
2018 if (gmx_mm_any_lt(rsq11,rcutoff2))
2021 /* REACTION-FIELD ELECTROSTATICS */
2022 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2024 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2028 fscal = _mm_and_ps(fscal,cutoff_mask);
2030 fscal = _mm_andnot_ps(dummy_mask,fscal);
2032 /* Calculate temporary vectorial force */
2033 tx = _mm_mul_ps(fscal,dx11);
2034 ty = _mm_mul_ps(fscal,dy11);
2035 tz = _mm_mul_ps(fscal,dz11);
2037 /* Update vectorial force */
2038 fix1 = _mm_add_ps(fix1,tx);
2039 fiy1 = _mm_add_ps(fiy1,ty);
2040 fiz1 = _mm_add_ps(fiz1,tz);
2042 fjx1 = _mm_add_ps(fjx1,tx);
2043 fjy1 = _mm_add_ps(fjy1,ty);
2044 fjz1 = _mm_add_ps(fjz1,tz);
2048 /**************************
2049 * CALCULATE INTERACTIONS *
2050 **************************/
2052 if (gmx_mm_any_lt(rsq12,rcutoff2))
2055 /* REACTION-FIELD ELECTROSTATICS */
2056 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2058 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2062 fscal = _mm_and_ps(fscal,cutoff_mask);
2064 fscal = _mm_andnot_ps(dummy_mask,fscal);
2066 /* Calculate temporary vectorial force */
2067 tx = _mm_mul_ps(fscal,dx12);
2068 ty = _mm_mul_ps(fscal,dy12);
2069 tz = _mm_mul_ps(fscal,dz12);
2071 /* Update vectorial force */
2072 fix1 = _mm_add_ps(fix1,tx);
2073 fiy1 = _mm_add_ps(fiy1,ty);
2074 fiz1 = _mm_add_ps(fiz1,tz);
2076 fjx2 = _mm_add_ps(fjx2,tx);
2077 fjy2 = _mm_add_ps(fjy2,ty);
2078 fjz2 = _mm_add_ps(fjz2,tz);
2082 /**************************
2083 * CALCULATE INTERACTIONS *
2084 **************************/
2086 if (gmx_mm_any_lt(rsq20,rcutoff2))
2089 /* REACTION-FIELD ELECTROSTATICS */
2090 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2092 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2096 fscal = _mm_and_ps(fscal,cutoff_mask);
2098 fscal = _mm_andnot_ps(dummy_mask,fscal);
2100 /* Calculate temporary vectorial force */
2101 tx = _mm_mul_ps(fscal,dx20);
2102 ty = _mm_mul_ps(fscal,dy20);
2103 tz = _mm_mul_ps(fscal,dz20);
2105 /* Update vectorial force */
2106 fix2 = _mm_add_ps(fix2,tx);
2107 fiy2 = _mm_add_ps(fiy2,ty);
2108 fiz2 = _mm_add_ps(fiz2,tz);
2110 fjx0 = _mm_add_ps(fjx0,tx);
2111 fjy0 = _mm_add_ps(fjy0,ty);
2112 fjz0 = _mm_add_ps(fjz0,tz);
2116 /**************************
2117 * CALCULATE INTERACTIONS *
2118 **************************/
2120 if (gmx_mm_any_lt(rsq21,rcutoff2))
2123 /* REACTION-FIELD ELECTROSTATICS */
2124 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2126 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2130 fscal = _mm_and_ps(fscal,cutoff_mask);
2132 fscal = _mm_andnot_ps(dummy_mask,fscal);
2134 /* Calculate temporary vectorial force */
2135 tx = _mm_mul_ps(fscal,dx21);
2136 ty = _mm_mul_ps(fscal,dy21);
2137 tz = _mm_mul_ps(fscal,dz21);
2139 /* Update vectorial force */
2140 fix2 = _mm_add_ps(fix2,tx);
2141 fiy2 = _mm_add_ps(fiy2,ty);
2142 fiz2 = _mm_add_ps(fiz2,tz);
2144 fjx1 = _mm_add_ps(fjx1,tx);
2145 fjy1 = _mm_add_ps(fjy1,ty);
2146 fjz1 = _mm_add_ps(fjz1,tz);
2150 /**************************
2151 * CALCULATE INTERACTIONS *
2152 **************************/
2154 if (gmx_mm_any_lt(rsq22,rcutoff2))
2157 /* REACTION-FIELD ELECTROSTATICS */
2158 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2160 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2164 fscal = _mm_and_ps(fscal,cutoff_mask);
2166 fscal = _mm_andnot_ps(dummy_mask,fscal);
2168 /* Calculate temporary vectorial force */
2169 tx = _mm_mul_ps(fscal,dx22);
2170 ty = _mm_mul_ps(fscal,dy22);
2171 tz = _mm_mul_ps(fscal,dz22);
2173 /* Update vectorial force */
2174 fix2 = _mm_add_ps(fix2,tx);
2175 fiy2 = _mm_add_ps(fiy2,ty);
2176 fiz2 = _mm_add_ps(fiz2,tz);
2178 fjx2 = _mm_add_ps(fjx2,tx);
2179 fjy2 = _mm_add_ps(fjy2,ty);
2180 fjz2 = _mm_add_ps(fjz2,tz);
2184 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2185 f+j_coord_offsetC,f+j_coord_offsetD,
2186 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2188 /* Inner loop uses 302 flops */
2191 /* End of innermost loop */
2193 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2194 f+i_coord_offset,fshift+i_shift_offset);
2196 /* Increment number of inner iterations */
2197 inneriter += j_index_end - j_index_start;
2199 /* Outer loop uses 27 flops */
2202 /* Increment number of outer iterations */
2205 /* Update outer/inner flops */
2207 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*302);