2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
98 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
102 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
104 __m128i ifour = _mm_set1_epi32(4);
105 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
107 __m128 dummy_mask,cutoff_mask;
108 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109 __m128 one = _mm_set1_ps(1.0);
110 __m128 two = _mm_set1_ps(2.0);
116 jindex = nlist->jindex;
118 shiftidx = nlist->shift;
120 shiftvec = fr->shift_vec[0];
121 fshift = fr->fshift[0];
122 facel = _mm_set1_ps(fr->epsfac);
123 charge = mdatoms->chargeA;
124 krf = _mm_set1_ps(fr->ic->k_rf);
125 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
126 crf = _mm_set1_ps(fr->ic->c_rf);
127 nvdwtype = fr->ntype;
129 vdwtype = mdatoms->typeA;
131 vftab = kernel_data->table_vdw->data;
132 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
134 /* Setup water-specific parameters */
135 inr = nlist->iinr[0];
136 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
139 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
141 jq1 = _mm_set1_ps(charge[inr+1]);
142 jq2 = _mm_set1_ps(charge[inr+2]);
143 jq3 = _mm_set1_ps(charge[inr+3]);
144 vdwjidx0A = 2*vdwtype[inr+0];
145 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
146 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
147 qq11 = _mm_mul_ps(iq1,jq1);
148 qq12 = _mm_mul_ps(iq1,jq2);
149 qq13 = _mm_mul_ps(iq1,jq3);
150 qq21 = _mm_mul_ps(iq2,jq1);
151 qq22 = _mm_mul_ps(iq2,jq2);
152 qq23 = _mm_mul_ps(iq2,jq3);
153 qq31 = _mm_mul_ps(iq3,jq1);
154 qq32 = _mm_mul_ps(iq3,jq2);
155 qq33 = _mm_mul_ps(iq3,jq3);
157 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
158 rcutoff_scalar = fr->rcoulomb;
159 rcutoff = _mm_set1_ps(rcutoff_scalar);
160 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
162 /* Avoid stupid compiler warnings */
163 jnrA = jnrB = jnrC = jnrD = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195 fix0 = _mm_setzero_ps();
196 fiy0 = _mm_setzero_ps();
197 fiz0 = _mm_setzero_ps();
198 fix1 = _mm_setzero_ps();
199 fiy1 = _mm_setzero_ps();
200 fiz1 = _mm_setzero_ps();
201 fix2 = _mm_setzero_ps();
202 fiy2 = _mm_setzero_ps();
203 fiz2 = _mm_setzero_ps();
204 fix3 = _mm_setzero_ps();
205 fiy3 = _mm_setzero_ps();
206 fiz3 = _mm_setzero_ps();
208 /* Reset potential sums */
209 velecsum = _mm_setzero_ps();
210 vvdwsum = _mm_setzero_ps();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
216 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA = DIM*jnrA;
222 j_coord_offsetB = DIM*jnrB;
223 j_coord_offsetC = DIM*jnrC;
224 j_coord_offsetD = DIM*jnrD;
226 /* load j atom coordinates */
227 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
230 &jy2,&jz2,&jx3,&jy3,&jz3);
232 /* Calculate displacement vector */
233 dx00 = _mm_sub_ps(ix0,jx0);
234 dy00 = _mm_sub_ps(iy0,jy0);
235 dz00 = _mm_sub_ps(iz0,jz0);
236 dx11 = _mm_sub_ps(ix1,jx1);
237 dy11 = _mm_sub_ps(iy1,jy1);
238 dz11 = _mm_sub_ps(iz1,jz1);
239 dx12 = _mm_sub_ps(ix1,jx2);
240 dy12 = _mm_sub_ps(iy1,jy2);
241 dz12 = _mm_sub_ps(iz1,jz2);
242 dx13 = _mm_sub_ps(ix1,jx3);
243 dy13 = _mm_sub_ps(iy1,jy3);
244 dz13 = _mm_sub_ps(iz1,jz3);
245 dx21 = _mm_sub_ps(ix2,jx1);
246 dy21 = _mm_sub_ps(iy2,jy1);
247 dz21 = _mm_sub_ps(iz2,jz1);
248 dx22 = _mm_sub_ps(ix2,jx2);
249 dy22 = _mm_sub_ps(iy2,jy2);
250 dz22 = _mm_sub_ps(iz2,jz2);
251 dx23 = _mm_sub_ps(ix2,jx3);
252 dy23 = _mm_sub_ps(iy2,jy3);
253 dz23 = _mm_sub_ps(iz2,jz3);
254 dx31 = _mm_sub_ps(ix3,jx1);
255 dy31 = _mm_sub_ps(iy3,jy1);
256 dz31 = _mm_sub_ps(iz3,jz1);
257 dx32 = _mm_sub_ps(ix3,jx2);
258 dy32 = _mm_sub_ps(iy3,jy2);
259 dz32 = _mm_sub_ps(iz3,jz2);
260 dx33 = _mm_sub_ps(ix3,jx3);
261 dy33 = _mm_sub_ps(iy3,jy3);
262 dz33 = _mm_sub_ps(iz3,jz3);
264 /* Calculate squared distance and things based on it */
265 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
266 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
267 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
268 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
269 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
270 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
271 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
272 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
273 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
274 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
276 rinv00 = gmx_mm_invsqrt_ps(rsq00);
277 rinv11 = gmx_mm_invsqrt_ps(rsq11);
278 rinv12 = gmx_mm_invsqrt_ps(rsq12);
279 rinv13 = gmx_mm_invsqrt_ps(rsq13);
280 rinv21 = gmx_mm_invsqrt_ps(rsq21);
281 rinv22 = gmx_mm_invsqrt_ps(rsq22);
282 rinv23 = gmx_mm_invsqrt_ps(rsq23);
283 rinv31 = gmx_mm_invsqrt_ps(rsq31);
284 rinv32 = gmx_mm_invsqrt_ps(rsq32);
285 rinv33 = gmx_mm_invsqrt_ps(rsq33);
287 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
288 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
289 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
290 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
291 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
292 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
293 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
294 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
295 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
297 fjx0 = _mm_setzero_ps();
298 fjy0 = _mm_setzero_ps();
299 fjz0 = _mm_setzero_ps();
300 fjx1 = _mm_setzero_ps();
301 fjy1 = _mm_setzero_ps();
302 fjz1 = _mm_setzero_ps();
303 fjx2 = _mm_setzero_ps();
304 fjy2 = _mm_setzero_ps();
305 fjz2 = _mm_setzero_ps();
306 fjx3 = _mm_setzero_ps();
307 fjy3 = _mm_setzero_ps();
308 fjz3 = _mm_setzero_ps();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 if (gmx_mm_any_lt(rsq00,rcutoff2))
317 r00 = _mm_mul_ps(rsq00,rinv00);
319 /* Calculate table index by multiplying r with table scale and truncate to integer */
320 rt = _mm_mul_ps(r00,vftabscale);
321 vfitab = _mm_cvttps_epi32(rt);
322 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
323 vfitab = _mm_slli_epi32(vfitab,3);
325 /* CUBIC SPLINE TABLE DISPERSION */
326 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
327 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
328 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
329 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
330 _MM_TRANSPOSE4_PS(Y,F,G,H);
331 Heps = _mm_mul_ps(vfeps,H);
332 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
333 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
334 vvdw6 = _mm_mul_ps(c6_00,VV);
335 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
336 fvdw6 = _mm_mul_ps(c6_00,FF);
338 /* CUBIC SPLINE TABLE REPULSION */
339 vfitab = _mm_add_epi32(vfitab,ifour);
340 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
341 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
342 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
343 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
344 _MM_TRANSPOSE4_PS(Y,F,G,H);
345 Heps = _mm_mul_ps(vfeps,H);
346 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
347 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
348 vvdw12 = _mm_mul_ps(c12_00,VV);
349 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
350 fvdw12 = _mm_mul_ps(c12_00,FF);
351 vvdw = _mm_add_ps(vvdw12,vvdw6);
352 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
354 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 vvdw = _mm_and_ps(vvdw,cutoff_mask);
358 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
362 fscal = _mm_and_ps(fscal,cutoff_mask);
364 /* Calculate temporary vectorial force */
365 tx = _mm_mul_ps(fscal,dx00);
366 ty = _mm_mul_ps(fscal,dy00);
367 tz = _mm_mul_ps(fscal,dz00);
369 /* Update vectorial force */
370 fix0 = _mm_add_ps(fix0,tx);
371 fiy0 = _mm_add_ps(fiy0,ty);
372 fiz0 = _mm_add_ps(fiz0,tz);
374 fjx0 = _mm_add_ps(fjx0,tx);
375 fjy0 = _mm_add_ps(fjy0,ty);
376 fjz0 = _mm_add_ps(fjz0,tz);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 if (gmx_mm_any_lt(rsq11,rcutoff2))
387 /* REACTION-FIELD ELECTROSTATICS */
388 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
389 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
391 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velec = _mm_and_ps(velec,cutoff_mask);
395 velecsum = _mm_add_ps(velecsum,velec);
399 fscal = _mm_and_ps(fscal,cutoff_mask);
401 /* Calculate temporary vectorial force */
402 tx = _mm_mul_ps(fscal,dx11);
403 ty = _mm_mul_ps(fscal,dy11);
404 tz = _mm_mul_ps(fscal,dz11);
406 /* Update vectorial force */
407 fix1 = _mm_add_ps(fix1,tx);
408 fiy1 = _mm_add_ps(fiy1,ty);
409 fiz1 = _mm_add_ps(fiz1,tz);
411 fjx1 = _mm_add_ps(fjx1,tx);
412 fjy1 = _mm_add_ps(fjy1,ty);
413 fjz1 = _mm_add_ps(fjz1,tz);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 if (gmx_mm_any_lt(rsq12,rcutoff2))
424 /* REACTION-FIELD ELECTROSTATICS */
425 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
426 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
428 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velec = _mm_and_ps(velec,cutoff_mask);
432 velecsum = _mm_add_ps(velecsum,velec);
436 fscal = _mm_and_ps(fscal,cutoff_mask);
438 /* Calculate temporary vectorial force */
439 tx = _mm_mul_ps(fscal,dx12);
440 ty = _mm_mul_ps(fscal,dy12);
441 tz = _mm_mul_ps(fscal,dz12);
443 /* Update vectorial force */
444 fix1 = _mm_add_ps(fix1,tx);
445 fiy1 = _mm_add_ps(fiy1,ty);
446 fiz1 = _mm_add_ps(fiz1,tz);
448 fjx2 = _mm_add_ps(fjx2,tx);
449 fjy2 = _mm_add_ps(fjy2,ty);
450 fjz2 = _mm_add_ps(fjz2,tz);
454 /**************************
455 * CALCULATE INTERACTIONS *
456 **************************/
458 if (gmx_mm_any_lt(rsq13,rcutoff2))
461 /* REACTION-FIELD ELECTROSTATICS */
462 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
463 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
465 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velec = _mm_and_ps(velec,cutoff_mask);
469 velecsum = _mm_add_ps(velecsum,velec);
473 fscal = _mm_and_ps(fscal,cutoff_mask);
475 /* Calculate temporary vectorial force */
476 tx = _mm_mul_ps(fscal,dx13);
477 ty = _mm_mul_ps(fscal,dy13);
478 tz = _mm_mul_ps(fscal,dz13);
480 /* Update vectorial force */
481 fix1 = _mm_add_ps(fix1,tx);
482 fiy1 = _mm_add_ps(fiy1,ty);
483 fiz1 = _mm_add_ps(fiz1,tz);
485 fjx3 = _mm_add_ps(fjx3,tx);
486 fjy3 = _mm_add_ps(fjy3,ty);
487 fjz3 = _mm_add_ps(fjz3,tz);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 if (gmx_mm_any_lt(rsq21,rcutoff2))
498 /* REACTION-FIELD ELECTROSTATICS */
499 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
500 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
502 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
504 /* Update potential sum for this i atom from the interaction with this j atom. */
505 velec = _mm_and_ps(velec,cutoff_mask);
506 velecsum = _mm_add_ps(velecsum,velec);
510 fscal = _mm_and_ps(fscal,cutoff_mask);
512 /* Calculate temporary vectorial force */
513 tx = _mm_mul_ps(fscal,dx21);
514 ty = _mm_mul_ps(fscal,dy21);
515 tz = _mm_mul_ps(fscal,dz21);
517 /* Update vectorial force */
518 fix2 = _mm_add_ps(fix2,tx);
519 fiy2 = _mm_add_ps(fiy2,ty);
520 fiz2 = _mm_add_ps(fiz2,tz);
522 fjx1 = _mm_add_ps(fjx1,tx);
523 fjy1 = _mm_add_ps(fjy1,ty);
524 fjz1 = _mm_add_ps(fjz1,tz);
528 /**************************
529 * CALCULATE INTERACTIONS *
530 **************************/
532 if (gmx_mm_any_lt(rsq22,rcutoff2))
535 /* REACTION-FIELD ELECTROSTATICS */
536 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
537 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
539 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velec = _mm_and_ps(velec,cutoff_mask);
543 velecsum = _mm_add_ps(velecsum,velec);
547 fscal = _mm_and_ps(fscal,cutoff_mask);
549 /* Calculate temporary vectorial force */
550 tx = _mm_mul_ps(fscal,dx22);
551 ty = _mm_mul_ps(fscal,dy22);
552 tz = _mm_mul_ps(fscal,dz22);
554 /* Update vectorial force */
555 fix2 = _mm_add_ps(fix2,tx);
556 fiy2 = _mm_add_ps(fiy2,ty);
557 fiz2 = _mm_add_ps(fiz2,tz);
559 fjx2 = _mm_add_ps(fjx2,tx);
560 fjy2 = _mm_add_ps(fjy2,ty);
561 fjz2 = _mm_add_ps(fjz2,tz);
565 /**************************
566 * CALCULATE INTERACTIONS *
567 **************************/
569 if (gmx_mm_any_lt(rsq23,rcutoff2))
572 /* REACTION-FIELD ELECTROSTATICS */
573 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
574 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
576 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
578 /* Update potential sum for this i atom from the interaction with this j atom. */
579 velec = _mm_and_ps(velec,cutoff_mask);
580 velecsum = _mm_add_ps(velecsum,velec);
584 fscal = _mm_and_ps(fscal,cutoff_mask);
586 /* Calculate temporary vectorial force */
587 tx = _mm_mul_ps(fscal,dx23);
588 ty = _mm_mul_ps(fscal,dy23);
589 tz = _mm_mul_ps(fscal,dz23);
591 /* Update vectorial force */
592 fix2 = _mm_add_ps(fix2,tx);
593 fiy2 = _mm_add_ps(fiy2,ty);
594 fiz2 = _mm_add_ps(fiz2,tz);
596 fjx3 = _mm_add_ps(fjx3,tx);
597 fjy3 = _mm_add_ps(fjy3,ty);
598 fjz3 = _mm_add_ps(fjz3,tz);
602 /**************************
603 * CALCULATE INTERACTIONS *
604 **************************/
606 if (gmx_mm_any_lt(rsq31,rcutoff2))
609 /* REACTION-FIELD ELECTROSTATICS */
610 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
611 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
613 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
615 /* Update potential sum for this i atom from the interaction with this j atom. */
616 velec = _mm_and_ps(velec,cutoff_mask);
617 velecsum = _mm_add_ps(velecsum,velec);
621 fscal = _mm_and_ps(fscal,cutoff_mask);
623 /* Calculate temporary vectorial force */
624 tx = _mm_mul_ps(fscal,dx31);
625 ty = _mm_mul_ps(fscal,dy31);
626 tz = _mm_mul_ps(fscal,dz31);
628 /* Update vectorial force */
629 fix3 = _mm_add_ps(fix3,tx);
630 fiy3 = _mm_add_ps(fiy3,ty);
631 fiz3 = _mm_add_ps(fiz3,tz);
633 fjx1 = _mm_add_ps(fjx1,tx);
634 fjy1 = _mm_add_ps(fjy1,ty);
635 fjz1 = _mm_add_ps(fjz1,tz);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 if (gmx_mm_any_lt(rsq32,rcutoff2))
646 /* REACTION-FIELD ELECTROSTATICS */
647 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
648 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
650 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velec = _mm_and_ps(velec,cutoff_mask);
654 velecsum = _mm_add_ps(velecsum,velec);
658 fscal = _mm_and_ps(fscal,cutoff_mask);
660 /* Calculate temporary vectorial force */
661 tx = _mm_mul_ps(fscal,dx32);
662 ty = _mm_mul_ps(fscal,dy32);
663 tz = _mm_mul_ps(fscal,dz32);
665 /* Update vectorial force */
666 fix3 = _mm_add_ps(fix3,tx);
667 fiy3 = _mm_add_ps(fiy3,ty);
668 fiz3 = _mm_add_ps(fiz3,tz);
670 fjx2 = _mm_add_ps(fjx2,tx);
671 fjy2 = _mm_add_ps(fjy2,ty);
672 fjz2 = _mm_add_ps(fjz2,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 if (gmx_mm_any_lt(rsq33,rcutoff2))
683 /* REACTION-FIELD ELECTROSTATICS */
684 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
685 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
687 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
689 /* Update potential sum for this i atom from the interaction with this j atom. */
690 velec = _mm_and_ps(velec,cutoff_mask);
691 velecsum = _mm_add_ps(velecsum,velec);
695 fscal = _mm_and_ps(fscal,cutoff_mask);
697 /* Calculate temporary vectorial force */
698 tx = _mm_mul_ps(fscal,dx33);
699 ty = _mm_mul_ps(fscal,dy33);
700 tz = _mm_mul_ps(fscal,dz33);
702 /* Update vectorial force */
703 fix3 = _mm_add_ps(fix3,tx);
704 fiy3 = _mm_add_ps(fiy3,ty);
705 fiz3 = _mm_add_ps(fiz3,tz);
707 fjx3 = _mm_add_ps(fjx3,tx);
708 fjy3 = _mm_add_ps(fjy3,ty);
709 fjz3 = _mm_add_ps(fjz3,tz);
713 fjptrA = f+j_coord_offsetA;
714 fjptrB = f+j_coord_offsetB;
715 fjptrC = f+j_coord_offsetC;
716 fjptrD = f+j_coord_offsetD;
718 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
719 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
720 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
722 /* Inner loop uses 387 flops */
728 /* Get j neighbor index, and coordinate index */
729 jnrlistA = jjnr[jidx];
730 jnrlistB = jjnr[jidx+1];
731 jnrlistC = jjnr[jidx+2];
732 jnrlistD = jjnr[jidx+3];
733 /* Sign of each element will be negative for non-real atoms.
734 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
735 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
737 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
738 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
739 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
740 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
741 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
742 j_coord_offsetA = DIM*jnrA;
743 j_coord_offsetB = DIM*jnrB;
744 j_coord_offsetC = DIM*jnrC;
745 j_coord_offsetD = DIM*jnrD;
747 /* load j atom coordinates */
748 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
749 x+j_coord_offsetC,x+j_coord_offsetD,
750 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
751 &jy2,&jz2,&jx3,&jy3,&jz3);
753 /* Calculate displacement vector */
754 dx00 = _mm_sub_ps(ix0,jx0);
755 dy00 = _mm_sub_ps(iy0,jy0);
756 dz00 = _mm_sub_ps(iz0,jz0);
757 dx11 = _mm_sub_ps(ix1,jx1);
758 dy11 = _mm_sub_ps(iy1,jy1);
759 dz11 = _mm_sub_ps(iz1,jz1);
760 dx12 = _mm_sub_ps(ix1,jx2);
761 dy12 = _mm_sub_ps(iy1,jy2);
762 dz12 = _mm_sub_ps(iz1,jz2);
763 dx13 = _mm_sub_ps(ix1,jx3);
764 dy13 = _mm_sub_ps(iy1,jy3);
765 dz13 = _mm_sub_ps(iz1,jz3);
766 dx21 = _mm_sub_ps(ix2,jx1);
767 dy21 = _mm_sub_ps(iy2,jy1);
768 dz21 = _mm_sub_ps(iz2,jz1);
769 dx22 = _mm_sub_ps(ix2,jx2);
770 dy22 = _mm_sub_ps(iy2,jy2);
771 dz22 = _mm_sub_ps(iz2,jz2);
772 dx23 = _mm_sub_ps(ix2,jx3);
773 dy23 = _mm_sub_ps(iy2,jy3);
774 dz23 = _mm_sub_ps(iz2,jz3);
775 dx31 = _mm_sub_ps(ix3,jx1);
776 dy31 = _mm_sub_ps(iy3,jy1);
777 dz31 = _mm_sub_ps(iz3,jz1);
778 dx32 = _mm_sub_ps(ix3,jx2);
779 dy32 = _mm_sub_ps(iy3,jy2);
780 dz32 = _mm_sub_ps(iz3,jz2);
781 dx33 = _mm_sub_ps(ix3,jx3);
782 dy33 = _mm_sub_ps(iy3,jy3);
783 dz33 = _mm_sub_ps(iz3,jz3);
785 /* Calculate squared distance and things based on it */
786 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
787 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
788 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
789 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
790 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
791 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
792 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
793 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
794 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
795 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
797 rinv00 = gmx_mm_invsqrt_ps(rsq00);
798 rinv11 = gmx_mm_invsqrt_ps(rsq11);
799 rinv12 = gmx_mm_invsqrt_ps(rsq12);
800 rinv13 = gmx_mm_invsqrt_ps(rsq13);
801 rinv21 = gmx_mm_invsqrt_ps(rsq21);
802 rinv22 = gmx_mm_invsqrt_ps(rsq22);
803 rinv23 = gmx_mm_invsqrt_ps(rsq23);
804 rinv31 = gmx_mm_invsqrt_ps(rsq31);
805 rinv32 = gmx_mm_invsqrt_ps(rsq32);
806 rinv33 = gmx_mm_invsqrt_ps(rsq33);
808 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
809 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
810 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
811 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
812 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
813 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
814 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
815 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
816 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
818 fjx0 = _mm_setzero_ps();
819 fjy0 = _mm_setzero_ps();
820 fjz0 = _mm_setzero_ps();
821 fjx1 = _mm_setzero_ps();
822 fjy1 = _mm_setzero_ps();
823 fjz1 = _mm_setzero_ps();
824 fjx2 = _mm_setzero_ps();
825 fjy2 = _mm_setzero_ps();
826 fjz2 = _mm_setzero_ps();
827 fjx3 = _mm_setzero_ps();
828 fjy3 = _mm_setzero_ps();
829 fjz3 = _mm_setzero_ps();
831 /**************************
832 * CALCULATE INTERACTIONS *
833 **************************/
835 if (gmx_mm_any_lt(rsq00,rcutoff2))
838 r00 = _mm_mul_ps(rsq00,rinv00);
839 r00 = _mm_andnot_ps(dummy_mask,r00);
841 /* Calculate table index by multiplying r with table scale and truncate to integer */
842 rt = _mm_mul_ps(r00,vftabscale);
843 vfitab = _mm_cvttps_epi32(rt);
844 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
845 vfitab = _mm_slli_epi32(vfitab,3);
847 /* CUBIC SPLINE TABLE DISPERSION */
848 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
849 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
850 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
851 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
852 _MM_TRANSPOSE4_PS(Y,F,G,H);
853 Heps = _mm_mul_ps(vfeps,H);
854 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
855 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
856 vvdw6 = _mm_mul_ps(c6_00,VV);
857 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
858 fvdw6 = _mm_mul_ps(c6_00,FF);
860 /* CUBIC SPLINE TABLE REPULSION */
861 vfitab = _mm_add_epi32(vfitab,ifour);
862 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
863 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
864 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
865 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
866 _MM_TRANSPOSE4_PS(Y,F,G,H);
867 Heps = _mm_mul_ps(vfeps,H);
868 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
869 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
870 vvdw12 = _mm_mul_ps(c12_00,VV);
871 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
872 fvdw12 = _mm_mul_ps(c12_00,FF);
873 vvdw = _mm_add_ps(vvdw12,vvdw6);
874 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
876 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
878 /* Update potential sum for this i atom from the interaction with this j atom. */
879 vvdw = _mm_and_ps(vvdw,cutoff_mask);
880 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
881 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
885 fscal = _mm_and_ps(fscal,cutoff_mask);
887 fscal = _mm_andnot_ps(dummy_mask,fscal);
889 /* Calculate temporary vectorial force */
890 tx = _mm_mul_ps(fscal,dx00);
891 ty = _mm_mul_ps(fscal,dy00);
892 tz = _mm_mul_ps(fscal,dz00);
894 /* Update vectorial force */
895 fix0 = _mm_add_ps(fix0,tx);
896 fiy0 = _mm_add_ps(fiy0,ty);
897 fiz0 = _mm_add_ps(fiz0,tz);
899 fjx0 = _mm_add_ps(fjx0,tx);
900 fjy0 = _mm_add_ps(fjy0,ty);
901 fjz0 = _mm_add_ps(fjz0,tz);
905 /**************************
906 * CALCULATE INTERACTIONS *
907 **************************/
909 if (gmx_mm_any_lt(rsq11,rcutoff2))
912 /* REACTION-FIELD ELECTROSTATICS */
913 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
914 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
916 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
918 /* Update potential sum for this i atom from the interaction with this j atom. */
919 velec = _mm_and_ps(velec,cutoff_mask);
920 velec = _mm_andnot_ps(dummy_mask,velec);
921 velecsum = _mm_add_ps(velecsum,velec);
925 fscal = _mm_and_ps(fscal,cutoff_mask);
927 fscal = _mm_andnot_ps(dummy_mask,fscal);
929 /* Calculate temporary vectorial force */
930 tx = _mm_mul_ps(fscal,dx11);
931 ty = _mm_mul_ps(fscal,dy11);
932 tz = _mm_mul_ps(fscal,dz11);
934 /* Update vectorial force */
935 fix1 = _mm_add_ps(fix1,tx);
936 fiy1 = _mm_add_ps(fiy1,ty);
937 fiz1 = _mm_add_ps(fiz1,tz);
939 fjx1 = _mm_add_ps(fjx1,tx);
940 fjy1 = _mm_add_ps(fjy1,ty);
941 fjz1 = _mm_add_ps(fjz1,tz);
945 /**************************
946 * CALCULATE INTERACTIONS *
947 **************************/
949 if (gmx_mm_any_lt(rsq12,rcutoff2))
952 /* REACTION-FIELD ELECTROSTATICS */
953 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
954 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
956 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
958 /* Update potential sum for this i atom from the interaction with this j atom. */
959 velec = _mm_and_ps(velec,cutoff_mask);
960 velec = _mm_andnot_ps(dummy_mask,velec);
961 velecsum = _mm_add_ps(velecsum,velec);
965 fscal = _mm_and_ps(fscal,cutoff_mask);
967 fscal = _mm_andnot_ps(dummy_mask,fscal);
969 /* Calculate temporary vectorial force */
970 tx = _mm_mul_ps(fscal,dx12);
971 ty = _mm_mul_ps(fscal,dy12);
972 tz = _mm_mul_ps(fscal,dz12);
974 /* Update vectorial force */
975 fix1 = _mm_add_ps(fix1,tx);
976 fiy1 = _mm_add_ps(fiy1,ty);
977 fiz1 = _mm_add_ps(fiz1,tz);
979 fjx2 = _mm_add_ps(fjx2,tx);
980 fjy2 = _mm_add_ps(fjy2,ty);
981 fjz2 = _mm_add_ps(fjz2,tz);
985 /**************************
986 * CALCULATE INTERACTIONS *
987 **************************/
989 if (gmx_mm_any_lt(rsq13,rcutoff2))
992 /* REACTION-FIELD ELECTROSTATICS */
993 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
994 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
996 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
998 /* Update potential sum for this i atom from the interaction with this j atom. */
999 velec = _mm_and_ps(velec,cutoff_mask);
1000 velec = _mm_andnot_ps(dummy_mask,velec);
1001 velecsum = _mm_add_ps(velecsum,velec);
1005 fscal = _mm_and_ps(fscal,cutoff_mask);
1007 fscal = _mm_andnot_ps(dummy_mask,fscal);
1009 /* Calculate temporary vectorial force */
1010 tx = _mm_mul_ps(fscal,dx13);
1011 ty = _mm_mul_ps(fscal,dy13);
1012 tz = _mm_mul_ps(fscal,dz13);
1014 /* Update vectorial force */
1015 fix1 = _mm_add_ps(fix1,tx);
1016 fiy1 = _mm_add_ps(fiy1,ty);
1017 fiz1 = _mm_add_ps(fiz1,tz);
1019 fjx3 = _mm_add_ps(fjx3,tx);
1020 fjy3 = _mm_add_ps(fjy3,ty);
1021 fjz3 = _mm_add_ps(fjz3,tz);
1025 /**************************
1026 * CALCULATE INTERACTIONS *
1027 **************************/
1029 if (gmx_mm_any_lt(rsq21,rcutoff2))
1032 /* REACTION-FIELD ELECTROSTATICS */
1033 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1034 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1036 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1038 /* Update potential sum for this i atom from the interaction with this j atom. */
1039 velec = _mm_and_ps(velec,cutoff_mask);
1040 velec = _mm_andnot_ps(dummy_mask,velec);
1041 velecsum = _mm_add_ps(velecsum,velec);
1045 fscal = _mm_and_ps(fscal,cutoff_mask);
1047 fscal = _mm_andnot_ps(dummy_mask,fscal);
1049 /* Calculate temporary vectorial force */
1050 tx = _mm_mul_ps(fscal,dx21);
1051 ty = _mm_mul_ps(fscal,dy21);
1052 tz = _mm_mul_ps(fscal,dz21);
1054 /* Update vectorial force */
1055 fix2 = _mm_add_ps(fix2,tx);
1056 fiy2 = _mm_add_ps(fiy2,ty);
1057 fiz2 = _mm_add_ps(fiz2,tz);
1059 fjx1 = _mm_add_ps(fjx1,tx);
1060 fjy1 = _mm_add_ps(fjy1,ty);
1061 fjz1 = _mm_add_ps(fjz1,tz);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 if (gmx_mm_any_lt(rsq22,rcutoff2))
1072 /* REACTION-FIELD ELECTROSTATICS */
1073 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1074 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1076 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 velec = _mm_and_ps(velec,cutoff_mask);
1080 velec = _mm_andnot_ps(dummy_mask,velec);
1081 velecsum = _mm_add_ps(velecsum,velec);
1085 fscal = _mm_and_ps(fscal,cutoff_mask);
1087 fscal = _mm_andnot_ps(dummy_mask,fscal);
1089 /* Calculate temporary vectorial force */
1090 tx = _mm_mul_ps(fscal,dx22);
1091 ty = _mm_mul_ps(fscal,dy22);
1092 tz = _mm_mul_ps(fscal,dz22);
1094 /* Update vectorial force */
1095 fix2 = _mm_add_ps(fix2,tx);
1096 fiy2 = _mm_add_ps(fiy2,ty);
1097 fiz2 = _mm_add_ps(fiz2,tz);
1099 fjx2 = _mm_add_ps(fjx2,tx);
1100 fjy2 = _mm_add_ps(fjy2,ty);
1101 fjz2 = _mm_add_ps(fjz2,tz);
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 if (gmx_mm_any_lt(rsq23,rcutoff2))
1112 /* REACTION-FIELD ELECTROSTATICS */
1113 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
1114 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1116 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1118 /* Update potential sum for this i atom from the interaction with this j atom. */
1119 velec = _mm_and_ps(velec,cutoff_mask);
1120 velec = _mm_andnot_ps(dummy_mask,velec);
1121 velecsum = _mm_add_ps(velecsum,velec);
1125 fscal = _mm_and_ps(fscal,cutoff_mask);
1127 fscal = _mm_andnot_ps(dummy_mask,fscal);
1129 /* Calculate temporary vectorial force */
1130 tx = _mm_mul_ps(fscal,dx23);
1131 ty = _mm_mul_ps(fscal,dy23);
1132 tz = _mm_mul_ps(fscal,dz23);
1134 /* Update vectorial force */
1135 fix2 = _mm_add_ps(fix2,tx);
1136 fiy2 = _mm_add_ps(fiy2,ty);
1137 fiz2 = _mm_add_ps(fiz2,tz);
1139 fjx3 = _mm_add_ps(fjx3,tx);
1140 fjy3 = _mm_add_ps(fjy3,ty);
1141 fjz3 = _mm_add_ps(fjz3,tz);
1145 /**************************
1146 * CALCULATE INTERACTIONS *
1147 **************************/
1149 if (gmx_mm_any_lt(rsq31,rcutoff2))
1152 /* REACTION-FIELD ELECTROSTATICS */
1153 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
1154 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1156 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1158 /* Update potential sum for this i atom from the interaction with this j atom. */
1159 velec = _mm_and_ps(velec,cutoff_mask);
1160 velec = _mm_andnot_ps(dummy_mask,velec);
1161 velecsum = _mm_add_ps(velecsum,velec);
1165 fscal = _mm_and_ps(fscal,cutoff_mask);
1167 fscal = _mm_andnot_ps(dummy_mask,fscal);
1169 /* Calculate temporary vectorial force */
1170 tx = _mm_mul_ps(fscal,dx31);
1171 ty = _mm_mul_ps(fscal,dy31);
1172 tz = _mm_mul_ps(fscal,dz31);
1174 /* Update vectorial force */
1175 fix3 = _mm_add_ps(fix3,tx);
1176 fiy3 = _mm_add_ps(fiy3,ty);
1177 fiz3 = _mm_add_ps(fiz3,tz);
1179 fjx1 = _mm_add_ps(fjx1,tx);
1180 fjy1 = _mm_add_ps(fjy1,ty);
1181 fjz1 = _mm_add_ps(fjz1,tz);
1185 /**************************
1186 * CALCULATE INTERACTIONS *
1187 **************************/
1189 if (gmx_mm_any_lt(rsq32,rcutoff2))
1192 /* REACTION-FIELD ELECTROSTATICS */
1193 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1194 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1196 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1198 /* Update potential sum for this i atom from the interaction with this j atom. */
1199 velec = _mm_and_ps(velec,cutoff_mask);
1200 velec = _mm_andnot_ps(dummy_mask,velec);
1201 velecsum = _mm_add_ps(velecsum,velec);
1205 fscal = _mm_and_ps(fscal,cutoff_mask);
1207 fscal = _mm_andnot_ps(dummy_mask,fscal);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm_mul_ps(fscal,dx32);
1211 ty = _mm_mul_ps(fscal,dy32);
1212 tz = _mm_mul_ps(fscal,dz32);
1214 /* Update vectorial force */
1215 fix3 = _mm_add_ps(fix3,tx);
1216 fiy3 = _mm_add_ps(fiy3,ty);
1217 fiz3 = _mm_add_ps(fiz3,tz);
1219 fjx2 = _mm_add_ps(fjx2,tx);
1220 fjy2 = _mm_add_ps(fjy2,ty);
1221 fjz2 = _mm_add_ps(fjz2,tz);
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 if (gmx_mm_any_lt(rsq33,rcutoff2))
1232 /* REACTION-FIELD ELECTROSTATICS */
1233 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1234 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1236 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1238 /* Update potential sum for this i atom from the interaction with this j atom. */
1239 velec = _mm_and_ps(velec,cutoff_mask);
1240 velec = _mm_andnot_ps(dummy_mask,velec);
1241 velecsum = _mm_add_ps(velecsum,velec);
1245 fscal = _mm_and_ps(fscal,cutoff_mask);
1247 fscal = _mm_andnot_ps(dummy_mask,fscal);
1249 /* Calculate temporary vectorial force */
1250 tx = _mm_mul_ps(fscal,dx33);
1251 ty = _mm_mul_ps(fscal,dy33);
1252 tz = _mm_mul_ps(fscal,dz33);
1254 /* Update vectorial force */
1255 fix3 = _mm_add_ps(fix3,tx);
1256 fiy3 = _mm_add_ps(fiy3,ty);
1257 fiz3 = _mm_add_ps(fiz3,tz);
1259 fjx3 = _mm_add_ps(fjx3,tx);
1260 fjy3 = _mm_add_ps(fjy3,ty);
1261 fjz3 = _mm_add_ps(fjz3,tz);
1265 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1266 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1267 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1268 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1270 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1271 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1272 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1274 /* Inner loop uses 388 flops */
1277 /* End of innermost loop */
1279 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1280 f+i_coord_offset,fshift+i_shift_offset);
1283 /* Update potential energies */
1284 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1285 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1287 /* Increment number of inner iterations */
1288 inneriter += j_index_end - j_index_start;
1290 /* Outer loop uses 26 flops */
1293 /* Increment number of outer iterations */
1296 /* Update outer/inner flops */
1298 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*388);
1301 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
1302 * Electrostatics interaction: ReactionField
1303 * VdW interaction: CubicSplineTable
1304 * Geometry: Water4-Water4
1305 * Calculate force/pot: Force
1308 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
1309 (t_nblist * gmx_restrict nlist,
1310 rvec * gmx_restrict xx,
1311 rvec * gmx_restrict ff,
1312 t_forcerec * gmx_restrict fr,
1313 t_mdatoms * gmx_restrict mdatoms,
1314 nb_kernel_data_t * gmx_restrict kernel_data,
1315 t_nrnb * gmx_restrict nrnb)
1317 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1318 * just 0 for non-waters.
1319 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1320 * jnr indices corresponding to data put in the four positions in the SIMD register.
1322 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1323 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1324 int jnrA,jnrB,jnrC,jnrD;
1325 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1326 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1327 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1328 real rcutoff_scalar;
1329 real *shiftvec,*fshift,*x,*f;
1330 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1331 real scratch[4*DIM];
1332 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1334 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1336 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1338 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1340 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1341 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1342 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1343 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1344 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1345 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1346 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1347 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1348 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1349 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1350 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1351 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1352 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1353 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1354 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1355 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1356 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1357 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1358 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1359 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1362 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1365 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1366 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1368 __m128i ifour = _mm_set1_epi32(4);
1369 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1371 __m128 dummy_mask,cutoff_mask;
1372 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1373 __m128 one = _mm_set1_ps(1.0);
1374 __m128 two = _mm_set1_ps(2.0);
1380 jindex = nlist->jindex;
1382 shiftidx = nlist->shift;
1384 shiftvec = fr->shift_vec[0];
1385 fshift = fr->fshift[0];
1386 facel = _mm_set1_ps(fr->epsfac);
1387 charge = mdatoms->chargeA;
1388 krf = _mm_set1_ps(fr->ic->k_rf);
1389 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1390 crf = _mm_set1_ps(fr->ic->c_rf);
1391 nvdwtype = fr->ntype;
1392 vdwparam = fr->nbfp;
1393 vdwtype = mdatoms->typeA;
1395 vftab = kernel_data->table_vdw->data;
1396 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1398 /* Setup water-specific parameters */
1399 inr = nlist->iinr[0];
1400 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1401 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1402 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1403 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1405 jq1 = _mm_set1_ps(charge[inr+1]);
1406 jq2 = _mm_set1_ps(charge[inr+2]);
1407 jq3 = _mm_set1_ps(charge[inr+3]);
1408 vdwjidx0A = 2*vdwtype[inr+0];
1409 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1410 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1411 qq11 = _mm_mul_ps(iq1,jq1);
1412 qq12 = _mm_mul_ps(iq1,jq2);
1413 qq13 = _mm_mul_ps(iq1,jq3);
1414 qq21 = _mm_mul_ps(iq2,jq1);
1415 qq22 = _mm_mul_ps(iq2,jq2);
1416 qq23 = _mm_mul_ps(iq2,jq3);
1417 qq31 = _mm_mul_ps(iq3,jq1);
1418 qq32 = _mm_mul_ps(iq3,jq2);
1419 qq33 = _mm_mul_ps(iq3,jq3);
1421 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1422 rcutoff_scalar = fr->rcoulomb;
1423 rcutoff = _mm_set1_ps(rcutoff_scalar);
1424 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1426 /* Avoid stupid compiler warnings */
1427 jnrA = jnrB = jnrC = jnrD = 0;
1428 j_coord_offsetA = 0;
1429 j_coord_offsetB = 0;
1430 j_coord_offsetC = 0;
1431 j_coord_offsetD = 0;
1436 for(iidx=0;iidx<4*DIM;iidx++)
1438 scratch[iidx] = 0.0;
1441 /* Start outer loop over neighborlists */
1442 for(iidx=0; iidx<nri; iidx++)
1444 /* Load shift vector for this list */
1445 i_shift_offset = DIM*shiftidx[iidx];
1447 /* Load limits for loop over neighbors */
1448 j_index_start = jindex[iidx];
1449 j_index_end = jindex[iidx+1];
1451 /* Get outer coordinate index */
1453 i_coord_offset = DIM*inr;
1455 /* Load i particle coords and add shift vector */
1456 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1457 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1459 fix0 = _mm_setzero_ps();
1460 fiy0 = _mm_setzero_ps();
1461 fiz0 = _mm_setzero_ps();
1462 fix1 = _mm_setzero_ps();
1463 fiy1 = _mm_setzero_ps();
1464 fiz1 = _mm_setzero_ps();
1465 fix2 = _mm_setzero_ps();
1466 fiy2 = _mm_setzero_ps();
1467 fiz2 = _mm_setzero_ps();
1468 fix3 = _mm_setzero_ps();
1469 fiy3 = _mm_setzero_ps();
1470 fiz3 = _mm_setzero_ps();
1472 /* Start inner kernel loop */
1473 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1476 /* Get j neighbor index, and coordinate index */
1478 jnrB = jjnr[jidx+1];
1479 jnrC = jjnr[jidx+2];
1480 jnrD = jjnr[jidx+3];
1481 j_coord_offsetA = DIM*jnrA;
1482 j_coord_offsetB = DIM*jnrB;
1483 j_coord_offsetC = DIM*jnrC;
1484 j_coord_offsetD = DIM*jnrD;
1486 /* load j atom coordinates */
1487 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1488 x+j_coord_offsetC,x+j_coord_offsetD,
1489 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1490 &jy2,&jz2,&jx3,&jy3,&jz3);
1492 /* Calculate displacement vector */
1493 dx00 = _mm_sub_ps(ix0,jx0);
1494 dy00 = _mm_sub_ps(iy0,jy0);
1495 dz00 = _mm_sub_ps(iz0,jz0);
1496 dx11 = _mm_sub_ps(ix1,jx1);
1497 dy11 = _mm_sub_ps(iy1,jy1);
1498 dz11 = _mm_sub_ps(iz1,jz1);
1499 dx12 = _mm_sub_ps(ix1,jx2);
1500 dy12 = _mm_sub_ps(iy1,jy2);
1501 dz12 = _mm_sub_ps(iz1,jz2);
1502 dx13 = _mm_sub_ps(ix1,jx3);
1503 dy13 = _mm_sub_ps(iy1,jy3);
1504 dz13 = _mm_sub_ps(iz1,jz3);
1505 dx21 = _mm_sub_ps(ix2,jx1);
1506 dy21 = _mm_sub_ps(iy2,jy1);
1507 dz21 = _mm_sub_ps(iz2,jz1);
1508 dx22 = _mm_sub_ps(ix2,jx2);
1509 dy22 = _mm_sub_ps(iy2,jy2);
1510 dz22 = _mm_sub_ps(iz2,jz2);
1511 dx23 = _mm_sub_ps(ix2,jx3);
1512 dy23 = _mm_sub_ps(iy2,jy3);
1513 dz23 = _mm_sub_ps(iz2,jz3);
1514 dx31 = _mm_sub_ps(ix3,jx1);
1515 dy31 = _mm_sub_ps(iy3,jy1);
1516 dz31 = _mm_sub_ps(iz3,jz1);
1517 dx32 = _mm_sub_ps(ix3,jx2);
1518 dy32 = _mm_sub_ps(iy3,jy2);
1519 dz32 = _mm_sub_ps(iz3,jz2);
1520 dx33 = _mm_sub_ps(ix3,jx3);
1521 dy33 = _mm_sub_ps(iy3,jy3);
1522 dz33 = _mm_sub_ps(iz3,jz3);
1524 /* Calculate squared distance and things based on it */
1525 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1526 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1527 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1528 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1529 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1530 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1531 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1532 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1533 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1534 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1536 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1537 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1538 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1539 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1540 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1541 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1542 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1543 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1544 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1545 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1547 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1548 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1549 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1550 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1551 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1552 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1553 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1554 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1555 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1557 fjx0 = _mm_setzero_ps();
1558 fjy0 = _mm_setzero_ps();
1559 fjz0 = _mm_setzero_ps();
1560 fjx1 = _mm_setzero_ps();
1561 fjy1 = _mm_setzero_ps();
1562 fjz1 = _mm_setzero_ps();
1563 fjx2 = _mm_setzero_ps();
1564 fjy2 = _mm_setzero_ps();
1565 fjz2 = _mm_setzero_ps();
1566 fjx3 = _mm_setzero_ps();
1567 fjy3 = _mm_setzero_ps();
1568 fjz3 = _mm_setzero_ps();
1570 /**************************
1571 * CALCULATE INTERACTIONS *
1572 **************************/
1574 if (gmx_mm_any_lt(rsq00,rcutoff2))
1577 r00 = _mm_mul_ps(rsq00,rinv00);
1579 /* Calculate table index by multiplying r with table scale and truncate to integer */
1580 rt = _mm_mul_ps(r00,vftabscale);
1581 vfitab = _mm_cvttps_epi32(rt);
1582 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1583 vfitab = _mm_slli_epi32(vfitab,3);
1585 /* CUBIC SPLINE TABLE DISPERSION */
1586 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1587 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1588 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1589 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1590 _MM_TRANSPOSE4_PS(Y,F,G,H);
1591 Heps = _mm_mul_ps(vfeps,H);
1592 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1593 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1594 fvdw6 = _mm_mul_ps(c6_00,FF);
1596 /* CUBIC SPLINE TABLE REPULSION */
1597 vfitab = _mm_add_epi32(vfitab,ifour);
1598 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1599 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1600 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1601 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1602 _MM_TRANSPOSE4_PS(Y,F,G,H);
1603 Heps = _mm_mul_ps(vfeps,H);
1604 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1605 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1606 fvdw12 = _mm_mul_ps(c12_00,FF);
1607 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1609 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1613 fscal = _mm_and_ps(fscal,cutoff_mask);
1615 /* Calculate temporary vectorial force */
1616 tx = _mm_mul_ps(fscal,dx00);
1617 ty = _mm_mul_ps(fscal,dy00);
1618 tz = _mm_mul_ps(fscal,dz00);
1620 /* Update vectorial force */
1621 fix0 = _mm_add_ps(fix0,tx);
1622 fiy0 = _mm_add_ps(fiy0,ty);
1623 fiz0 = _mm_add_ps(fiz0,tz);
1625 fjx0 = _mm_add_ps(fjx0,tx);
1626 fjy0 = _mm_add_ps(fjy0,ty);
1627 fjz0 = _mm_add_ps(fjz0,tz);
1631 /**************************
1632 * CALCULATE INTERACTIONS *
1633 **************************/
1635 if (gmx_mm_any_lt(rsq11,rcutoff2))
1638 /* REACTION-FIELD ELECTROSTATICS */
1639 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1641 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1645 fscal = _mm_and_ps(fscal,cutoff_mask);
1647 /* Calculate temporary vectorial force */
1648 tx = _mm_mul_ps(fscal,dx11);
1649 ty = _mm_mul_ps(fscal,dy11);
1650 tz = _mm_mul_ps(fscal,dz11);
1652 /* Update vectorial force */
1653 fix1 = _mm_add_ps(fix1,tx);
1654 fiy1 = _mm_add_ps(fiy1,ty);
1655 fiz1 = _mm_add_ps(fiz1,tz);
1657 fjx1 = _mm_add_ps(fjx1,tx);
1658 fjy1 = _mm_add_ps(fjy1,ty);
1659 fjz1 = _mm_add_ps(fjz1,tz);
1663 /**************************
1664 * CALCULATE INTERACTIONS *
1665 **************************/
1667 if (gmx_mm_any_lt(rsq12,rcutoff2))
1670 /* REACTION-FIELD ELECTROSTATICS */
1671 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1673 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1677 fscal = _mm_and_ps(fscal,cutoff_mask);
1679 /* Calculate temporary vectorial force */
1680 tx = _mm_mul_ps(fscal,dx12);
1681 ty = _mm_mul_ps(fscal,dy12);
1682 tz = _mm_mul_ps(fscal,dz12);
1684 /* Update vectorial force */
1685 fix1 = _mm_add_ps(fix1,tx);
1686 fiy1 = _mm_add_ps(fiy1,ty);
1687 fiz1 = _mm_add_ps(fiz1,tz);
1689 fjx2 = _mm_add_ps(fjx2,tx);
1690 fjy2 = _mm_add_ps(fjy2,ty);
1691 fjz2 = _mm_add_ps(fjz2,tz);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 if (gmx_mm_any_lt(rsq13,rcutoff2))
1702 /* REACTION-FIELD ELECTROSTATICS */
1703 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1705 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1709 fscal = _mm_and_ps(fscal,cutoff_mask);
1711 /* Calculate temporary vectorial force */
1712 tx = _mm_mul_ps(fscal,dx13);
1713 ty = _mm_mul_ps(fscal,dy13);
1714 tz = _mm_mul_ps(fscal,dz13);
1716 /* Update vectorial force */
1717 fix1 = _mm_add_ps(fix1,tx);
1718 fiy1 = _mm_add_ps(fiy1,ty);
1719 fiz1 = _mm_add_ps(fiz1,tz);
1721 fjx3 = _mm_add_ps(fjx3,tx);
1722 fjy3 = _mm_add_ps(fjy3,ty);
1723 fjz3 = _mm_add_ps(fjz3,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 if (gmx_mm_any_lt(rsq21,rcutoff2))
1734 /* REACTION-FIELD ELECTROSTATICS */
1735 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1737 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1741 fscal = _mm_and_ps(fscal,cutoff_mask);
1743 /* Calculate temporary vectorial force */
1744 tx = _mm_mul_ps(fscal,dx21);
1745 ty = _mm_mul_ps(fscal,dy21);
1746 tz = _mm_mul_ps(fscal,dz21);
1748 /* Update vectorial force */
1749 fix2 = _mm_add_ps(fix2,tx);
1750 fiy2 = _mm_add_ps(fiy2,ty);
1751 fiz2 = _mm_add_ps(fiz2,tz);
1753 fjx1 = _mm_add_ps(fjx1,tx);
1754 fjy1 = _mm_add_ps(fjy1,ty);
1755 fjz1 = _mm_add_ps(fjz1,tz);
1759 /**************************
1760 * CALCULATE INTERACTIONS *
1761 **************************/
1763 if (gmx_mm_any_lt(rsq22,rcutoff2))
1766 /* REACTION-FIELD ELECTROSTATICS */
1767 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1769 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1773 fscal = _mm_and_ps(fscal,cutoff_mask);
1775 /* Calculate temporary vectorial force */
1776 tx = _mm_mul_ps(fscal,dx22);
1777 ty = _mm_mul_ps(fscal,dy22);
1778 tz = _mm_mul_ps(fscal,dz22);
1780 /* Update vectorial force */
1781 fix2 = _mm_add_ps(fix2,tx);
1782 fiy2 = _mm_add_ps(fiy2,ty);
1783 fiz2 = _mm_add_ps(fiz2,tz);
1785 fjx2 = _mm_add_ps(fjx2,tx);
1786 fjy2 = _mm_add_ps(fjy2,ty);
1787 fjz2 = _mm_add_ps(fjz2,tz);
1791 /**************************
1792 * CALCULATE INTERACTIONS *
1793 **************************/
1795 if (gmx_mm_any_lt(rsq23,rcutoff2))
1798 /* REACTION-FIELD ELECTROSTATICS */
1799 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1801 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1805 fscal = _mm_and_ps(fscal,cutoff_mask);
1807 /* Calculate temporary vectorial force */
1808 tx = _mm_mul_ps(fscal,dx23);
1809 ty = _mm_mul_ps(fscal,dy23);
1810 tz = _mm_mul_ps(fscal,dz23);
1812 /* Update vectorial force */
1813 fix2 = _mm_add_ps(fix2,tx);
1814 fiy2 = _mm_add_ps(fiy2,ty);
1815 fiz2 = _mm_add_ps(fiz2,tz);
1817 fjx3 = _mm_add_ps(fjx3,tx);
1818 fjy3 = _mm_add_ps(fjy3,ty);
1819 fjz3 = _mm_add_ps(fjz3,tz);
1823 /**************************
1824 * CALCULATE INTERACTIONS *
1825 **************************/
1827 if (gmx_mm_any_lt(rsq31,rcutoff2))
1830 /* REACTION-FIELD ELECTROSTATICS */
1831 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1833 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1837 fscal = _mm_and_ps(fscal,cutoff_mask);
1839 /* Calculate temporary vectorial force */
1840 tx = _mm_mul_ps(fscal,dx31);
1841 ty = _mm_mul_ps(fscal,dy31);
1842 tz = _mm_mul_ps(fscal,dz31);
1844 /* Update vectorial force */
1845 fix3 = _mm_add_ps(fix3,tx);
1846 fiy3 = _mm_add_ps(fiy3,ty);
1847 fiz3 = _mm_add_ps(fiz3,tz);
1849 fjx1 = _mm_add_ps(fjx1,tx);
1850 fjy1 = _mm_add_ps(fjy1,ty);
1851 fjz1 = _mm_add_ps(fjz1,tz);
1855 /**************************
1856 * CALCULATE INTERACTIONS *
1857 **************************/
1859 if (gmx_mm_any_lt(rsq32,rcutoff2))
1862 /* REACTION-FIELD ELECTROSTATICS */
1863 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1865 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1869 fscal = _mm_and_ps(fscal,cutoff_mask);
1871 /* Calculate temporary vectorial force */
1872 tx = _mm_mul_ps(fscal,dx32);
1873 ty = _mm_mul_ps(fscal,dy32);
1874 tz = _mm_mul_ps(fscal,dz32);
1876 /* Update vectorial force */
1877 fix3 = _mm_add_ps(fix3,tx);
1878 fiy3 = _mm_add_ps(fiy3,ty);
1879 fiz3 = _mm_add_ps(fiz3,tz);
1881 fjx2 = _mm_add_ps(fjx2,tx);
1882 fjy2 = _mm_add_ps(fjy2,ty);
1883 fjz2 = _mm_add_ps(fjz2,tz);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 if (gmx_mm_any_lt(rsq33,rcutoff2))
1894 /* REACTION-FIELD ELECTROSTATICS */
1895 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1897 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1901 fscal = _mm_and_ps(fscal,cutoff_mask);
1903 /* Calculate temporary vectorial force */
1904 tx = _mm_mul_ps(fscal,dx33);
1905 ty = _mm_mul_ps(fscal,dy33);
1906 tz = _mm_mul_ps(fscal,dz33);
1908 /* Update vectorial force */
1909 fix3 = _mm_add_ps(fix3,tx);
1910 fiy3 = _mm_add_ps(fiy3,ty);
1911 fiz3 = _mm_add_ps(fiz3,tz);
1913 fjx3 = _mm_add_ps(fjx3,tx);
1914 fjy3 = _mm_add_ps(fjy3,ty);
1915 fjz3 = _mm_add_ps(fjz3,tz);
1919 fjptrA = f+j_coord_offsetA;
1920 fjptrB = f+j_coord_offsetB;
1921 fjptrC = f+j_coord_offsetC;
1922 fjptrD = f+j_coord_offsetD;
1924 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1925 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1926 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1928 /* Inner loop uses 324 flops */
1931 if(jidx<j_index_end)
1934 /* Get j neighbor index, and coordinate index */
1935 jnrlistA = jjnr[jidx];
1936 jnrlistB = jjnr[jidx+1];
1937 jnrlistC = jjnr[jidx+2];
1938 jnrlistD = jjnr[jidx+3];
1939 /* Sign of each element will be negative for non-real atoms.
1940 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1941 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1943 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1944 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1945 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1946 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1947 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1948 j_coord_offsetA = DIM*jnrA;
1949 j_coord_offsetB = DIM*jnrB;
1950 j_coord_offsetC = DIM*jnrC;
1951 j_coord_offsetD = DIM*jnrD;
1953 /* load j atom coordinates */
1954 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1955 x+j_coord_offsetC,x+j_coord_offsetD,
1956 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1957 &jy2,&jz2,&jx3,&jy3,&jz3);
1959 /* Calculate displacement vector */
1960 dx00 = _mm_sub_ps(ix0,jx0);
1961 dy00 = _mm_sub_ps(iy0,jy0);
1962 dz00 = _mm_sub_ps(iz0,jz0);
1963 dx11 = _mm_sub_ps(ix1,jx1);
1964 dy11 = _mm_sub_ps(iy1,jy1);
1965 dz11 = _mm_sub_ps(iz1,jz1);
1966 dx12 = _mm_sub_ps(ix1,jx2);
1967 dy12 = _mm_sub_ps(iy1,jy2);
1968 dz12 = _mm_sub_ps(iz1,jz2);
1969 dx13 = _mm_sub_ps(ix1,jx3);
1970 dy13 = _mm_sub_ps(iy1,jy3);
1971 dz13 = _mm_sub_ps(iz1,jz3);
1972 dx21 = _mm_sub_ps(ix2,jx1);
1973 dy21 = _mm_sub_ps(iy2,jy1);
1974 dz21 = _mm_sub_ps(iz2,jz1);
1975 dx22 = _mm_sub_ps(ix2,jx2);
1976 dy22 = _mm_sub_ps(iy2,jy2);
1977 dz22 = _mm_sub_ps(iz2,jz2);
1978 dx23 = _mm_sub_ps(ix2,jx3);
1979 dy23 = _mm_sub_ps(iy2,jy3);
1980 dz23 = _mm_sub_ps(iz2,jz3);
1981 dx31 = _mm_sub_ps(ix3,jx1);
1982 dy31 = _mm_sub_ps(iy3,jy1);
1983 dz31 = _mm_sub_ps(iz3,jz1);
1984 dx32 = _mm_sub_ps(ix3,jx2);
1985 dy32 = _mm_sub_ps(iy3,jy2);
1986 dz32 = _mm_sub_ps(iz3,jz2);
1987 dx33 = _mm_sub_ps(ix3,jx3);
1988 dy33 = _mm_sub_ps(iy3,jy3);
1989 dz33 = _mm_sub_ps(iz3,jz3);
1991 /* Calculate squared distance and things based on it */
1992 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1993 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1994 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1995 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1996 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1997 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1998 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1999 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2000 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2001 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2003 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2004 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2005 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2006 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2007 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2008 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2009 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2010 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2011 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2012 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2014 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2015 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2016 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2017 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2018 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2019 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2020 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2021 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2022 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2024 fjx0 = _mm_setzero_ps();
2025 fjy0 = _mm_setzero_ps();
2026 fjz0 = _mm_setzero_ps();
2027 fjx1 = _mm_setzero_ps();
2028 fjy1 = _mm_setzero_ps();
2029 fjz1 = _mm_setzero_ps();
2030 fjx2 = _mm_setzero_ps();
2031 fjy2 = _mm_setzero_ps();
2032 fjz2 = _mm_setzero_ps();
2033 fjx3 = _mm_setzero_ps();
2034 fjy3 = _mm_setzero_ps();
2035 fjz3 = _mm_setzero_ps();
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 if (gmx_mm_any_lt(rsq00,rcutoff2))
2044 r00 = _mm_mul_ps(rsq00,rinv00);
2045 r00 = _mm_andnot_ps(dummy_mask,r00);
2047 /* Calculate table index by multiplying r with table scale and truncate to integer */
2048 rt = _mm_mul_ps(r00,vftabscale);
2049 vfitab = _mm_cvttps_epi32(rt);
2050 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2051 vfitab = _mm_slli_epi32(vfitab,3);
2053 /* CUBIC SPLINE TABLE DISPERSION */
2054 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2055 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2056 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2057 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2058 _MM_TRANSPOSE4_PS(Y,F,G,H);
2059 Heps = _mm_mul_ps(vfeps,H);
2060 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2061 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2062 fvdw6 = _mm_mul_ps(c6_00,FF);
2064 /* CUBIC SPLINE TABLE REPULSION */
2065 vfitab = _mm_add_epi32(vfitab,ifour);
2066 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2067 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2068 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2069 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2070 _MM_TRANSPOSE4_PS(Y,F,G,H);
2071 Heps = _mm_mul_ps(vfeps,H);
2072 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2073 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2074 fvdw12 = _mm_mul_ps(c12_00,FF);
2075 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2077 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2081 fscal = _mm_and_ps(fscal,cutoff_mask);
2083 fscal = _mm_andnot_ps(dummy_mask,fscal);
2085 /* Calculate temporary vectorial force */
2086 tx = _mm_mul_ps(fscal,dx00);
2087 ty = _mm_mul_ps(fscal,dy00);
2088 tz = _mm_mul_ps(fscal,dz00);
2090 /* Update vectorial force */
2091 fix0 = _mm_add_ps(fix0,tx);
2092 fiy0 = _mm_add_ps(fiy0,ty);
2093 fiz0 = _mm_add_ps(fiz0,tz);
2095 fjx0 = _mm_add_ps(fjx0,tx);
2096 fjy0 = _mm_add_ps(fjy0,ty);
2097 fjz0 = _mm_add_ps(fjz0,tz);
2101 /**************************
2102 * CALCULATE INTERACTIONS *
2103 **************************/
2105 if (gmx_mm_any_lt(rsq11,rcutoff2))
2108 /* REACTION-FIELD ELECTROSTATICS */
2109 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2111 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2115 fscal = _mm_and_ps(fscal,cutoff_mask);
2117 fscal = _mm_andnot_ps(dummy_mask,fscal);
2119 /* Calculate temporary vectorial force */
2120 tx = _mm_mul_ps(fscal,dx11);
2121 ty = _mm_mul_ps(fscal,dy11);
2122 tz = _mm_mul_ps(fscal,dz11);
2124 /* Update vectorial force */
2125 fix1 = _mm_add_ps(fix1,tx);
2126 fiy1 = _mm_add_ps(fiy1,ty);
2127 fiz1 = _mm_add_ps(fiz1,tz);
2129 fjx1 = _mm_add_ps(fjx1,tx);
2130 fjy1 = _mm_add_ps(fjy1,ty);
2131 fjz1 = _mm_add_ps(fjz1,tz);
2135 /**************************
2136 * CALCULATE INTERACTIONS *
2137 **************************/
2139 if (gmx_mm_any_lt(rsq12,rcutoff2))
2142 /* REACTION-FIELD ELECTROSTATICS */
2143 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2145 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2149 fscal = _mm_and_ps(fscal,cutoff_mask);
2151 fscal = _mm_andnot_ps(dummy_mask,fscal);
2153 /* Calculate temporary vectorial force */
2154 tx = _mm_mul_ps(fscal,dx12);
2155 ty = _mm_mul_ps(fscal,dy12);
2156 tz = _mm_mul_ps(fscal,dz12);
2158 /* Update vectorial force */
2159 fix1 = _mm_add_ps(fix1,tx);
2160 fiy1 = _mm_add_ps(fiy1,ty);
2161 fiz1 = _mm_add_ps(fiz1,tz);
2163 fjx2 = _mm_add_ps(fjx2,tx);
2164 fjy2 = _mm_add_ps(fjy2,ty);
2165 fjz2 = _mm_add_ps(fjz2,tz);
2169 /**************************
2170 * CALCULATE INTERACTIONS *
2171 **************************/
2173 if (gmx_mm_any_lt(rsq13,rcutoff2))
2176 /* REACTION-FIELD ELECTROSTATICS */
2177 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
2179 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2183 fscal = _mm_and_ps(fscal,cutoff_mask);
2185 fscal = _mm_andnot_ps(dummy_mask,fscal);
2187 /* Calculate temporary vectorial force */
2188 tx = _mm_mul_ps(fscal,dx13);
2189 ty = _mm_mul_ps(fscal,dy13);
2190 tz = _mm_mul_ps(fscal,dz13);
2192 /* Update vectorial force */
2193 fix1 = _mm_add_ps(fix1,tx);
2194 fiy1 = _mm_add_ps(fiy1,ty);
2195 fiz1 = _mm_add_ps(fiz1,tz);
2197 fjx3 = _mm_add_ps(fjx3,tx);
2198 fjy3 = _mm_add_ps(fjy3,ty);
2199 fjz3 = _mm_add_ps(fjz3,tz);
2203 /**************************
2204 * CALCULATE INTERACTIONS *
2205 **************************/
2207 if (gmx_mm_any_lt(rsq21,rcutoff2))
2210 /* REACTION-FIELD ELECTROSTATICS */
2211 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2213 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2217 fscal = _mm_and_ps(fscal,cutoff_mask);
2219 fscal = _mm_andnot_ps(dummy_mask,fscal);
2221 /* Calculate temporary vectorial force */
2222 tx = _mm_mul_ps(fscal,dx21);
2223 ty = _mm_mul_ps(fscal,dy21);
2224 tz = _mm_mul_ps(fscal,dz21);
2226 /* Update vectorial force */
2227 fix2 = _mm_add_ps(fix2,tx);
2228 fiy2 = _mm_add_ps(fiy2,ty);
2229 fiz2 = _mm_add_ps(fiz2,tz);
2231 fjx1 = _mm_add_ps(fjx1,tx);
2232 fjy1 = _mm_add_ps(fjy1,ty);
2233 fjz1 = _mm_add_ps(fjz1,tz);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 if (gmx_mm_any_lt(rsq22,rcutoff2))
2244 /* REACTION-FIELD ELECTROSTATICS */
2245 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2247 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2251 fscal = _mm_and_ps(fscal,cutoff_mask);
2253 fscal = _mm_andnot_ps(dummy_mask,fscal);
2255 /* Calculate temporary vectorial force */
2256 tx = _mm_mul_ps(fscal,dx22);
2257 ty = _mm_mul_ps(fscal,dy22);
2258 tz = _mm_mul_ps(fscal,dz22);
2260 /* Update vectorial force */
2261 fix2 = _mm_add_ps(fix2,tx);
2262 fiy2 = _mm_add_ps(fiy2,ty);
2263 fiz2 = _mm_add_ps(fiz2,tz);
2265 fjx2 = _mm_add_ps(fjx2,tx);
2266 fjy2 = _mm_add_ps(fjy2,ty);
2267 fjz2 = _mm_add_ps(fjz2,tz);
2271 /**************************
2272 * CALCULATE INTERACTIONS *
2273 **************************/
2275 if (gmx_mm_any_lt(rsq23,rcutoff2))
2278 /* REACTION-FIELD ELECTROSTATICS */
2279 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
2281 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2285 fscal = _mm_and_ps(fscal,cutoff_mask);
2287 fscal = _mm_andnot_ps(dummy_mask,fscal);
2289 /* Calculate temporary vectorial force */
2290 tx = _mm_mul_ps(fscal,dx23);
2291 ty = _mm_mul_ps(fscal,dy23);
2292 tz = _mm_mul_ps(fscal,dz23);
2294 /* Update vectorial force */
2295 fix2 = _mm_add_ps(fix2,tx);
2296 fiy2 = _mm_add_ps(fiy2,ty);
2297 fiz2 = _mm_add_ps(fiz2,tz);
2299 fjx3 = _mm_add_ps(fjx3,tx);
2300 fjy3 = _mm_add_ps(fjy3,ty);
2301 fjz3 = _mm_add_ps(fjz3,tz);
2305 /**************************
2306 * CALCULATE INTERACTIONS *
2307 **************************/
2309 if (gmx_mm_any_lt(rsq31,rcutoff2))
2312 /* REACTION-FIELD ELECTROSTATICS */
2313 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
2315 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2319 fscal = _mm_and_ps(fscal,cutoff_mask);
2321 fscal = _mm_andnot_ps(dummy_mask,fscal);
2323 /* Calculate temporary vectorial force */
2324 tx = _mm_mul_ps(fscal,dx31);
2325 ty = _mm_mul_ps(fscal,dy31);
2326 tz = _mm_mul_ps(fscal,dz31);
2328 /* Update vectorial force */
2329 fix3 = _mm_add_ps(fix3,tx);
2330 fiy3 = _mm_add_ps(fiy3,ty);
2331 fiz3 = _mm_add_ps(fiz3,tz);
2333 fjx1 = _mm_add_ps(fjx1,tx);
2334 fjy1 = _mm_add_ps(fjy1,ty);
2335 fjz1 = _mm_add_ps(fjz1,tz);
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2343 if (gmx_mm_any_lt(rsq32,rcutoff2))
2346 /* REACTION-FIELD ELECTROSTATICS */
2347 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
2349 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2353 fscal = _mm_and_ps(fscal,cutoff_mask);
2355 fscal = _mm_andnot_ps(dummy_mask,fscal);
2357 /* Calculate temporary vectorial force */
2358 tx = _mm_mul_ps(fscal,dx32);
2359 ty = _mm_mul_ps(fscal,dy32);
2360 tz = _mm_mul_ps(fscal,dz32);
2362 /* Update vectorial force */
2363 fix3 = _mm_add_ps(fix3,tx);
2364 fiy3 = _mm_add_ps(fiy3,ty);
2365 fiz3 = _mm_add_ps(fiz3,tz);
2367 fjx2 = _mm_add_ps(fjx2,tx);
2368 fjy2 = _mm_add_ps(fjy2,ty);
2369 fjz2 = _mm_add_ps(fjz2,tz);
2373 /**************************
2374 * CALCULATE INTERACTIONS *
2375 **************************/
2377 if (gmx_mm_any_lt(rsq33,rcutoff2))
2380 /* REACTION-FIELD ELECTROSTATICS */
2381 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2383 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2387 fscal = _mm_and_ps(fscal,cutoff_mask);
2389 fscal = _mm_andnot_ps(dummy_mask,fscal);
2391 /* Calculate temporary vectorial force */
2392 tx = _mm_mul_ps(fscal,dx33);
2393 ty = _mm_mul_ps(fscal,dy33);
2394 tz = _mm_mul_ps(fscal,dz33);
2396 /* Update vectorial force */
2397 fix3 = _mm_add_ps(fix3,tx);
2398 fiy3 = _mm_add_ps(fiy3,ty);
2399 fiz3 = _mm_add_ps(fiz3,tz);
2401 fjx3 = _mm_add_ps(fjx3,tx);
2402 fjy3 = _mm_add_ps(fjy3,ty);
2403 fjz3 = _mm_add_ps(fjz3,tz);
2407 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2408 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2409 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2410 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2412 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2413 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2414 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2416 /* Inner loop uses 325 flops */
2419 /* End of innermost loop */
2421 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2422 f+i_coord_offset,fshift+i_shift_offset);
2424 /* Increment number of inner iterations */
2425 inneriter += j_index_end - j_index_start;
2427 /* Outer loop uses 24 flops */
2430 /* Increment number of outer iterations */
2433 /* Update outer/inner flops */
2435 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);