2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128 dummy_mask,cutoff_mask;
105 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106 __m128 one = _mm_set1_ps(1.0);
107 __m128 two = _mm_set1_ps(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_ps(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 vftab = kernel_data->table_elec->data;
126 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
131 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
132 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
133 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
135 jq1 = _mm_set1_ps(charge[inr+1]);
136 jq2 = _mm_set1_ps(charge[inr+2]);
137 jq3 = _mm_set1_ps(charge[inr+3]);
138 vdwjidx0A = 2*vdwtype[inr+0];
139 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq11 = _mm_mul_ps(iq1,jq1);
142 qq12 = _mm_mul_ps(iq1,jq2);
143 qq13 = _mm_mul_ps(iq1,jq3);
144 qq21 = _mm_mul_ps(iq2,jq1);
145 qq22 = _mm_mul_ps(iq2,jq2);
146 qq23 = _mm_mul_ps(iq2,jq3);
147 qq31 = _mm_mul_ps(iq3,jq1);
148 qq32 = _mm_mul_ps(iq3,jq2);
149 qq33 = _mm_mul_ps(iq3,jq3);
151 /* Avoid stupid compiler warnings */
152 jnrA = jnrB = jnrC = jnrD = 0;
161 /* Start outer loop over neighborlists */
162 for(iidx=0; iidx<nri; iidx++)
164 /* Load shift vector for this list */
165 i_shift_offset = DIM*shiftidx[iidx];
166 shX = shiftvec[i_shift_offset+XX];
167 shY = shiftvec[i_shift_offset+YY];
168 shZ = shiftvec[i_shift_offset+ZZ];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
180 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
181 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
182 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
183 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
184 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
185 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
186 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
187 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
188 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
189 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
190 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
192 fix0 = _mm_setzero_ps();
193 fiy0 = _mm_setzero_ps();
194 fiz0 = _mm_setzero_ps();
195 fix1 = _mm_setzero_ps();
196 fiy1 = _mm_setzero_ps();
197 fiz1 = _mm_setzero_ps();
198 fix2 = _mm_setzero_ps();
199 fiy2 = _mm_setzero_ps();
200 fiz2 = _mm_setzero_ps();
201 fix3 = _mm_setzero_ps();
202 fiy3 = _mm_setzero_ps();
203 fiz3 = _mm_setzero_ps();
205 /* Reset potential sums */
206 velecsum = _mm_setzero_ps();
207 vvdwsum = _mm_setzero_ps();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
228 &jy2,&jz2,&jx3,&jy3,&jz3);
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx11 = _mm_sub_ps(ix1,jx1);
235 dy11 = _mm_sub_ps(iy1,jy1);
236 dz11 = _mm_sub_ps(iz1,jz1);
237 dx12 = _mm_sub_ps(ix1,jx2);
238 dy12 = _mm_sub_ps(iy1,jy2);
239 dz12 = _mm_sub_ps(iz1,jz2);
240 dx13 = _mm_sub_ps(ix1,jx3);
241 dy13 = _mm_sub_ps(iy1,jy3);
242 dz13 = _mm_sub_ps(iz1,jz3);
243 dx21 = _mm_sub_ps(ix2,jx1);
244 dy21 = _mm_sub_ps(iy2,jy1);
245 dz21 = _mm_sub_ps(iz2,jz1);
246 dx22 = _mm_sub_ps(ix2,jx2);
247 dy22 = _mm_sub_ps(iy2,jy2);
248 dz22 = _mm_sub_ps(iz2,jz2);
249 dx23 = _mm_sub_ps(ix2,jx3);
250 dy23 = _mm_sub_ps(iy2,jy3);
251 dz23 = _mm_sub_ps(iz2,jz3);
252 dx31 = _mm_sub_ps(ix3,jx1);
253 dy31 = _mm_sub_ps(iy3,jy1);
254 dz31 = _mm_sub_ps(iz3,jz1);
255 dx32 = _mm_sub_ps(ix3,jx2);
256 dy32 = _mm_sub_ps(iy3,jy2);
257 dz32 = _mm_sub_ps(iz3,jz2);
258 dx33 = _mm_sub_ps(ix3,jx3);
259 dy33 = _mm_sub_ps(iy3,jy3);
260 dz33 = _mm_sub_ps(iz3,jz3);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
267 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
270 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
271 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
272 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
274 rinv11 = gmx_mm_invsqrt_ps(rsq11);
275 rinv12 = gmx_mm_invsqrt_ps(rsq12);
276 rinv13 = gmx_mm_invsqrt_ps(rsq13);
277 rinv21 = gmx_mm_invsqrt_ps(rsq21);
278 rinv22 = gmx_mm_invsqrt_ps(rsq22);
279 rinv23 = gmx_mm_invsqrt_ps(rsq23);
280 rinv31 = gmx_mm_invsqrt_ps(rsq31);
281 rinv32 = gmx_mm_invsqrt_ps(rsq32);
282 rinv33 = gmx_mm_invsqrt_ps(rsq33);
284 rinvsq00 = gmx_mm_inv_ps(rsq00);
286 fjx0 = _mm_setzero_ps();
287 fjy0 = _mm_setzero_ps();
288 fjz0 = _mm_setzero_ps();
289 fjx1 = _mm_setzero_ps();
290 fjy1 = _mm_setzero_ps();
291 fjz1 = _mm_setzero_ps();
292 fjx2 = _mm_setzero_ps();
293 fjy2 = _mm_setzero_ps();
294 fjz2 = _mm_setzero_ps();
295 fjx3 = _mm_setzero_ps();
296 fjy3 = _mm_setzero_ps();
297 fjz3 = _mm_setzero_ps();
299 /**************************
300 * CALCULATE INTERACTIONS *
301 **************************/
303 /* LENNARD-JONES DISPERSION/REPULSION */
305 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
306 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
307 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
308 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
309 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
311 /* Update potential sum for this i atom from the interaction with this j atom. */
312 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
316 /* Calculate temporary vectorial force */
317 tx = _mm_mul_ps(fscal,dx00);
318 ty = _mm_mul_ps(fscal,dy00);
319 tz = _mm_mul_ps(fscal,dz00);
321 /* Update vectorial force */
322 fix0 = _mm_add_ps(fix0,tx);
323 fiy0 = _mm_add_ps(fiy0,ty);
324 fiz0 = _mm_add_ps(fiz0,tz);
326 fjx0 = _mm_add_ps(fjx0,tx);
327 fjy0 = _mm_add_ps(fjy0,ty);
328 fjz0 = _mm_add_ps(fjz0,tz);
330 /**************************
331 * CALCULATE INTERACTIONS *
332 **************************/
334 r11 = _mm_mul_ps(rsq11,rinv11);
336 /* Calculate table index by multiplying r with table scale and truncate to integer */
337 rt = _mm_mul_ps(r11,vftabscale);
338 vfitab = _mm_cvttps_epi32(rt);
339 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
340 vfitab = _mm_slli_epi32(vfitab,2);
342 /* CUBIC SPLINE TABLE ELECTROSTATICS */
343 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
344 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
345 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
346 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
347 _MM_TRANSPOSE4_PS(Y,F,G,H);
348 Heps = _mm_mul_ps(vfeps,H);
349 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
350 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
351 velec = _mm_mul_ps(qq11,VV);
352 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
353 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 velecsum = _mm_add_ps(velecsum,velec);
360 /* Calculate temporary vectorial force */
361 tx = _mm_mul_ps(fscal,dx11);
362 ty = _mm_mul_ps(fscal,dy11);
363 tz = _mm_mul_ps(fscal,dz11);
365 /* Update vectorial force */
366 fix1 = _mm_add_ps(fix1,tx);
367 fiy1 = _mm_add_ps(fiy1,ty);
368 fiz1 = _mm_add_ps(fiz1,tz);
370 fjx1 = _mm_add_ps(fjx1,tx);
371 fjy1 = _mm_add_ps(fjy1,ty);
372 fjz1 = _mm_add_ps(fjz1,tz);
374 /**************************
375 * CALCULATE INTERACTIONS *
376 **************************/
378 r12 = _mm_mul_ps(rsq12,rinv12);
380 /* Calculate table index by multiplying r with table scale and truncate to integer */
381 rt = _mm_mul_ps(r12,vftabscale);
382 vfitab = _mm_cvttps_epi32(rt);
383 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
384 vfitab = _mm_slli_epi32(vfitab,2);
386 /* CUBIC SPLINE TABLE ELECTROSTATICS */
387 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
388 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
389 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
390 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
391 _MM_TRANSPOSE4_PS(Y,F,G,H);
392 Heps = _mm_mul_ps(vfeps,H);
393 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
394 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
395 velec = _mm_mul_ps(qq12,VV);
396 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
397 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_ps(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_ps(fscal,dx12);
406 ty = _mm_mul_ps(fscal,dy12);
407 tz = _mm_mul_ps(fscal,dz12);
409 /* Update vectorial force */
410 fix1 = _mm_add_ps(fix1,tx);
411 fiy1 = _mm_add_ps(fiy1,ty);
412 fiz1 = _mm_add_ps(fiz1,tz);
414 fjx2 = _mm_add_ps(fjx2,tx);
415 fjy2 = _mm_add_ps(fjy2,ty);
416 fjz2 = _mm_add_ps(fjz2,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r13 = _mm_mul_ps(rsq13,rinv13);
424 /* Calculate table index by multiplying r with table scale and truncate to integer */
425 rt = _mm_mul_ps(r13,vftabscale);
426 vfitab = _mm_cvttps_epi32(rt);
427 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
428 vfitab = _mm_slli_epi32(vfitab,2);
430 /* CUBIC SPLINE TABLE ELECTROSTATICS */
431 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
432 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
433 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
434 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
435 _MM_TRANSPOSE4_PS(Y,F,G,H);
436 Heps = _mm_mul_ps(vfeps,H);
437 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
438 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
439 velec = _mm_mul_ps(qq13,VV);
440 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
441 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm_add_ps(velecsum,velec);
448 /* Calculate temporary vectorial force */
449 tx = _mm_mul_ps(fscal,dx13);
450 ty = _mm_mul_ps(fscal,dy13);
451 tz = _mm_mul_ps(fscal,dz13);
453 /* Update vectorial force */
454 fix1 = _mm_add_ps(fix1,tx);
455 fiy1 = _mm_add_ps(fiy1,ty);
456 fiz1 = _mm_add_ps(fiz1,tz);
458 fjx3 = _mm_add_ps(fjx3,tx);
459 fjy3 = _mm_add_ps(fjy3,ty);
460 fjz3 = _mm_add_ps(fjz3,tz);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 r21 = _mm_mul_ps(rsq21,rinv21);
468 /* Calculate table index by multiplying r with table scale and truncate to integer */
469 rt = _mm_mul_ps(r21,vftabscale);
470 vfitab = _mm_cvttps_epi32(rt);
471 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
472 vfitab = _mm_slli_epi32(vfitab,2);
474 /* CUBIC SPLINE TABLE ELECTROSTATICS */
475 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
476 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
477 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
478 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
479 _MM_TRANSPOSE4_PS(Y,F,G,H);
480 Heps = _mm_mul_ps(vfeps,H);
481 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
482 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
483 velec = _mm_mul_ps(qq21,VV);
484 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
485 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velecsum = _mm_add_ps(velecsum,velec);
492 /* Calculate temporary vectorial force */
493 tx = _mm_mul_ps(fscal,dx21);
494 ty = _mm_mul_ps(fscal,dy21);
495 tz = _mm_mul_ps(fscal,dz21);
497 /* Update vectorial force */
498 fix2 = _mm_add_ps(fix2,tx);
499 fiy2 = _mm_add_ps(fiy2,ty);
500 fiz2 = _mm_add_ps(fiz2,tz);
502 fjx1 = _mm_add_ps(fjx1,tx);
503 fjy1 = _mm_add_ps(fjy1,ty);
504 fjz1 = _mm_add_ps(fjz1,tz);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 r22 = _mm_mul_ps(rsq22,rinv22);
512 /* Calculate table index by multiplying r with table scale and truncate to integer */
513 rt = _mm_mul_ps(r22,vftabscale);
514 vfitab = _mm_cvttps_epi32(rt);
515 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
516 vfitab = _mm_slli_epi32(vfitab,2);
518 /* CUBIC SPLINE TABLE ELECTROSTATICS */
519 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
520 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
521 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
522 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
523 _MM_TRANSPOSE4_PS(Y,F,G,H);
524 Heps = _mm_mul_ps(vfeps,H);
525 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
526 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
527 velec = _mm_mul_ps(qq22,VV);
528 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
529 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
531 /* Update potential sum for this i atom from the interaction with this j atom. */
532 velecsum = _mm_add_ps(velecsum,velec);
536 /* Calculate temporary vectorial force */
537 tx = _mm_mul_ps(fscal,dx22);
538 ty = _mm_mul_ps(fscal,dy22);
539 tz = _mm_mul_ps(fscal,dz22);
541 /* Update vectorial force */
542 fix2 = _mm_add_ps(fix2,tx);
543 fiy2 = _mm_add_ps(fiy2,ty);
544 fiz2 = _mm_add_ps(fiz2,tz);
546 fjx2 = _mm_add_ps(fjx2,tx);
547 fjy2 = _mm_add_ps(fjy2,ty);
548 fjz2 = _mm_add_ps(fjz2,tz);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 r23 = _mm_mul_ps(rsq23,rinv23);
556 /* Calculate table index by multiplying r with table scale and truncate to integer */
557 rt = _mm_mul_ps(r23,vftabscale);
558 vfitab = _mm_cvttps_epi32(rt);
559 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
560 vfitab = _mm_slli_epi32(vfitab,2);
562 /* CUBIC SPLINE TABLE ELECTROSTATICS */
563 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
564 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
565 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
566 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
567 _MM_TRANSPOSE4_PS(Y,F,G,H);
568 Heps = _mm_mul_ps(vfeps,H);
569 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
570 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
571 velec = _mm_mul_ps(qq23,VV);
572 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
573 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velecsum = _mm_add_ps(velecsum,velec);
580 /* Calculate temporary vectorial force */
581 tx = _mm_mul_ps(fscal,dx23);
582 ty = _mm_mul_ps(fscal,dy23);
583 tz = _mm_mul_ps(fscal,dz23);
585 /* Update vectorial force */
586 fix2 = _mm_add_ps(fix2,tx);
587 fiy2 = _mm_add_ps(fiy2,ty);
588 fiz2 = _mm_add_ps(fiz2,tz);
590 fjx3 = _mm_add_ps(fjx3,tx);
591 fjy3 = _mm_add_ps(fjy3,ty);
592 fjz3 = _mm_add_ps(fjz3,tz);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 r31 = _mm_mul_ps(rsq31,rinv31);
600 /* Calculate table index by multiplying r with table scale and truncate to integer */
601 rt = _mm_mul_ps(r31,vftabscale);
602 vfitab = _mm_cvttps_epi32(rt);
603 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
604 vfitab = _mm_slli_epi32(vfitab,2);
606 /* CUBIC SPLINE TABLE ELECTROSTATICS */
607 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
608 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
609 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
610 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
611 _MM_TRANSPOSE4_PS(Y,F,G,H);
612 Heps = _mm_mul_ps(vfeps,H);
613 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
614 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
615 velec = _mm_mul_ps(qq31,VV);
616 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
617 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
619 /* Update potential sum for this i atom from the interaction with this j atom. */
620 velecsum = _mm_add_ps(velecsum,velec);
624 /* Calculate temporary vectorial force */
625 tx = _mm_mul_ps(fscal,dx31);
626 ty = _mm_mul_ps(fscal,dy31);
627 tz = _mm_mul_ps(fscal,dz31);
629 /* Update vectorial force */
630 fix3 = _mm_add_ps(fix3,tx);
631 fiy3 = _mm_add_ps(fiy3,ty);
632 fiz3 = _mm_add_ps(fiz3,tz);
634 fjx1 = _mm_add_ps(fjx1,tx);
635 fjy1 = _mm_add_ps(fjy1,ty);
636 fjz1 = _mm_add_ps(fjz1,tz);
638 /**************************
639 * CALCULATE INTERACTIONS *
640 **************************/
642 r32 = _mm_mul_ps(rsq32,rinv32);
644 /* Calculate table index by multiplying r with table scale and truncate to integer */
645 rt = _mm_mul_ps(r32,vftabscale);
646 vfitab = _mm_cvttps_epi32(rt);
647 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
648 vfitab = _mm_slli_epi32(vfitab,2);
650 /* CUBIC SPLINE TABLE ELECTROSTATICS */
651 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
652 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
653 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
654 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
655 _MM_TRANSPOSE4_PS(Y,F,G,H);
656 Heps = _mm_mul_ps(vfeps,H);
657 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
658 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
659 velec = _mm_mul_ps(qq32,VV);
660 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
661 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velecsum = _mm_add_ps(velecsum,velec);
668 /* Calculate temporary vectorial force */
669 tx = _mm_mul_ps(fscal,dx32);
670 ty = _mm_mul_ps(fscal,dy32);
671 tz = _mm_mul_ps(fscal,dz32);
673 /* Update vectorial force */
674 fix3 = _mm_add_ps(fix3,tx);
675 fiy3 = _mm_add_ps(fiy3,ty);
676 fiz3 = _mm_add_ps(fiz3,tz);
678 fjx2 = _mm_add_ps(fjx2,tx);
679 fjy2 = _mm_add_ps(fjy2,ty);
680 fjz2 = _mm_add_ps(fjz2,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 r33 = _mm_mul_ps(rsq33,rinv33);
688 /* Calculate table index by multiplying r with table scale and truncate to integer */
689 rt = _mm_mul_ps(r33,vftabscale);
690 vfitab = _mm_cvttps_epi32(rt);
691 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
692 vfitab = _mm_slli_epi32(vfitab,2);
694 /* CUBIC SPLINE TABLE ELECTROSTATICS */
695 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
696 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
697 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
698 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
699 _MM_TRANSPOSE4_PS(Y,F,G,H);
700 Heps = _mm_mul_ps(vfeps,H);
701 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
702 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
703 velec = _mm_mul_ps(qq33,VV);
704 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
705 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
707 /* Update potential sum for this i atom from the interaction with this j atom. */
708 velecsum = _mm_add_ps(velecsum,velec);
712 /* Calculate temporary vectorial force */
713 tx = _mm_mul_ps(fscal,dx33);
714 ty = _mm_mul_ps(fscal,dy33);
715 tz = _mm_mul_ps(fscal,dz33);
717 /* Update vectorial force */
718 fix3 = _mm_add_ps(fix3,tx);
719 fiy3 = _mm_add_ps(fiy3,ty);
720 fiz3 = _mm_add_ps(fiz3,tz);
722 fjx3 = _mm_add_ps(fjx3,tx);
723 fjy3 = _mm_add_ps(fjy3,ty);
724 fjz3 = _mm_add_ps(fjz3,tz);
726 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
727 f+j_coord_offsetC,f+j_coord_offsetD,
728 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
729 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
731 /* Inner loop uses 422 flops */
737 /* Get j neighbor index, and coordinate index */
743 /* Sign of each element will be negative for non-real atoms.
744 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
745 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
747 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
748 jnrA = (jnrA>=0) ? jnrA : 0;
749 jnrB = (jnrB>=0) ? jnrB : 0;
750 jnrC = (jnrC>=0) ? jnrC : 0;
751 jnrD = (jnrD>=0) ? jnrD : 0;
753 j_coord_offsetA = DIM*jnrA;
754 j_coord_offsetB = DIM*jnrB;
755 j_coord_offsetC = DIM*jnrC;
756 j_coord_offsetD = DIM*jnrD;
758 /* load j atom coordinates */
759 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
760 x+j_coord_offsetC,x+j_coord_offsetD,
761 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
762 &jy2,&jz2,&jx3,&jy3,&jz3);
764 /* Calculate displacement vector */
765 dx00 = _mm_sub_ps(ix0,jx0);
766 dy00 = _mm_sub_ps(iy0,jy0);
767 dz00 = _mm_sub_ps(iz0,jz0);
768 dx11 = _mm_sub_ps(ix1,jx1);
769 dy11 = _mm_sub_ps(iy1,jy1);
770 dz11 = _mm_sub_ps(iz1,jz1);
771 dx12 = _mm_sub_ps(ix1,jx2);
772 dy12 = _mm_sub_ps(iy1,jy2);
773 dz12 = _mm_sub_ps(iz1,jz2);
774 dx13 = _mm_sub_ps(ix1,jx3);
775 dy13 = _mm_sub_ps(iy1,jy3);
776 dz13 = _mm_sub_ps(iz1,jz3);
777 dx21 = _mm_sub_ps(ix2,jx1);
778 dy21 = _mm_sub_ps(iy2,jy1);
779 dz21 = _mm_sub_ps(iz2,jz1);
780 dx22 = _mm_sub_ps(ix2,jx2);
781 dy22 = _mm_sub_ps(iy2,jy2);
782 dz22 = _mm_sub_ps(iz2,jz2);
783 dx23 = _mm_sub_ps(ix2,jx3);
784 dy23 = _mm_sub_ps(iy2,jy3);
785 dz23 = _mm_sub_ps(iz2,jz3);
786 dx31 = _mm_sub_ps(ix3,jx1);
787 dy31 = _mm_sub_ps(iy3,jy1);
788 dz31 = _mm_sub_ps(iz3,jz1);
789 dx32 = _mm_sub_ps(ix3,jx2);
790 dy32 = _mm_sub_ps(iy3,jy2);
791 dz32 = _mm_sub_ps(iz3,jz2);
792 dx33 = _mm_sub_ps(ix3,jx3);
793 dy33 = _mm_sub_ps(iy3,jy3);
794 dz33 = _mm_sub_ps(iz3,jz3);
796 /* Calculate squared distance and things based on it */
797 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
798 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
799 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
800 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
801 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
802 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
803 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
804 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
805 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
806 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
808 rinv11 = gmx_mm_invsqrt_ps(rsq11);
809 rinv12 = gmx_mm_invsqrt_ps(rsq12);
810 rinv13 = gmx_mm_invsqrt_ps(rsq13);
811 rinv21 = gmx_mm_invsqrt_ps(rsq21);
812 rinv22 = gmx_mm_invsqrt_ps(rsq22);
813 rinv23 = gmx_mm_invsqrt_ps(rsq23);
814 rinv31 = gmx_mm_invsqrt_ps(rsq31);
815 rinv32 = gmx_mm_invsqrt_ps(rsq32);
816 rinv33 = gmx_mm_invsqrt_ps(rsq33);
818 rinvsq00 = gmx_mm_inv_ps(rsq00);
820 fjx0 = _mm_setzero_ps();
821 fjy0 = _mm_setzero_ps();
822 fjz0 = _mm_setzero_ps();
823 fjx1 = _mm_setzero_ps();
824 fjy1 = _mm_setzero_ps();
825 fjz1 = _mm_setzero_ps();
826 fjx2 = _mm_setzero_ps();
827 fjy2 = _mm_setzero_ps();
828 fjz2 = _mm_setzero_ps();
829 fjx3 = _mm_setzero_ps();
830 fjy3 = _mm_setzero_ps();
831 fjz3 = _mm_setzero_ps();
833 /**************************
834 * CALCULATE INTERACTIONS *
835 **************************/
837 /* LENNARD-JONES DISPERSION/REPULSION */
839 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
840 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
841 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
842 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
843 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
845 /* Update potential sum for this i atom from the interaction with this j atom. */
846 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
847 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
851 fscal = _mm_andnot_ps(dummy_mask,fscal);
853 /* Calculate temporary vectorial force */
854 tx = _mm_mul_ps(fscal,dx00);
855 ty = _mm_mul_ps(fscal,dy00);
856 tz = _mm_mul_ps(fscal,dz00);
858 /* Update vectorial force */
859 fix0 = _mm_add_ps(fix0,tx);
860 fiy0 = _mm_add_ps(fiy0,ty);
861 fiz0 = _mm_add_ps(fiz0,tz);
863 fjx0 = _mm_add_ps(fjx0,tx);
864 fjy0 = _mm_add_ps(fjy0,ty);
865 fjz0 = _mm_add_ps(fjz0,tz);
867 /**************************
868 * CALCULATE INTERACTIONS *
869 **************************/
871 r11 = _mm_mul_ps(rsq11,rinv11);
872 r11 = _mm_andnot_ps(dummy_mask,r11);
874 /* Calculate table index by multiplying r with table scale and truncate to integer */
875 rt = _mm_mul_ps(r11,vftabscale);
876 vfitab = _mm_cvttps_epi32(rt);
877 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
878 vfitab = _mm_slli_epi32(vfitab,2);
880 /* CUBIC SPLINE TABLE ELECTROSTATICS */
881 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
882 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
883 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
884 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
885 _MM_TRANSPOSE4_PS(Y,F,G,H);
886 Heps = _mm_mul_ps(vfeps,H);
887 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
888 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
889 velec = _mm_mul_ps(qq11,VV);
890 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
891 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
893 /* Update potential sum for this i atom from the interaction with this j atom. */
894 velec = _mm_andnot_ps(dummy_mask,velec);
895 velecsum = _mm_add_ps(velecsum,velec);
899 fscal = _mm_andnot_ps(dummy_mask,fscal);
901 /* Calculate temporary vectorial force */
902 tx = _mm_mul_ps(fscal,dx11);
903 ty = _mm_mul_ps(fscal,dy11);
904 tz = _mm_mul_ps(fscal,dz11);
906 /* Update vectorial force */
907 fix1 = _mm_add_ps(fix1,tx);
908 fiy1 = _mm_add_ps(fiy1,ty);
909 fiz1 = _mm_add_ps(fiz1,tz);
911 fjx1 = _mm_add_ps(fjx1,tx);
912 fjy1 = _mm_add_ps(fjy1,ty);
913 fjz1 = _mm_add_ps(fjz1,tz);
915 /**************************
916 * CALCULATE INTERACTIONS *
917 **************************/
919 r12 = _mm_mul_ps(rsq12,rinv12);
920 r12 = _mm_andnot_ps(dummy_mask,r12);
922 /* Calculate table index by multiplying r with table scale and truncate to integer */
923 rt = _mm_mul_ps(r12,vftabscale);
924 vfitab = _mm_cvttps_epi32(rt);
925 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
926 vfitab = _mm_slli_epi32(vfitab,2);
928 /* CUBIC SPLINE TABLE ELECTROSTATICS */
929 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
930 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
931 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
932 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
933 _MM_TRANSPOSE4_PS(Y,F,G,H);
934 Heps = _mm_mul_ps(vfeps,H);
935 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
936 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
937 velec = _mm_mul_ps(qq12,VV);
938 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
939 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
941 /* Update potential sum for this i atom from the interaction with this j atom. */
942 velec = _mm_andnot_ps(dummy_mask,velec);
943 velecsum = _mm_add_ps(velecsum,velec);
947 fscal = _mm_andnot_ps(dummy_mask,fscal);
949 /* Calculate temporary vectorial force */
950 tx = _mm_mul_ps(fscal,dx12);
951 ty = _mm_mul_ps(fscal,dy12);
952 tz = _mm_mul_ps(fscal,dz12);
954 /* Update vectorial force */
955 fix1 = _mm_add_ps(fix1,tx);
956 fiy1 = _mm_add_ps(fiy1,ty);
957 fiz1 = _mm_add_ps(fiz1,tz);
959 fjx2 = _mm_add_ps(fjx2,tx);
960 fjy2 = _mm_add_ps(fjy2,ty);
961 fjz2 = _mm_add_ps(fjz2,tz);
963 /**************************
964 * CALCULATE INTERACTIONS *
965 **************************/
967 r13 = _mm_mul_ps(rsq13,rinv13);
968 r13 = _mm_andnot_ps(dummy_mask,r13);
970 /* Calculate table index by multiplying r with table scale and truncate to integer */
971 rt = _mm_mul_ps(r13,vftabscale);
972 vfitab = _mm_cvttps_epi32(rt);
973 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
974 vfitab = _mm_slli_epi32(vfitab,2);
976 /* CUBIC SPLINE TABLE ELECTROSTATICS */
977 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
978 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
979 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
980 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
981 _MM_TRANSPOSE4_PS(Y,F,G,H);
982 Heps = _mm_mul_ps(vfeps,H);
983 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
984 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
985 velec = _mm_mul_ps(qq13,VV);
986 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
987 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
989 /* Update potential sum for this i atom from the interaction with this j atom. */
990 velec = _mm_andnot_ps(dummy_mask,velec);
991 velecsum = _mm_add_ps(velecsum,velec);
995 fscal = _mm_andnot_ps(dummy_mask,fscal);
997 /* Calculate temporary vectorial force */
998 tx = _mm_mul_ps(fscal,dx13);
999 ty = _mm_mul_ps(fscal,dy13);
1000 tz = _mm_mul_ps(fscal,dz13);
1002 /* Update vectorial force */
1003 fix1 = _mm_add_ps(fix1,tx);
1004 fiy1 = _mm_add_ps(fiy1,ty);
1005 fiz1 = _mm_add_ps(fiz1,tz);
1007 fjx3 = _mm_add_ps(fjx3,tx);
1008 fjy3 = _mm_add_ps(fjy3,ty);
1009 fjz3 = _mm_add_ps(fjz3,tz);
1011 /**************************
1012 * CALCULATE INTERACTIONS *
1013 **************************/
1015 r21 = _mm_mul_ps(rsq21,rinv21);
1016 r21 = _mm_andnot_ps(dummy_mask,r21);
1018 /* Calculate table index by multiplying r with table scale and truncate to integer */
1019 rt = _mm_mul_ps(r21,vftabscale);
1020 vfitab = _mm_cvttps_epi32(rt);
1021 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1022 vfitab = _mm_slli_epi32(vfitab,2);
1024 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1025 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1026 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1027 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1028 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1029 _MM_TRANSPOSE4_PS(Y,F,G,H);
1030 Heps = _mm_mul_ps(vfeps,H);
1031 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1032 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1033 velec = _mm_mul_ps(qq21,VV);
1034 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1035 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1037 /* Update potential sum for this i atom from the interaction with this j atom. */
1038 velec = _mm_andnot_ps(dummy_mask,velec);
1039 velecsum = _mm_add_ps(velecsum,velec);
1043 fscal = _mm_andnot_ps(dummy_mask,fscal);
1045 /* Calculate temporary vectorial force */
1046 tx = _mm_mul_ps(fscal,dx21);
1047 ty = _mm_mul_ps(fscal,dy21);
1048 tz = _mm_mul_ps(fscal,dz21);
1050 /* Update vectorial force */
1051 fix2 = _mm_add_ps(fix2,tx);
1052 fiy2 = _mm_add_ps(fiy2,ty);
1053 fiz2 = _mm_add_ps(fiz2,tz);
1055 fjx1 = _mm_add_ps(fjx1,tx);
1056 fjy1 = _mm_add_ps(fjy1,ty);
1057 fjz1 = _mm_add_ps(fjz1,tz);
1059 /**************************
1060 * CALCULATE INTERACTIONS *
1061 **************************/
1063 r22 = _mm_mul_ps(rsq22,rinv22);
1064 r22 = _mm_andnot_ps(dummy_mask,r22);
1066 /* Calculate table index by multiplying r with table scale and truncate to integer */
1067 rt = _mm_mul_ps(r22,vftabscale);
1068 vfitab = _mm_cvttps_epi32(rt);
1069 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1070 vfitab = _mm_slli_epi32(vfitab,2);
1072 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1073 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1074 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1075 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1076 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1077 _MM_TRANSPOSE4_PS(Y,F,G,H);
1078 Heps = _mm_mul_ps(vfeps,H);
1079 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1080 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1081 velec = _mm_mul_ps(qq22,VV);
1082 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1083 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1085 /* Update potential sum for this i atom from the interaction with this j atom. */
1086 velec = _mm_andnot_ps(dummy_mask,velec);
1087 velecsum = _mm_add_ps(velecsum,velec);
1091 fscal = _mm_andnot_ps(dummy_mask,fscal);
1093 /* Calculate temporary vectorial force */
1094 tx = _mm_mul_ps(fscal,dx22);
1095 ty = _mm_mul_ps(fscal,dy22);
1096 tz = _mm_mul_ps(fscal,dz22);
1098 /* Update vectorial force */
1099 fix2 = _mm_add_ps(fix2,tx);
1100 fiy2 = _mm_add_ps(fiy2,ty);
1101 fiz2 = _mm_add_ps(fiz2,tz);
1103 fjx2 = _mm_add_ps(fjx2,tx);
1104 fjy2 = _mm_add_ps(fjy2,ty);
1105 fjz2 = _mm_add_ps(fjz2,tz);
1107 /**************************
1108 * CALCULATE INTERACTIONS *
1109 **************************/
1111 r23 = _mm_mul_ps(rsq23,rinv23);
1112 r23 = _mm_andnot_ps(dummy_mask,r23);
1114 /* Calculate table index by multiplying r with table scale and truncate to integer */
1115 rt = _mm_mul_ps(r23,vftabscale);
1116 vfitab = _mm_cvttps_epi32(rt);
1117 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1118 vfitab = _mm_slli_epi32(vfitab,2);
1120 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1121 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1122 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1123 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1124 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1125 _MM_TRANSPOSE4_PS(Y,F,G,H);
1126 Heps = _mm_mul_ps(vfeps,H);
1127 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1128 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1129 velec = _mm_mul_ps(qq23,VV);
1130 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1131 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1133 /* Update potential sum for this i atom from the interaction with this j atom. */
1134 velec = _mm_andnot_ps(dummy_mask,velec);
1135 velecsum = _mm_add_ps(velecsum,velec);
1139 fscal = _mm_andnot_ps(dummy_mask,fscal);
1141 /* Calculate temporary vectorial force */
1142 tx = _mm_mul_ps(fscal,dx23);
1143 ty = _mm_mul_ps(fscal,dy23);
1144 tz = _mm_mul_ps(fscal,dz23);
1146 /* Update vectorial force */
1147 fix2 = _mm_add_ps(fix2,tx);
1148 fiy2 = _mm_add_ps(fiy2,ty);
1149 fiz2 = _mm_add_ps(fiz2,tz);
1151 fjx3 = _mm_add_ps(fjx3,tx);
1152 fjy3 = _mm_add_ps(fjy3,ty);
1153 fjz3 = _mm_add_ps(fjz3,tz);
1155 /**************************
1156 * CALCULATE INTERACTIONS *
1157 **************************/
1159 r31 = _mm_mul_ps(rsq31,rinv31);
1160 r31 = _mm_andnot_ps(dummy_mask,r31);
1162 /* Calculate table index by multiplying r with table scale and truncate to integer */
1163 rt = _mm_mul_ps(r31,vftabscale);
1164 vfitab = _mm_cvttps_epi32(rt);
1165 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1166 vfitab = _mm_slli_epi32(vfitab,2);
1168 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1169 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1170 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1171 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1172 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1173 _MM_TRANSPOSE4_PS(Y,F,G,H);
1174 Heps = _mm_mul_ps(vfeps,H);
1175 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1176 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1177 velec = _mm_mul_ps(qq31,VV);
1178 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1179 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1181 /* Update potential sum for this i atom from the interaction with this j atom. */
1182 velec = _mm_andnot_ps(dummy_mask,velec);
1183 velecsum = _mm_add_ps(velecsum,velec);
1187 fscal = _mm_andnot_ps(dummy_mask,fscal);
1189 /* Calculate temporary vectorial force */
1190 tx = _mm_mul_ps(fscal,dx31);
1191 ty = _mm_mul_ps(fscal,dy31);
1192 tz = _mm_mul_ps(fscal,dz31);
1194 /* Update vectorial force */
1195 fix3 = _mm_add_ps(fix3,tx);
1196 fiy3 = _mm_add_ps(fiy3,ty);
1197 fiz3 = _mm_add_ps(fiz3,tz);
1199 fjx1 = _mm_add_ps(fjx1,tx);
1200 fjy1 = _mm_add_ps(fjy1,ty);
1201 fjz1 = _mm_add_ps(fjz1,tz);
1203 /**************************
1204 * CALCULATE INTERACTIONS *
1205 **************************/
1207 r32 = _mm_mul_ps(rsq32,rinv32);
1208 r32 = _mm_andnot_ps(dummy_mask,r32);
1210 /* Calculate table index by multiplying r with table scale and truncate to integer */
1211 rt = _mm_mul_ps(r32,vftabscale);
1212 vfitab = _mm_cvttps_epi32(rt);
1213 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1214 vfitab = _mm_slli_epi32(vfitab,2);
1216 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1217 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1218 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1219 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1220 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1221 _MM_TRANSPOSE4_PS(Y,F,G,H);
1222 Heps = _mm_mul_ps(vfeps,H);
1223 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1224 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1225 velec = _mm_mul_ps(qq32,VV);
1226 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1227 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1229 /* Update potential sum for this i atom from the interaction with this j atom. */
1230 velec = _mm_andnot_ps(dummy_mask,velec);
1231 velecsum = _mm_add_ps(velecsum,velec);
1235 fscal = _mm_andnot_ps(dummy_mask,fscal);
1237 /* Calculate temporary vectorial force */
1238 tx = _mm_mul_ps(fscal,dx32);
1239 ty = _mm_mul_ps(fscal,dy32);
1240 tz = _mm_mul_ps(fscal,dz32);
1242 /* Update vectorial force */
1243 fix3 = _mm_add_ps(fix3,tx);
1244 fiy3 = _mm_add_ps(fiy3,ty);
1245 fiz3 = _mm_add_ps(fiz3,tz);
1247 fjx2 = _mm_add_ps(fjx2,tx);
1248 fjy2 = _mm_add_ps(fjy2,ty);
1249 fjz2 = _mm_add_ps(fjz2,tz);
1251 /**************************
1252 * CALCULATE INTERACTIONS *
1253 **************************/
1255 r33 = _mm_mul_ps(rsq33,rinv33);
1256 r33 = _mm_andnot_ps(dummy_mask,r33);
1258 /* Calculate table index by multiplying r with table scale and truncate to integer */
1259 rt = _mm_mul_ps(r33,vftabscale);
1260 vfitab = _mm_cvttps_epi32(rt);
1261 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1262 vfitab = _mm_slli_epi32(vfitab,2);
1264 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1265 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1266 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1267 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1268 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1269 _MM_TRANSPOSE4_PS(Y,F,G,H);
1270 Heps = _mm_mul_ps(vfeps,H);
1271 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1272 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1273 velec = _mm_mul_ps(qq33,VV);
1274 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1275 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1277 /* Update potential sum for this i atom from the interaction with this j atom. */
1278 velec = _mm_andnot_ps(dummy_mask,velec);
1279 velecsum = _mm_add_ps(velecsum,velec);
1283 fscal = _mm_andnot_ps(dummy_mask,fscal);
1285 /* Calculate temporary vectorial force */
1286 tx = _mm_mul_ps(fscal,dx33);
1287 ty = _mm_mul_ps(fscal,dy33);
1288 tz = _mm_mul_ps(fscal,dz33);
1290 /* Update vectorial force */
1291 fix3 = _mm_add_ps(fix3,tx);
1292 fiy3 = _mm_add_ps(fiy3,ty);
1293 fiz3 = _mm_add_ps(fiz3,tz);
1295 fjx3 = _mm_add_ps(fjx3,tx);
1296 fjy3 = _mm_add_ps(fjy3,ty);
1297 fjz3 = _mm_add_ps(fjz3,tz);
1299 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1300 f+j_coord_offsetC,f+j_coord_offsetD,
1301 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1302 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1304 /* Inner loop uses 431 flops */
1307 /* End of innermost loop */
1309 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1310 f+i_coord_offset,fshift+i_shift_offset);
1313 /* Update potential energies */
1314 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1315 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1317 /* Increment number of inner iterations */
1318 inneriter += j_index_end - j_index_start;
1320 /* Outer loop uses 38 flops */
1323 /* Increment number of outer iterations */
1326 /* Update outer/inner flops */
1328 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*431);
1331 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
1332 * Electrostatics interaction: CubicSplineTable
1333 * VdW interaction: LennardJones
1334 * Geometry: Water4-Water4
1335 * Calculate force/pot: Force
1338 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
1339 (t_nblist * gmx_restrict nlist,
1340 rvec * gmx_restrict xx,
1341 rvec * gmx_restrict ff,
1342 t_forcerec * gmx_restrict fr,
1343 t_mdatoms * gmx_restrict mdatoms,
1344 nb_kernel_data_t * gmx_restrict kernel_data,
1345 t_nrnb * gmx_restrict nrnb)
1347 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1348 * just 0 for non-waters.
1349 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1350 * jnr indices corresponding to data put in the four positions in the SIMD register.
1352 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1353 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1354 int jnrA,jnrB,jnrC,jnrD;
1355 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1356 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1357 real shX,shY,shZ,rcutoff_scalar;
1358 real *shiftvec,*fshift,*x,*f;
1359 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1361 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1363 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1365 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1367 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1368 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1369 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1370 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1371 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1372 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1373 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1374 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1375 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1376 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1377 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1378 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1379 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1380 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1381 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1382 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1383 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1384 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1385 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1386 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1389 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1392 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1393 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1395 __m128i ifour = _mm_set1_epi32(4);
1396 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1398 __m128 dummy_mask,cutoff_mask;
1399 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1400 __m128 one = _mm_set1_ps(1.0);
1401 __m128 two = _mm_set1_ps(2.0);
1407 jindex = nlist->jindex;
1409 shiftidx = nlist->shift;
1411 shiftvec = fr->shift_vec[0];
1412 fshift = fr->fshift[0];
1413 facel = _mm_set1_ps(fr->epsfac);
1414 charge = mdatoms->chargeA;
1415 nvdwtype = fr->ntype;
1416 vdwparam = fr->nbfp;
1417 vdwtype = mdatoms->typeA;
1419 vftab = kernel_data->table_elec->data;
1420 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1422 /* Setup water-specific parameters */
1423 inr = nlist->iinr[0];
1424 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1425 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1426 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1427 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1429 jq1 = _mm_set1_ps(charge[inr+1]);
1430 jq2 = _mm_set1_ps(charge[inr+2]);
1431 jq3 = _mm_set1_ps(charge[inr+3]);
1432 vdwjidx0A = 2*vdwtype[inr+0];
1433 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1434 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1435 qq11 = _mm_mul_ps(iq1,jq1);
1436 qq12 = _mm_mul_ps(iq1,jq2);
1437 qq13 = _mm_mul_ps(iq1,jq3);
1438 qq21 = _mm_mul_ps(iq2,jq1);
1439 qq22 = _mm_mul_ps(iq2,jq2);
1440 qq23 = _mm_mul_ps(iq2,jq3);
1441 qq31 = _mm_mul_ps(iq3,jq1);
1442 qq32 = _mm_mul_ps(iq3,jq2);
1443 qq33 = _mm_mul_ps(iq3,jq3);
1445 /* Avoid stupid compiler warnings */
1446 jnrA = jnrB = jnrC = jnrD = 0;
1447 j_coord_offsetA = 0;
1448 j_coord_offsetB = 0;
1449 j_coord_offsetC = 0;
1450 j_coord_offsetD = 0;
1455 /* Start outer loop over neighborlists */
1456 for(iidx=0; iidx<nri; iidx++)
1458 /* Load shift vector for this list */
1459 i_shift_offset = DIM*shiftidx[iidx];
1460 shX = shiftvec[i_shift_offset+XX];
1461 shY = shiftvec[i_shift_offset+YY];
1462 shZ = shiftvec[i_shift_offset+ZZ];
1464 /* Load limits for loop over neighbors */
1465 j_index_start = jindex[iidx];
1466 j_index_end = jindex[iidx+1];
1468 /* Get outer coordinate index */
1470 i_coord_offset = DIM*inr;
1472 /* Load i particle coords and add shift vector */
1473 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1474 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1475 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1476 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1477 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1478 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1479 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1480 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1481 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1482 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1483 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1484 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1486 fix0 = _mm_setzero_ps();
1487 fiy0 = _mm_setzero_ps();
1488 fiz0 = _mm_setzero_ps();
1489 fix1 = _mm_setzero_ps();
1490 fiy1 = _mm_setzero_ps();
1491 fiz1 = _mm_setzero_ps();
1492 fix2 = _mm_setzero_ps();
1493 fiy2 = _mm_setzero_ps();
1494 fiz2 = _mm_setzero_ps();
1495 fix3 = _mm_setzero_ps();
1496 fiy3 = _mm_setzero_ps();
1497 fiz3 = _mm_setzero_ps();
1499 /* Start inner kernel loop */
1500 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1503 /* Get j neighbor index, and coordinate index */
1505 jnrB = jjnr[jidx+1];
1506 jnrC = jjnr[jidx+2];
1507 jnrD = jjnr[jidx+3];
1509 j_coord_offsetA = DIM*jnrA;
1510 j_coord_offsetB = DIM*jnrB;
1511 j_coord_offsetC = DIM*jnrC;
1512 j_coord_offsetD = DIM*jnrD;
1514 /* load j atom coordinates */
1515 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1516 x+j_coord_offsetC,x+j_coord_offsetD,
1517 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1518 &jy2,&jz2,&jx3,&jy3,&jz3);
1520 /* Calculate displacement vector */
1521 dx00 = _mm_sub_ps(ix0,jx0);
1522 dy00 = _mm_sub_ps(iy0,jy0);
1523 dz00 = _mm_sub_ps(iz0,jz0);
1524 dx11 = _mm_sub_ps(ix1,jx1);
1525 dy11 = _mm_sub_ps(iy1,jy1);
1526 dz11 = _mm_sub_ps(iz1,jz1);
1527 dx12 = _mm_sub_ps(ix1,jx2);
1528 dy12 = _mm_sub_ps(iy1,jy2);
1529 dz12 = _mm_sub_ps(iz1,jz2);
1530 dx13 = _mm_sub_ps(ix1,jx3);
1531 dy13 = _mm_sub_ps(iy1,jy3);
1532 dz13 = _mm_sub_ps(iz1,jz3);
1533 dx21 = _mm_sub_ps(ix2,jx1);
1534 dy21 = _mm_sub_ps(iy2,jy1);
1535 dz21 = _mm_sub_ps(iz2,jz1);
1536 dx22 = _mm_sub_ps(ix2,jx2);
1537 dy22 = _mm_sub_ps(iy2,jy2);
1538 dz22 = _mm_sub_ps(iz2,jz2);
1539 dx23 = _mm_sub_ps(ix2,jx3);
1540 dy23 = _mm_sub_ps(iy2,jy3);
1541 dz23 = _mm_sub_ps(iz2,jz3);
1542 dx31 = _mm_sub_ps(ix3,jx1);
1543 dy31 = _mm_sub_ps(iy3,jy1);
1544 dz31 = _mm_sub_ps(iz3,jz1);
1545 dx32 = _mm_sub_ps(ix3,jx2);
1546 dy32 = _mm_sub_ps(iy3,jy2);
1547 dz32 = _mm_sub_ps(iz3,jz2);
1548 dx33 = _mm_sub_ps(ix3,jx3);
1549 dy33 = _mm_sub_ps(iy3,jy3);
1550 dz33 = _mm_sub_ps(iz3,jz3);
1552 /* Calculate squared distance and things based on it */
1553 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1554 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1555 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1556 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1557 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1558 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1559 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1560 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1561 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1562 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1564 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1565 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1566 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1567 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1568 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1569 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1570 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1571 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1572 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1574 rinvsq00 = gmx_mm_inv_ps(rsq00);
1576 fjx0 = _mm_setzero_ps();
1577 fjy0 = _mm_setzero_ps();
1578 fjz0 = _mm_setzero_ps();
1579 fjx1 = _mm_setzero_ps();
1580 fjy1 = _mm_setzero_ps();
1581 fjz1 = _mm_setzero_ps();
1582 fjx2 = _mm_setzero_ps();
1583 fjy2 = _mm_setzero_ps();
1584 fjz2 = _mm_setzero_ps();
1585 fjx3 = _mm_setzero_ps();
1586 fjy3 = _mm_setzero_ps();
1587 fjz3 = _mm_setzero_ps();
1589 /**************************
1590 * CALCULATE INTERACTIONS *
1591 **************************/
1593 /* LENNARD-JONES DISPERSION/REPULSION */
1595 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1596 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1600 /* Calculate temporary vectorial force */
1601 tx = _mm_mul_ps(fscal,dx00);
1602 ty = _mm_mul_ps(fscal,dy00);
1603 tz = _mm_mul_ps(fscal,dz00);
1605 /* Update vectorial force */
1606 fix0 = _mm_add_ps(fix0,tx);
1607 fiy0 = _mm_add_ps(fiy0,ty);
1608 fiz0 = _mm_add_ps(fiz0,tz);
1610 fjx0 = _mm_add_ps(fjx0,tx);
1611 fjy0 = _mm_add_ps(fjy0,ty);
1612 fjz0 = _mm_add_ps(fjz0,tz);
1614 /**************************
1615 * CALCULATE INTERACTIONS *
1616 **************************/
1618 r11 = _mm_mul_ps(rsq11,rinv11);
1620 /* Calculate table index by multiplying r with table scale and truncate to integer */
1621 rt = _mm_mul_ps(r11,vftabscale);
1622 vfitab = _mm_cvttps_epi32(rt);
1623 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1624 vfitab = _mm_slli_epi32(vfitab,2);
1626 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1627 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1628 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1629 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1630 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1631 _MM_TRANSPOSE4_PS(Y,F,G,H);
1632 Heps = _mm_mul_ps(vfeps,H);
1633 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1634 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1635 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1639 /* Calculate temporary vectorial force */
1640 tx = _mm_mul_ps(fscal,dx11);
1641 ty = _mm_mul_ps(fscal,dy11);
1642 tz = _mm_mul_ps(fscal,dz11);
1644 /* Update vectorial force */
1645 fix1 = _mm_add_ps(fix1,tx);
1646 fiy1 = _mm_add_ps(fiy1,ty);
1647 fiz1 = _mm_add_ps(fiz1,tz);
1649 fjx1 = _mm_add_ps(fjx1,tx);
1650 fjy1 = _mm_add_ps(fjy1,ty);
1651 fjz1 = _mm_add_ps(fjz1,tz);
1653 /**************************
1654 * CALCULATE INTERACTIONS *
1655 **************************/
1657 r12 = _mm_mul_ps(rsq12,rinv12);
1659 /* Calculate table index by multiplying r with table scale and truncate to integer */
1660 rt = _mm_mul_ps(r12,vftabscale);
1661 vfitab = _mm_cvttps_epi32(rt);
1662 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1663 vfitab = _mm_slli_epi32(vfitab,2);
1665 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1666 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1667 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1668 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1669 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1670 _MM_TRANSPOSE4_PS(Y,F,G,H);
1671 Heps = _mm_mul_ps(vfeps,H);
1672 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1673 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1674 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1678 /* Calculate temporary vectorial force */
1679 tx = _mm_mul_ps(fscal,dx12);
1680 ty = _mm_mul_ps(fscal,dy12);
1681 tz = _mm_mul_ps(fscal,dz12);
1683 /* Update vectorial force */
1684 fix1 = _mm_add_ps(fix1,tx);
1685 fiy1 = _mm_add_ps(fiy1,ty);
1686 fiz1 = _mm_add_ps(fiz1,tz);
1688 fjx2 = _mm_add_ps(fjx2,tx);
1689 fjy2 = _mm_add_ps(fjy2,ty);
1690 fjz2 = _mm_add_ps(fjz2,tz);
1692 /**************************
1693 * CALCULATE INTERACTIONS *
1694 **************************/
1696 r13 = _mm_mul_ps(rsq13,rinv13);
1698 /* Calculate table index by multiplying r with table scale and truncate to integer */
1699 rt = _mm_mul_ps(r13,vftabscale);
1700 vfitab = _mm_cvttps_epi32(rt);
1701 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1702 vfitab = _mm_slli_epi32(vfitab,2);
1704 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1705 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1706 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1707 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1708 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1709 _MM_TRANSPOSE4_PS(Y,F,G,H);
1710 Heps = _mm_mul_ps(vfeps,H);
1711 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1712 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1713 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1717 /* Calculate temporary vectorial force */
1718 tx = _mm_mul_ps(fscal,dx13);
1719 ty = _mm_mul_ps(fscal,dy13);
1720 tz = _mm_mul_ps(fscal,dz13);
1722 /* Update vectorial force */
1723 fix1 = _mm_add_ps(fix1,tx);
1724 fiy1 = _mm_add_ps(fiy1,ty);
1725 fiz1 = _mm_add_ps(fiz1,tz);
1727 fjx3 = _mm_add_ps(fjx3,tx);
1728 fjy3 = _mm_add_ps(fjy3,ty);
1729 fjz3 = _mm_add_ps(fjz3,tz);
1731 /**************************
1732 * CALCULATE INTERACTIONS *
1733 **************************/
1735 r21 = _mm_mul_ps(rsq21,rinv21);
1737 /* Calculate table index by multiplying r with table scale and truncate to integer */
1738 rt = _mm_mul_ps(r21,vftabscale);
1739 vfitab = _mm_cvttps_epi32(rt);
1740 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1741 vfitab = _mm_slli_epi32(vfitab,2);
1743 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1744 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1745 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1746 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1747 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1748 _MM_TRANSPOSE4_PS(Y,F,G,H);
1749 Heps = _mm_mul_ps(vfeps,H);
1750 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1751 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1752 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1756 /* Calculate temporary vectorial force */
1757 tx = _mm_mul_ps(fscal,dx21);
1758 ty = _mm_mul_ps(fscal,dy21);
1759 tz = _mm_mul_ps(fscal,dz21);
1761 /* Update vectorial force */
1762 fix2 = _mm_add_ps(fix2,tx);
1763 fiy2 = _mm_add_ps(fiy2,ty);
1764 fiz2 = _mm_add_ps(fiz2,tz);
1766 fjx1 = _mm_add_ps(fjx1,tx);
1767 fjy1 = _mm_add_ps(fjy1,ty);
1768 fjz1 = _mm_add_ps(fjz1,tz);
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 r22 = _mm_mul_ps(rsq22,rinv22);
1776 /* Calculate table index by multiplying r with table scale and truncate to integer */
1777 rt = _mm_mul_ps(r22,vftabscale);
1778 vfitab = _mm_cvttps_epi32(rt);
1779 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1780 vfitab = _mm_slli_epi32(vfitab,2);
1782 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1783 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1784 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1785 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1786 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1787 _MM_TRANSPOSE4_PS(Y,F,G,H);
1788 Heps = _mm_mul_ps(vfeps,H);
1789 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1790 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1791 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1795 /* Calculate temporary vectorial force */
1796 tx = _mm_mul_ps(fscal,dx22);
1797 ty = _mm_mul_ps(fscal,dy22);
1798 tz = _mm_mul_ps(fscal,dz22);
1800 /* Update vectorial force */
1801 fix2 = _mm_add_ps(fix2,tx);
1802 fiy2 = _mm_add_ps(fiy2,ty);
1803 fiz2 = _mm_add_ps(fiz2,tz);
1805 fjx2 = _mm_add_ps(fjx2,tx);
1806 fjy2 = _mm_add_ps(fjy2,ty);
1807 fjz2 = _mm_add_ps(fjz2,tz);
1809 /**************************
1810 * CALCULATE INTERACTIONS *
1811 **************************/
1813 r23 = _mm_mul_ps(rsq23,rinv23);
1815 /* Calculate table index by multiplying r with table scale and truncate to integer */
1816 rt = _mm_mul_ps(r23,vftabscale);
1817 vfitab = _mm_cvttps_epi32(rt);
1818 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1819 vfitab = _mm_slli_epi32(vfitab,2);
1821 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1822 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1823 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1824 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1825 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1826 _MM_TRANSPOSE4_PS(Y,F,G,H);
1827 Heps = _mm_mul_ps(vfeps,H);
1828 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1829 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1830 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1834 /* Calculate temporary vectorial force */
1835 tx = _mm_mul_ps(fscal,dx23);
1836 ty = _mm_mul_ps(fscal,dy23);
1837 tz = _mm_mul_ps(fscal,dz23);
1839 /* Update vectorial force */
1840 fix2 = _mm_add_ps(fix2,tx);
1841 fiy2 = _mm_add_ps(fiy2,ty);
1842 fiz2 = _mm_add_ps(fiz2,tz);
1844 fjx3 = _mm_add_ps(fjx3,tx);
1845 fjy3 = _mm_add_ps(fjy3,ty);
1846 fjz3 = _mm_add_ps(fjz3,tz);
1848 /**************************
1849 * CALCULATE INTERACTIONS *
1850 **************************/
1852 r31 = _mm_mul_ps(rsq31,rinv31);
1854 /* Calculate table index by multiplying r with table scale and truncate to integer */
1855 rt = _mm_mul_ps(r31,vftabscale);
1856 vfitab = _mm_cvttps_epi32(rt);
1857 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1858 vfitab = _mm_slli_epi32(vfitab,2);
1860 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1861 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1862 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1863 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1864 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1865 _MM_TRANSPOSE4_PS(Y,F,G,H);
1866 Heps = _mm_mul_ps(vfeps,H);
1867 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1868 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1869 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1873 /* Calculate temporary vectorial force */
1874 tx = _mm_mul_ps(fscal,dx31);
1875 ty = _mm_mul_ps(fscal,dy31);
1876 tz = _mm_mul_ps(fscal,dz31);
1878 /* Update vectorial force */
1879 fix3 = _mm_add_ps(fix3,tx);
1880 fiy3 = _mm_add_ps(fiy3,ty);
1881 fiz3 = _mm_add_ps(fiz3,tz);
1883 fjx1 = _mm_add_ps(fjx1,tx);
1884 fjy1 = _mm_add_ps(fjy1,ty);
1885 fjz1 = _mm_add_ps(fjz1,tz);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r32 = _mm_mul_ps(rsq32,rinv32);
1893 /* Calculate table index by multiplying r with table scale and truncate to integer */
1894 rt = _mm_mul_ps(r32,vftabscale);
1895 vfitab = _mm_cvttps_epi32(rt);
1896 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1897 vfitab = _mm_slli_epi32(vfitab,2);
1899 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1900 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1901 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1902 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1903 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1904 _MM_TRANSPOSE4_PS(Y,F,G,H);
1905 Heps = _mm_mul_ps(vfeps,H);
1906 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1907 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1908 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1912 /* Calculate temporary vectorial force */
1913 tx = _mm_mul_ps(fscal,dx32);
1914 ty = _mm_mul_ps(fscal,dy32);
1915 tz = _mm_mul_ps(fscal,dz32);
1917 /* Update vectorial force */
1918 fix3 = _mm_add_ps(fix3,tx);
1919 fiy3 = _mm_add_ps(fiy3,ty);
1920 fiz3 = _mm_add_ps(fiz3,tz);
1922 fjx2 = _mm_add_ps(fjx2,tx);
1923 fjy2 = _mm_add_ps(fjy2,ty);
1924 fjz2 = _mm_add_ps(fjz2,tz);
1926 /**************************
1927 * CALCULATE INTERACTIONS *
1928 **************************/
1930 r33 = _mm_mul_ps(rsq33,rinv33);
1932 /* Calculate table index by multiplying r with table scale and truncate to integer */
1933 rt = _mm_mul_ps(r33,vftabscale);
1934 vfitab = _mm_cvttps_epi32(rt);
1935 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1936 vfitab = _mm_slli_epi32(vfitab,2);
1938 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1939 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1940 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1941 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1942 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1943 _MM_TRANSPOSE4_PS(Y,F,G,H);
1944 Heps = _mm_mul_ps(vfeps,H);
1945 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1946 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1947 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1951 /* Calculate temporary vectorial force */
1952 tx = _mm_mul_ps(fscal,dx33);
1953 ty = _mm_mul_ps(fscal,dy33);
1954 tz = _mm_mul_ps(fscal,dz33);
1956 /* Update vectorial force */
1957 fix3 = _mm_add_ps(fix3,tx);
1958 fiy3 = _mm_add_ps(fiy3,ty);
1959 fiz3 = _mm_add_ps(fiz3,tz);
1961 fjx3 = _mm_add_ps(fjx3,tx);
1962 fjy3 = _mm_add_ps(fjy3,ty);
1963 fjz3 = _mm_add_ps(fjz3,tz);
1965 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1966 f+j_coord_offsetC,f+j_coord_offsetD,
1967 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1968 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1970 /* Inner loop uses 381 flops */
1973 if(jidx<j_index_end)
1976 /* Get j neighbor index, and coordinate index */
1978 jnrB = jjnr[jidx+1];
1979 jnrC = jjnr[jidx+2];
1980 jnrD = jjnr[jidx+3];
1982 /* Sign of each element will be negative for non-real atoms.
1983 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1984 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1986 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1987 jnrA = (jnrA>=0) ? jnrA : 0;
1988 jnrB = (jnrB>=0) ? jnrB : 0;
1989 jnrC = (jnrC>=0) ? jnrC : 0;
1990 jnrD = (jnrD>=0) ? jnrD : 0;
1992 j_coord_offsetA = DIM*jnrA;
1993 j_coord_offsetB = DIM*jnrB;
1994 j_coord_offsetC = DIM*jnrC;
1995 j_coord_offsetD = DIM*jnrD;
1997 /* load j atom coordinates */
1998 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1999 x+j_coord_offsetC,x+j_coord_offsetD,
2000 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2001 &jy2,&jz2,&jx3,&jy3,&jz3);
2003 /* Calculate displacement vector */
2004 dx00 = _mm_sub_ps(ix0,jx0);
2005 dy00 = _mm_sub_ps(iy0,jy0);
2006 dz00 = _mm_sub_ps(iz0,jz0);
2007 dx11 = _mm_sub_ps(ix1,jx1);
2008 dy11 = _mm_sub_ps(iy1,jy1);
2009 dz11 = _mm_sub_ps(iz1,jz1);
2010 dx12 = _mm_sub_ps(ix1,jx2);
2011 dy12 = _mm_sub_ps(iy1,jy2);
2012 dz12 = _mm_sub_ps(iz1,jz2);
2013 dx13 = _mm_sub_ps(ix1,jx3);
2014 dy13 = _mm_sub_ps(iy1,jy3);
2015 dz13 = _mm_sub_ps(iz1,jz3);
2016 dx21 = _mm_sub_ps(ix2,jx1);
2017 dy21 = _mm_sub_ps(iy2,jy1);
2018 dz21 = _mm_sub_ps(iz2,jz1);
2019 dx22 = _mm_sub_ps(ix2,jx2);
2020 dy22 = _mm_sub_ps(iy2,jy2);
2021 dz22 = _mm_sub_ps(iz2,jz2);
2022 dx23 = _mm_sub_ps(ix2,jx3);
2023 dy23 = _mm_sub_ps(iy2,jy3);
2024 dz23 = _mm_sub_ps(iz2,jz3);
2025 dx31 = _mm_sub_ps(ix3,jx1);
2026 dy31 = _mm_sub_ps(iy3,jy1);
2027 dz31 = _mm_sub_ps(iz3,jz1);
2028 dx32 = _mm_sub_ps(ix3,jx2);
2029 dy32 = _mm_sub_ps(iy3,jy2);
2030 dz32 = _mm_sub_ps(iz3,jz2);
2031 dx33 = _mm_sub_ps(ix3,jx3);
2032 dy33 = _mm_sub_ps(iy3,jy3);
2033 dz33 = _mm_sub_ps(iz3,jz3);
2035 /* Calculate squared distance and things based on it */
2036 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2037 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2038 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2039 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2040 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2041 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2042 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2043 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2044 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2045 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2047 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2048 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2049 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2050 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2051 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2052 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2053 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2054 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2055 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2057 rinvsq00 = gmx_mm_inv_ps(rsq00);
2059 fjx0 = _mm_setzero_ps();
2060 fjy0 = _mm_setzero_ps();
2061 fjz0 = _mm_setzero_ps();
2062 fjx1 = _mm_setzero_ps();
2063 fjy1 = _mm_setzero_ps();
2064 fjz1 = _mm_setzero_ps();
2065 fjx2 = _mm_setzero_ps();
2066 fjy2 = _mm_setzero_ps();
2067 fjz2 = _mm_setzero_ps();
2068 fjx3 = _mm_setzero_ps();
2069 fjy3 = _mm_setzero_ps();
2070 fjz3 = _mm_setzero_ps();
2072 /**************************
2073 * CALCULATE INTERACTIONS *
2074 **************************/
2076 /* LENNARD-JONES DISPERSION/REPULSION */
2078 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2079 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2083 fscal = _mm_andnot_ps(dummy_mask,fscal);
2085 /* Calculate temporary vectorial force */
2086 tx = _mm_mul_ps(fscal,dx00);
2087 ty = _mm_mul_ps(fscal,dy00);
2088 tz = _mm_mul_ps(fscal,dz00);
2090 /* Update vectorial force */
2091 fix0 = _mm_add_ps(fix0,tx);
2092 fiy0 = _mm_add_ps(fiy0,ty);
2093 fiz0 = _mm_add_ps(fiz0,tz);
2095 fjx0 = _mm_add_ps(fjx0,tx);
2096 fjy0 = _mm_add_ps(fjy0,ty);
2097 fjz0 = _mm_add_ps(fjz0,tz);
2099 /**************************
2100 * CALCULATE INTERACTIONS *
2101 **************************/
2103 r11 = _mm_mul_ps(rsq11,rinv11);
2104 r11 = _mm_andnot_ps(dummy_mask,r11);
2106 /* Calculate table index by multiplying r with table scale and truncate to integer */
2107 rt = _mm_mul_ps(r11,vftabscale);
2108 vfitab = _mm_cvttps_epi32(rt);
2109 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2110 vfitab = _mm_slli_epi32(vfitab,2);
2112 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2113 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2114 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2115 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2116 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2117 _MM_TRANSPOSE4_PS(Y,F,G,H);
2118 Heps = _mm_mul_ps(vfeps,H);
2119 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2120 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2121 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2125 fscal = _mm_andnot_ps(dummy_mask,fscal);
2127 /* Calculate temporary vectorial force */
2128 tx = _mm_mul_ps(fscal,dx11);
2129 ty = _mm_mul_ps(fscal,dy11);
2130 tz = _mm_mul_ps(fscal,dz11);
2132 /* Update vectorial force */
2133 fix1 = _mm_add_ps(fix1,tx);
2134 fiy1 = _mm_add_ps(fiy1,ty);
2135 fiz1 = _mm_add_ps(fiz1,tz);
2137 fjx1 = _mm_add_ps(fjx1,tx);
2138 fjy1 = _mm_add_ps(fjy1,ty);
2139 fjz1 = _mm_add_ps(fjz1,tz);
2141 /**************************
2142 * CALCULATE INTERACTIONS *
2143 **************************/
2145 r12 = _mm_mul_ps(rsq12,rinv12);
2146 r12 = _mm_andnot_ps(dummy_mask,r12);
2148 /* Calculate table index by multiplying r with table scale and truncate to integer */
2149 rt = _mm_mul_ps(r12,vftabscale);
2150 vfitab = _mm_cvttps_epi32(rt);
2151 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2152 vfitab = _mm_slli_epi32(vfitab,2);
2154 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2155 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2156 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2157 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2158 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2159 _MM_TRANSPOSE4_PS(Y,F,G,H);
2160 Heps = _mm_mul_ps(vfeps,H);
2161 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2162 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2163 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2167 fscal = _mm_andnot_ps(dummy_mask,fscal);
2169 /* Calculate temporary vectorial force */
2170 tx = _mm_mul_ps(fscal,dx12);
2171 ty = _mm_mul_ps(fscal,dy12);
2172 tz = _mm_mul_ps(fscal,dz12);
2174 /* Update vectorial force */
2175 fix1 = _mm_add_ps(fix1,tx);
2176 fiy1 = _mm_add_ps(fiy1,ty);
2177 fiz1 = _mm_add_ps(fiz1,tz);
2179 fjx2 = _mm_add_ps(fjx2,tx);
2180 fjy2 = _mm_add_ps(fjy2,ty);
2181 fjz2 = _mm_add_ps(fjz2,tz);
2183 /**************************
2184 * CALCULATE INTERACTIONS *
2185 **************************/
2187 r13 = _mm_mul_ps(rsq13,rinv13);
2188 r13 = _mm_andnot_ps(dummy_mask,r13);
2190 /* Calculate table index by multiplying r with table scale and truncate to integer */
2191 rt = _mm_mul_ps(r13,vftabscale);
2192 vfitab = _mm_cvttps_epi32(rt);
2193 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2194 vfitab = _mm_slli_epi32(vfitab,2);
2196 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2197 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2198 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2199 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2200 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2201 _MM_TRANSPOSE4_PS(Y,F,G,H);
2202 Heps = _mm_mul_ps(vfeps,H);
2203 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2204 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2205 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2209 fscal = _mm_andnot_ps(dummy_mask,fscal);
2211 /* Calculate temporary vectorial force */
2212 tx = _mm_mul_ps(fscal,dx13);
2213 ty = _mm_mul_ps(fscal,dy13);
2214 tz = _mm_mul_ps(fscal,dz13);
2216 /* Update vectorial force */
2217 fix1 = _mm_add_ps(fix1,tx);
2218 fiy1 = _mm_add_ps(fiy1,ty);
2219 fiz1 = _mm_add_ps(fiz1,tz);
2221 fjx3 = _mm_add_ps(fjx3,tx);
2222 fjy3 = _mm_add_ps(fjy3,ty);
2223 fjz3 = _mm_add_ps(fjz3,tz);
2225 /**************************
2226 * CALCULATE INTERACTIONS *
2227 **************************/
2229 r21 = _mm_mul_ps(rsq21,rinv21);
2230 r21 = _mm_andnot_ps(dummy_mask,r21);
2232 /* Calculate table index by multiplying r with table scale and truncate to integer */
2233 rt = _mm_mul_ps(r21,vftabscale);
2234 vfitab = _mm_cvttps_epi32(rt);
2235 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2236 vfitab = _mm_slli_epi32(vfitab,2);
2238 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2239 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2240 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2241 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2242 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2243 _MM_TRANSPOSE4_PS(Y,F,G,H);
2244 Heps = _mm_mul_ps(vfeps,H);
2245 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2246 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2247 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2251 fscal = _mm_andnot_ps(dummy_mask,fscal);
2253 /* Calculate temporary vectorial force */
2254 tx = _mm_mul_ps(fscal,dx21);
2255 ty = _mm_mul_ps(fscal,dy21);
2256 tz = _mm_mul_ps(fscal,dz21);
2258 /* Update vectorial force */
2259 fix2 = _mm_add_ps(fix2,tx);
2260 fiy2 = _mm_add_ps(fiy2,ty);
2261 fiz2 = _mm_add_ps(fiz2,tz);
2263 fjx1 = _mm_add_ps(fjx1,tx);
2264 fjy1 = _mm_add_ps(fjy1,ty);
2265 fjz1 = _mm_add_ps(fjz1,tz);
2267 /**************************
2268 * CALCULATE INTERACTIONS *
2269 **************************/
2271 r22 = _mm_mul_ps(rsq22,rinv22);
2272 r22 = _mm_andnot_ps(dummy_mask,r22);
2274 /* Calculate table index by multiplying r with table scale and truncate to integer */
2275 rt = _mm_mul_ps(r22,vftabscale);
2276 vfitab = _mm_cvttps_epi32(rt);
2277 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2278 vfitab = _mm_slli_epi32(vfitab,2);
2280 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2281 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2282 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2283 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2284 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2285 _MM_TRANSPOSE4_PS(Y,F,G,H);
2286 Heps = _mm_mul_ps(vfeps,H);
2287 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2288 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2289 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2293 fscal = _mm_andnot_ps(dummy_mask,fscal);
2295 /* Calculate temporary vectorial force */
2296 tx = _mm_mul_ps(fscal,dx22);
2297 ty = _mm_mul_ps(fscal,dy22);
2298 tz = _mm_mul_ps(fscal,dz22);
2300 /* Update vectorial force */
2301 fix2 = _mm_add_ps(fix2,tx);
2302 fiy2 = _mm_add_ps(fiy2,ty);
2303 fiz2 = _mm_add_ps(fiz2,tz);
2305 fjx2 = _mm_add_ps(fjx2,tx);
2306 fjy2 = _mm_add_ps(fjy2,ty);
2307 fjz2 = _mm_add_ps(fjz2,tz);
2309 /**************************
2310 * CALCULATE INTERACTIONS *
2311 **************************/
2313 r23 = _mm_mul_ps(rsq23,rinv23);
2314 r23 = _mm_andnot_ps(dummy_mask,r23);
2316 /* Calculate table index by multiplying r with table scale and truncate to integer */
2317 rt = _mm_mul_ps(r23,vftabscale);
2318 vfitab = _mm_cvttps_epi32(rt);
2319 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2320 vfitab = _mm_slli_epi32(vfitab,2);
2322 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2323 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2324 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2325 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2326 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2327 _MM_TRANSPOSE4_PS(Y,F,G,H);
2328 Heps = _mm_mul_ps(vfeps,H);
2329 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2330 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2331 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2335 fscal = _mm_andnot_ps(dummy_mask,fscal);
2337 /* Calculate temporary vectorial force */
2338 tx = _mm_mul_ps(fscal,dx23);
2339 ty = _mm_mul_ps(fscal,dy23);
2340 tz = _mm_mul_ps(fscal,dz23);
2342 /* Update vectorial force */
2343 fix2 = _mm_add_ps(fix2,tx);
2344 fiy2 = _mm_add_ps(fiy2,ty);
2345 fiz2 = _mm_add_ps(fiz2,tz);
2347 fjx3 = _mm_add_ps(fjx3,tx);
2348 fjy3 = _mm_add_ps(fjy3,ty);
2349 fjz3 = _mm_add_ps(fjz3,tz);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 r31 = _mm_mul_ps(rsq31,rinv31);
2356 r31 = _mm_andnot_ps(dummy_mask,r31);
2358 /* Calculate table index by multiplying r with table scale and truncate to integer */
2359 rt = _mm_mul_ps(r31,vftabscale);
2360 vfitab = _mm_cvttps_epi32(rt);
2361 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2362 vfitab = _mm_slli_epi32(vfitab,2);
2364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2365 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2366 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2367 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2368 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2369 _MM_TRANSPOSE4_PS(Y,F,G,H);
2370 Heps = _mm_mul_ps(vfeps,H);
2371 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2372 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2373 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2377 fscal = _mm_andnot_ps(dummy_mask,fscal);
2379 /* Calculate temporary vectorial force */
2380 tx = _mm_mul_ps(fscal,dx31);
2381 ty = _mm_mul_ps(fscal,dy31);
2382 tz = _mm_mul_ps(fscal,dz31);
2384 /* Update vectorial force */
2385 fix3 = _mm_add_ps(fix3,tx);
2386 fiy3 = _mm_add_ps(fiy3,ty);
2387 fiz3 = _mm_add_ps(fiz3,tz);
2389 fjx1 = _mm_add_ps(fjx1,tx);
2390 fjy1 = _mm_add_ps(fjy1,ty);
2391 fjz1 = _mm_add_ps(fjz1,tz);
2393 /**************************
2394 * CALCULATE INTERACTIONS *
2395 **************************/
2397 r32 = _mm_mul_ps(rsq32,rinv32);
2398 r32 = _mm_andnot_ps(dummy_mask,r32);
2400 /* Calculate table index by multiplying r with table scale and truncate to integer */
2401 rt = _mm_mul_ps(r32,vftabscale);
2402 vfitab = _mm_cvttps_epi32(rt);
2403 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2404 vfitab = _mm_slli_epi32(vfitab,2);
2406 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2407 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2408 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2409 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2410 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2411 _MM_TRANSPOSE4_PS(Y,F,G,H);
2412 Heps = _mm_mul_ps(vfeps,H);
2413 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2414 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2415 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2419 fscal = _mm_andnot_ps(dummy_mask,fscal);
2421 /* Calculate temporary vectorial force */
2422 tx = _mm_mul_ps(fscal,dx32);
2423 ty = _mm_mul_ps(fscal,dy32);
2424 tz = _mm_mul_ps(fscal,dz32);
2426 /* Update vectorial force */
2427 fix3 = _mm_add_ps(fix3,tx);
2428 fiy3 = _mm_add_ps(fiy3,ty);
2429 fiz3 = _mm_add_ps(fiz3,tz);
2431 fjx2 = _mm_add_ps(fjx2,tx);
2432 fjy2 = _mm_add_ps(fjy2,ty);
2433 fjz2 = _mm_add_ps(fjz2,tz);
2435 /**************************
2436 * CALCULATE INTERACTIONS *
2437 **************************/
2439 r33 = _mm_mul_ps(rsq33,rinv33);
2440 r33 = _mm_andnot_ps(dummy_mask,r33);
2442 /* Calculate table index by multiplying r with table scale and truncate to integer */
2443 rt = _mm_mul_ps(r33,vftabscale);
2444 vfitab = _mm_cvttps_epi32(rt);
2445 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2446 vfitab = _mm_slli_epi32(vfitab,2);
2448 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2449 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2450 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2451 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2452 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2453 _MM_TRANSPOSE4_PS(Y,F,G,H);
2454 Heps = _mm_mul_ps(vfeps,H);
2455 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2456 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2457 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2461 fscal = _mm_andnot_ps(dummy_mask,fscal);
2463 /* Calculate temporary vectorial force */
2464 tx = _mm_mul_ps(fscal,dx33);
2465 ty = _mm_mul_ps(fscal,dy33);
2466 tz = _mm_mul_ps(fscal,dz33);
2468 /* Update vectorial force */
2469 fix3 = _mm_add_ps(fix3,tx);
2470 fiy3 = _mm_add_ps(fiy3,ty);
2471 fiz3 = _mm_add_ps(fiz3,tz);
2473 fjx3 = _mm_add_ps(fjx3,tx);
2474 fjy3 = _mm_add_ps(fjy3,ty);
2475 fjz3 = _mm_add_ps(fjz3,tz);
2477 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2478 f+j_coord_offsetC,f+j_coord_offsetD,
2479 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2480 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2482 /* Inner loop uses 390 flops */
2485 /* End of innermost loop */
2487 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2488 f+i_coord_offset,fshift+i_shift_offset);
2490 /* Increment number of inner iterations */
2491 inneriter += j_index_end - j_index_start;
2493 /* Outer loop uses 36 flops */
2496 /* Increment number of outer iterations */
2499 /* Update outer/inner flops */
2501 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*390);