2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
94 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
99 __m128 dummy_mask,cutoff_mask;
100 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
101 __m128 one = _mm_set1_ps(1.0);
102 __m128 two = _mm_set1_ps(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_ps(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_vdw->data;
121 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
126 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
127 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_ps(charge[inr+0]);
131 jq1 = _mm_set1_ps(charge[inr+1]);
132 jq2 = _mm_set1_ps(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_ps(iq0,jq0);
135 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_ps(iq0,jq1);
138 qq02 = _mm_mul_ps(iq0,jq2);
139 qq10 = _mm_mul_ps(iq1,jq0);
140 qq11 = _mm_mul_ps(iq1,jq1);
141 qq12 = _mm_mul_ps(iq1,jq2);
142 qq20 = _mm_mul_ps(iq2,jq0);
143 qq21 = _mm_mul_ps(iq2,jq1);
144 qq22 = _mm_mul_ps(iq2,jq2);
146 /* Avoid stupid compiler warnings */
147 jnrA = jnrB = jnrC = jnrD = 0;
156 /* Start outer loop over neighborlists */
157 for(iidx=0; iidx<nri; iidx++)
159 /* Load shift vector for this list */
160 i_shift_offset = DIM*shiftidx[iidx];
161 shX = shiftvec[i_shift_offset+XX];
162 shY = shiftvec[i_shift_offset+YY];
163 shZ = shiftvec[i_shift_offset+ZZ];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
175 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
176 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
177 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
178 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
179 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
180 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
181 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
182 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
184 fix0 = _mm_setzero_ps();
185 fiy0 = _mm_setzero_ps();
186 fiz0 = _mm_setzero_ps();
187 fix1 = _mm_setzero_ps();
188 fiy1 = _mm_setzero_ps();
189 fiz1 = _mm_setzero_ps();
190 fix2 = _mm_setzero_ps();
191 fiy2 = _mm_setzero_ps();
192 fiz2 = _mm_setzero_ps();
194 /* Reset potential sums */
195 velecsum = _mm_setzero_ps();
196 vvdwsum = _mm_setzero_ps();
198 /* Start inner kernel loop */
199 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
202 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA = DIM*jnrA;
209 j_coord_offsetB = DIM*jnrB;
210 j_coord_offsetC = DIM*jnrC;
211 j_coord_offsetD = DIM*jnrD;
213 /* load j atom coordinates */
214 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
215 x+j_coord_offsetC,x+j_coord_offsetD,
216 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
218 /* Calculate displacement vector */
219 dx00 = _mm_sub_ps(ix0,jx0);
220 dy00 = _mm_sub_ps(iy0,jy0);
221 dz00 = _mm_sub_ps(iz0,jz0);
222 dx01 = _mm_sub_ps(ix0,jx1);
223 dy01 = _mm_sub_ps(iy0,jy1);
224 dz01 = _mm_sub_ps(iz0,jz1);
225 dx02 = _mm_sub_ps(ix0,jx2);
226 dy02 = _mm_sub_ps(iy0,jy2);
227 dz02 = _mm_sub_ps(iz0,jz2);
228 dx10 = _mm_sub_ps(ix1,jx0);
229 dy10 = _mm_sub_ps(iy1,jy0);
230 dz10 = _mm_sub_ps(iz1,jz0);
231 dx11 = _mm_sub_ps(ix1,jx1);
232 dy11 = _mm_sub_ps(iy1,jy1);
233 dz11 = _mm_sub_ps(iz1,jz1);
234 dx12 = _mm_sub_ps(ix1,jx2);
235 dy12 = _mm_sub_ps(iy1,jy2);
236 dz12 = _mm_sub_ps(iz1,jz2);
237 dx20 = _mm_sub_ps(ix2,jx0);
238 dy20 = _mm_sub_ps(iy2,jy0);
239 dz20 = _mm_sub_ps(iz2,jz0);
240 dx21 = _mm_sub_ps(ix2,jx1);
241 dy21 = _mm_sub_ps(iy2,jy1);
242 dz21 = _mm_sub_ps(iz2,jz1);
243 dx22 = _mm_sub_ps(ix2,jx2);
244 dy22 = _mm_sub_ps(iy2,jy2);
245 dz22 = _mm_sub_ps(iz2,jz2);
247 /* Calculate squared distance and things based on it */
248 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
249 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
250 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
251 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
252 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
253 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
254 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
255 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
256 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
258 rinv00 = gmx_mm_invsqrt_ps(rsq00);
259 rinv01 = gmx_mm_invsqrt_ps(rsq01);
260 rinv02 = gmx_mm_invsqrt_ps(rsq02);
261 rinv10 = gmx_mm_invsqrt_ps(rsq10);
262 rinv11 = gmx_mm_invsqrt_ps(rsq11);
263 rinv12 = gmx_mm_invsqrt_ps(rsq12);
264 rinv20 = gmx_mm_invsqrt_ps(rsq20);
265 rinv21 = gmx_mm_invsqrt_ps(rsq21);
266 rinv22 = gmx_mm_invsqrt_ps(rsq22);
268 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
269 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
270 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
271 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
272 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
273 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
274 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
275 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
276 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
278 fjx0 = _mm_setzero_ps();
279 fjy0 = _mm_setzero_ps();
280 fjz0 = _mm_setzero_ps();
281 fjx1 = _mm_setzero_ps();
282 fjy1 = _mm_setzero_ps();
283 fjz1 = _mm_setzero_ps();
284 fjx2 = _mm_setzero_ps();
285 fjy2 = _mm_setzero_ps();
286 fjz2 = _mm_setzero_ps();
288 /**************************
289 * CALCULATE INTERACTIONS *
290 **************************/
292 r00 = _mm_mul_ps(rsq00,rinv00);
294 /* Calculate table index by multiplying r with table scale and truncate to integer */
295 rt = _mm_mul_ps(r00,vftabscale);
296 vfitab = _mm_cvttps_epi32(rt);
297 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
298 vfitab = _mm_slli_epi32(vfitab,3);
300 /* COULOMB ELECTROSTATICS */
301 velec = _mm_mul_ps(qq00,rinv00);
302 felec = _mm_mul_ps(velec,rinvsq00);
304 /* CUBIC SPLINE TABLE DISPERSION */
305 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
306 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
307 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
308 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
309 _MM_TRANSPOSE4_PS(Y,F,G,H);
310 Heps = _mm_mul_ps(vfeps,H);
311 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
312 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
313 vvdw6 = _mm_mul_ps(c6_00,VV);
314 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
315 fvdw6 = _mm_mul_ps(c6_00,FF);
317 /* CUBIC SPLINE TABLE REPULSION */
318 vfitab = _mm_add_epi32(vfitab,ifour);
319 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
320 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
321 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
322 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
323 _MM_TRANSPOSE4_PS(Y,F,G,H);
324 Heps = _mm_mul_ps(vfeps,H);
325 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
326 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
327 vvdw12 = _mm_mul_ps(c12_00,VV);
328 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
329 fvdw12 = _mm_mul_ps(c12_00,FF);
330 vvdw = _mm_add_ps(vvdw12,vvdw6);
331 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm_add_ps(velecsum,velec);
335 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
337 fscal = _mm_add_ps(felec,fvdw);
339 /* Calculate temporary vectorial force */
340 tx = _mm_mul_ps(fscal,dx00);
341 ty = _mm_mul_ps(fscal,dy00);
342 tz = _mm_mul_ps(fscal,dz00);
344 /* Update vectorial force */
345 fix0 = _mm_add_ps(fix0,tx);
346 fiy0 = _mm_add_ps(fiy0,ty);
347 fiz0 = _mm_add_ps(fiz0,tz);
349 fjx0 = _mm_add_ps(fjx0,tx);
350 fjy0 = _mm_add_ps(fjy0,ty);
351 fjz0 = _mm_add_ps(fjz0,tz);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 /* COULOMB ELECTROSTATICS */
358 velec = _mm_mul_ps(qq01,rinv01);
359 felec = _mm_mul_ps(velec,rinvsq01);
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velecsum = _mm_add_ps(velecsum,velec);
366 /* Calculate temporary vectorial force */
367 tx = _mm_mul_ps(fscal,dx01);
368 ty = _mm_mul_ps(fscal,dy01);
369 tz = _mm_mul_ps(fscal,dz01);
371 /* Update vectorial force */
372 fix0 = _mm_add_ps(fix0,tx);
373 fiy0 = _mm_add_ps(fiy0,ty);
374 fiz0 = _mm_add_ps(fiz0,tz);
376 fjx1 = _mm_add_ps(fjx1,tx);
377 fjy1 = _mm_add_ps(fjy1,ty);
378 fjz1 = _mm_add_ps(fjz1,tz);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 /* COULOMB ELECTROSTATICS */
385 velec = _mm_mul_ps(qq02,rinv02);
386 felec = _mm_mul_ps(velec,rinvsq02);
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velecsum = _mm_add_ps(velecsum,velec);
393 /* Calculate temporary vectorial force */
394 tx = _mm_mul_ps(fscal,dx02);
395 ty = _mm_mul_ps(fscal,dy02);
396 tz = _mm_mul_ps(fscal,dz02);
398 /* Update vectorial force */
399 fix0 = _mm_add_ps(fix0,tx);
400 fiy0 = _mm_add_ps(fiy0,ty);
401 fiz0 = _mm_add_ps(fiz0,tz);
403 fjx2 = _mm_add_ps(fjx2,tx);
404 fjy2 = _mm_add_ps(fjy2,ty);
405 fjz2 = _mm_add_ps(fjz2,tz);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 /* COULOMB ELECTROSTATICS */
412 velec = _mm_mul_ps(qq10,rinv10);
413 felec = _mm_mul_ps(velec,rinvsq10);
415 /* Update potential sum for this i atom from the interaction with this j atom. */
416 velecsum = _mm_add_ps(velecsum,velec);
420 /* Calculate temporary vectorial force */
421 tx = _mm_mul_ps(fscal,dx10);
422 ty = _mm_mul_ps(fscal,dy10);
423 tz = _mm_mul_ps(fscal,dz10);
425 /* Update vectorial force */
426 fix1 = _mm_add_ps(fix1,tx);
427 fiy1 = _mm_add_ps(fiy1,ty);
428 fiz1 = _mm_add_ps(fiz1,tz);
430 fjx0 = _mm_add_ps(fjx0,tx);
431 fjy0 = _mm_add_ps(fjy0,ty);
432 fjz0 = _mm_add_ps(fjz0,tz);
434 /**************************
435 * CALCULATE INTERACTIONS *
436 **************************/
438 /* COULOMB ELECTROSTATICS */
439 velec = _mm_mul_ps(qq11,rinv11);
440 felec = _mm_mul_ps(velec,rinvsq11);
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velecsum = _mm_add_ps(velecsum,velec);
447 /* Calculate temporary vectorial force */
448 tx = _mm_mul_ps(fscal,dx11);
449 ty = _mm_mul_ps(fscal,dy11);
450 tz = _mm_mul_ps(fscal,dz11);
452 /* Update vectorial force */
453 fix1 = _mm_add_ps(fix1,tx);
454 fiy1 = _mm_add_ps(fiy1,ty);
455 fiz1 = _mm_add_ps(fiz1,tz);
457 fjx1 = _mm_add_ps(fjx1,tx);
458 fjy1 = _mm_add_ps(fjy1,ty);
459 fjz1 = _mm_add_ps(fjz1,tz);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 /* COULOMB ELECTROSTATICS */
466 velec = _mm_mul_ps(qq12,rinv12);
467 felec = _mm_mul_ps(velec,rinvsq12);
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velecsum = _mm_add_ps(velecsum,velec);
474 /* Calculate temporary vectorial force */
475 tx = _mm_mul_ps(fscal,dx12);
476 ty = _mm_mul_ps(fscal,dy12);
477 tz = _mm_mul_ps(fscal,dz12);
479 /* Update vectorial force */
480 fix1 = _mm_add_ps(fix1,tx);
481 fiy1 = _mm_add_ps(fiy1,ty);
482 fiz1 = _mm_add_ps(fiz1,tz);
484 fjx2 = _mm_add_ps(fjx2,tx);
485 fjy2 = _mm_add_ps(fjy2,ty);
486 fjz2 = _mm_add_ps(fjz2,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 /* COULOMB ELECTROSTATICS */
493 velec = _mm_mul_ps(qq20,rinv20);
494 felec = _mm_mul_ps(velec,rinvsq20);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velecsum = _mm_add_ps(velecsum,velec);
501 /* Calculate temporary vectorial force */
502 tx = _mm_mul_ps(fscal,dx20);
503 ty = _mm_mul_ps(fscal,dy20);
504 tz = _mm_mul_ps(fscal,dz20);
506 /* Update vectorial force */
507 fix2 = _mm_add_ps(fix2,tx);
508 fiy2 = _mm_add_ps(fiy2,ty);
509 fiz2 = _mm_add_ps(fiz2,tz);
511 fjx0 = _mm_add_ps(fjx0,tx);
512 fjy0 = _mm_add_ps(fjy0,ty);
513 fjz0 = _mm_add_ps(fjz0,tz);
515 /**************************
516 * CALCULATE INTERACTIONS *
517 **************************/
519 /* COULOMB ELECTROSTATICS */
520 velec = _mm_mul_ps(qq21,rinv21);
521 felec = _mm_mul_ps(velec,rinvsq21);
523 /* Update potential sum for this i atom from the interaction with this j atom. */
524 velecsum = _mm_add_ps(velecsum,velec);
528 /* Calculate temporary vectorial force */
529 tx = _mm_mul_ps(fscal,dx21);
530 ty = _mm_mul_ps(fscal,dy21);
531 tz = _mm_mul_ps(fscal,dz21);
533 /* Update vectorial force */
534 fix2 = _mm_add_ps(fix2,tx);
535 fiy2 = _mm_add_ps(fiy2,ty);
536 fiz2 = _mm_add_ps(fiz2,tz);
538 fjx1 = _mm_add_ps(fjx1,tx);
539 fjy1 = _mm_add_ps(fjy1,ty);
540 fjz1 = _mm_add_ps(fjz1,tz);
542 /**************************
543 * CALCULATE INTERACTIONS *
544 **************************/
546 /* COULOMB ELECTROSTATICS */
547 velec = _mm_mul_ps(qq22,rinv22);
548 felec = _mm_mul_ps(velec,rinvsq22);
550 /* Update potential sum for this i atom from the interaction with this j atom. */
551 velecsum = _mm_add_ps(velecsum,velec);
555 /* Calculate temporary vectorial force */
556 tx = _mm_mul_ps(fscal,dx22);
557 ty = _mm_mul_ps(fscal,dy22);
558 tz = _mm_mul_ps(fscal,dz22);
560 /* Update vectorial force */
561 fix2 = _mm_add_ps(fix2,tx);
562 fiy2 = _mm_add_ps(fiy2,ty);
563 fiz2 = _mm_add_ps(fiz2,tz);
565 fjx2 = _mm_add_ps(fjx2,tx);
566 fjy2 = _mm_add_ps(fjy2,ty);
567 fjz2 = _mm_add_ps(fjz2,tz);
569 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
570 f+j_coord_offsetC,f+j_coord_offsetD,
571 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
573 /* Inner loop uses 287 flops */
579 /* Get j neighbor index, and coordinate index */
585 /* Sign of each element will be negative for non-real atoms.
586 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
587 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
589 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
590 jnrA = (jnrA>=0) ? jnrA : 0;
591 jnrB = (jnrB>=0) ? jnrB : 0;
592 jnrC = (jnrC>=0) ? jnrC : 0;
593 jnrD = (jnrD>=0) ? jnrD : 0;
595 j_coord_offsetA = DIM*jnrA;
596 j_coord_offsetB = DIM*jnrB;
597 j_coord_offsetC = DIM*jnrC;
598 j_coord_offsetD = DIM*jnrD;
600 /* load j atom coordinates */
601 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
602 x+j_coord_offsetC,x+j_coord_offsetD,
603 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
605 /* Calculate displacement vector */
606 dx00 = _mm_sub_ps(ix0,jx0);
607 dy00 = _mm_sub_ps(iy0,jy0);
608 dz00 = _mm_sub_ps(iz0,jz0);
609 dx01 = _mm_sub_ps(ix0,jx1);
610 dy01 = _mm_sub_ps(iy0,jy1);
611 dz01 = _mm_sub_ps(iz0,jz1);
612 dx02 = _mm_sub_ps(ix0,jx2);
613 dy02 = _mm_sub_ps(iy0,jy2);
614 dz02 = _mm_sub_ps(iz0,jz2);
615 dx10 = _mm_sub_ps(ix1,jx0);
616 dy10 = _mm_sub_ps(iy1,jy0);
617 dz10 = _mm_sub_ps(iz1,jz0);
618 dx11 = _mm_sub_ps(ix1,jx1);
619 dy11 = _mm_sub_ps(iy1,jy1);
620 dz11 = _mm_sub_ps(iz1,jz1);
621 dx12 = _mm_sub_ps(ix1,jx2);
622 dy12 = _mm_sub_ps(iy1,jy2);
623 dz12 = _mm_sub_ps(iz1,jz2);
624 dx20 = _mm_sub_ps(ix2,jx0);
625 dy20 = _mm_sub_ps(iy2,jy0);
626 dz20 = _mm_sub_ps(iz2,jz0);
627 dx21 = _mm_sub_ps(ix2,jx1);
628 dy21 = _mm_sub_ps(iy2,jy1);
629 dz21 = _mm_sub_ps(iz2,jz1);
630 dx22 = _mm_sub_ps(ix2,jx2);
631 dy22 = _mm_sub_ps(iy2,jy2);
632 dz22 = _mm_sub_ps(iz2,jz2);
634 /* Calculate squared distance and things based on it */
635 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
636 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
637 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
638 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
639 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
640 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
641 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
642 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
643 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
645 rinv00 = gmx_mm_invsqrt_ps(rsq00);
646 rinv01 = gmx_mm_invsqrt_ps(rsq01);
647 rinv02 = gmx_mm_invsqrt_ps(rsq02);
648 rinv10 = gmx_mm_invsqrt_ps(rsq10);
649 rinv11 = gmx_mm_invsqrt_ps(rsq11);
650 rinv12 = gmx_mm_invsqrt_ps(rsq12);
651 rinv20 = gmx_mm_invsqrt_ps(rsq20);
652 rinv21 = gmx_mm_invsqrt_ps(rsq21);
653 rinv22 = gmx_mm_invsqrt_ps(rsq22);
655 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
656 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
657 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
658 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
659 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
660 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
661 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
662 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
663 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
665 fjx0 = _mm_setzero_ps();
666 fjy0 = _mm_setzero_ps();
667 fjz0 = _mm_setzero_ps();
668 fjx1 = _mm_setzero_ps();
669 fjy1 = _mm_setzero_ps();
670 fjz1 = _mm_setzero_ps();
671 fjx2 = _mm_setzero_ps();
672 fjy2 = _mm_setzero_ps();
673 fjz2 = _mm_setzero_ps();
675 /**************************
676 * CALCULATE INTERACTIONS *
677 **************************/
679 r00 = _mm_mul_ps(rsq00,rinv00);
680 r00 = _mm_andnot_ps(dummy_mask,r00);
682 /* Calculate table index by multiplying r with table scale and truncate to integer */
683 rt = _mm_mul_ps(r00,vftabscale);
684 vfitab = _mm_cvttps_epi32(rt);
685 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
686 vfitab = _mm_slli_epi32(vfitab,3);
688 /* COULOMB ELECTROSTATICS */
689 velec = _mm_mul_ps(qq00,rinv00);
690 felec = _mm_mul_ps(velec,rinvsq00);
692 /* CUBIC SPLINE TABLE DISPERSION */
693 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
694 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
695 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
696 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
697 _MM_TRANSPOSE4_PS(Y,F,G,H);
698 Heps = _mm_mul_ps(vfeps,H);
699 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
700 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
701 vvdw6 = _mm_mul_ps(c6_00,VV);
702 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
703 fvdw6 = _mm_mul_ps(c6_00,FF);
705 /* CUBIC SPLINE TABLE REPULSION */
706 vfitab = _mm_add_epi32(vfitab,ifour);
707 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
708 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
709 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
710 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
711 _MM_TRANSPOSE4_PS(Y,F,G,H);
712 Heps = _mm_mul_ps(vfeps,H);
713 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
714 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
715 vvdw12 = _mm_mul_ps(c12_00,VV);
716 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
717 fvdw12 = _mm_mul_ps(c12_00,FF);
718 vvdw = _mm_add_ps(vvdw12,vvdw6);
719 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
721 /* Update potential sum for this i atom from the interaction with this j atom. */
722 velec = _mm_andnot_ps(dummy_mask,velec);
723 velecsum = _mm_add_ps(velecsum,velec);
724 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
725 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
727 fscal = _mm_add_ps(felec,fvdw);
729 fscal = _mm_andnot_ps(dummy_mask,fscal);
731 /* Calculate temporary vectorial force */
732 tx = _mm_mul_ps(fscal,dx00);
733 ty = _mm_mul_ps(fscal,dy00);
734 tz = _mm_mul_ps(fscal,dz00);
736 /* Update vectorial force */
737 fix0 = _mm_add_ps(fix0,tx);
738 fiy0 = _mm_add_ps(fiy0,ty);
739 fiz0 = _mm_add_ps(fiz0,tz);
741 fjx0 = _mm_add_ps(fjx0,tx);
742 fjy0 = _mm_add_ps(fjy0,ty);
743 fjz0 = _mm_add_ps(fjz0,tz);
745 /**************************
746 * CALCULATE INTERACTIONS *
747 **************************/
749 /* COULOMB ELECTROSTATICS */
750 velec = _mm_mul_ps(qq01,rinv01);
751 felec = _mm_mul_ps(velec,rinvsq01);
753 /* Update potential sum for this i atom from the interaction with this j atom. */
754 velec = _mm_andnot_ps(dummy_mask,velec);
755 velecsum = _mm_add_ps(velecsum,velec);
759 fscal = _mm_andnot_ps(dummy_mask,fscal);
761 /* Calculate temporary vectorial force */
762 tx = _mm_mul_ps(fscal,dx01);
763 ty = _mm_mul_ps(fscal,dy01);
764 tz = _mm_mul_ps(fscal,dz01);
766 /* Update vectorial force */
767 fix0 = _mm_add_ps(fix0,tx);
768 fiy0 = _mm_add_ps(fiy0,ty);
769 fiz0 = _mm_add_ps(fiz0,tz);
771 fjx1 = _mm_add_ps(fjx1,tx);
772 fjy1 = _mm_add_ps(fjy1,ty);
773 fjz1 = _mm_add_ps(fjz1,tz);
775 /**************************
776 * CALCULATE INTERACTIONS *
777 **************************/
779 /* COULOMB ELECTROSTATICS */
780 velec = _mm_mul_ps(qq02,rinv02);
781 felec = _mm_mul_ps(velec,rinvsq02);
783 /* Update potential sum for this i atom from the interaction with this j atom. */
784 velec = _mm_andnot_ps(dummy_mask,velec);
785 velecsum = _mm_add_ps(velecsum,velec);
789 fscal = _mm_andnot_ps(dummy_mask,fscal);
791 /* Calculate temporary vectorial force */
792 tx = _mm_mul_ps(fscal,dx02);
793 ty = _mm_mul_ps(fscal,dy02);
794 tz = _mm_mul_ps(fscal,dz02);
796 /* Update vectorial force */
797 fix0 = _mm_add_ps(fix0,tx);
798 fiy0 = _mm_add_ps(fiy0,ty);
799 fiz0 = _mm_add_ps(fiz0,tz);
801 fjx2 = _mm_add_ps(fjx2,tx);
802 fjy2 = _mm_add_ps(fjy2,ty);
803 fjz2 = _mm_add_ps(fjz2,tz);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 /* COULOMB ELECTROSTATICS */
810 velec = _mm_mul_ps(qq10,rinv10);
811 felec = _mm_mul_ps(velec,rinvsq10);
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_andnot_ps(dummy_mask,velec);
815 velecsum = _mm_add_ps(velecsum,velec);
819 fscal = _mm_andnot_ps(dummy_mask,fscal);
821 /* Calculate temporary vectorial force */
822 tx = _mm_mul_ps(fscal,dx10);
823 ty = _mm_mul_ps(fscal,dy10);
824 tz = _mm_mul_ps(fscal,dz10);
826 /* Update vectorial force */
827 fix1 = _mm_add_ps(fix1,tx);
828 fiy1 = _mm_add_ps(fiy1,ty);
829 fiz1 = _mm_add_ps(fiz1,tz);
831 fjx0 = _mm_add_ps(fjx0,tx);
832 fjy0 = _mm_add_ps(fjy0,ty);
833 fjz0 = _mm_add_ps(fjz0,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 /* COULOMB ELECTROSTATICS */
840 velec = _mm_mul_ps(qq11,rinv11);
841 felec = _mm_mul_ps(velec,rinvsq11);
843 /* Update potential sum for this i atom from the interaction with this j atom. */
844 velec = _mm_andnot_ps(dummy_mask,velec);
845 velecsum = _mm_add_ps(velecsum,velec);
849 fscal = _mm_andnot_ps(dummy_mask,fscal);
851 /* Calculate temporary vectorial force */
852 tx = _mm_mul_ps(fscal,dx11);
853 ty = _mm_mul_ps(fscal,dy11);
854 tz = _mm_mul_ps(fscal,dz11);
856 /* Update vectorial force */
857 fix1 = _mm_add_ps(fix1,tx);
858 fiy1 = _mm_add_ps(fiy1,ty);
859 fiz1 = _mm_add_ps(fiz1,tz);
861 fjx1 = _mm_add_ps(fjx1,tx);
862 fjy1 = _mm_add_ps(fjy1,ty);
863 fjz1 = _mm_add_ps(fjz1,tz);
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 /* COULOMB ELECTROSTATICS */
870 velec = _mm_mul_ps(qq12,rinv12);
871 felec = _mm_mul_ps(velec,rinvsq12);
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_andnot_ps(dummy_mask,velec);
875 velecsum = _mm_add_ps(velecsum,velec);
879 fscal = _mm_andnot_ps(dummy_mask,fscal);
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_ps(fscal,dx12);
883 ty = _mm_mul_ps(fscal,dy12);
884 tz = _mm_mul_ps(fscal,dz12);
886 /* Update vectorial force */
887 fix1 = _mm_add_ps(fix1,tx);
888 fiy1 = _mm_add_ps(fiy1,ty);
889 fiz1 = _mm_add_ps(fiz1,tz);
891 fjx2 = _mm_add_ps(fjx2,tx);
892 fjy2 = _mm_add_ps(fjy2,ty);
893 fjz2 = _mm_add_ps(fjz2,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 /* COULOMB ELECTROSTATICS */
900 velec = _mm_mul_ps(qq20,rinv20);
901 felec = _mm_mul_ps(velec,rinvsq20);
903 /* Update potential sum for this i atom from the interaction with this j atom. */
904 velec = _mm_andnot_ps(dummy_mask,velec);
905 velecsum = _mm_add_ps(velecsum,velec);
909 fscal = _mm_andnot_ps(dummy_mask,fscal);
911 /* Calculate temporary vectorial force */
912 tx = _mm_mul_ps(fscal,dx20);
913 ty = _mm_mul_ps(fscal,dy20);
914 tz = _mm_mul_ps(fscal,dz20);
916 /* Update vectorial force */
917 fix2 = _mm_add_ps(fix2,tx);
918 fiy2 = _mm_add_ps(fiy2,ty);
919 fiz2 = _mm_add_ps(fiz2,tz);
921 fjx0 = _mm_add_ps(fjx0,tx);
922 fjy0 = _mm_add_ps(fjy0,ty);
923 fjz0 = _mm_add_ps(fjz0,tz);
925 /**************************
926 * CALCULATE INTERACTIONS *
927 **************************/
929 /* COULOMB ELECTROSTATICS */
930 velec = _mm_mul_ps(qq21,rinv21);
931 felec = _mm_mul_ps(velec,rinvsq21);
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm_andnot_ps(dummy_mask,velec);
935 velecsum = _mm_add_ps(velecsum,velec);
939 fscal = _mm_andnot_ps(dummy_mask,fscal);
941 /* Calculate temporary vectorial force */
942 tx = _mm_mul_ps(fscal,dx21);
943 ty = _mm_mul_ps(fscal,dy21);
944 tz = _mm_mul_ps(fscal,dz21);
946 /* Update vectorial force */
947 fix2 = _mm_add_ps(fix2,tx);
948 fiy2 = _mm_add_ps(fiy2,ty);
949 fiz2 = _mm_add_ps(fiz2,tz);
951 fjx1 = _mm_add_ps(fjx1,tx);
952 fjy1 = _mm_add_ps(fjy1,ty);
953 fjz1 = _mm_add_ps(fjz1,tz);
955 /**************************
956 * CALCULATE INTERACTIONS *
957 **************************/
959 /* COULOMB ELECTROSTATICS */
960 velec = _mm_mul_ps(qq22,rinv22);
961 felec = _mm_mul_ps(velec,rinvsq22);
963 /* Update potential sum for this i atom from the interaction with this j atom. */
964 velec = _mm_andnot_ps(dummy_mask,velec);
965 velecsum = _mm_add_ps(velecsum,velec);
969 fscal = _mm_andnot_ps(dummy_mask,fscal);
971 /* Calculate temporary vectorial force */
972 tx = _mm_mul_ps(fscal,dx22);
973 ty = _mm_mul_ps(fscal,dy22);
974 tz = _mm_mul_ps(fscal,dz22);
976 /* Update vectorial force */
977 fix2 = _mm_add_ps(fix2,tx);
978 fiy2 = _mm_add_ps(fiy2,ty);
979 fiz2 = _mm_add_ps(fiz2,tz);
981 fjx2 = _mm_add_ps(fjx2,tx);
982 fjy2 = _mm_add_ps(fjy2,ty);
983 fjz2 = _mm_add_ps(fjz2,tz);
985 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
986 f+j_coord_offsetC,f+j_coord_offsetD,
987 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
989 /* Inner loop uses 288 flops */
992 /* End of innermost loop */
994 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
995 f+i_coord_offset,fshift+i_shift_offset);
998 /* Update potential energies */
999 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1000 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1002 /* Increment number of inner iterations */
1003 inneriter += j_index_end - j_index_start;
1005 /* Outer loop uses 29 flops */
1008 /* Increment number of outer iterations */
1011 /* Update outer/inner flops */
1013 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*288);
1016 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
1017 * Electrostatics interaction: Coulomb
1018 * VdW interaction: CubicSplineTable
1019 * Geometry: Water3-Water3
1020 * Calculate force/pot: Force
1023 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
1024 (t_nblist * gmx_restrict nlist,
1025 rvec * gmx_restrict xx,
1026 rvec * gmx_restrict ff,
1027 t_forcerec * gmx_restrict fr,
1028 t_mdatoms * gmx_restrict mdatoms,
1029 nb_kernel_data_t * gmx_restrict kernel_data,
1030 t_nrnb * gmx_restrict nrnb)
1032 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1033 * just 0 for non-waters.
1034 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1035 * jnr indices corresponding to data put in the four positions in the SIMD register.
1037 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1038 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1039 int jnrA,jnrB,jnrC,jnrD;
1040 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1041 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1042 real shX,shY,shZ,rcutoff_scalar;
1043 real *shiftvec,*fshift,*x,*f;
1044 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1046 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1048 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1050 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1051 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1052 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1053 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1054 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1055 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1056 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1057 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1058 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1059 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1060 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1061 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1062 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1063 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1064 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1065 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1066 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1069 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1072 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1073 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1075 __m128i ifour = _mm_set1_epi32(4);
1076 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1078 __m128 dummy_mask,cutoff_mask;
1079 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1080 __m128 one = _mm_set1_ps(1.0);
1081 __m128 two = _mm_set1_ps(2.0);
1087 jindex = nlist->jindex;
1089 shiftidx = nlist->shift;
1091 shiftvec = fr->shift_vec[0];
1092 fshift = fr->fshift[0];
1093 facel = _mm_set1_ps(fr->epsfac);
1094 charge = mdatoms->chargeA;
1095 nvdwtype = fr->ntype;
1096 vdwparam = fr->nbfp;
1097 vdwtype = mdatoms->typeA;
1099 vftab = kernel_data->table_vdw->data;
1100 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1102 /* Setup water-specific parameters */
1103 inr = nlist->iinr[0];
1104 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1105 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1106 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1107 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1109 jq0 = _mm_set1_ps(charge[inr+0]);
1110 jq1 = _mm_set1_ps(charge[inr+1]);
1111 jq2 = _mm_set1_ps(charge[inr+2]);
1112 vdwjidx0A = 2*vdwtype[inr+0];
1113 qq00 = _mm_mul_ps(iq0,jq0);
1114 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1115 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1116 qq01 = _mm_mul_ps(iq0,jq1);
1117 qq02 = _mm_mul_ps(iq0,jq2);
1118 qq10 = _mm_mul_ps(iq1,jq0);
1119 qq11 = _mm_mul_ps(iq1,jq1);
1120 qq12 = _mm_mul_ps(iq1,jq2);
1121 qq20 = _mm_mul_ps(iq2,jq0);
1122 qq21 = _mm_mul_ps(iq2,jq1);
1123 qq22 = _mm_mul_ps(iq2,jq2);
1125 /* Avoid stupid compiler warnings */
1126 jnrA = jnrB = jnrC = jnrD = 0;
1127 j_coord_offsetA = 0;
1128 j_coord_offsetB = 0;
1129 j_coord_offsetC = 0;
1130 j_coord_offsetD = 0;
1135 /* Start outer loop over neighborlists */
1136 for(iidx=0; iidx<nri; iidx++)
1138 /* Load shift vector for this list */
1139 i_shift_offset = DIM*shiftidx[iidx];
1140 shX = shiftvec[i_shift_offset+XX];
1141 shY = shiftvec[i_shift_offset+YY];
1142 shZ = shiftvec[i_shift_offset+ZZ];
1144 /* Load limits for loop over neighbors */
1145 j_index_start = jindex[iidx];
1146 j_index_end = jindex[iidx+1];
1148 /* Get outer coordinate index */
1150 i_coord_offset = DIM*inr;
1152 /* Load i particle coords and add shift vector */
1153 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1154 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1155 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1156 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1157 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1158 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1159 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1160 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1161 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1163 fix0 = _mm_setzero_ps();
1164 fiy0 = _mm_setzero_ps();
1165 fiz0 = _mm_setzero_ps();
1166 fix1 = _mm_setzero_ps();
1167 fiy1 = _mm_setzero_ps();
1168 fiz1 = _mm_setzero_ps();
1169 fix2 = _mm_setzero_ps();
1170 fiy2 = _mm_setzero_ps();
1171 fiz2 = _mm_setzero_ps();
1173 /* Start inner kernel loop */
1174 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1177 /* Get j neighbor index, and coordinate index */
1179 jnrB = jjnr[jidx+1];
1180 jnrC = jjnr[jidx+2];
1181 jnrD = jjnr[jidx+3];
1183 j_coord_offsetA = DIM*jnrA;
1184 j_coord_offsetB = DIM*jnrB;
1185 j_coord_offsetC = DIM*jnrC;
1186 j_coord_offsetD = DIM*jnrD;
1188 /* load j atom coordinates */
1189 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1190 x+j_coord_offsetC,x+j_coord_offsetD,
1191 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1193 /* Calculate displacement vector */
1194 dx00 = _mm_sub_ps(ix0,jx0);
1195 dy00 = _mm_sub_ps(iy0,jy0);
1196 dz00 = _mm_sub_ps(iz0,jz0);
1197 dx01 = _mm_sub_ps(ix0,jx1);
1198 dy01 = _mm_sub_ps(iy0,jy1);
1199 dz01 = _mm_sub_ps(iz0,jz1);
1200 dx02 = _mm_sub_ps(ix0,jx2);
1201 dy02 = _mm_sub_ps(iy0,jy2);
1202 dz02 = _mm_sub_ps(iz0,jz2);
1203 dx10 = _mm_sub_ps(ix1,jx0);
1204 dy10 = _mm_sub_ps(iy1,jy0);
1205 dz10 = _mm_sub_ps(iz1,jz0);
1206 dx11 = _mm_sub_ps(ix1,jx1);
1207 dy11 = _mm_sub_ps(iy1,jy1);
1208 dz11 = _mm_sub_ps(iz1,jz1);
1209 dx12 = _mm_sub_ps(ix1,jx2);
1210 dy12 = _mm_sub_ps(iy1,jy2);
1211 dz12 = _mm_sub_ps(iz1,jz2);
1212 dx20 = _mm_sub_ps(ix2,jx0);
1213 dy20 = _mm_sub_ps(iy2,jy0);
1214 dz20 = _mm_sub_ps(iz2,jz0);
1215 dx21 = _mm_sub_ps(ix2,jx1);
1216 dy21 = _mm_sub_ps(iy2,jy1);
1217 dz21 = _mm_sub_ps(iz2,jz1);
1218 dx22 = _mm_sub_ps(ix2,jx2);
1219 dy22 = _mm_sub_ps(iy2,jy2);
1220 dz22 = _mm_sub_ps(iz2,jz2);
1222 /* Calculate squared distance and things based on it */
1223 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1224 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1225 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1226 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1227 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1228 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1229 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1230 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1231 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1233 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1234 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1235 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1236 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1237 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1238 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1239 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1240 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1241 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1243 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1244 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1245 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1246 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1247 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1248 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1249 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1250 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1251 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1253 fjx0 = _mm_setzero_ps();
1254 fjy0 = _mm_setzero_ps();
1255 fjz0 = _mm_setzero_ps();
1256 fjx1 = _mm_setzero_ps();
1257 fjy1 = _mm_setzero_ps();
1258 fjz1 = _mm_setzero_ps();
1259 fjx2 = _mm_setzero_ps();
1260 fjy2 = _mm_setzero_ps();
1261 fjz2 = _mm_setzero_ps();
1263 /**************************
1264 * CALCULATE INTERACTIONS *
1265 **************************/
1267 r00 = _mm_mul_ps(rsq00,rinv00);
1269 /* Calculate table index by multiplying r with table scale and truncate to integer */
1270 rt = _mm_mul_ps(r00,vftabscale);
1271 vfitab = _mm_cvttps_epi32(rt);
1272 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1273 vfitab = _mm_slli_epi32(vfitab,3);
1275 /* COULOMB ELECTROSTATICS */
1276 velec = _mm_mul_ps(qq00,rinv00);
1277 felec = _mm_mul_ps(velec,rinvsq00);
1279 /* CUBIC SPLINE TABLE DISPERSION */
1280 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1281 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1282 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1283 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1284 _MM_TRANSPOSE4_PS(Y,F,G,H);
1285 Heps = _mm_mul_ps(vfeps,H);
1286 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1287 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1288 fvdw6 = _mm_mul_ps(c6_00,FF);
1290 /* CUBIC SPLINE TABLE REPULSION */
1291 vfitab = _mm_add_epi32(vfitab,ifour);
1292 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1293 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1294 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1295 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1296 _MM_TRANSPOSE4_PS(Y,F,G,H);
1297 Heps = _mm_mul_ps(vfeps,H);
1298 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1299 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1300 fvdw12 = _mm_mul_ps(c12_00,FF);
1301 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1303 fscal = _mm_add_ps(felec,fvdw);
1305 /* Calculate temporary vectorial force */
1306 tx = _mm_mul_ps(fscal,dx00);
1307 ty = _mm_mul_ps(fscal,dy00);
1308 tz = _mm_mul_ps(fscal,dz00);
1310 /* Update vectorial force */
1311 fix0 = _mm_add_ps(fix0,tx);
1312 fiy0 = _mm_add_ps(fiy0,ty);
1313 fiz0 = _mm_add_ps(fiz0,tz);
1315 fjx0 = _mm_add_ps(fjx0,tx);
1316 fjy0 = _mm_add_ps(fjy0,ty);
1317 fjz0 = _mm_add_ps(fjz0,tz);
1319 /**************************
1320 * CALCULATE INTERACTIONS *
1321 **************************/
1323 /* COULOMB ELECTROSTATICS */
1324 velec = _mm_mul_ps(qq01,rinv01);
1325 felec = _mm_mul_ps(velec,rinvsq01);
1329 /* Calculate temporary vectorial force */
1330 tx = _mm_mul_ps(fscal,dx01);
1331 ty = _mm_mul_ps(fscal,dy01);
1332 tz = _mm_mul_ps(fscal,dz01);
1334 /* Update vectorial force */
1335 fix0 = _mm_add_ps(fix0,tx);
1336 fiy0 = _mm_add_ps(fiy0,ty);
1337 fiz0 = _mm_add_ps(fiz0,tz);
1339 fjx1 = _mm_add_ps(fjx1,tx);
1340 fjy1 = _mm_add_ps(fjy1,ty);
1341 fjz1 = _mm_add_ps(fjz1,tz);
1343 /**************************
1344 * CALCULATE INTERACTIONS *
1345 **************************/
1347 /* COULOMB ELECTROSTATICS */
1348 velec = _mm_mul_ps(qq02,rinv02);
1349 felec = _mm_mul_ps(velec,rinvsq02);
1353 /* Calculate temporary vectorial force */
1354 tx = _mm_mul_ps(fscal,dx02);
1355 ty = _mm_mul_ps(fscal,dy02);
1356 tz = _mm_mul_ps(fscal,dz02);
1358 /* Update vectorial force */
1359 fix0 = _mm_add_ps(fix0,tx);
1360 fiy0 = _mm_add_ps(fiy0,ty);
1361 fiz0 = _mm_add_ps(fiz0,tz);
1363 fjx2 = _mm_add_ps(fjx2,tx);
1364 fjy2 = _mm_add_ps(fjy2,ty);
1365 fjz2 = _mm_add_ps(fjz2,tz);
1367 /**************************
1368 * CALCULATE INTERACTIONS *
1369 **************************/
1371 /* COULOMB ELECTROSTATICS */
1372 velec = _mm_mul_ps(qq10,rinv10);
1373 felec = _mm_mul_ps(velec,rinvsq10);
1377 /* Calculate temporary vectorial force */
1378 tx = _mm_mul_ps(fscal,dx10);
1379 ty = _mm_mul_ps(fscal,dy10);
1380 tz = _mm_mul_ps(fscal,dz10);
1382 /* Update vectorial force */
1383 fix1 = _mm_add_ps(fix1,tx);
1384 fiy1 = _mm_add_ps(fiy1,ty);
1385 fiz1 = _mm_add_ps(fiz1,tz);
1387 fjx0 = _mm_add_ps(fjx0,tx);
1388 fjy0 = _mm_add_ps(fjy0,ty);
1389 fjz0 = _mm_add_ps(fjz0,tz);
1391 /**************************
1392 * CALCULATE INTERACTIONS *
1393 **************************/
1395 /* COULOMB ELECTROSTATICS */
1396 velec = _mm_mul_ps(qq11,rinv11);
1397 felec = _mm_mul_ps(velec,rinvsq11);
1401 /* Calculate temporary vectorial force */
1402 tx = _mm_mul_ps(fscal,dx11);
1403 ty = _mm_mul_ps(fscal,dy11);
1404 tz = _mm_mul_ps(fscal,dz11);
1406 /* Update vectorial force */
1407 fix1 = _mm_add_ps(fix1,tx);
1408 fiy1 = _mm_add_ps(fiy1,ty);
1409 fiz1 = _mm_add_ps(fiz1,tz);
1411 fjx1 = _mm_add_ps(fjx1,tx);
1412 fjy1 = _mm_add_ps(fjy1,ty);
1413 fjz1 = _mm_add_ps(fjz1,tz);
1415 /**************************
1416 * CALCULATE INTERACTIONS *
1417 **************************/
1419 /* COULOMB ELECTROSTATICS */
1420 velec = _mm_mul_ps(qq12,rinv12);
1421 felec = _mm_mul_ps(velec,rinvsq12);
1425 /* Calculate temporary vectorial force */
1426 tx = _mm_mul_ps(fscal,dx12);
1427 ty = _mm_mul_ps(fscal,dy12);
1428 tz = _mm_mul_ps(fscal,dz12);
1430 /* Update vectorial force */
1431 fix1 = _mm_add_ps(fix1,tx);
1432 fiy1 = _mm_add_ps(fiy1,ty);
1433 fiz1 = _mm_add_ps(fiz1,tz);
1435 fjx2 = _mm_add_ps(fjx2,tx);
1436 fjy2 = _mm_add_ps(fjy2,ty);
1437 fjz2 = _mm_add_ps(fjz2,tz);
1439 /**************************
1440 * CALCULATE INTERACTIONS *
1441 **************************/
1443 /* COULOMB ELECTROSTATICS */
1444 velec = _mm_mul_ps(qq20,rinv20);
1445 felec = _mm_mul_ps(velec,rinvsq20);
1449 /* Calculate temporary vectorial force */
1450 tx = _mm_mul_ps(fscal,dx20);
1451 ty = _mm_mul_ps(fscal,dy20);
1452 tz = _mm_mul_ps(fscal,dz20);
1454 /* Update vectorial force */
1455 fix2 = _mm_add_ps(fix2,tx);
1456 fiy2 = _mm_add_ps(fiy2,ty);
1457 fiz2 = _mm_add_ps(fiz2,tz);
1459 fjx0 = _mm_add_ps(fjx0,tx);
1460 fjy0 = _mm_add_ps(fjy0,ty);
1461 fjz0 = _mm_add_ps(fjz0,tz);
1463 /**************************
1464 * CALCULATE INTERACTIONS *
1465 **************************/
1467 /* COULOMB ELECTROSTATICS */
1468 velec = _mm_mul_ps(qq21,rinv21);
1469 felec = _mm_mul_ps(velec,rinvsq21);
1473 /* Calculate temporary vectorial force */
1474 tx = _mm_mul_ps(fscal,dx21);
1475 ty = _mm_mul_ps(fscal,dy21);
1476 tz = _mm_mul_ps(fscal,dz21);
1478 /* Update vectorial force */
1479 fix2 = _mm_add_ps(fix2,tx);
1480 fiy2 = _mm_add_ps(fiy2,ty);
1481 fiz2 = _mm_add_ps(fiz2,tz);
1483 fjx1 = _mm_add_ps(fjx1,tx);
1484 fjy1 = _mm_add_ps(fjy1,ty);
1485 fjz1 = _mm_add_ps(fjz1,tz);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 /* COULOMB ELECTROSTATICS */
1492 velec = _mm_mul_ps(qq22,rinv22);
1493 felec = _mm_mul_ps(velec,rinvsq22);
1497 /* Calculate temporary vectorial force */
1498 tx = _mm_mul_ps(fscal,dx22);
1499 ty = _mm_mul_ps(fscal,dy22);
1500 tz = _mm_mul_ps(fscal,dz22);
1502 /* Update vectorial force */
1503 fix2 = _mm_add_ps(fix2,tx);
1504 fiy2 = _mm_add_ps(fiy2,ty);
1505 fiz2 = _mm_add_ps(fiz2,tz);
1507 fjx2 = _mm_add_ps(fjx2,tx);
1508 fjy2 = _mm_add_ps(fjy2,ty);
1509 fjz2 = _mm_add_ps(fjz2,tz);
1511 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1512 f+j_coord_offsetC,f+j_coord_offsetD,
1513 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1515 /* Inner loop uses 270 flops */
1518 if(jidx<j_index_end)
1521 /* Get j neighbor index, and coordinate index */
1523 jnrB = jjnr[jidx+1];
1524 jnrC = jjnr[jidx+2];
1525 jnrD = jjnr[jidx+3];
1527 /* Sign of each element will be negative for non-real atoms.
1528 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1529 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1531 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1532 jnrA = (jnrA>=0) ? jnrA : 0;
1533 jnrB = (jnrB>=0) ? jnrB : 0;
1534 jnrC = (jnrC>=0) ? jnrC : 0;
1535 jnrD = (jnrD>=0) ? jnrD : 0;
1537 j_coord_offsetA = DIM*jnrA;
1538 j_coord_offsetB = DIM*jnrB;
1539 j_coord_offsetC = DIM*jnrC;
1540 j_coord_offsetD = DIM*jnrD;
1542 /* load j atom coordinates */
1543 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1544 x+j_coord_offsetC,x+j_coord_offsetD,
1545 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1547 /* Calculate displacement vector */
1548 dx00 = _mm_sub_ps(ix0,jx0);
1549 dy00 = _mm_sub_ps(iy0,jy0);
1550 dz00 = _mm_sub_ps(iz0,jz0);
1551 dx01 = _mm_sub_ps(ix0,jx1);
1552 dy01 = _mm_sub_ps(iy0,jy1);
1553 dz01 = _mm_sub_ps(iz0,jz1);
1554 dx02 = _mm_sub_ps(ix0,jx2);
1555 dy02 = _mm_sub_ps(iy0,jy2);
1556 dz02 = _mm_sub_ps(iz0,jz2);
1557 dx10 = _mm_sub_ps(ix1,jx0);
1558 dy10 = _mm_sub_ps(iy1,jy0);
1559 dz10 = _mm_sub_ps(iz1,jz0);
1560 dx11 = _mm_sub_ps(ix1,jx1);
1561 dy11 = _mm_sub_ps(iy1,jy1);
1562 dz11 = _mm_sub_ps(iz1,jz1);
1563 dx12 = _mm_sub_ps(ix1,jx2);
1564 dy12 = _mm_sub_ps(iy1,jy2);
1565 dz12 = _mm_sub_ps(iz1,jz2);
1566 dx20 = _mm_sub_ps(ix2,jx0);
1567 dy20 = _mm_sub_ps(iy2,jy0);
1568 dz20 = _mm_sub_ps(iz2,jz0);
1569 dx21 = _mm_sub_ps(ix2,jx1);
1570 dy21 = _mm_sub_ps(iy2,jy1);
1571 dz21 = _mm_sub_ps(iz2,jz1);
1572 dx22 = _mm_sub_ps(ix2,jx2);
1573 dy22 = _mm_sub_ps(iy2,jy2);
1574 dz22 = _mm_sub_ps(iz2,jz2);
1576 /* Calculate squared distance and things based on it */
1577 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1578 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1579 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1580 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1581 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1582 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1583 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1584 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1585 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1587 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1588 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1589 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1590 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1591 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1592 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1593 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1594 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1595 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1597 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1598 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1599 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1600 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1601 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1602 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1603 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1604 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1605 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1607 fjx0 = _mm_setzero_ps();
1608 fjy0 = _mm_setzero_ps();
1609 fjz0 = _mm_setzero_ps();
1610 fjx1 = _mm_setzero_ps();
1611 fjy1 = _mm_setzero_ps();
1612 fjz1 = _mm_setzero_ps();
1613 fjx2 = _mm_setzero_ps();
1614 fjy2 = _mm_setzero_ps();
1615 fjz2 = _mm_setzero_ps();
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 r00 = _mm_mul_ps(rsq00,rinv00);
1622 r00 = _mm_andnot_ps(dummy_mask,r00);
1624 /* Calculate table index by multiplying r with table scale and truncate to integer */
1625 rt = _mm_mul_ps(r00,vftabscale);
1626 vfitab = _mm_cvttps_epi32(rt);
1627 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1628 vfitab = _mm_slli_epi32(vfitab,3);
1630 /* COULOMB ELECTROSTATICS */
1631 velec = _mm_mul_ps(qq00,rinv00);
1632 felec = _mm_mul_ps(velec,rinvsq00);
1634 /* CUBIC SPLINE TABLE DISPERSION */
1635 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1636 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1637 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1638 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1639 _MM_TRANSPOSE4_PS(Y,F,G,H);
1640 Heps = _mm_mul_ps(vfeps,H);
1641 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1642 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1643 fvdw6 = _mm_mul_ps(c6_00,FF);
1645 /* CUBIC SPLINE TABLE REPULSION */
1646 vfitab = _mm_add_epi32(vfitab,ifour);
1647 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1648 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1649 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1650 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1651 _MM_TRANSPOSE4_PS(Y,F,G,H);
1652 Heps = _mm_mul_ps(vfeps,H);
1653 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1654 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1655 fvdw12 = _mm_mul_ps(c12_00,FF);
1656 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1658 fscal = _mm_add_ps(felec,fvdw);
1660 fscal = _mm_andnot_ps(dummy_mask,fscal);
1662 /* Calculate temporary vectorial force */
1663 tx = _mm_mul_ps(fscal,dx00);
1664 ty = _mm_mul_ps(fscal,dy00);
1665 tz = _mm_mul_ps(fscal,dz00);
1667 /* Update vectorial force */
1668 fix0 = _mm_add_ps(fix0,tx);
1669 fiy0 = _mm_add_ps(fiy0,ty);
1670 fiz0 = _mm_add_ps(fiz0,tz);
1672 fjx0 = _mm_add_ps(fjx0,tx);
1673 fjy0 = _mm_add_ps(fjy0,ty);
1674 fjz0 = _mm_add_ps(fjz0,tz);
1676 /**************************
1677 * CALCULATE INTERACTIONS *
1678 **************************/
1680 /* COULOMB ELECTROSTATICS */
1681 velec = _mm_mul_ps(qq01,rinv01);
1682 felec = _mm_mul_ps(velec,rinvsq01);
1686 fscal = _mm_andnot_ps(dummy_mask,fscal);
1688 /* Calculate temporary vectorial force */
1689 tx = _mm_mul_ps(fscal,dx01);
1690 ty = _mm_mul_ps(fscal,dy01);
1691 tz = _mm_mul_ps(fscal,dz01);
1693 /* Update vectorial force */
1694 fix0 = _mm_add_ps(fix0,tx);
1695 fiy0 = _mm_add_ps(fiy0,ty);
1696 fiz0 = _mm_add_ps(fiz0,tz);
1698 fjx1 = _mm_add_ps(fjx1,tx);
1699 fjy1 = _mm_add_ps(fjy1,ty);
1700 fjz1 = _mm_add_ps(fjz1,tz);
1702 /**************************
1703 * CALCULATE INTERACTIONS *
1704 **************************/
1706 /* COULOMB ELECTROSTATICS */
1707 velec = _mm_mul_ps(qq02,rinv02);
1708 felec = _mm_mul_ps(velec,rinvsq02);
1712 fscal = _mm_andnot_ps(dummy_mask,fscal);
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_ps(fscal,dx02);
1716 ty = _mm_mul_ps(fscal,dy02);
1717 tz = _mm_mul_ps(fscal,dz02);
1719 /* Update vectorial force */
1720 fix0 = _mm_add_ps(fix0,tx);
1721 fiy0 = _mm_add_ps(fiy0,ty);
1722 fiz0 = _mm_add_ps(fiz0,tz);
1724 fjx2 = _mm_add_ps(fjx2,tx);
1725 fjy2 = _mm_add_ps(fjy2,ty);
1726 fjz2 = _mm_add_ps(fjz2,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 /* COULOMB ELECTROSTATICS */
1733 velec = _mm_mul_ps(qq10,rinv10);
1734 felec = _mm_mul_ps(velec,rinvsq10);
1738 fscal = _mm_andnot_ps(dummy_mask,fscal);
1740 /* Calculate temporary vectorial force */
1741 tx = _mm_mul_ps(fscal,dx10);
1742 ty = _mm_mul_ps(fscal,dy10);
1743 tz = _mm_mul_ps(fscal,dz10);
1745 /* Update vectorial force */
1746 fix1 = _mm_add_ps(fix1,tx);
1747 fiy1 = _mm_add_ps(fiy1,ty);
1748 fiz1 = _mm_add_ps(fiz1,tz);
1750 fjx0 = _mm_add_ps(fjx0,tx);
1751 fjy0 = _mm_add_ps(fjy0,ty);
1752 fjz0 = _mm_add_ps(fjz0,tz);
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1758 /* COULOMB ELECTROSTATICS */
1759 velec = _mm_mul_ps(qq11,rinv11);
1760 felec = _mm_mul_ps(velec,rinvsq11);
1764 fscal = _mm_andnot_ps(dummy_mask,fscal);
1766 /* Calculate temporary vectorial force */
1767 tx = _mm_mul_ps(fscal,dx11);
1768 ty = _mm_mul_ps(fscal,dy11);
1769 tz = _mm_mul_ps(fscal,dz11);
1771 /* Update vectorial force */
1772 fix1 = _mm_add_ps(fix1,tx);
1773 fiy1 = _mm_add_ps(fiy1,ty);
1774 fiz1 = _mm_add_ps(fiz1,tz);
1776 fjx1 = _mm_add_ps(fjx1,tx);
1777 fjy1 = _mm_add_ps(fjy1,ty);
1778 fjz1 = _mm_add_ps(fjz1,tz);
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1784 /* COULOMB ELECTROSTATICS */
1785 velec = _mm_mul_ps(qq12,rinv12);
1786 felec = _mm_mul_ps(velec,rinvsq12);
1790 fscal = _mm_andnot_ps(dummy_mask,fscal);
1792 /* Calculate temporary vectorial force */
1793 tx = _mm_mul_ps(fscal,dx12);
1794 ty = _mm_mul_ps(fscal,dy12);
1795 tz = _mm_mul_ps(fscal,dz12);
1797 /* Update vectorial force */
1798 fix1 = _mm_add_ps(fix1,tx);
1799 fiy1 = _mm_add_ps(fiy1,ty);
1800 fiz1 = _mm_add_ps(fiz1,tz);
1802 fjx2 = _mm_add_ps(fjx2,tx);
1803 fjy2 = _mm_add_ps(fjy2,ty);
1804 fjz2 = _mm_add_ps(fjz2,tz);
1806 /**************************
1807 * CALCULATE INTERACTIONS *
1808 **************************/
1810 /* COULOMB ELECTROSTATICS */
1811 velec = _mm_mul_ps(qq20,rinv20);
1812 felec = _mm_mul_ps(velec,rinvsq20);
1816 fscal = _mm_andnot_ps(dummy_mask,fscal);
1818 /* Calculate temporary vectorial force */
1819 tx = _mm_mul_ps(fscal,dx20);
1820 ty = _mm_mul_ps(fscal,dy20);
1821 tz = _mm_mul_ps(fscal,dz20);
1823 /* Update vectorial force */
1824 fix2 = _mm_add_ps(fix2,tx);
1825 fiy2 = _mm_add_ps(fiy2,ty);
1826 fiz2 = _mm_add_ps(fiz2,tz);
1828 fjx0 = _mm_add_ps(fjx0,tx);
1829 fjy0 = _mm_add_ps(fjy0,ty);
1830 fjz0 = _mm_add_ps(fjz0,tz);
1832 /**************************
1833 * CALCULATE INTERACTIONS *
1834 **************************/
1836 /* COULOMB ELECTROSTATICS */
1837 velec = _mm_mul_ps(qq21,rinv21);
1838 felec = _mm_mul_ps(velec,rinvsq21);
1842 fscal = _mm_andnot_ps(dummy_mask,fscal);
1844 /* Calculate temporary vectorial force */
1845 tx = _mm_mul_ps(fscal,dx21);
1846 ty = _mm_mul_ps(fscal,dy21);
1847 tz = _mm_mul_ps(fscal,dz21);
1849 /* Update vectorial force */
1850 fix2 = _mm_add_ps(fix2,tx);
1851 fiy2 = _mm_add_ps(fiy2,ty);
1852 fiz2 = _mm_add_ps(fiz2,tz);
1854 fjx1 = _mm_add_ps(fjx1,tx);
1855 fjy1 = _mm_add_ps(fjy1,ty);
1856 fjz1 = _mm_add_ps(fjz1,tz);
1858 /**************************
1859 * CALCULATE INTERACTIONS *
1860 **************************/
1862 /* COULOMB ELECTROSTATICS */
1863 velec = _mm_mul_ps(qq22,rinv22);
1864 felec = _mm_mul_ps(velec,rinvsq22);
1868 fscal = _mm_andnot_ps(dummy_mask,fscal);
1870 /* Calculate temporary vectorial force */
1871 tx = _mm_mul_ps(fscal,dx22);
1872 ty = _mm_mul_ps(fscal,dy22);
1873 tz = _mm_mul_ps(fscal,dz22);
1875 /* Update vectorial force */
1876 fix2 = _mm_add_ps(fix2,tx);
1877 fiy2 = _mm_add_ps(fiy2,ty);
1878 fiz2 = _mm_add_ps(fiz2,tz);
1880 fjx2 = _mm_add_ps(fjx2,tx);
1881 fjy2 = _mm_add_ps(fjy2,ty);
1882 fjz2 = _mm_add_ps(fjz2,tz);
1884 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1885 f+j_coord_offsetC,f+j_coord_offsetD,
1886 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1888 /* Inner loop uses 271 flops */
1891 /* End of innermost loop */
1893 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1894 f+i_coord_offset,fshift+i_shift_offset);
1896 /* Increment number of inner iterations */
1897 inneriter += j_index_end - j_index_start;
1899 /* Outer loop uses 27 flops */
1902 /* Increment number of outer iterations */
1905 /* Update outer/inner flops */
1907 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);