2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_single
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128i ifour = _mm_set1_epi32(4);
100 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
102 __m128 dummy_mask,cutoff_mask;
103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104 __m128 one = _mm_set1_ps(1.0);
105 __m128 two = _mm_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 vftab = kernel_data->table_vdw->data;
124 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
129 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
130 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_ps(charge[inr+0]);
134 jq1 = _mm_set1_ps(charge[inr+1]);
135 jq2 = _mm_set1_ps(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_ps(iq0,jq0);
138 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_ps(iq0,jq1);
141 qq02 = _mm_mul_ps(iq0,jq2);
142 qq10 = _mm_mul_ps(iq1,jq0);
143 qq11 = _mm_mul_ps(iq1,jq1);
144 qq12 = _mm_mul_ps(iq1,jq2);
145 qq20 = _mm_mul_ps(iq2,jq0);
146 qq21 = _mm_mul_ps(iq2,jq1);
147 qq22 = _mm_mul_ps(iq2,jq2);
149 /* Avoid stupid compiler warnings */
150 jnrA = jnrB = jnrC = jnrD = 0;
159 for(iidx=0;iidx<4*DIM;iidx++)
164 /* Start outer loop over neighborlists */
165 for(iidx=0; iidx<nri; iidx++)
167 /* Load shift vector for this list */
168 i_shift_offset = DIM*shiftidx[iidx];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
180 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
182 fix0 = _mm_setzero_ps();
183 fiy0 = _mm_setzero_ps();
184 fiz0 = _mm_setzero_ps();
185 fix1 = _mm_setzero_ps();
186 fiy1 = _mm_setzero_ps();
187 fiz1 = _mm_setzero_ps();
188 fix2 = _mm_setzero_ps();
189 fiy2 = _mm_setzero_ps();
190 fiz2 = _mm_setzero_ps();
192 /* Reset potential sums */
193 velecsum = _mm_setzero_ps();
194 vvdwsum = _mm_setzero_ps();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA = DIM*jnrA;
206 j_coord_offsetB = DIM*jnrB;
207 j_coord_offsetC = DIM*jnrC;
208 j_coord_offsetD = DIM*jnrD;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212 x+j_coord_offsetC,x+j_coord_offsetD,
213 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215 /* Calculate displacement vector */
216 dx00 = _mm_sub_ps(ix0,jx0);
217 dy00 = _mm_sub_ps(iy0,jy0);
218 dz00 = _mm_sub_ps(iz0,jz0);
219 dx01 = _mm_sub_ps(ix0,jx1);
220 dy01 = _mm_sub_ps(iy0,jy1);
221 dz01 = _mm_sub_ps(iz0,jz1);
222 dx02 = _mm_sub_ps(ix0,jx2);
223 dy02 = _mm_sub_ps(iy0,jy2);
224 dz02 = _mm_sub_ps(iz0,jz2);
225 dx10 = _mm_sub_ps(ix1,jx0);
226 dy10 = _mm_sub_ps(iy1,jy0);
227 dz10 = _mm_sub_ps(iz1,jz0);
228 dx11 = _mm_sub_ps(ix1,jx1);
229 dy11 = _mm_sub_ps(iy1,jy1);
230 dz11 = _mm_sub_ps(iz1,jz1);
231 dx12 = _mm_sub_ps(ix1,jx2);
232 dy12 = _mm_sub_ps(iy1,jy2);
233 dz12 = _mm_sub_ps(iz1,jz2);
234 dx20 = _mm_sub_ps(ix2,jx0);
235 dy20 = _mm_sub_ps(iy2,jy0);
236 dz20 = _mm_sub_ps(iz2,jz0);
237 dx21 = _mm_sub_ps(ix2,jx1);
238 dy21 = _mm_sub_ps(iy2,jy1);
239 dz21 = _mm_sub_ps(iz2,jz1);
240 dx22 = _mm_sub_ps(ix2,jx2);
241 dy22 = _mm_sub_ps(iy2,jy2);
242 dz22 = _mm_sub_ps(iz2,jz2);
244 /* Calculate squared distance and things based on it */
245 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
246 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
247 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
248 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
249 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
250 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
251 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
252 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
253 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
255 rinv00 = gmx_mm_invsqrt_ps(rsq00);
256 rinv01 = gmx_mm_invsqrt_ps(rsq01);
257 rinv02 = gmx_mm_invsqrt_ps(rsq02);
258 rinv10 = gmx_mm_invsqrt_ps(rsq10);
259 rinv11 = gmx_mm_invsqrt_ps(rsq11);
260 rinv12 = gmx_mm_invsqrt_ps(rsq12);
261 rinv20 = gmx_mm_invsqrt_ps(rsq20);
262 rinv21 = gmx_mm_invsqrt_ps(rsq21);
263 rinv22 = gmx_mm_invsqrt_ps(rsq22);
265 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
266 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
267 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
268 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
269 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
270 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
271 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
272 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
273 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
275 fjx0 = _mm_setzero_ps();
276 fjy0 = _mm_setzero_ps();
277 fjz0 = _mm_setzero_ps();
278 fjx1 = _mm_setzero_ps();
279 fjy1 = _mm_setzero_ps();
280 fjz1 = _mm_setzero_ps();
281 fjx2 = _mm_setzero_ps();
282 fjy2 = _mm_setzero_ps();
283 fjz2 = _mm_setzero_ps();
285 /**************************
286 * CALCULATE INTERACTIONS *
287 **************************/
289 r00 = _mm_mul_ps(rsq00,rinv00);
291 /* Calculate table index by multiplying r with table scale and truncate to integer */
292 rt = _mm_mul_ps(r00,vftabscale);
293 vfitab = _mm_cvttps_epi32(rt);
294 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
295 vfitab = _mm_slli_epi32(vfitab,3);
297 /* COULOMB ELECTROSTATICS */
298 velec = _mm_mul_ps(qq00,rinv00);
299 felec = _mm_mul_ps(velec,rinvsq00);
301 /* CUBIC SPLINE TABLE DISPERSION */
302 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
303 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
304 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
305 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
306 _MM_TRANSPOSE4_PS(Y,F,G,H);
307 Heps = _mm_mul_ps(vfeps,H);
308 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
309 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
310 vvdw6 = _mm_mul_ps(c6_00,VV);
311 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
312 fvdw6 = _mm_mul_ps(c6_00,FF);
314 /* CUBIC SPLINE TABLE REPULSION */
315 vfitab = _mm_add_epi32(vfitab,ifour);
316 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
317 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
318 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
319 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
320 _MM_TRANSPOSE4_PS(Y,F,G,H);
321 Heps = _mm_mul_ps(vfeps,H);
322 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
323 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
324 vvdw12 = _mm_mul_ps(c12_00,VV);
325 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
326 fvdw12 = _mm_mul_ps(c12_00,FF);
327 vvdw = _mm_add_ps(vvdw12,vvdw6);
328 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velecsum = _mm_add_ps(velecsum,velec);
332 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
334 fscal = _mm_add_ps(felec,fvdw);
336 /* Calculate temporary vectorial force */
337 tx = _mm_mul_ps(fscal,dx00);
338 ty = _mm_mul_ps(fscal,dy00);
339 tz = _mm_mul_ps(fscal,dz00);
341 /* Update vectorial force */
342 fix0 = _mm_add_ps(fix0,tx);
343 fiy0 = _mm_add_ps(fiy0,ty);
344 fiz0 = _mm_add_ps(fiz0,tz);
346 fjx0 = _mm_add_ps(fjx0,tx);
347 fjy0 = _mm_add_ps(fjy0,ty);
348 fjz0 = _mm_add_ps(fjz0,tz);
350 /**************************
351 * CALCULATE INTERACTIONS *
352 **************************/
354 /* COULOMB ELECTROSTATICS */
355 velec = _mm_mul_ps(qq01,rinv01);
356 felec = _mm_mul_ps(velec,rinvsq01);
358 /* Update potential sum for this i atom from the interaction with this j atom. */
359 velecsum = _mm_add_ps(velecsum,velec);
363 /* Calculate temporary vectorial force */
364 tx = _mm_mul_ps(fscal,dx01);
365 ty = _mm_mul_ps(fscal,dy01);
366 tz = _mm_mul_ps(fscal,dz01);
368 /* Update vectorial force */
369 fix0 = _mm_add_ps(fix0,tx);
370 fiy0 = _mm_add_ps(fiy0,ty);
371 fiz0 = _mm_add_ps(fiz0,tz);
373 fjx1 = _mm_add_ps(fjx1,tx);
374 fjy1 = _mm_add_ps(fjy1,ty);
375 fjz1 = _mm_add_ps(fjz1,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 /* COULOMB ELECTROSTATICS */
382 velec = _mm_mul_ps(qq02,rinv02);
383 felec = _mm_mul_ps(velec,rinvsq02);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velecsum = _mm_add_ps(velecsum,velec);
390 /* Calculate temporary vectorial force */
391 tx = _mm_mul_ps(fscal,dx02);
392 ty = _mm_mul_ps(fscal,dy02);
393 tz = _mm_mul_ps(fscal,dz02);
395 /* Update vectorial force */
396 fix0 = _mm_add_ps(fix0,tx);
397 fiy0 = _mm_add_ps(fiy0,ty);
398 fiz0 = _mm_add_ps(fiz0,tz);
400 fjx2 = _mm_add_ps(fjx2,tx);
401 fjy2 = _mm_add_ps(fjy2,ty);
402 fjz2 = _mm_add_ps(fjz2,tz);
404 /**************************
405 * CALCULATE INTERACTIONS *
406 **************************/
408 /* COULOMB ELECTROSTATICS */
409 velec = _mm_mul_ps(qq10,rinv10);
410 felec = _mm_mul_ps(velec,rinvsq10);
412 /* Update potential sum for this i atom from the interaction with this j atom. */
413 velecsum = _mm_add_ps(velecsum,velec);
417 /* Calculate temporary vectorial force */
418 tx = _mm_mul_ps(fscal,dx10);
419 ty = _mm_mul_ps(fscal,dy10);
420 tz = _mm_mul_ps(fscal,dz10);
422 /* Update vectorial force */
423 fix1 = _mm_add_ps(fix1,tx);
424 fiy1 = _mm_add_ps(fiy1,ty);
425 fiz1 = _mm_add_ps(fiz1,tz);
427 fjx0 = _mm_add_ps(fjx0,tx);
428 fjy0 = _mm_add_ps(fjy0,ty);
429 fjz0 = _mm_add_ps(fjz0,tz);
431 /**************************
432 * CALCULATE INTERACTIONS *
433 **************************/
435 /* COULOMB ELECTROSTATICS */
436 velec = _mm_mul_ps(qq11,rinv11);
437 felec = _mm_mul_ps(velec,rinvsq11);
439 /* Update potential sum for this i atom from the interaction with this j atom. */
440 velecsum = _mm_add_ps(velecsum,velec);
444 /* Calculate temporary vectorial force */
445 tx = _mm_mul_ps(fscal,dx11);
446 ty = _mm_mul_ps(fscal,dy11);
447 tz = _mm_mul_ps(fscal,dz11);
449 /* Update vectorial force */
450 fix1 = _mm_add_ps(fix1,tx);
451 fiy1 = _mm_add_ps(fiy1,ty);
452 fiz1 = _mm_add_ps(fiz1,tz);
454 fjx1 = _mm_add_ps(fjx1,tx);
455 fjy1 = _mm_add_ps(fjy1,ty);
456 fjz1 = _mm_add_ps(fjz1,tz);
458 /**************************
459 * CALCULATE INTERACTIONS *
460 **************************/
462 /* COULOMB ELECTROSTATICS */
463 velec = _mm_mul_ps(qq12,rinv12);
464 felec = _mm_mul_ps(velec,rinvsq12);
466 /* Update potential sum for this i atom from the interaction with this j atom. */
467 velecsum = _mm_add_ps(velecsum,velec);
471 /* Calculate temporary vectorial force */
472 tx = _mm_mul_ps(fscal,dx12);
473 ty = _mm_mul_ps(fscal,dy12);
474 tz = _mm_mul_ps(fscal,dz12);
476 /* Update vectorial force */
477 fix1 = _mm_add_ps(fix1,tx);
478 fiy1 = _mm_add_ps(fiy1,ty);
479 fiz1 = _mm_add_ps(fiz1,tz);
481 fjx2 = _mm_add_ps(fjx2,tx);
482 fjy2 = _mm_add_ps(fjy2,ty);
483 fjz2 = _mm_add_ps(fjz2,tz);
485 /**************************
486 * CALCULATE INTERACTIONS *
487 **************************/
489 /* COULOMB ELECTROSTATICS */
490 velec = _mm_mul_ps(qq20,rinv20);
491 felec = _mm_mul_ps(velec,rinvsq20);
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velecsum = _mm_add_ps(velecsum,velec);
498 /* Calculate temporary vectorial force */
499 tx = _mm_mul_ps(fscal,dx20);
500 ty = _mm_mul_ps(fscal,dy20);
501 tz = _mm_mul_ps(fscal,dz20);
503 /* Update vectorial force */
504 fix2 = _mm_add_ps(fix2,tx);
505 fiy2 = _mm_add_ps(fiy2,ty);
506 fiz2 = _mm_add_ps(fiz2,tz);
508 fjx0 = _mm_add_ps(fjx0,tx);
509 fjy0 = _mm_add_ps(fjy0,ty);
510 fjz0 = _mm_add_ps(fjz0,tz);
512 /**************************
513 * CALCULATE INTERACTIONS *
514 **************************/
516 /* COULOMB ELECTROSTATICS */
517 velec = _mm_mul_ps(qq21,rinv21);
518 felec = _mm_mul_ps(velec,rinvsq21);
520 /* Update potential sum for this i atom from the interaction with this j atom. */
521 velecsum = _mm_add_ps(velecsum,velec);
525 /* Calculate temporary vectorial force */
526 tx = _mm_mul_ps(fscal,dx21);
527 ty = _mm_mul_ps(fscal,dy21);
528 tz = _mm_mul_ps(fscal,dz21);
530 /* Update vectorial force */
531 fix2 = _mm_add_ps(fix2,tx);
532 fiy2 = _mm_add_ps(fiy2,ty);
533 fiz2 = _mm_add_ps(fiz2,tz);
535 fjx1 = _mm_add_ps(fjx1,tx);
536 fjy1 = _mm_add_ps(fjy1,ty);
537 fjz1 = _mm_add_ps(fjz1,tz);
539 /**************************
540 * CALCULATE INTERACTIONS *
541 **************************/
543 /* COULOMB ELECTROSTATICS */
544 velec = _mm_mul_ps(qq22,rinv22);
545 felec = _mm_mul_ps(velec,rinvsq22);
547 /* Update potential sum for this i atom from the interaction with this j atom. */
548 velecsum = _mm_add_ps(velecsum,velec);
552 /* Calculate temporary vectorial force */
553 tx = _mm_mul_ps(fscal,dx22);
554 ty = _mm_mul_ps(fscal,dy22);
555 tz = _mm_mul_ps(fscal,dz22);
557 /* Update vectorial force */
558 fix2 = _mm_add_ps(fix2,tx);
559 fiy2 = _mm_add_ps(fiy2,ty);
560 fiz2 = _mm_add_ps(fiz2,tz);
562 fjx2 = _mm_add_ps(fjx2,tx);
563 fjy2 = _mm_add_ps(fjy2,ty);
564 fjz2 = _mm_add_ps(fjz2,tz);
566 fjptrA = f+j_coord_offsetA;
567 fjptrB = f+j_coord_offsetB;
568 fjptrC = f+j_coord_offsetC;
569 fjptrD = f+j_coord_offsetD;
571 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
572 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
574 /* Inner loop uses 287 flops */
580 /* Get j neighbor index, and coordinate index */
581 jnrlistA = jjnr[jidx];
582 jnrlistB = jjnr[jidx+1];
583 jnrlistC = jjnr[jidx+2];
584 jnrlistD = jjnr[jidx+3];
585 /* Sign of each element will be negative for non-real atoms.
586 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
587 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
589 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
590 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
591 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
592 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
593 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
594 j_coord_offsetA = DIM*jnrA;
595 j_coord_offsetB = DIM*jnrB;
596 j_coord_offsetC = DIM*jnrC;
597 j_coord_offsetD = DIM*jnrD;
599 /* load j atom coordinates */
600 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
601 x+j_coord_offsetC,x+j_coord_offsetD,
602 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
604 /* Calculate displacement vector */
605 dx00 = _mm_sub_ps(ix0,jx0);
606 dy00 = _mm_sub_ps(iy0,jy0);
607 dz00 = _mm_sub_ps(iz0,jz0);
608 dx01 = _mm_sub_ps(ix0,jx1);
609 dy01 = _mm_sub_ps(iy0,jy1);
610 dz01 = _mm_sub_ps(iz0,jz1);
611 dx02 = _mm_sub_ps(ix0,jx2);
612 dy02 = _mm_sub_ps(iy0,jy2);
613 dz02 = _mm_sub_ps(iz0,jz2);
614 dx10 = _mm_sub_ps(ix1,jx0);
615 dy10 = _mm_sub_ps(iy1,jy0);
616 dz10 = _mm_sub_ps(iz1,jz0);
617 dx11 = _mm_sub_ps(ix1,jx1);
618 dy11 = _mm_sub_ps(iy1,jy1);
619 dz11 = _mm_sub_ps(iz1,jz1);
620 dx12 = _mm_sub_ps(ix1,jx2);
621 dy12 = _mm_sub_ps(iy1,jy2);
622 dz12 = _mm_sub_ps(iz1,jz2);
623 dx20 = _mm_sub_ps(ix2,jx0);
624 dy20 = _mm_sub_ps(iy2,jy0);
625 dz20 = _mm_sub_ps(iz2,jz0);
626 dx21 = _mm_sub_ps(ix2,jx1);
627 dy21 = _mm_sub_ps(iy2,jy1);
628 dz21 = _mm_sub_ps(iz2,jz1);
629 dx22 = _mm_sub_ps(ix2,jx2);
630 dy22 = _mm_sub_ps(iy2,jy2);
631 dz22 = _mm_sub_ps(iz2,jz2);
633 /* Calculate squared distance and things based on it */
634 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
635 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
636 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
637 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
638 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
639 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
640 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
641 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
642 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
644 rinv00 = gmx_mm_invsqrt_ps(rsq00);
645 rinv01 = gmx_mm_invsqrt_ps(rsq01);
646 rinv02 = gmx_mm_invsqrt_ps(rsq02);
647 rinv10 = gmx_mm_invsqrt_ps(rsq10);
648 rinv11 = gmx_mm_invsqrt_ps(rsq11);
649 rinv12 = gmx_mm_invsqrt_ps(rsq12);
650 rinv20 = gmx_mm_invsqrt_ps(rsq20);
651 rinv21 = gmx_mm_invsqrt_ps(rsq21);
652 rinv22 = gmx_mm_invsqrt_ps(rsq22);
654 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
655 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
656 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
657 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
658 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
659 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
660 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
661 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
662 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
664 fjx0 = _mm_setzero_ps();
665 fjy0 = _mm_setzero_ps();
666 fjz0 = _mm_setzero_ps();
667 fjx1 = _mm_setzero_ps();
668 fjy1 = _mm_setzero_ps();
669 fjz1 = _mm_setzero_ps();
670 fjx2 = _mm_setzero_ps();
671 fjy2 = _mm_setzero_ps();
672 fjz2 = _mm_setzero_ps();
674 /**************************
675 * CALCULATE INTERACTIONS *
676 **************************/
678 r00 = _mm_mul_ps(rsq00,rinv00);
679 r00 = _mm_andnot_ps(dummy_mask,r00);
681 /* Calculate table index by multiplying r with table scale and truncate to integer */
682 rt = _mm_mul_ps(r00,vftabscale);
683 vfitab = _mm_cvttps_epi32(rt);
684 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
685 vfitab = _mm_slli_epi32(vfitab,3);
687 /* COULOMB ELECTROSTATICS */
688 velec = _mm_mul_ps(qq00,rinv00);
689 felec = _mm_mul_ps(velec,rinvsq00);
691 /* CUBIC SPLINE TABLE DISPERSION */
692 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
693 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
694 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
695 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
696 _MM_TRANSPOSE4_PS(Y,F,G,H);
697 Heps = _mm_mul_ps(vfeps,H);
698 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
699 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
700 vvdw6 = _mm_mul_ps(c6_00,VV);
701 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
702 fvdw6 = _mm_mul_ps(c6_00,FF);
704 /* CUBIC SPLINE TABLE REPULSION */
705 vfitab = _mm_add_epi32(vfitab,ifour);
706 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
707 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
708 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
709 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
710 _MM_TRANSPOSE4_PS(Y,F,G,H);
711 Heps = _mm_mul_ps(vfeps,H);
712 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
713 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
714 vvdw12 = _mm_mul_ps(c12_00,VV);
715 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
716 fvdw12 = _mm_mul_ps(c12_00,FF);
717 vvdw = _mm_add_ps(vvdw12,vvdw6);
718 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec = _mm_andnot_ps(dummy_mask,velec);
722 velecsum = _mm_add_ps(velecsum,velec);
723 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
724 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
726 fscal = _mm_add_ps(felec,fvdw);
728 fscal = _mm_andnot_ps(dummy_mask,fscal);
730 /* Calculate temporary vectorial force */
731 tx = _mm_mul_ps(fscal,dx00);
732 ty = _mm_mul_ps(fscal,dy00);
733 tz = _mm_mul_ps(fscal,dz00);
735 /* Update vectorial force */
736 fix0 = _mm_add_ps(fix0,tx);
737 fiy0 = _mm_add_ps(fiy0,ty);
738 fiz0 = _mm_add_ps(fiz0,tz);
740 fjx0 = _mm_add_ps(fjx0,tx);
741 fjy0 = _mm_add_ps(fjy0,ty);
742 fjz0 = _mm_add_ps(fjz0,tz);
744 /**************************
745 * CALCULATE INTERACTIONS *
746 **************************/
748 /* COULOMB ELECTROSTATICS */
749 velec = _mm_mul_ps(qq01,rinv01);
750 felec = _mm_mul_ps(velec,rinvsq01);
752 /* Update potential sum for this i atom from the interaction with this j atom. */
753 velec = _mm_andnot_ps(dummy_mask,velec);
754 velecsum = _mm_add_ps(velecsum,velec);
758 fscal = _mm_andnot_ps(dummy_mask,fscal);
760 /* Calculate temporary vectorial force */
761 tx = _mm_mul_ps(fscal,dx01);
762 ty = _mm_mul_ps(fscal,dy01);
763 tz = _mm_mul_ps(fscal,dz01);
765 /* Update vectorial force */
766 fix0 = _mm_add_ps(fix0,tx);
767 fiy0 = _mm_add_ps(fiy0,ty);
768 fiz0 = _mm_add_ps(fiz0,tz);
770 fjx1 = _mm_add_ps(fjx1,tx);
771 fjy1 = _mm_add_ps(fjy1,ty);
772 fjz1 = _mm_add_ps(fjz1,tz);
774 /**************************
775 * CALCULATE INTERACTIONS *
776 **************************/
778 /* COULOMB ELECTROSTATICS */
779 velec = _mm_mul_ps(qq02,rinv02);
780 felec = _mm_mul_ps(velec,rinvsq02);
782 /* Update potential sum for this i atom from the interaction with this j atom. */
783 velec = _mm_andnot_ps(dummy_mask,velec);
784 velecsum = _mm_add_ps(velecsum,velec);
788 fscal = _mm_andnot_ps(dummy_mask,fscal);
790 /* Calculate temporary vectorial force */
791 tx = _mm_mul_ps(fscal,dx02);
792 ty = _mm_mul_ps(fscal,dy02);
793 tz = _mm_mul_ps(fscal,dz02);
795 /* Update vectorial force */
796 fix0 = _mm_add_ps(fix0,tx);
797 fiy0 = _mm_add_ps(fiy0,ty);
798 fiz0 = _mm_add_ps(fiz0,tz);
800 fjx2 = _mm_add_ps(fjx2,tx);
801 fjy2 = _mm_add_ps(fjy2,ty);
802 fjz2 = _mm_add_ps(fjz2,tz);
804 /**************************
805 * CALCULATE INTERACTIONS *
806 **************************/
808 /* COULOMB ELECTROSTATICS */
809 velec = _mm_mul_ps(qq10,rinv10);
810 felec = _mm_mul_ps(velec,rinvsq10);
812 /* Update potential sum for this i atom from the interaction with this j atom. */
813 velec = _mm_andnot_ps(dummy_mask,velec);
814 velecsum = _mm_add_ps(velecsum,velec);
818 fscal = _mm_andnot_ps(dummy_mask,fscal);
820 /* Calculate temporary vectorial force */
821 tx = _mm_mul_ps(fscal,dx10);
822 ty = _mm_mul_ps(fscal,dy10);
823 tz = _mm_mul_ps(fscal,dz10);
825 /* Update vectorial force */
826 fix1 = _mm_add_ps(fix1,tx);
827 fiy1 = _mm_add_ps(fiy1,ty);
828 fiz1 = _mm_add_ps(fiz1,tz);
830 fjx0 = _mm_add_ps(fjx0,tx);
831 fjy0 = _mm_add_ps(fjy0,ty);
832 fjz0 = _mm_add_ps(fjz0,tz);
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 /* COULOMB ELECTROSTATICS */
839 velec = _mm_mul_ps(qq11,rinv11);
840 felec = _mm_mul_ps(velec,rinvsq11);
842 /* Update potential sum for this i atom from the interaction with this j atom. */
843 velec = _mm_andnot_ps(dummy_mask,velec);
844 velecsum = _mm_add_ps(velecsum,velec);
848 fscal = _mm_andnot_ps(dummy_mask,fscal);
850 /* Calculate temporary vectorial force */
851 tx = _mm_mul_ps(fscal,dx11);
852 ty = _mm_mul_ps(fscal,dy11);
853 tz = _mm_mul_ps(fscal,dz11);
855 /* Update vectorial force */
856 fix1 = _mm_add_ps(fix1,tx);
857 fiy1 = _mm_add_ps(fiy1,ty);
858 fiz1 = _mm_add_ps(fiz1,tz);
860 fjx1 = _mm_add_ps(fjx1,tx);
861 fjy1 = _mm_add_ps(fjy1,ty);
862 fjz1 = _mm_add_ps(fjz1,tz);
864 /**************************
865 * CALCULATE INTERACTIONS *
866 **************************/
868 /* COULOMB ELECTROSTATICS */
869 velec = _mm_mul_ps(qq12,rinv12);
870 felec = _mm_mul_ps(velec,rinvsq12);
872 /* Update potential sum for this i atom from the interaction with this j atom. */
873 velec = _mm_andnot_ps(dummy_mask,velec);
874 velecsum = _mm_add_ps(velecsum,velec);
878 fscal = _mm_andnot_ps(dummy_mask,fscal);
880 /* Calculate temporary vectorial force */
881 tx = _mm_mul_ps(fscal,dx12);
882 ty = _mm_mul_ps(fscal,dy12);
883 tz = _mm_mul_ps(fscal,dz12);
885 /* Update vectorial force */
886 fix1 = _mm_add_ps(fix1,tx);
887 fiy1 = _mm_add_ps(fiy1,ty);
888 fiz1 = _mm_add_ps(fiz1,tz);
890 fjx2 = _mm_add_ps(fjx2,tx);
891 fjy2 = _mm_add_ps(fjy2,ty);
892 fjz2 = _mm_add_ps(fjz2,tz);
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 /* COULOMB ELECTROSTATICS */
899 velec = _mm_mul_ps(qq20,rinv20);
900 felec = _mm_mul_ps(velec,rinvsq20);
902 /* Update potential sum for this i atom from the interaction with this j atom. */
903 velec = _mm_andnot_ps(dummy_mask,velec);
904 velecsum = _mm_add_ps(velecsum,velec);
908 fscal = _mm_andnot_ps(dummy_mask,fscal);
910 /* Calculate temporary vectorial force */
911 tx = _mm_mul_ps(fscal,dx20);
912 ty = _mm_mul_ps(fscal,dy20);
913 tz = _mm_mul_ps(fscal,dz20);
915 /* Update vectorial force */
916 fix2 = _mm_add_ps(fix2,tx);
917 fiy2 = _mm_add_ps(fiy2,ty);
918 fiz2 = _mm_add_ps(fiz2,tz);
920 fjx0 = _mm_add_ps(fjx0,tx);
921 fjy0 = _mm_add_ps(fjy0,ty);
922 fjz0 = _mm_add_ps(fjz0,tz);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 /* COULOMB ELECTROSTATICS */
929 velec = _mm_mul_ps(qq21,rinv21);
930 felec = _mm_mul_ps(velec,rinvsq21);
932 /* Update potential sum for this i atom from the interaction with this j atom. */
933 velec = _mm_andnot_ps(dummy_mask,velec);
934 velecsum = _mm_add_ps(velecsum,velec);
938 fscal = _mm_andnot_ps(dummy_mask,fscal);
940 /* Calculate temporary vectorial force */
941 tx = _mm_mul_ps(fscal,dx21);
942 ty = _mm_mul_ps(fscal,dy21);
943 tz = _mm_mul_ps(fscal,dz21);
945 /* Update vectorial force */
946 fix2 = _mm_add_ps(fix2,tx);
947 fiy2 = _mm_add_ps(fiy2,ty);
948 fiz2 = _mm_add_ps(fiz2,tz);
950 fjx1 = _mm_add_ps(fjx1,tx);
951 fjy1 = _mm_add_ps(fjy1,ty);
952 fjz1 = _mm_add_ps(fjz1,tz);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 /* COULOMB ELECTROSTATICS */
959 velec = _mm_mul_ps(qq22,rinv22);
960 felec = _mm_mul_ps(velec,rinvsq22);
962 /* Update potential sum for this i atom from the interaction with this j atom. */
963 velec = _mm_andnot_ps(dummy_mask,velec);
964 velecsum = _mm_add_ps(velecsum,velec);
968 fscal = _mm_andnot_ps(dummy_mask,fscal);
970 /* Calculate temporary vectorial force */
971 tx = _mm_mul_ps(fscal,dx22);
972 ty = _mm_mul_ps(fscal,dy22);
973 tz = _mm_mul_ps(fscal,dz22);
975 /* Update vectorial force */
976 fix2 = _mm_add_ps(fix2,tx);
977 fiy2 = _mm_add_ps(fiy2,ty);
978 fiz2 = _mm_add_ps(fiz2,tz);
980 fjx2 = _mm_add_ps(fjx2,tx);
981 fjy2 = _mm_add_ps(fjy2,ty);
982 fjz2 = _mm_add_ps(fjz2,tz);
984 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
985 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
986 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
987 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
989 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
990 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
992 /* Inner loop uses 288 flops */
995 /* End of innermost loop */
997 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
998 f+i_coord_offset,fshift+i_shift_offset);
1001 /* Update potential energies */
1002 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1003 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1005 /* Increment number of inner iterations */
1006 inneriter += j_index_end - j_index_start;
1008 /* Outer loop uses 20 flops */
1011 /* Increment number of outer iterations */
1014 /* Update outer/inner flops */
1016 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*288);
1019 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_single
1020 * Electrostatics interaction: Coulomb
1021 * VdW interaction: CubicSplineTable
1022 * Geometry: Water3-Water3
1023 * Calculate force/pot: Force
1026 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_single
1027 (t_nblist * gmx_restrict nlist,
1028 rvec * gmx_restrict xx,
1029 rvec * gmx_restrict ff,
1030 t_forcerec * gmx_restrict fr,
1031 t_mdatoms * gmx_restrict mdatoms,
1032 nb_kernel_data_t * gmx_restrict kernel_data,
1033 t_nrnb * gmx_restrict nrnb)
1035 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1036 * just 0 for non-waters.
1037 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1038 * jnr indices corresponding to data put in the four positions in the SIMD register.
1040 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1041 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1042 int jnrA,jnrB,jnrC,jnrD;
1043 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1044 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1045 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1046 real rcutoff_scalar;
1047 real *shiftvec,*fshift,*x,*f;
1048 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1049 real scratch[4*DIM];
1050 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1052 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1054 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1056 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1057 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1058 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1059 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1060 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1061 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1062 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1063 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1064 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1065 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1066 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1067 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1068 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1069 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1070 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1071 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1072 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1075 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1078 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1079 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1081 __m128i ifour = _mm_set1_epi32(4);
1082 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1084 __m128 dummy_mask,cutoff_mask;
1085 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1086 __m128 one = _mm_set1_ps(1.0);
1087 __m128 two = _mm_set1_ps(2.0);
1093 jindex = nlist->jindex;
1095 shiftidx = nlist->shift;
1097 shiftvec = fr->shift_vec[0];
1098 fshift = fr->fshift[0];
1099 facel = _mm_set1_ps(fr->epsfac);
1100 charge = mdatoms->chargeA;
1101 nvdwtype = fr->ntype;
1102 vdwparam = fr->nbfp;
1103 vdwtype = mdatoms->typeA;
1105 vftab = kernel_data->table_vdw->data;
1106 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1108 /* Setup water-specific parameters */
1109 inr = nlist->iinr[0];
1110 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1111 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1112 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1113 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1115 jq0 = _mm_set1_ps(charge[inr+0]);
1116 jq1 = _mm_set1_ps(charge[inr+1]);
1117 jq2 = _mm_set1_ps(charge[inr+2]);
1118 vdwjidx0A = 2*vdwtype[inr+0];
1119 qq00 = _mm_mul_ps(iq0,jq0);
1120 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1121 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1122 qq01 = _mm_mul_ps(iq0,jq1);
1123 qq02 = _mm_mul_ps(iq0,jq2);
1124 qq10 = _mm_mul_ps(iq1,jq0);
1125 qq11 = _mm_mul_ps(iq1,jq1);
1126 qq12 = _mm_mul_ps(iq1,jq2);
1127 qq20 = _mm_mul_ps(iq2,jq0);
1128 qq21 = _mm_mul_ps(iq2,jq1);
1129 qq22 = _mm_mul_ps(iq2,jq2);
1131 /* Avoid stupid compiler warnings */
1132 jnrA = jnrB = jnrC = jnrD = 0;
1133 j_coord_offsetA = 0;
1134 j_coord_offsetB = 0;
1135 j_coord_offsetC = 0;
1136 j_coord_offsetD = 0;
1141 for(iidx=0;iidx<4*DIM;iidx++)
1143 scratch[iidx] = 0.0;
1146 /* Start outer loop over neighborlists */
1147 for(iidx=0; iidx<nri; iidx++)
1149 /* Load shift vector for this list */
1150 i_shift_offset = DIM*shiftidx[iidx];
1152 /* Load limits for loop over neighbors */
1153 j_index_start = jindex[iidx];
1154 j_index_end = jindex[iidx+1];
1156 /* Get outer coordinate index */
1158 i_coord_offset = DIM*inr;
1160 /* Load i particle coords and add shift vector */
1161 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1162 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1164 fix0 = _mm_setzero_ps();
1165 fiy0 = _mm_setzero_ps();
1166 fiz0 = _mm_setzero_ps();
1167 fix1 = _mm_setzero_ps();
1168 fiy1 = _mm_setzero_ps();
1169 fiz1 = _mm_setzero_ps();
1170 fix2 = _mm_setzero_ps();
1171 fiy2 = _mm_setzero_ps();
1172 fiz2 = _mm_setzero_ps();
1174 /* Start inner kernel loop */
1175 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1178 /* Get j neighbor index, and coordinate index */
1180 jnrB = jjnr[jidx+1];
1181 jnrC = jjnr[jidx+2];
1182 jnrD = jjnr[jidx+3];
1183 j_coord_offsetA = DIM*jnrA;
1184 j_coord_offsetB = DIM*jnrB;
1185 j_coord_offsetC = DIM*jnrC;
1186 j_coord_offsetD = DIM*jnrD;
1188 /* load j atom coordinates */
1189 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1190 x+j_coord_offsetC,x+j_coord_offsetD,
1191 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1193 /* Calculate displacement vector */
1194 dx00 = _mm_sub_ps(ix0,jx0);
1195 dy00 = _mm_sub_ps(iy0,jy0);
1196 dz00 = _mm_sub_ps(iz0,jz0);
1197 dx01 = _mm_sub_ps(ix0,jx1);
1198 dy01 = _mm_sub_ps(iy0,jy1);
1199 dz01 = _mm_sub_ps(iz0,jz1);
1200 dx02 = _mm_sub_ps(ix0,jx2);
1201 dy02 = _mm_sub_ps(iy0,jy2);
1202 dz02 = _mm_sub_ps(iz0,jz2);
1203 dx10 = _mm_sub_ps(ix1,jx0);
1204 dy10 = _mm_sub_ps(iy1,jy0);
1205 dz10 = _mm_sub_ps(iz1,jz0);
1206 dx11 = _mm_sub_ps(ix1,jx1);
1207 dy11 = _mm_sub_ps(iy1,jy1);
1208 dz11 = _mm_sub_ps(iz1,jz1);
1209 dx12 = _mm_sub_ps(ix1,jx2);
1210 dy12 = _mm_sub_ps(iy1,jy2);
1211 dz12 = _mm_sub_ps(iz1,jz2);
1212 dx20 = _mm_sub_ps(ix2,jx0);
1213 dy20 = _mm_sub_ps(iy2,jy0);
1214 dz20 = _mm_sub_ps(iz2,jz0);
1215 dx21 = _mm_sub_ps(ix2,jx1);
1216 dy21 = _mm_sub_ps(iy2,jy1);
1217 dz21 = _mm_sub_ps(iz2,jz1);
1218 dx22 = _mm_sub_ps(ix2,jx2);
1219 dy22 = _mm_sub_ps(iy2,jy2);
1220 dz22 = _mm_sub_ps(iz2,jz2);
1222 /* Calculate squared distance and things based on it */
1223 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1224 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1225 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1226 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1227 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1228 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1229 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1230 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1231 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1233 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1234 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1235 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1236 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1237 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1238 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1239 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1240 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1241 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1243 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1244 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1245 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1246 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1247 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1248 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1249 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1250 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1251 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1253 fjx0 = _mm_setzero_ps();
1254 fjy0 = _mm_setzero_ps();
1255 fjz0 = _mm_setzero_ps();
1256 fjx1 = _mm_setzero_ps();
1257 fjy1 = _mm_setzero_ps();
1258 fjz1 = _mm_setzero_ps();
1259 fjx2 = _mm_setzero_ps();
1260 fjy2 = _mm_setzero_ps();
1261 fjz2 = _mm_setzero_ps();
1263 /**************************
1264 * CALCULATE INTERACTIONS *
1265 **************************/
1267 r00 = _mm_mul_ps(rsq00,rinv00);
1269 /* Calculate table index by multiplying r with table scale and truncate to integer */
1270 rt = _mm_mul_ps(r00,vftabscale);
1271 vfitab = _mm_cvttps_epi32(rt);
1272 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1273 vfitab = _mm_slli_epi32(vfitab,3);
1275 /* COULOMB ELECTROSTATICS */
1276 velec = _mm_mul_ps(qq00,rinv00);
1277 felec = _mm_mul_ps(velec,rinvsq00);
1279 /* CUBIC SPLINE TABLE DISPERSION */
1280 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1281 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1282 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1283 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1284 _MM_TRANSPOSE4_PS(Y,F,G,H);
1285 Heps = _mm_mul_ps(vfeps,H);
1286 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1287 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1288 fvdw6 = _mm_mul_ps(c6_00,FF);
1290 /* CUBIC SPLINE TABLE REPULSION */
1291 vfitab = _mm_add_epi32(vfitab,ifour);
1292 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1293 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1294 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1295 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1296 _MM_TRANSPOSE4_PS(Y,F,G,H);
1297 Heps = _mm_mul_ps(vfeps,H);
1298 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1299 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1300 fvdw12 = _mm_mul_ps(c12_00,FF);
1301 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1303 fscal = _mm_add_ps(felec,fvdw);
1305 /* Calculate temporary vectorial force */
1306 tx = _mm_mul_ps(fscal,dx00);
1307 ty = _mm_mul_ps(fscal,dy00);
1308 tz = _mm_mul_ps(fscal,dz00);
1310 /* Update vectorial force */
1311 fix0 = _mm_add_ps(fix0,tx);
1312 fiy0 = _mm_add_ps(fiy0,ty);
1313 fiz0 = _mm_add_ps(fiz0,tz);
1315 fjx0 = _mm_add_ps(fjx0,tx);
1316 fjy0 = _mm_add_ps(fjy0,ty);
1317 fjz0 = _mm_add_ps(fjz0,tz);
1319 /**************************
1320 * CALCULATE INTERACTIONS *
1321 **************************/
1323 /* COULOMB ELECTROSTATICS */
1324 velec = _mm_mul_ps(qq01,rinv01);
1325 felec = _mm_mul_ps(velec,rinvsq01);
1329 /* Calculate temporary vectorial force */
1330 tx = _mm_mul_ps(fscal,dx01);
1331 ty = _mm_mul_ps(fscal,dy01);
1332 tz = _mm_mul_ps(fscal,dz01);
1334 /* Update vectorial force */
1335 fix0 = _mm_add_ps(fix0,tx);
1336 fiy0 = _mm_add_ps(fiy0,ty);
1337 fiz0 = _mm_add_ps(fiz0,tz);
1339 fjx1 = _mm_add_ps(fjx1,tx);
1340 fjy1 = _mm_add_ps(fjy1,ty);
1341 fjz1 = _mm_add_ps(fjz1,tz);
1343 /**************************
1344 * CALCULATE INTERACTIONS *
1345 **************************/
1347 /* COULOMB ELECTROSTATICS */
1348 velec = _mm_mul_ps(qq02,rinv02);
1349 felec = _mm_mul_ps(velec,rinvsq02);
1353 /* Calculate temporary vectorial force */
1354 tx = _mm_mul_ps(fscal,dx02);
1355 ty = _mm_mul_ps(fscal,dy02);
1356 tz = _mm_mul_ps(fscal,dz02);
1358 /* Update vectorial force */
1359 fix0 = _mm_add_ps(fix0,tx);
1360 fiy0 = _mm_add_ps(fiy0,ty);
1361 fiz0 = _mm_add_ps(fiz0,tz);
1363 fjx2 = _mm_add_ps(fjx2,tx);
1364 fjy2 = _mm_add_ps(fjy2,ty);
1365 fjz2 = _mm_add_ps(fjz2,tz);
1367 /**************************
1368 * CALCULATE INTERACTIONS *
1369 **************************/
1371 /* COULOMB ELECTROSTATICS */
1372 velec = _mm_mul_ps(qq10,rinv10);
1373 felec = _mm_mul_ps(velec,rinvsq10);
1377 /* Calculate temporary vectorial force */
1378 tx = _mm_mul_ps(fscal,dx10);
1379 ty = _mm_mul_ps(fscal,dy10);
1380 tz = _mm_mul_ps(fscal,dz10);
1382 /* Update vectorial force */
1383 fix1 = _mm_add_ps(fix1,tx);
1384 fiy1 = _mm_add_ps(fiy1,ty);
1385 fiz1 = _mm_add_ps(fiz1,tz);
1387 fjx0 = _mm_add_ps(fjx0,tx);
1388 fjy0 = _mm_add_ps(fjy0,ty);
1389 fjz0 = _mm_add_ps(fjz0,tz);
1391 /**************************
1392 * CALCULATE INTERACTIONS *
1393 **************************/
1395 /* COULOMB ELECTROSTATICS */
1396 velec = _mm_mul_ps(qq11,rinv11);
1397 felec = _mm_mul_ps(velec,rinvsq11);
1401 /* Calculate temporary vectorial force */
1402 tx = _mm_mul_ps(fscal,dx11);
1403 ty = _mm_mul_ps(fscal,dy11);
1404 tz = _mm_mul_ps(fscal,dz11);
1406 /* Update vectorial force */
1407 fix1 = _mm_add_ps(fix1,tx);
1408 fiy1 = _mm_add_ps(fiy1,ty);
1409 fiz1 = _mm_add_ps(fiz1,tz);
1411 fjx1 = _mm_add_ps(fjx1,tx);
1412 fjy1 = _mm_add_ps(fjy1,ty);
1413 fjz1 = _mm_add_ps(fjz1,tz);
1415 /**************************
1416 * CALCULATE INTERACTIONS *
1417 **************************/
1419 /* COULOMB ELECTROSTATICS */
1420 velec = _mm_mul_ps(qq12,rinv12);
1421 felec = _mm_mul_ps(velec,rinvsq12);
1425 /* Calculate temporary vectorial force */
1426 tx = _mm_mul_ps(fscal,dx12);
1427 ty = _mm_mul_ps(fscal,dy12);
1428 tz = _mm_mul_ps(fscal,dz12);
1430 /* Update vectorial force */
1431 fix1 = _mm_add_ps(fix1,tx);
1432 fiy1 = _mm_add_ps(fiy1,ty);
1433 fiz1 = _mm_add_ps(fiz1,tz);
1435 fjx2 = _mm_add_ps(fjx2,tx);
1436 fjy2 = _mm_add_ps(fjy2,ty);
1437 fjz2 = _mm_add_ps(fjz2,tz);
1439 /**************************
1440 * CALCULATE INTERACTIONS *
1441 **************************/
1443 /* COULOMB ELECTROSTATICS */
1444 velec = _mm_mul_ps(qq20,rinv20);
1445 felec = _mm_mul_ps(velec,rinvsq20);
1449 /* Calculate temporary vectorial force */
1450 tx = _mm_mul_ps(fscal,dx20);
1451 ty = _mm_mul_ps(fscal,dy20);
1452 tz = _mm_mul_ps(fscal,dz20);
1454 /* Update vectorial force */
1455 fix2 = _mm_add_ps(fix2,tx);
1456 fiy2 = _mm_add_ps(fiy2,ty);
1457 fiz2 = _mm_add_ps(fiz2,tz);
1459 fjx0 = _mm_add_ps(fjx0,tx);
1460 fjy0 = _mm_add_ps(fjy0,ty);
1461 fjz0 = _mm_add_ps(fjz0,tz);
1463 /**************************
1464 * CALCULATE INTERACTIONS *
1465 **************************/
1467 /* COULOMB ELECTROSTATICS */
1468 velec = _mm_mul_ps(qq21,rinv21);
1469 felec = _mm_mul_ps(velec,rinvsq21);
1473 /* Calculate temporary vectorial force */
1474 tx = _mm_mul_ps(fscal,dx21);
1475 ty = _mm_mul_ps(fscal,dy21);
1476 tz = _mm_mul_ps(fscal,dz21);
1478 /* Update vectorial force */
1479 fix2 = _mm_add_ps(fix2,tx);
1480 fiy2 = _mm_add_ps(fiy2,ty);
1481 fiz2 = _mm_add_ps(fiz2,tz);
1483 fjx1 = _mm_add_ps(fjx1,tx);
1484 fjy1 = _mm_add_ps(fjy1,ty);
1485 fjz1 = _mm_add_ps(fjz1,tz);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 /* COULOMB ELECTROSTATICS */
1492 velec = _mm_mul_ps(qq22,rinv22);
1493 felec = _mm_mul_ps(velec,rinvsq22);
1497 /* Calculate temporary vectorial force */
1498 tx = _mm_mul_ps(fscal,dx22);
1499 ty = _mm_mul_ps(fscal,dy22);
1500 tz = _mm_mul_ps(fscal,dz22);
1502 /* Update vectorial force */
1503 fix2 = _mm_add_ps(fix2,tx);
1504 fiy2 = _mm_add_ps(fiy2,ty);
1505 fiz2 = _mm_add_ps(fiz2,tz);
1507 fjx2 = _mm_add_ps(fjx2,tx);
1508 fjy2 = _mm_add_ps(fjy2,ty);
1509 fjz2 = _mm_add_ps(fjz2,tz);
1511 fjptrA = f+j_coord_offsetA;
1512 fjptrB = f+j_coord_offsetB;
1513 fjptrC = f+j_coord_offsetC;
1514 fjptrD = f+j_coord_offsetD;
1516 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1517 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1519 /* Inner loop uses 270 flops */
1522 if(jidx<j_index_end)
1525 /* Get j neighbor index, and coordinate index */
1526 jnrlistA = jjnr[jidx];
1527 jnrlistB = jjnr[jidx+1];
1528 jnrlistC = jjnr[jidx+2];
1529 jnrlistD = jjnr[jidx+3];
1530 /* Sign of each element will be negative for non-real atoms.
1531 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1532 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1534 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1535 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1536 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1537 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1538 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1539 j_coord_offsetA = DIM*jnrA;
1540 j_coord_offsetB = DIM*jnrB;
1541 j_coord_offsetC = DIM*jnrC;
1542 j_coord_offsetD = DIM*jnrD;
1544 /* load j atom coordinates */
1545 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1546 x+j_coord_offsetC,x+j_coord_offsetD,
1547 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1549 /* Calculate displacement vector */
1550 dx00 = _mm_sub_ps(ix0,jx0);
1551 dy00 = _mm_sub_ps(iy0,jy0);
1552 dz00 = _mm_sub_ps(iz0,jz0);
1553 dx01 = _mm_sub_ps(ix0,jx1);
1554 dy01 = _mm_sub_ps(iy0,jy1);
1555 dz01 = _mm_sub_ps(iz0,jz1);
1556 dx02 = _mm_sub_ps(ix0,jx2);
1557 dy02 = _mm_sub_ps(iy0,jy2);
1558 dz02 = _mm_sub_ps(iz0,jz2);
1559 dx10 = _mm_sub_ps(ix1,jx0);
1560 dy10 = _mm_sub_ps(iy1,jy0);
1561 dz10 = _mm_sub_ps(iz1,jz0);
1562 dx11 = _mm_sub_ps(ix1,jx1);
1563 dy11 = _mm_sub_ps(iy1,jy1);
1564 dz11 = _mm_sub_ps(iz1,jz1);
1565 dx12 = _mm_sub_ps(ix1,jx2);
1566 dy12 = _mm_sub_ps(iy1,jy2);
1567 dz12 = _mm_sub_ps(iz1,jz2);
1568 dx20 = _mm_sub_ps(ix2,jx0);
1569 dy20 = _mm_sub_ps(iy2,jy0);
1570 dz20 = _mm_sub_ps(iz2,jz0);
1571 dx21 = _mm_sub_ps(ix2,jx1);
1572 dy21 = _mm_sub_ps(iy2,jy1);
1573 dz21 = _mm_sub_ps(iz2,jz1);
1574 dx22 = _mm_sub_ps(ix2,jx2);
1575 dy22 = _mm_sub_ps(iy2,jy2);
1576 dz22 = _mm_sub_ps(iz2,jz2);
1578 /* Calculate squared distance and things based on it */
1579 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1580 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1581 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1582 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1583 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1584 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1585 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1586 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1587 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1589 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1590 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1591 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1592 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1593 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1594 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1595 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1596 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1597 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1599 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1600 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1601 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1602 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1603 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1604 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1605 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1606 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1607 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1609 fjx0 = _mm_setzero_ps();
1610 fjy0 = _mm_setzero_ps();
1611 fjz0 = _mm_setzero_ps();
1612 fjx1 = _mm_setzero_ps();
1613 fjy1 = _mm_setzero_ps();
1614 fjz1 = _mm_setzero_ps();
1615 fjx2 = _mm_setzero_ps();
1616 fjy2 = _mm_setzero_ps();
1617 fjz2 = _mm_setzero_ps();
1619 /**************************
1620 * CALCULATE INTERACTIONS *
1621 **************************/
1623 r00 = _mm_mul_ps(rsq00,rinv00);
1624 r00 = _mm_andnot_ps(dummy_mask,r00);
1626 /* Calculate table index by multiplying r with table scale and truncate to integer */
1627 rt = _mm_mul_ps(r00,vftabscale);
1628 vfitab = _mm_cvttps_epi32(rt);
1629 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1630 vfitab = _mm_slli_epi32(vfitab,3);
1632 /* COULOMB ELECTROSTATICS */
1633 velec = _mm_mul_ps(qq00,rinv00);
1634 felec = _mm_mul_ps(velec,rinvsq00);
1636 /* CUBIC SPLINE TABLE DISPERSION */
1637 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1638 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1639 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1640 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1641 _MM_TRANSPOSE4_PS(Y,F,G,H);
1642 Heps = _mm_mul_ps(vfeps,H);
1643 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1644 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1645 fvdw6 = _mm_mul_ps(c6_00,FF);
1647 /* CUBIC SPLINE TABLE REPULSION */
1648 vfitab = _mm_add_epi32(vfitab,ifour);
1649 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1650 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1651 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1652 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1653 _MM_TRANSPOSE4_PS(Y,F,G,H);
1654 Heps = _mm_mul_ps(vfeps,H);
1655 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1656 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1657 fvdw12 = _mm_mul_ps(c12_00,FF);
1658 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1660 fscal = _mm_add_ps(felec,fvdw);
1662 fscal = _mm_andnot_ps(dummy_mask,fscal);
1664 /* Calculate temporary vectorial force */
1665 tx = _mm_mul_ps(fscal,dx00);
1666 ty = _mm_mul_ps(fscal,dy00);
1667 tz = _mm_mul_ps(fscal,dz00);
1669 /* Update vectorial force */
1670 fix0 = _mm_add_ps(fix0,tx);
1671 fiy0 = _mm_add_ps(fiy0,ty);
1672 fiz0 = _mm_add_ps(fiz0,tz);
1674 fjx0 = _mm_add_ps(fjx0,tx);
1675 fjy0 = _mm_add_ps(fjy0,ty);
1676 fjz0 = _mm_add_ps(fjz0,tz);
1678 /**************************
1679 * CALCULATE INTERACTIONS *
1680 **************************/
1682 /* COULOMB ELECTROSTATICS */
1683 velec = _mm_mul_ps(qq01,rinv01);
1684 felec = _mm_mul_ps(velec,rinvsq01);
1688 fscal = _mm_andnot_ps(dummy_mask,fscal);
1690 /* Calculate temporary vectorial force */
1691 tx = _mm_mul_ps(fscal,dx01);
1692 ty = _mm_mul_ps(fscal,dy01);
1693 tz = _mm_mul_ps(fscal,dz01);
1695 /* Update vectorial force */
1696 fix0 = _mm_add_ps(fix0,tx);
1697 fiy0 = _mm_add_ps(fiy0,ty);
1698 fiz0 = _mm_add_ps(fiz0,tz);
1700 fjx1 = _mm_add_ps(fjx1,tx);
1701 fjy1 = _mm_add_ps(fjy1,ty);
1702 fjz1 = _mm_add_ps(fjz1,tz);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 /* COULOMB ELECTROSTATICS */
1709 velec = _mm_mul_ps(qq02,rinv02);
1710 felec = _mm_mul_ps(velec,rinvsq02);
1714 fscal = _mm_andnot_ps(dummy_mask,fscal);
1716 /* Calculate temporary vectorial force */
1717 tx = _mm_mul_ps(fscal,dx02);
1718 ty = _mm_mul_ps(fscal,dy02);
1719 tz = _mm_mul_ps(fscal,dz02);
1721 /* Update vectorial force */
1722 fix0 = _mm_add_ps(fix0,tx);
1723 fiy0 = _mm_add_ps(fiy0,ty);
1724 fiz0 = _mm_add_ps(fiz0,tz);
1726 fjx2 = _mm_add_ps(fjx2,tx);
1727 fjy2 = _mm_add_ps(fjy2,ty);
1728 fjz2 = _mm_add_ps(fjz2,tz);
1730 /**************************
1731 * CALCULATE INTERACTIONS *
1732 **************************/
1734 /* COULOMB ELECTROSTATICS */
1735 velec = _mm_mul_ps(qq10,rinv10);
1736 felec = _mm_mul_ps(velec,rinvsq10);
1740 fscal = _mm_andnot_ps(dummy_mask,fscal);
1742 /* Calculate temporary vectorial force */
1743 tx = _mm_mul_ps(fscal,dx10);
1744 ty = _mm_mul_ps(fscal,dy10);
1745 tz = _mm_mul_ps(fscal,dz10);
1747 /* Update vectorial force */
1748 fix1 = _mm_add_ps(fix1,tx);
1749 fiy1 = _mm_add_ps(fiy1,ty);
1750 fiz1 = _mm_add_ps(fiz1,tz);
1752 fjx0 = _mm_add_ps(fjx0,tx);
1753 fjy0 = _mm_add_ps(fjy0,ty);
1754 fjz0 = _mm_add_ps(fjz0,tz);
1756 /**************************
1757 * CALCULATE INTERACTIONS *
1758 **************************/
1760 /* COULOMB ELECTROSTATICS */
1761 velec = _mm_mul_ps(qq11,rinv11);
1762 felec = _mm_mul_ps(velec,rinvsq11);
1766 fscal = _mm_andnot_ps(dummy_mask,fscal);
1768 /* Calculate temporary vectorial force */
1769 tx = _mm_mul_ps(fscal,dx11);
1770 ty = _mm_mul_ps(fscal,dy11);
1771 tz = _mm_mul_ps(fscal,dz11);
1773 /* Update vectorial force */
1774 fix1 = _mm_add_ps(fix1,tx);
1775 fiy1 = _mm_add_ps(fiy1,ty);
1776 fiz1 = _mm_add_ps(fiz1,tz);
1778 fjx1 = _mm_add_ps(fjx1,tx);
1779 fjy1 = _mm_add_ps(fjy1,ty);
1780 fjz1 = _mm_add_ps(fjz1,tz);
1782 /**************************
1783 * CALCULATE INTERACTIONS *
1784 **************************/
1786 /* COULOMB ELECTROSTATICS */
1787 velec = _mm_mul_ps(qq12,rinv12);
1788 felec = _mm_mul_ps(velec,rinvsq12);
1792 fscal = _mm_andnot_ps(dummy_mask,fscal);
1794 /* Calculate temporary vectorial force */
1795 tx = _mm_mul_ps(fscal,dx12);
1796 ty = _mm_mul_ps(fscal,dy12);
1797 tz = _mm_mul_ps(fscal,dz12);
1799 /* Update vectorial force */
1800 fix1 = _mm_add_ps(fix1,tx);
1801 fiy1 = _mm_add_ps(fiy1,ty);
1802 fiz1 = _mm_add_ps(fiz1,tz);
1804 fjx2 = _mm_add_ps(fjx2,tx);
1805 fjy2 = _mm_add_ps(fjy2,ty);
1806 fjz2 = _mm_add_ps(fjz2,tz);
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 /* COULOMB ELECTROSTATICS */
1813 velec = _mm_mul_ps(qq20,rinv20);
1814 felec = _mm_mul_ps(velec,rinvsq20);
1818 fscal = _mm_andnot_ps(dummy_mask,fscal);
1820 /* Calculate temporary vectorial force */
1821 tx = _mm_mul_ps(fscal,dx20);
1822 ty = _mm_mul_ps(fscal,dy20);
1823 tz = _mm_mul_ps(fscal,dz20);
1825 /* Update vectorial force */
1826 fix2 = _mm_add_ps(fix2,tx);
1827 fiy2 = _mm_add_ps(fiy2,ty);
1828 fiz2 = _mm_add_ps(fiz2,tz);
1830 fjx0 = _mm_add_ps(fjx0,tx);
1831 fjy0 = _mm_add_ps(fjy0,ty);
1832 fjz0 = _mm_add_ps(fjz0,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 /* COULOMB ELECTROSTATICS */
1839 velec = _mm_mul_ps(qq21,rinv21);
1840 felec = _mm_mul_ps(velec,rinvsq21);
1844 fscal = _mm_andnot_ps(dummy_mask,fscal);
1846 /* Calculate temporary vectorial force */
1847 tx = _mm_mul_ps(fscal,dx21);
1848 ty = _mm_mul_ps(fscal,dy21);
1849 tz = _mm_mul_ps(fscal,dz21);
1851 /* Update vectorial force */
1852 fix2 = _mm_add_ps(fix2,tx);
1853 fiy2 = _mm_add_ps(fiy2,ty);
1854 fiz2 = _mm_add_ps(fiz2,tz);
1856 fjx1 = _mm_add_ps(fjx1,tx);
1857 fjy1 = _mm_add_ps(fjy1,ty);
1858 fjz1 = _mm_add_ps(fjz1,tz);
1860 /**************************
1861 * CALCULATE INTERACTIONS *
1862 **************************/
1864 /* COULOMB ELECTROSTATICS */
1865 velec = _mm_mul_ps(qq22,rinv22);
1866 felec = _mm_mul_ps(velec,rinvsq22);
1870 fscal = _mm_andnot_ps(dummy_mask,fscal);
1872 /* Calculate temporary vectorial force */
1873 tx = _mm_mul_ps(fscal,dx22);
1874 ty = _mm_mul_ps(fscal,dy22);
1875 tz = _mm_mul_ps(fscal,dz22);
1877 /* Update vectorial force */
1878 fix2 = _mm_add_ps(fix2,tx);
1879 fiy2 = _mm_add_ps(fiy2,ty);
1880 fiz2 = _mm_add_ps(fiz2,tz);
1882 fjx2 = _mm_add_ps(fjx2,tx);
1883 fjy2 = _mm_add_ps(fjy2,ty);
1884 fjz2 = _mm_add_ps(fjz2,tz);
1886 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1887 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1888 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1889 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1891 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1892 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1894 /* Inner loop uses 271 flops */
1897 /* End of innermost loop */
1899 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1900 f+i_coord_offset,fshift+i_shift_offset);
1902 /* Increment number of inner iterations */
1903 inneriter += j_index_end - j_index_start;
1905 /* Outer loop uses 18 flops */
1908 /* Increment number of outer iterations */
1911 /* Update outer/inner flops */
1913 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*271);