2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_vdw->data;
121 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_pd(charge[inr+0]);
131 jq1 = _mm_set1_pd(charge[inr+1]);
132 jq2 = _mm_set1_pd(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_pd(iq0,jq0);
135 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_pd(iq0,jq1);
138 qq02 = _mm_mul_pd(iq0,jq2);
139 qq10 = _mm_mul_pd(iq1,jq0);
140 qq11 = _mm_mul_pd(iq1,jq1);
141 qq12 = _mm_mul_pd(iq1,jq2);
142 qq20 = _mm_mul_pd(iq2,jq0);
143 qq21 = _mm_mul_pd(iq2,jq1);
144 qq22 = _mm_mul_pd(iq2,jq2);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm_setzero_pd();
173 fiy0 = _mm_setzero_pd();
174 fiz0 = _mm_setzero_pd();
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx01 = _mm_sub_pd(ix0,jx1);
205 dy01 = _mm_sub_pd(iy0,jy1);
206 dz01 = _mm_sub_pd(iz0,jz1);
207 dx02 = _mm_sub_pd(ix0,jx2);
208 dy02 = _mm_sub_pd(iy0,jy2);
209 dz02 = _mm_sub_pd(iz0,jz2);
210 dx10 = _mm_sub_pd(ix1,jx0);
211 dy10 = _mm_sub_pd(iy1,jy0);
212 dz10 = _mm_sub_pd(iz1,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx20 = _mm_sub_pd(ix2,jx0);
220 dy20 = _mm_sub_pd(iy2,jy0);
221 dz20 = _mm_sub_pd(iz2,jz0);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
240 rinv00 = gmx_mm_invsqrt_pd(rsq00);
241 rinv01 = gmx_mm_invsqrt_pd(rsq01);
242 rinv02 = gmx_mm_invsqrt_pd(rsq02);
243 rinv10 = gmx_mm_invsqrt_pd(rsq10);
244 rinv11 = gmx_mm_invsqrt_pd(rsq11);
245 rinv12 = gmx_mm_invsqrt_pd(rsq12);
246 rinv20 = gmx_mm_invsqrt_pd(rsq20);
247 rinv21 = gmx_mm_invsqrt_pd(rsq21);
248 rinv22 = gmx_mm_invsqrt_pd(rsq22);
250 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
251 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
252 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
253 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
254 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
255 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
256 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
257 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
258 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
260 fjx0 = _mm_setzero_pd();
261 fjy0 = _mm_setzero_pd();
262 fjz0 = _mm_setzero_pd();
263 fjx1 = _mm_setzero_pd();
264 fjy1 = _mm_setzero_pd();
265 fjz1 = _mm_setzero_pd();
266 fjx2 = _mm_setzero_pd();
267 fjy2 = _mm_setzero_pd();
268 fjz2 = _mm_setzero_pd();
270 /**************************
271 * CALCULATE INTERACTIONS *
272 **************************/
274 r00 = _mm_mul_pd(rsq00,rinv00);
276 /* Calculate table index by multiplying r with table scale and truncate to integer */
277 rt = _mm_mul_pd(r00,vftabscale);
278 vfitab = _mm_cvttpd_epi32(rt);
279 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
280 vfitab = _mm_slli_epi32(vfitab,3);
282 /* COULOMB ELECTROSTATICS */
283 velec = _mm_mul_pd(qq00,rinv00);
284 felec = _mm_mul_pd(velec,rinvsq00);
286 /* CUBIC SPLINE TABLE DISPERSION */
287 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
288 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
289 GMX_MM_TRANSPOSE2_PD(Y,F);
290 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
291 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
292 GMX_MM_TRANSPOSE2_PD(G,H);
293 Heps = _mm_mul_pd(vfeps,H);
294 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
295 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
296 vvdw6 = _mm_mul_pd(c6_00,VV);
297 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
298 fvdw6 = _mm_mul_pd(c6_00,FF);
300 /* CUBIC SPLINE TABLE REPULSION */
301 vfitab = _mm_add_epi32(vfitab,ifour);
302 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
303 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
304 GMX_MM_TRANSPOSE2_PD(Y,F);
305 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
306 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
307 GMX_MM_TRANSPOSE2_PD(G,H);
308 Heps = _mm_mul_pd(vfeps,H);
309 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
310 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
311 vvdw12 = _mm_mul_pd(c12_00,VV);
312 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
313 fvdw12 = _mm_mul_pd(c12_00,FF);
314 vvdw = _mm_add_pd(vvdw12,vvdw6);
315 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
317 /* Update potential sum for this i atom from the interaction with this j atom. */
318 velecsum = _mm_add_pd(velecsum,velec);
319 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
321 fscal = _mm_add_pd(felec,fvdw);
323 /* Calculate temporary vectorial force */
324 tx = _mm_mul_pd(fscal,dx00);
325 ty = _mm_mul_pd(fscal,dy00);
326 tz = _mm_mul_pd(fscal,dz00);
328 /* Update vectorial force */
329 fix0 = _mm_add_pd(fix0,tx);
330 fiy0 = _mm_add_pd(fiy0,ty);
331 fiz0 = _mm_add_pd(fiz0,tz);
333 fjx0 = _mm_add_pd(fjx0,tx);
334 fjy0 = _mm_add_pd(fjy0,ty);
335 fjz0 = _mm_add_pd(fjz0,tz);
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 /* COULOMB ELECTROSTATICS */
342 velec = _mm_mul_pd(qq01,rinv01);
343 felec = _mm_mul_pd(velec,rinvsq01);
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velecsum = _mm_add_pd(velecsum,velec);
350 /* Calculate temporary vectorial force */
351 tx = _mm_mul_pd(fscal,dx01);
352 ty = _mm_mul_pd(fscal,dy01);
353 tz = _mm_mul_pd(fscal,dz01);
355 /* Update vectorial force */
356 fix0 = _mm_add_pd(fix0,tx);
357 fiy0 = _mm_add_pd(fiy0,ty);
358 fiz0 = _mm_add_pd(fiz0,tz);
360 fjx1 = _mm_add_pd(fjx1,tx);
361 fjy1 = _mm_add_pd(fjy1,ty);
362 fjz1 = _mm_add_pd(fjz1,tz);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 /* COULOMB ELECTROSTATICS */
369 velec = _mm_mul_pd(qq02,rinv02);
370 felec = _mm_mul_pd(velec,rinvsq02);
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velecsum = _mm_add_pd(velecsum,velec);
377 /* Calculate temporary vectorial force */
378 tx = _mm_mul_pd(fscal,dx02);
379 ty = _mm_mul_pd(fscal,dy02);
380 tz = _mm_mul_pd(fscal,dz02);
382 /* Update vectorial force */
383 fix0 = _mm_add_pd(fix0,tx);
384 fiy0 = _mm_add_pd(fiy0,ty);
385 fiz0 = _mm_add_pd(fiz0,tz);
387 fjx2 = _mm_add_pd(fjx2,tx);
388 fjy2 = _mm_add_pd(fjy2,ty);
389 fjz2 = _mm_add_pd(fjz2,tz);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 /* COULOMB ELECTROSTATICS */
396 velec = _mm_mul_pd(qq10,rinv10);
397 felec = _mm_mul_pd(velec,rinvsq10);
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_pd(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_pd(fscal,dx10);
406 ty = _mm_mul_pd(fscal,dy10);
407 tz = _mm_mul_pd(fscal,dz10);
409 /* Update vectorial force */
410 fix1 = _mm_add_pd(fix1,tx);
411 fiy1 = _mm_add_pd(fiy1,ty);
412 fiz1 = _mm_add_pd(fiz1,tz);
414 fjx0 = _mm_add_pd(fjx0,tx);
415 fjy0 = _mm_add_pd(fjy0,ty);
416 fjz0 = _mm_add_pd(fjz0,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 /* COULOMB ELECTROSTATICS */
423 velec = _mm_mul_pd(qq11,rinv11);
424 felec = _mm_mul_pd(velec,rinvsq11);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velecsum = _mm_add_pd(velecsum,velec);
431 /* Calculate temporary vectorial force */
432 tx = _mm_mul_pd(fscal,dx11);
433 ty = _mm_mul_pd(fscal,dy11);
434 tz = _mm_mul_pd(fscal,dz11);
436 /* Update vectorial force */
437 fix1 = _mm_add_pd(fix1,tx);
438 fiy1 = _mm_add_pd(fiy1,ty);
439 fiz1 = _mm_add_pd(fiz1,tz);
441 fjx1 = _mm_add_pd(fjx1,tx);
442 fjy1 = _mm_add_pd(fjy1,ty);
443 fjz1 = _mm_add_pd(fjz1,tz);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 /* COULOMB ELECTROSTATICS */
450 velec = _mm_mul_pd(qq12,rinv12);
451 felec = _mm_mul_pd(velec,rinvsq12);
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velecsum = _mm_add_pd(velecsum,velec);
458 /* Calculate temporary vectorial force */
459 tx = _mm_mul_pd(fscal,dx12);
460 ty = _mm_mul_pd(fscal,dy12);
461 tz = _mm_mul_pd(fscal,dz12);
463 /* Update vectorial force */
464 fix1 = _mm_add_pd(fix1,tx);
465 fiy1 = _mm_add_pd(fiy1,ty);
466 fiz1 = _mm_add_pd(fiz1,tz);
468 fjx2 = _mm_add_pd(fjx2,tx);
469 fjy2 = _mm_add_pd(fjy2,ty);
470 fjz2 = _mm_add_pd(fjz2,tz);
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
476 /* COULOMB ELECTROSTATICS */
477 velec = _mm_mul_pd(qq20,rinv20);
478 felec = _mm_mul_pd(velec,rinvsq20);
480 /* Update potential sum for this i atom from the interaction with this j atom. */
481 velecsum = _mm_add_pd(velecsum,velec);
485 /* Calculate temporary vectorial force */
486 tx = _mm_mul_pd(fscal,dx20);
487 ty = _mm_mul_pd(fscal,dy20);
488 tz = _mm_mul_pd(fscal,dz20);
490 /* Update vectorial force */
491 fix2 = _mm_add_pd(fix2,tx);
492 fiy2 = _mm_add_pd(fiy2,ty);
493 fiz2 = _mm_add_pd(fiz2,tz);
495 fjx0 = _mm_add_pd(fjx0,tx);
496 fjy0 = _mm_add_pd(fjy0,ty);
497 fjz0 = _mm_add_pd(fjz0,tz);
499 /**************************
500 * CALCULATE INTERACTIONS *
501 **************************/
503 /* COULOMB ELECTROSTATICS */
504 velec = _mm_mul_pd(qq21,rinv21);
505 felec = _mm_mul_pd(velec,rinvsq21);
507 /* Update potential sum for this i atom from the interaction with this j atom. */
508 velecsum = _mm_add_pd(velecsum,velec);
512 /* Calculate temporary vectorial force */
513 tx = _mm_mul_pd(fscal,dx21);
514 ty = _mm_mul_pd(fscal,dy21);
515 tz = _mm_mul_pd(fscal,dz21);
517 /* Update vectorial force */
518 fix2 = _mm_add_pd(fix2,tx);
519 fiy2 = _mm_add_pd(fiy2,ty);
520 fiz2 = _mm_add_pd(fiz2,tz);
522 fjx1 = _mm_add_pd(fjx1,tx);
523 fjy1 = _mm_add_pd(fjy1,ty);
524 fjz1 = _mm_add_pd(fjz1,tz);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 /* COULOMB ELECTROSTATICS */
531 velec = _mm_mul_pd(qq22,rinv22);
532 felec = _mm_mul_pd(velec,rinvsq22);
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum = _mm_add_pd(velecsum,velec);
539 /* Calculate temporary vectorial force */
540 tx = _mm_mul_pd(fscal,dx22);
541 ty = _mm_mul_pd(fscal,dy22);
542 tz = _mm_mul_pd(fscal,dz22);
544 /* Update vectorial force */
545 fix2 = _mm_add_pd(fix2,tx);
546 fiy2 = _mm_add_pd(fiy2,ty);
547 fiz2 = _mm_add_pd(fiz2,tz);
549 fjx2 = _mm_add_pd(fjx2,tx);
550 fjy2 = _mm_add_pd(fjy2,ty);
551 fjz2 = _mm_add_pd(fjz2,tz);
553 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
555 /* Inner loop uses 287 flops */
562 j_coord_offsetA = DIM*jnrA;
564 /* load j atom coordinates */
565 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
566 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
568 /* Calculate displacement vector */
569 dx00 = _mm_sub_pd(ix0,jx0);
570 dy00 = _mm_sub_pd(iy0,jy0);
571 dz00 = _mm_sub_pd(iz0,jz0);
572 dx01 = _mm_sub_pd(ix0,jx1);
573 dy01 = _mm_sub_pd(iy0,jy1);
574 dz01 = _mm_sub_pd(iz0,jz1);
575 dx02 = _mm_sub_pd(ix0,jx2);
576 dy02 = _mm_sub_pd(iy0,jy2);
577 dz02 = _mm_sub_pd(iz0,jz2);
578 dx10 = _mm_sub_pd(ix1,jx0);
579 dy10 = _mm_sub_pd(iy1,jy0);
580 dz10 = _mm_sub_pd(iz1,jz0);
581 dx11 = _mm_sub_pd(ix1,jx1);
582 dy11 = _mm_sub_pd(iy1,jy1);
583 dz11 = _mm_sub_pd(iz1,jz1);
584 dx12 = _mm_sub_pd(ix1,jx2);
585 dy12 = _mm_sub_pd(iy1,jy2);
586 dz12 = _mm_sub_pd(iz1,jz2);
587 dx20 = _mm_sub_pd(ix2,jx0);
588 dy20 = _mm_sub_pd(iy2,jy0);
589 dz20 = _mm_sub_pd(iz2,jz0);
590 dx21 = _mm_sub_pd(ix2,jx1);
591 dy21 = _mm_sub_pd(iy2,jy1);
592 dz21 = _mm_sub_pd(iz2,jz1);
593 dx22 = _mm_sub_pd(ix2,jx2);
594 dy22 = _mm_sub_pd(iy2,jy2);
595 dz22 = _mm_sub_pd(iz2,jz2);
597 /* Calculate squared distance and things based on it */
598 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
599 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
600 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
601 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
602 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
603 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
604 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
605 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
606 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
608 rinv00 = gmx_mm_invsqrt_pd(rsq00);
609 rinv01 = gmx_mm_invsqrt_pd(rsq01);
610 rinv02 = gmx_mm_invsqrt_pd(rsq02);
611 rinv10 = gmx_mm_invsqrt_pd(rsq10);
612 rinv11 = gmx_mm_invsqrt_pd(rsq11);
613 rinv12 = gmx_mm_invsqrt_pd(rsq12);
614 rinv20 = gmx_mm_invsqrt_pd(rsq20);
615 rinv21 = gmx_mm_invsqrt_pd(rsq21);
616 rinv22 = gmx_mm_invsqrt_pd(rsq22);
618 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
619 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
620 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
621 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
622 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
623 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
624 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
625 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
626 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
628 fjx0 = _mm_setzero_pd();
629 fjy0 = _mm_setzero_pd();
630 fjz0 = _mm_setzero_pd();
631 fjx1 = _mm_setzero_pd();
632 fjy1 = _mm_setzero_pd();
633 fjz1 = _mm_setzero_pd();
634 fjx2 = _mm_setzero_pd();
635 fjy2 = _mm_setzero_pd();
636 fjz2 = _mm_setzero_pd();
638 /**************************
639 * CALCULATE INTERACTIONS *
640 **************************/
642 r00 = _mm_mul_pd(rsq00,rinv00);
644 /* Calculate table index by multiplying r with table scale and truncate to integer */
645 rt = _mm_mul_pd(r00,vftabscale);
646 vfitab = _mm_cvttpd_epi32(rt);
647 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
648 vfitab = _mm_slli_epi32(vfitab,3);
650 /* COULOMB ELECTROSTATICS */
651 velec = _mm_mul_pd(qq00,rinv00);
652 felec = _mm_mul_pd(velec,rinvsq00);
654 /* CUBIC SPLINE TABLE DISPERSION */
655 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
656 F = _mm_setzero_pd();
657 GMX_MM_TRANSPOSE2_PD(Y,F);
658 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
659 H = _mm_setzero_pd();
660 GMX_MM_TRANSPOSE2_PD(G,H);
661 Heps = _mm_mul_pd(vfeps,H);
662 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
663 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
664 vvdw6 = _mm_mul_pd(c6_00,VV);
665 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
666 fvdw6 = _mm_mul_pd(c6_00,FF);
668 /* CUBIC SPLINE TABLE REPULSION */
669 vfitab = _mm_add_epi32(vfitab,ifour);
670 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
671 F = _mm_setzero_pd();
672 GMX_MM_TRANSPOSE2_PD(Y,F);
673 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
674 H = _mm_setzero_pd();
675 GMX_MM_TRANSPOSE2_PD(G,H);
676 Heps = _mm_mul_pd(vfeps,H);
677 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
678 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
679 vvdw12 = _mm_mul_pd(c12_00,VV);
680 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
681 fvdw12 = _mm_mul_pd(c12_00,FF);
682 vvdw = _mm_add_pd(vvdw12,vvdw6);
683 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
685 /* Update potential sum for this i atom from the interaction with this j atom. */
686 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
687 velecsum = _mm_add_pd(velecsum,velec);
688 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
689 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
691 fscal = _mm_add_pd(felec,fvdw);
693 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
695 /* Calculate temporary vectorial force */
696 tx = _mm_mul_pd(fscal,dx00);
697 ty = _mm_mul_pd(fscal,dy00);
698 tz = _mm_mul_pd(fscal,dz00);
700 /* Update vectorial force */
701 fix0 = _mm_add_pd(fix0,tx);
702 fiy0 = _mm_add_pd(fiy0,ty);
703 fiz0 = _mm_add_pd(fiz0,tz);
705 fjx0 = _mm_add_pd(fjx0,tx);
706 fjy0 = _mm_add_pd(fjy0,ty);
707 fjz0 = _mm_add_pd(fjz0,tz);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 /* COULOMB ELECTROSTATICS */
714 velec = _mm_mul_pd(qq01,rinv01);
715 felec = _mm_mul_pd(velec,rinvsq01);
717 /* Update potential sum for this i atom from the interaction with this j atom. */
718 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
719 velecsum = _mm_add_pd(velecsum,velec);
723 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
725 /* Calculate temporary vectorial force */
726 tx = _mm_mul_pd(fscal,dx01);
727 ty = _mm_mul_pd(fscal,dy01);
728 tz = _mm_mul_pd(fscal,dz01);
730 /* Update vectorial force */
731 fix0 = _mm_add_pd(fix0,tx);
732 fiy0 = _mm_add_pd(fiy0,ty);
733 fiz0 = _mm_add_pd(fiz0,tz);
735 fjx1 = _mm_add_pd(fjx1,tx);
736 fjy1 = _mm_add_pd(fjy1,ty);
737 fjz1 = _mm_add_pd(fjz1,tz);
739 /**************************
740 * CALCULATE INTERACTIONS *
741 **************************/
743 /* COULOMB ELECTROSTATICS */
744 velec = _mm_mul_pd(qq02,rinv02);
745 felec = _mm_mul_pd(velec,rinvsq02);
747 /* Update potential sum for this i atom from the interaction with this j atom. */
748 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
749 velecsum = _mm_add_pd(velecsum,velec);
753 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
755 /* Calculate temporary vectorial force */
756 tx = _mm_mul_pd(fscal,dx02);
757 ty = _mm_mul_pd(fscal,dy02);
758 tz = _mm_mul_pd(fscal,dz02);
760 /* Update vectorial force */
761 fix0 = _mm_add_pd(fix0,tx);
762 fiy0 = _mm_add_pd(fiy0,ty);
763 fiz0 = _mm_add_pd(fiz0,tz);
765 fjx2 = _mm_add_pd(fjx2,tx);
766 fjy2 = _mm_add_pd(fjy2,ty);
767 fjz2 = _mm_add_pd(fjz2,tz);
769 /**************************
770 * CALCULATE INTERACTIONS *
771 **************************/
773 /* COULOMB ELECTROSTATICS */
774 velec = _mm_mul_pd(qq10,rinv10);
775 felec = _mm_mul_pd(velec,rinvsq10);
777 /* Update potential sum for this i atom from the interaction with this j atom. */
778 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
779 velecsum = _mm_add_pd(velecsum,velec);
783 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
785 /* Calculate temporary vectorial force */
786 tx = _mm_mul_pd(fscal,dx10);
787 ty = _mm_mul_pd(fscal,dy10);
788 tz = _mm_mul_pd(fscal,dz10);
790 /* Update vectorial force */
791 fix1 = _mm_add_pd(fix1,tx);
792 fiy1 = _mm_add_pd(fiy1,ty);
793 fiz1 = _mm_add_pd(fiz1,tz);
795 fjx0 = _mm_add_pd(fjx0,tx);
796 fjy0 = _mm_add_pd(fjy0,ty);
797 fjz0 = _mm_add_pd(fjz0,tz);
799 /**************************
800 * CALCULATE INTERACTIONS *
801 **************************/
803 /* COULOMB ELECTROSTATICS */
804 velec = _mm_mul_pd(qq11,rinv11);
805 felec = _mm_mul_pd(velec,rinvsq11);
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
809 velecsum = _mm_add_pd(velecsum,velec);
813 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
815 /* Calculate temporary vectorial force */
816 tx = _mm_mul_pd(fscal,dx11);
817 ty = _mm_mul_pd(fscal,dy11);
818 tz = _mm_mul_pd(fscal,dz11);
820 /* Update vectorial force */
821 fix1 = _mm_add_pd(fix1,tx);
822 fiy1 = _mm_add_pd(fiy1,ty);
823 fiz1 = _mm_add_pd(fiz1,tz);
825 fjx1 = _mm_add_pd(fjx1,tx);
826 fjy1 = _mm_add_pd(fjy1,ty);
827 fjz1 = _mm_add_pd(fjz1,tz);
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 /* COULOMB ELECTROSTATICS */
834 velec = _mm_mul_pd(qq12,rinv12);
835 felec = _mm_mul_pd(velec,rinvsq12);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
839 velecsum = _mm_add_pd(velecsum,velec);
843 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
845 /* Calculate temporary vectorial force */
846 tx = _mm_mul_pd(fscal,dx12);
847 ty = _mm_mul_pd(fscal,dy12);
848 tz = _mm_mul_pd(fscal,dz12);
850 /* Update vectorial force */
851 fix1 = _mm_add_pd(fix1,tx);
852 fiy1 = _mm_add_pd(fiy1,ty);
853 fiz1 = _mm_add_pd(fiz1,tz);
855 fjx2 = _mm_add_pd(fjx2,tx);
856 fjy2 = _mm_add_pd(fjy2,ty);
857 fjz2 = _mm_add_pd(fjz2,tz);
859 /**************************
860 * CALCULATE INTERACTIONS *
861 **************************/
863 /* COULOMB ELECTROSTATICS */
864 velec = _mm_mul_pd(qq20,rinv20);
865 felec = _mm_mul_pd(velec,rinvsq20);
867 /* Update potential sum for this i atom from the interaction with this j atom. */
868 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
869 velecsum = _mm_add_pd(velecsum,velec);
873 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
875 /* Calculate temporary vectorial force */
876 tx = _mm_mul_pd(fscal,dx20);
877 ty = _mm_mul_pd(fscal,dy20);
878 tz = _mm_mul_pd(fscal,dz20);
880 /* Update vectorial force */
881 fix2 = _mm_add_pd(fix2,tx);
882 fiy2 = _mm_add_pd(fiy2,ty);
883 fiz2 = _mm_add_pd(fiz2,tz);
885 fjx0 = _mm_add_pd(fjx0,tx);
886 fjy0 = _mm_add_pd(fjy0,ty);
887 fjz0 = _mm_add_pd(fjz0,tz);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 /* COULOMB ELECTROSTATICS */
894 velec = _mm_mul_pd(qq21,rinv21);
895 felec = _mm_mul_pd(velec,rinvsq21);
897 /* Update potential sum for this i atom from the interaction with this j atom. */
898 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
899 velecsum = _mm_add_pd(velecsum,velec);
903 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
905 /* Calculate temporary vectorial force */
906 tx = _mm_mul_pd(fscal,dx21);
907 ty = _mm_mul_pd(fscal,dy21);
908 tz = _mm_mul_pd(fscal,dz21);
910 /* Update vectorial force */
911 fix2 = _mm_add_pd(fix2,tx);
912 fiy2 = _mm_add_pd(fiy2,ty);
913 fiz2 = _mm_add_pd(fiz2,tz);
915 fjx1 = _mm_add_pd(fjx1,tx);
916 fjy1 = _mm_add_pd(fjy1,ty);
917 fjz1 = _mm_add_pd(fjz1,tz);
919 /**************************
920 * CALCULATE INTERACTIONS *
921 **************************/
923 /* COULOMB ELECTROSTATICS */
924 velec = _mm_mul_pd(qq22,rinv22);
925 felec = _mm_mul_pd(velec,rinvsq22);
927 /* Update potential sum for this i atom from the interaction with this j atom. */
928 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
929 velecsum = _mm_add_pd(velecsum,velec);
933 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
935 /* Calculate temporary vectorial force */
936 tx = _mm_mul_pd(fscal,dx22);
937 ty = _mm_mul_pd(fscal,dy22);
938 tz = _mm_mul_pd(fscal,dz22);
940 /* Update vectorial force */
941 fix2 = _mm_add_pd(fix2,tx);
942 fiy2 = _mm_add_pd(fiy2,ty);
943 fiz2 = _mm_add_pd(fiz2,tz);
945 fjx2 = _mm_add_pd(fjx2,tx);
946 fjy2 = _mm_add_pd(fjy2,ty);
947 fjz2 = _mm_add_pd(fjz2,tz);
949 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
951 /* Inner loop uses 287 flops */
954 /* End of innermost loop */
956 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
957 f+i_coord_offset,fshift+i_shift_offset);
960 /* Update potential energies */
961 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
962 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
964 /* Increment number of inner iterations */
965 inneriter += j_index_end - j_index_start;
967 /* Outer loop uses 20 flops */
970 /* Increment number of outer iterations */
973 /* Update outer/inner flops */
975 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*287);
978 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double
979 * Electrostatics interaction: Coulomb
980 * VdW interaction: CubicSplineTable
981 * Geometry: Water3-Water3
982 * Calculate force/pot: Force
985 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double
986 (t_nblist * gmx_restrict nlist,
987 rvec * gmx_restrict xx,
988 rvec * gmx_restrict ff,
989 t_forcerec * gmx_restrict fr,
990 t_mdatoms * gmx_restrict mdatoms,
991 nb_kernel_data_t * gmx_restrict kernel_data,
992 t_nrnb * gmx_restrict nrnb)
994 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
995 * just 0 for non-waters.
996 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
997 * jnr indices corresponding to data put in the four positions in the SIMD register.
999 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1000 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1002 int j_coord_offsetA,j_coord_offsetB;
1003 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1004 real rcutoff_scalar;
1005 real *shiftvec,*fshift,*x,*f;
1006 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1008 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1010 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1012 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1013 int vdwjidx0A,vdwjidx0B;
1014 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1015 int vdwjidx1A,vdwjidx1B;
1016 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1017 int vdwjidx2A,vdwjidx2B;
1018 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1019 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1020 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1021 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1022 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1023 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1024 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1025 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1026 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1027 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1028 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1031 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1034 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1035 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1037 __m128i ifour = _mm_set1_epi32(4);
1038 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1040 __m128d dummy_mask,cutoff_mask;
1041 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1042 __m128d one = _mm_set1_pd(1.0);
1043 __m128d two = _mm_set1_pd(2.0);
1049 jindex = nlist->jindex;
1051 shiftidx = nlist->shift;
1053 shiftvec = fr->shift_vec[0];
1054 fshift = fr->fshift[0];
1055 facel = _mm_set1_pd(fr->epsfac);
1056 charge = mdatoms->chargeA;
1057 nvdwtype = fr->ntype;
1058 vdwparam = fr->nbfp;
1059 vdwtype = mdatoms->typeA;
1061 vftab = kernel_data->table_vdw->data;
1062 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1064 /* Setup water-specific parameters */
1065 inr = nlist->iinr[0];
1066 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1067 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1068 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1069 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1071 jq0 = _mm_set1_pd(charge[inr+0]);
1072 jq1 = _mm_set1_pd(charge[inr+1]);
1073 jq2 = _mm_set1_pd(charge[inr+2]);
1074 vdwjidx0A = 2*vdwtype[inr+0];
1075 qq00 = _mm_mul_pd(iq0,jq0);
1076 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1077 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1078 qq01 = _mm_mul_pd(iq0,jq1);
1079 qq02 = _mm_mul_pd(iq0,jq2);
1080 qq10 = _mm_mul_pd(iq1,jq0);
1081 qq11 = _mm_mul_pd(iq1,jq1);
1082 qq12 = _mm_mul_pd(iq1,jq2);
1083 qq20 = _mm_mul_pd(iq2,jq0);
1084 qq21 = _mm_mul_pd(iq2,jq1);
1085 qq22 = _mm_mul_pd(iq2,jq2);
1087 /* Avoid stupid compiler warnings */
1089 j_coord_offsetA = 0;
1090 j_coord_offsetB = 0;
1095 /* Start outer loop over neighborlists */
1096 for(iidx=0; iidx<nri; iidx++)
1098 /* Load shift vector for this list */
1099 i_shift_offset = DIM*shiftidx[iidx];
1101 /* Load limits for loop over neighbors */
1102 j_index_start = jindex[iidx];
1103 j_index_end = jindex[iidx+1];
1105 /* Get outer coordinate index */
1107 i_coord_offset = DIM*inr;
1109 /* Load i particle coords and add shift vector */
1110 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1111 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1113 fix0 = _mm_setzero_pd();
1114 fiy0 = _mm_setzero_pd();
1115 fiz0 = _mm_setzero_pd();
1116 fix1 = _mm_setzero_pd();
1117 fiy1 = _mm_setzero_pd();
1118 fiz1 = _mm_setzero_pd();
1119 fix2 = _mm_setzero_pd();
1120 fiy2 = _mm_setzero_pd();
1121 fiz2 = _mm_setzero_pd();
1123 /* Start inner kernel loop */
1124 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1127 /* Get j neighbor index, and coordinate index */
1129 jnrB = jjnr[jidx+1];
1130 j_coord_offsetA = DIM*jnrA;
1131 j_coord_offsetB = DIM*jnrB;
1133 /* load j atom coordinates */
1134 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1135 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1137 /* Calculate displacement vector */
1138 dx00 = _mm_sub_pd(ix0,jx0);
1139 dy00 = _mm_sub_pd(iy0,jy0);
1140 dz00 = _mm_sub_pd(iz0,jz0);
1141 dx01 = _mm_sub_pd(ix0,jx1);
1142 dy01 = _mm_sub_pd(iy0,jy1);
1143 dz01 = _mm_sub_pd(iz0,jz1);
1144 dx02 = _mm_sub_pd(ix0,jx2);
1145 dy02 = _mm_sub_pd(iy0,jy2);
1146 dz02 = _mm_sub_pd(iz0,jz2);
1147 dx10 = _mm_sub_pd(ix1,jx0);
1148 dy10 = _mm_sub_pd(iy1,jy0);
1149 dz10 = _mm_sub_pd(iz1,jz0);
1150 dx11 = _mm_sub_pd(ix1,jx1);
1151 dy11 = _mm_sub_pd(iy1,jy1);
1152 dz11 = _mm_sub_pd(iz1,jz1);
1153 dx12 = _mm_sub_pd(ix1,jx2);
1154 dy12 = _mm_sub_pd(iy1,jy2);
1155 dz12 = _mm_sub_pd(iz1,jz2);
1156 dx20 = _mm_sub_pd(ix2,jx0);
1157 dy20 = _mm_sub_pd(iy2,jy0);
1158 dz20 = _mm_sub_pd(iz2,jz0);
1159 dx21 = _mm_sub_pd(ix2,jx1);
1160 dy21 = _mm_sub_pd(iy2,jy1);
1161 dz21 = _mm_sub_pd(iz2,jz1);
1162 dx22 = _mm_sub_pd(ix2,jx2);
1163 dy22 = _mm_sub_pd(iy2,jy2);
1164 dz22 = _mm_sub_pd(iz2,jz2);
1166 /* Calculate squared distance and things based on it */
1167 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1168 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1169 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1170 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1171 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1172 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1173 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1174 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1175 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1177 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1178 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1179 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1180 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1181 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1182 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1183 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1184 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1185 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1187 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1188 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1189 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1190 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1191 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1192 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1193 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1194 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1195 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1197 fjx0 = _mm_setzero_pd();
1198 fjy0 = _mm_setzero_pd();
1199 fjz0 = _mm_setzero_pd();
1200 fjx1 = _mm_setzero_pd();
1201 fjy1 = _mm_setzero_pd();
1202 fjz1 = _mm_setzero_pd();
1203 fjx2 = _mm_setzero_pd();
1204 fjy2 = _mm_setzero_pd();
1205 fjz2 = _mm_setzero_pd();
1207 /**************************
1208 * CALCULATE INTERACTIONS *
1209 **************************/
1211 r00 = _mm_mul_pd(rsq00,rinv00);
1213 /* Calculate table index by multiplying r with table scale and truncate to integer */
1214 rt = _mm_mul_pd(r00,vftabscale);
1215 vfitab = _mm_cvttpd_epi32(rt);
1216 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1217 vfitab = _mm_slli_epi32(vfitab,3);
1219 /* COULOMB ELECTROSTATICS */
1220 velec = _mm_mul_pd(qq00,rinv00);
1221 felec = _mm_mul_pd(velec,rinvsq00);
1223 /* CUBIC SPLINE TABLE DISPERSION */
1224 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1225 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1226 GMX_MM_TRANSPOSE2_PD(Y,F);
1227 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1228 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1229 GMX_MM_TRANSPOSE2_PD(G,H);
1230 Heps = _mm_mul_pd(vfeps,H);
1231 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1232 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1233 fvdw6 = _mm_mul_pd(c6_00,FF);
1235 /* CUBIC SPLINE TABLE REPULSION */
1236 vfitab = _mm_add_epi32(vfitab,ifour);
1237 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1238 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1239 GMX_MM_TRANSPOSE2_PD(Y,F);
1240 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1241 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1242 GMX_MM_TRANSPOSE2_PD(G,H);
1243 Heps = _mm_mul_pd(vfeps,H);
1244 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1245 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1246 fvdw12 = _mm_mul_pd(c12_00,FF);
1247 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1249 fscal = _mm_add_pd(felec,fvdw);
1251 /* Calculate temporary vectorial force */
1252 tx = _mm_mul_pd(fscal,dx00);
1253 ty = _mm_mul_pd(fscal,dy00);
1254 tz = _mm_mul_pd(fscal,dz00);
1256 /* Update vectorial force */
1257 fix0 = _mm_add_pd(fix0,tx);
1258 fiy0 = _mm_add_pd(fiy0,ty);
1259 fiz0 = _mm_add_pd(fiz0,tz);
1261 fjx0 = _mm_add_pd(fjx0,tx);
1262 fjy0 = _mm_add_pd(fjy0,ty);
1263 fjz0 = _mm_add_pd(fjz0,tz);
1265 /**************************
1266 * CALCULATE INTERACTIONS *
1267 **************************/
1269 /* COULOMB ELECTROSTATICS */
1270 velec = _mm_mul_pd(qq01,rinv01);
1271 felec = _mm_mul_pd(velec,rinvsq01);
1275 /* Calculate temporary vectorial force */
1276 tx = _mm_mul_pd(fscal,dx01);
1277 ty = _mm_mul_pd(fscal,dy01);
1278 tz = _mm_mul_pd(fscal,dz01);
1280 /* Update vectorial force */
1281 fix0 = _mm_add_pd(fix0,tx);
1282 fiy0 = _mm_add_pd(fiy0,ty);
1283 fiz0 = _mm_add_pd(fiz0,tz);
1285 fjx1 = _mm_add_pd(fjx1,tx);
1286 fjy1 = _mm_add_pd(fjy1,ty);
1287 fjz1 = _mm_add_pd(fjz1,tz);
1289 /**************************
1290 * CALCULATE INTERACTIONS *
1291 **************************/
1293 /* COULOMB ELECTROSTATICS */
1294 velec = _mm_mul_pd(qq02,rinv02);
1295 felec = _mm_mul_pd(velec,rinvsq02);
1299 /* Calculate temporary vectorial force */
1300 tx = _mm_mul_pd(fscal,dx02);
1301 ty = _mm_mul_pd(fscal,dy02);
1302 tz = _mm_mul_pd(fscal,dz02);
1304 /* Update vectorial force */
1305 fix0 = _mm_add_pd(fix0,tx);
1306 fiy0 = _mm_add_pd(fiy0,ty);
1307 fiz0 = _mm_add_pd(fiz0,tz);
1309 fjx2 = _mm_add_pd(fjx2,tx);
1310 fjy2 = _mm_add_pd(fjy2,ty);
1311 fjz2 = _mm_add_pd(fjz2,tz);
1313 /**************************
1314 * CALCULATE INTERACTIONS *
1315 **************************/
1317 /* COULOMB ELECTROSTATICS */
1318 velec = _mm_mul_pd(qq10,rinv10);
1319 felec = _mm_mul_pd(velec,rinvsq10);
1323 /* Calculate temporary vectorial force */
1324 tx = _mm_mul_pd(fscal,dx10);
1325 ty = _mm_mul_pd(fscal,dy10);
1326 tz = _mm_mul_pd(fscal,dz10);
1328 /* Update vectorial force */
1329 fix1 = _mm_add_pd(fix1,tx);
1330 fiy1 = _mm_add_pd(fiy1,ty);
1331 fiz1 = _mm_add_pd(fiz1,tz);
1333 fjx0 = _mm_add_pd(fjx0,tx);
1334 fjy0 = _mm_add_pd(fjy0,ty);
1335 fjz0 = _mm_add_pd(fjz0,tz);
1337 /**************************
1338 * CALCULATE INTERACTIONS *
1339 **************************/
1341 /* COULOMB ELECTROSTATICS */
1342 velec = _mm_mul_pd(qq11,rinv11);
1343 felec = _mm_mul_pd(velec,rinvsq11);
1347 /* Calculate temporary vectorial force */
1348 tx = _mm_mul_pd(fscal,dx11);
1349 ty = _mm_mul_pd(fscal,dy11);
1350 tz = _mm_mul_pd(fscal,dz11);
1352 /* Update vectorial force */
1353 fix1 = _mm_add_pd(fix1,tx);
1354 fiy1 = _mm_add_pd(fiy1,ty);
1355 fiz1 = _mm_add_pd(fiz1,tz);
1357 fjx1 = _mm_add_pd(fjx1,tx);
1358 fjy1 = _mm_add_pd(fjy1,ty);
1359 fjz1 = _mm_add_pd(fjz1,tz);
1361 /**************************
1362 * CALCULATE INTERACTIONS *
1363 **************************/
1365 /* COULOMB ELECTROSTATICS */
1366 velec = _mm_mul_pd(qq12,rinv12);
1367 felec = _mm_mul_pd(velec,rinvsq12);
1371 /* Calculate temporary vectorial force */
1372 tx = _mm_mul_pd(fscal,dx12);
1373 ty = _mm_mul_pd(fscal,dy12);
1374 tz = _mm_mul_pd(fscal,dz12);
1376 /* Update vectorial force */
1377 fix1 = _mm_add_pd(fix1,tx);
1378 fiy1 = _mm_add_pd(fiy1,ty);
1379 fiz1 = _mm_add_pd(fiz1,tz);
1381 fjx2 = _mm_add_pd(fjx2,tx);
1382 fjy2 = _mm_add_pd(fjy2,ty);
1383 fjz2 = _mm_add_pd(fjz2,tz);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 /* COULOMB ELECTROSTATICS */
1390 velec = _mm_mul_pd(qq20,rinv20);
1391 felec = _mm_mul_pd(velec,rinvsq20);
1395 /* Calculate temporary vectorial force */
1396 tx = _mm_mul_pd(fscal,dx20);
1397 ty = _mm_mul_pd(fscal,dy20);
1398 tz = _mm_mul_pd(fscal,dz20);
1400 /* Update vectorial force */
1401 fix2 = _mm_add_pd(fix2,tx);
1402 fiy2 = _mm_add_pd(fiy2,ty);
1403 fiz2 = _mm_add_pd(fiz2,tz);
1405 fjx0 = _mm_add_pd(fjx0,tx);
1406 fjy0 = _mm_add_pd(fjy0,ty);
1407 fjz0 = _mm_add_pd(fjz0,tz);
1409 /**************************
1410 * CALCULATE INTERACTIONS *
1411 **************************/
1413 /* COULOMB ELECTROSTATICS */
1414 velec = _mm_mul_pd(qq21,rinv21);
1415 felec = _mm_mul_pd(velec,rinvsq21);
1419 /* Calculate temporary vectorial force */
1420 tx = _mm_mul_pd(fscal,dx21);
1421 ty = _mm_mul_pd(fscal,dy21);
1422 tz = _mm_mul_pd(fscal,dz21);
1424 /* Update vectorial force */
1425 fix2 = _mm_add_pd(fix2,tx);
1426 fiy2 = _mm_add_pd(fiy2,ty);
1427 fiz2 = _mm_add_pd(fiz2,tz);
1429 fjx1 = _mm_add_pd(fjx1,tx);
1430 fjy1 = _mm_add_pd(fjy1,ty);
1431 fjz1 = _mm_add_pd(fjz1,tz);
1433 /**************************
1434 * CALCULATE INTERACTIONS *
1435 **************************/
1437 /* COULOMB ELECTROSTATICS */
1438 velec = _mm_mul_pd(qq22,rinv22);
1439 felec = _mm_mul_pd(velec,rinvsq22);
1443 /* Calculate temporary vectorial force */
1444 tx = _mm_mul_pd(fscal,dx22);
1445 ty = _mm_mul_pd(fscal,dy22);
1446 tz = _mm_mul_pd(fscal,dz22);
1448 /* Update vectorial force */
1449 fix2 = _mm_add_pd(fix2,tx);
1450 fiy2 = _mm_add_pd(fiy2,ty);
1451 fiz2 = _mm_add_pd(fiz2,tz);
1453 fjx2 = _mm_add_pd(fjx2,tx);
1454 fjy2 = _mm_add_pd(fjy2,ty);
1455 fjz2 = _mm_add_pd(fjz2,tz);
1457 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1459 /* Inner loop uses 270 flops */
1462 if(jidx<j_index_end)
1466 j_coord_offsetA = DIM*jnrA;
1468 /* load j atom coordinates */
1469 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1470 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1472 /* Calculate displacement vector */
1473 dx00 = _mm_sub_pd(ix0,jx0);
1474 dy00 = _mm_sub_pd(iy0,jy0);
1475 dz00 = _mm_sub_pd(iz0,jz0);
1476 dx01 = _mm_sub_pd(ix0,jx1);
1477 dy01 = _mm_sub_pd(iy0,jy1);
1478 dz01 = _mm_sub_pd(iz0,jz1);
1479 dx02 = _mm_sub_pd(ix0,jx2);
1480 dy02 = _mm_sub_pd(iy0,jy2);
1481 dz02 = _mm_sub_pd(iz0,jz2);
1482 dx10 = _mm_sub_pd(ix1,jx0);
1483 dy10 = _mm_sub_pd(iy1,jy0);
1484 dz10 = _mm_sub_pd(iz1,jz0);
1485 dx11 = _mm_sub_pd(ix1,jx1);
1486 dy11 = _mm_sub_pd(iy1,jy1);
1487 dz11 = _mm_sub_pd(iz1,jz1);
1488 dx12 = _mm_sub_pd(ix1,jx2);
1489 dy12 = _mm_sub_pd(iy1,jy2);
1490 dz12 = _mm_sub_pd(iz1,jz2);
1491 dx20 = _mm_sub_pd(ix2,jx0);
1492 dy20 = _mm_sub_pd(iy2,jy0);
1493 dz20 = _mm_sub_pd(iz2,jz0);
1494 dx21 = _mm_sub_pd(ix2,jx1);
1495 dy21 = _mm_sub_pd(iy2,jy1);
1496 dz21 = _mm_sub_pd(iz2,jz1);
1497 dx22 = _mm_sub_pd(ix2,jx2);
1498 dy22 = _mm_sub_pd(iy2,jy2);
1499 dz22 = _mm_sub_pd(iz2,jz2);
1501 /* Calculate squared distance and things based on it */
1502 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1503 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1504 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1505 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1506 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1507 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1508 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1509 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1510 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1512 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1513 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1514 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1515 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1516 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1517 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1518 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1519 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1520 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1522 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1523 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1524 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1525 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1526 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1527 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1528 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1529 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1530 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1532 fjx0 = _mm_setzero_pd();
1533 fjy0 = _mm_setzero_pd();
1534 fjz0 = _mm_setzero_pd();
1535 fjx1 = _mm_setzero_pd();
1536 fjy1 = _mm_setzero_pd();
1537 fjz1 = _mm_setzero_pd();
1538 fjx2 = _mm_setzero_pd();
1539 fjy2 = _mm_setzero_pd();
1540 fjz2 = _mm_setzero_pd();
1542 /**************************
1543 * CALCULATE INTERACTIONS *
1544 **************************/
1546 r00 = _mm_mul_pd(rsq00,rinv00);
1548 /* Calculate table index by multiplying r with table scale and truncate to integer */
1549 rt = _mm_mul_pd(r00,vftabscale);
1550 vfitab = _mm_cvttpd_epi32(rt);
1551 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1552 vfitab = _mm_slli_epi32(vfitab,3);
1554 /* COULOMB ELECTROSTATICS */
1555 velec = _mm_mul_pd(qq00,rinv00);
1556 felec = _mm_mul_pd(velec,rinvsq00);
1558 /* CUBIC SPLINE TABLE DISPERSION */
1559 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1560 F = _mm_setzero_pd();
1561 GMX_MM_TRANSPOSE2_PD(Y,F);
1562 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1563 H = _mm_setzero_pd();
1564 GMX_MM_TRANSPOSE2_PD(G,H);
1565 Heps = _mm_mul_pd(vfeps,H);
1566 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1567 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1568 fvdw6 = _mm_mul_pd(c6_00,FF);
1570 /* CUBIC SPLINE TABLE REPULSION */
1571 vfitab = _mm_add_epi32(vfitab,ifour);
1572 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1573 F = _mm_setzero_pd();
1574 GMX_MM_TRANSPOSE2_PD(Y,F);
1575 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1576 H = _mm_setzero_pd();
1577 GMX_MM_TRANSPOSE2_PD(G,H);
1578 Heps = _mm_mul_pd(vfeps,H);
1579 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1580 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1581 fvdw12 = _mm_mul_pd(c12_00,FF);
1582 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1584 fscal = _mm_add_pd(felec,fvdw);
1586 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1588 /* Calculate temporary vectorial force */
1589 tx = _mm_mul_pd(fscal,dx00);
1590 ty = _mm_mul_pd(fscal,dy00);
1591 tz = _mm_mul_pd(fscal,dz00);
1593 /* Update vectorial force */
1594 fix0 = _mm_add_pd(fix0,tx);
1595 fiy0 = _mm_add_pd(fiy0,ty);
1596 fiz0 = _mm_add_pd(fiz0,tz);
1598 fjx0 = _mm_add_pd(fjx0,tx);
1599 fjy0 = _mm_add_pd(fjy0,ty);
1600 fjz0 = _mm_add_pd(fjz0,tz);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 /* COULOMB ELECTROSTATICS */
1607 velec = _mm_mul_pd(qq01,rinv01);
1608 felec = _mm_mul_pd(velec,rinvsq01);
1612 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1614 /* Calculate temporary vectorial force */
1615 tx = _mm_mul_pd(fscal,dx01);
1616 ty = _mm_mul_pd(fscal,dy01);
1617 tz = _mm_mul_pd(fscal,dz01);
1619 /* Update vectorial force */
1620 fix0 = _mm_add_pd(fix0,tx);
1621 fiy0 = _mm_add_pd(fiy0,ty);
1622 fiz0 = _mm_add_pd(fiz0,tz);
1624 fjx1 = _mm_add_pd(fjx1,tx);
1625 fjy1 = _mm_add_pd(fjy1,ty);
1626 fjz1 = _mm_add_pd(fjz1,tz);
1628 /**************************
1629 * CALCULATE INTERACTIONS *
1630 **************************/
1632 /* COULOMB ELECTROSTATICS */
1633 velec = _mm_mul_pd(qq02,rinv02);
1634 felec = _mm_mul_pd(velec,rinvsq02);
1638 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1640 /* Calculate temporary vectorial force */
1641 tx = _mm_mul_pd(fscal,dx02);
1642 ty = _mm_mul_pd(fscal,dy02);
1643 tz = _mm_mul_pd(fscal,dz02);
1645 /* Update vectorial force */
1646 fix0 = _mm_add_pd(fix0,tx);
1647 fiy0 = _mm_add_pd(fiy0,ty);
1648 fiz0 = _mm_add_pd(fiz0,tz);
1650 fjx2 = _mm_add_pd(fjx2,tx);
1651 fjy2 = _mm_add_pd(fjy2,ty);
1652 fjz2 = _mm_add_pd(fjz2,tz);
1654 /**************************
1655 * CALCULATE INTERACTIONS *
1656 **************************/
1658 /* COULOMB ELECTROSTATICS */
1659 velec = _mm_mul_pd(qq10,rinv10);
1660 felec = _mm_mul_pd(velec,rinvsq10);
1664 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1666 /* Calculate temporary vectorial force */
1667 tx = _mm_mul_pd(fscal,dx10);
1668 ty = _mm_mul_pd(fscal,dy10);
1669 tz = _mm_mul_pd(fscal,dz10);
1671 /* Update vectorial force */
1672 fix1 = _mm_add_pd(fix1,tx);
1673 fiy1 = _mm_add_pd(fiy1,ty);
1674 fiz1 = _mm_add_pd(fiz1,tz);
1676 fjx0 = _mm_add_pd(fjx0,tx);
1677 fjy0 = _mm_add_pd(fjy0,ty);
1678 fjz0 = _mm_add_pd(fjz0,tz);
1680 /**************************
1681 * CALCULATE INTERACTIONS *
1682 **************************/
1684 /* COULOMB ELECTROSTATICS */
1685 velec = _mm_mul_pd(qq11,rinv11);
1686 felec = _mm_mul_pd(velec,rinvsq11);
1690 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1692 /* Calculate temporary vectorial force */
1693 tx = _mm_mul_pd(fscal,dx11);
1694 ty = _mm_mul_pd(fscal,dy11);
1695 tz = _mm_mul_pd(fscal,dz11);
1697 /* Update vectorial force */
1698 fix1 = _mm_add_pd(fix1,tx);
1699 fiy1 = _mm_add_pd(fiy1,ty);
1700 fiz1 = _mm_add_pd(fiz1,tz);
1702 fjx1 = _mm_add_pd(fjx1,tx);
1703 fjy1 = _mm_add_pd(fjy1,ty);
1704 fjz1 = _mm_add_pd(fjz1,tz);
1706 /**************************
1707 * CALCULATE INTERACTIONS *
1708 **************************/
1710 /* COULOMB ELECTROSTATICS */
1711 velec = _mm_mul_pd(qq12,rinv12);
1712 felec = _mm_mul_pd(velec,rinvsq12);
1716 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1718 /* Calculate temporary vectorial force */
1719 tx = _mm_mul_pd(fscal,dx12);
1720 ty = _mm_mul_pd(fscal,dy12);
1721 tz = _mm_mul_pd(fscal,dz12);
1723 /* Update vectorial force */
1724 fix1 = _mm_add_pd(fix1,tx);
1725 fiy1 = _mm_add_pd(fiy1,ty);
1726 fiz1 = _mm_add_pd(fiz1,tz);
1728 fjx2 = _mm_add_pd(fjx2,tx);
1729 fjy2 = _mm_add_pd(fjy2,ty);
1730 fjz2 = _mm_add_pd(fjz2,tz);
1732 /**************************
1733 * CALCULATE INTERACTIONS *
1734 **************************/
1736 /* COULOMB ELECTROSTATICS */
1737 velec = _mm_mul_pd(qq20,rinv20);
1738 felec = _mm_mul_pd(velec,rinvsq20);
1742 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1744 /* Calculate temporary vectorial force */
1745 tx = _mm_mul_pd(fscal,dx20);
1746 ty = _mm_mul_pd(fscal,dy20);
1747 tz = _mm_mul_pd(fscal,dz20);
1749 /* Update vectorial force */
1750 fix2 = _mm_add_pd(fix2,tx);
1751 fiy2 = _mm_add_pd(fiy2,ty);
1752 fiz2 = _mm_add_pd(fiz2,tz);
1754 fjx0 = _mm_add_pd(fjx0,tx);
1755 fjy0 = _mm_add_pd(fjy0,ty);
1756 fjz0 = _mm_add_pd(fjz0,tz);
1758 /**************************
1759 * CALCULATE INTERACTIONS *
1760 **************************/
1762 /* COULOMB ELECTROSTATICS */
1763 velec = _mm_mul_pd(qq21,rinv21);
1764 felec = _mm_mul_pd(velec,rinvsq21);
1768 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1770 /* Calculate temporary vectorial force */
1771 tx = _mm_mul_pd(fscal,dx21);
1772 ty = _mm_mul_pd(fscal,dy21);
1773 tz = _mm_mul_pd(fscal,dz21);
1775 /* Update vectorial force */
1776 fix2 = _mm_add_pd(fix2,tx);
1777 fiy2 = _mm_add_pd(fiy2,ty);
1778 fiz2 = _mm_add_pd(fiz2,tz);
1780 fjx1 = _mm_add_pd(fjx1,tx);
1781 fjy1 = _mm_add_pd(fjy1,ty);
1782 fjz1 = _mm_add_pd(fjz1,tz);
1784 /**************************
1785 * CALCULATE INTERACTIONS *
1786 **************************/
1788 /* COULOMB ELECTROSTATICS */
1789 velec = _mm_mul_pd(qq22,rinv22);
1790 felec = _mm_mul_pd(velec,rinvsq22);
1794 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1796 /* Calculate temporary vectorial force */
1797 tx = _mm_mul_pd(fscal,dx22);
1798 ty = _mm_mul_pd(fscal,dy22);
1799 tz = _mm_mul_pd(fscal,dz22);
1801 /* Update vectorial force */
1802 fix2 = _mm_add_pd(fix2,tx);
1803 fiy2 = _mm_add_pd(fiy2,ty);
1804 fiz2 = _mm_add_pd(fiz2,tz);
1806 fjx2 = _mm_add_pd(fjx2,tx);
1807 fjy2 = _mm_add_pd(fjy2,ty);
1808 fjz2 = _mm_add_pd(fjz2,tz);
1810 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1812 /* Inner loop uses 270 flops */
1815 /* End of innermost loop */
1817 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1818 f+i_coord_offset,fshift+i_shift_offset);
1820 /* Increment number of inner iterations */
1821 inneriter += j_index_end - j_index_start;
1823 /* Outer loop uses 18 flops */
1826 /* Increment number of outer iterations */
1829 /* Update outer/inner flops */
1831 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*270);