2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_vdw->data;
121 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_pd(charge[inr+0]);
131 jq1 = _mm_set1_pd(charge[inr+1]);
132 jq2 = _mm_set1_pd(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_pd(iq0,jq0);
135 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_pd(iq0,jq1);
138 qq02 = _mm_mul_pd(iq0,jq2);
139 qq10 = _mm_mul_pd(iq1,jq0);
140 qq11 = _mm_mul_pd(iq1,jq1);
141 qq12 = _mm_mul_pd(iq1,jq2);
142 qq20 = _mm_mul_pd(iq2,jq0);
143 qq21 = _mm_mul_pd(iq2,jq1);
144 qq22 = _mm_mul_pd(iq2,jq2);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm_setzero_pd();
173 fiy0 = _mm_setzero_pd();
174 fiz0 = _mm_setzero_pd();
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx01 = _mm_sub_pd(ix0,jx1);
205 dy01 = _mm_sub_pd(iy0,jy1);
206 dz01 = _mm_sub_pd(iz0,jz1);
207 dx02 = _mm_sub_pd(ix0,jx2);
208 dy02 = _mm_sub_pd(iy0,jy2);
209 dz02 = _mm_sub_pd(iz0,jz2);
210 dx10 = _mm_sub_pd(ix1,jx0);
211 dy10 = _mm_sub_pd(iy1,jy0);
212 dz10 = _mm_sub_pd(iz1,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx20 = _mm_sub_pd(ix2,jx0);
220 dy20 = _mm_sub_pd(iy2,jy0);
221 dz20 = _mm_sub_pd(iz2,jz0);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
240 rinv00 = gmx_mm_invsqrt_pd(rsq00);
241 rinv01 = gmx_mm_invsqrt_pd(rsq01);
242 rinv02 = gmx_mm_invsqrt_pd(rsq02);
243 rinv10 = gmx_mm_invsqrt_pd(rsq10);
244 rinv11 = gmx_mm_invsqrt_pd(rsq11);
245 rinv12 = gmx_mm_invsqrt_pd(rsq12);
246 rinv20 = gmx_mm_invsqrt_pd(rsq20);
247 rinv21 = gmx_mm_invsqrt_pd(rsq21);
248 rinv22 = gmx_mm_invsqrt_pd(rsq22);
250 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
251 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
252 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
253 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
254 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
255 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
256 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
257 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
258 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
260 fjx0 = _mm_setzero_pd();
261 fjy0 = _mm_setzero_pd();
262 fjz0 = _mm_setzero_pd();
263 fjx1 = _mm_setzero_pd();
264 fjy1 = _mm_setzero_pd();
265 fjz1 = _mm_setzero_pd();
266 fjx2 = _mm_setzero_pd();
267 fjy2 = _mm_setzero_pd();
268 fjz2 = _mm_setzero_pd();
270 /**************************
271 * CALCULATE INTERACTIONS *
272 **************************/
274 r00 = _mm_mul_pd(rsq00,rinv00);
276 /* Calculate table index by multiplying r with table scale and truncate to integer */
277 rt = _mm_mul_pd(r00,vftabscale);
278 vfitab = _mm_cvttpd_epi32(rt);
280 vfeps = _mm_frcz_pd(rt);
282 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
284 twovfeps = _mm_add_pd(vfeps,vfeps);
285 vfitab = _mm_slli_epi32(vfitab,3);
287 /* COULOMB ELECTROSTATICS */
288 velec = _mm_mul_pd(qq00,rinv00);
289 felec = _mm_mul_pd(velec,rinvsq00);
291 /* CUBIC SPLINE TABLE DISPERSION */
292 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294 GMX_MM_TRANSPOSE2_PD(Y,F);
295 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
296 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
297 GMX_MM_TRANSPOSE2_PD(G,H);
298 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
299 VV = _mm_macc_pd(vfeps,Fp,Y);
300 vvdw6 = _mm_mul_pd(c6_00,VV);
301 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
302 fvdw6 = _mm_mul_pd(c6_00,FF);
304 /* CUBIC SPLINE TABLE REPULSION */
305 vfitab = _mm_add_epi32(vfitab,ifour);
306 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308 GMX_MM_TRANSPOSE2_PD(Y,F);
309 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
310 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
311 GMX_MM_TRANSPOSE2_PD(G,H);
312 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
313 VV = _mm_macc_pd(vfeps,Fp,Y);
314 vvdw12 = _mm_mul_pd(c12_00,VV);
315 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
316 fvdw12 = _mm_mul_pd(c12_00,FF);
317 vvdw = _mm_add_pd(vvdw12,vvdw6);
318 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 velecsum = _mm_add_pd(velecsum,velec);
322 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
324 fscal = _mm_add_pd(felec,fvdw);
326 /* Update vectorial force */
327 fix0 = _mm_macc_pd(dx00,fscal,fix0);
328 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
329 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
331 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
332 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
333 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
335 /**************************
336 * CALCULATE INTERACTIONS *
337 **************************/
339 /* COULOMB ELECTROSTATICS */
340 velec = _mm_mul_pd(qq01,rinv01);
341 felec = _mm_mul_pd(velec,rinvsq01);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velecsum = _mm_add_pd(velecsum,velec);
348 /* Update vectorial force */
349 fix0 = _mm_macc_pd(dx01,fscal,fix0);
350 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
351 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
353 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
354 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
355 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
357 /**************************
358 * CALCULATE INTERACTIONS *
359 **************************/
361 /* COULOMB ELECTROSTATICS */
362 velec = _mm_mul_pd(qq02,rinv02);
363 felec = _mm_mul_pd(velec,rinvsq02);
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velecsum = _mm_add_pd(velecsum,velec);
370 /* Update vectorial force */
371 fix0 = _mm_macc_pd(dx02,fscal,fix0);
372 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
373 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
375 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
376 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
377 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 /* COULOMB ELECTROSTATICS */
384 velec = _mm_mul_pd(qq10,rinv10);
385 felec = _mm_mul_pd(velec,rinvsq10);
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velecsum = _mm_add_pd(velecsum,velec);
392 /* Update vectorial force */
393 fix1 = _mm_macc_pd(dx10,fscal,fix1);
394 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
395 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
397 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
398 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
399 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
401 /**************************
402 * CALCULATE INTERACTIONS *
403 **************************/
405 /* COULOMB ELECTROSTATICS */
406 velec = _mm_mul_pd(qq11,rinv11);
407 felec = _mm_mul_pd(velec,rinvsq11);
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velecsum = _mm_add_pd(velecsum,velec);
414 /* Update vectorial force */
415 fix1 = _mm_macc_pd(dx11,fscal,fix1);
416 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
417 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
419 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
420 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
421 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 /* COULOMB ELECTROSTATICS */
428 velec = _mm_mul_pd(qq12,rinv12);
429 felec = _mm_mul_pd(velec,rinvsq12);
431 /* Update potential sum for this i atom from the interaction with this j atom. */
432 velecsum = _mm_add_pd(velecsum,velec);
436 /* Update vectorial force */
437 fix1 = _mm_macc_pd(dx12,fscal,fix1);
438 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
439 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
441 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
442 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
443 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 /* COULOMB ELECTROSTATICS */
450 velec = _mm_mul_pd(qq20,rinv20);
451 felec = _mm_mul_pd(velec,rinvsq20);
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velecsum = _mm_add_pd(velecsum,velec);
458 /* Update vectorial force */
459 fix2 = _mm_macc_pd(dx20,fscal,fix2);
460 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
461 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
463 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
464 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
465 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
467 /**************************
468 * CALCULATE INTERACTIONS *
469 **************************/
471 /* COULOMB ELECTROSTATICS */
472 velec = _mm_mul_pd(qq21,rinv21);
473 felec = _mm_mul_pd(velec,rinvsq21);
475 /* Update potential sum for this i atom from the interaction with this j atom. */
476 velecsum = _mm_add_pd(velecsum,velec);
480 /* Update vectorial force */
481 fix2 = _mm_macc_pd(dx21,fscal,fix2);
482 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
483 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
485 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
486 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
487 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 /* COULOMB ELECTROSTATICS */
494 velec = _mm_mul_pd(qq22,rinv22);
495 felec = _mm_mul_pd(velec,rinvsq22);
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm_add_pd(velecsum,velec);
502 /* Update vectorial force */
503 fix2 = _mm_macc_pd(dx22,fscal,fix2);
504 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
505 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
507 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
508 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
509 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
511 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
513 /* Inner loop uses 314 flops */
520 j_coord_offsetA = DIM*jnrA;
522 /* load j atom coordinates */
523 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
524 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
526 /* Calculate displacement vector */
527 dx00 = _mm_sub_pd(ix0,jx0);
528 dy00 = _mm_sub_pd(iy0,jy0);
529 dz00 = _mm_sub_pd(iz0,jz0);
530 dx01 = _mm_sub_pd(ix0,jx1);
531 dy01 = _mm_sub_pd(iy0,jy1);
532 dz01 = _mm_sub_pd(iz0,jz1);
533 dx02 = _mm_sub_pd(ix0,jx2);
534 dy02 = _mm_sub_pd(iy0,jy2);
535 dz02 = _mm_sub_pd(iz0,jz2);
536 dx10 = _mm_sub_pd(ix1,jx0);
537 dy10 = _mm_sub_pd(iy1,jy0);
538 dz10 = _mm_sub_pd(iz1,jz0);
539 dx11 = _mm_sub_pd(ix1,jx1);
540 dy11 = _mm_sub_pd(iy1,jy1);
541 dz11 = _mm_sub_pd(iz1,jz1);
542 dx12 = _mm_sub_pd(ix1,jx2);
543 dy12 = _mm_sub_pd(iy1,jy2);
544 dz12 = _mm_sub_pd(iz1,jz2);
545 dx20 = _mm_sub_pd(ix2,jx0);
546 dy20 = _mm_sub_pd(iy2,jy0);
547 dz20 = _mm_sub_pd(iz2,jz0);
548 dx21 = _mm_sub_pd(ix2,jx1);
549 dy21 = _mm_sub_pd(iy2,jy1);
550 dz21 = _mm_sub_pd(iz2,jz1);
551 dx22 = _mm_sub_pd(ix2,jx2);
552 dy22 = _mm_sub_pd(iy2,jy2);
553 dz22 = _mm_sub_pd(iz2,jz2);
555 /* Calculate squared distance and things based on it */
556 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
557 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
558 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
559 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
560 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
561 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
562 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
563 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
564 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
566 rinv00 = gmx_mm_invsqrt_pd(rsq00);
567 rinv01 = gmx_mm_invsqrt_pd(rsq01);
568 rinv02 = gmx_mm_invsqrt_pd(rsq02);
569 rinv10 = gmx_mm_invsqrt_pd(rsq10);
570 rinv11 = gmx_mm_invsqrt_pd(rsq11);
571 rinv12 = gmx_mm_invsqrt_pd(rsq12);
572 rinv20 = gmx_mm_invsqrt_pd(rsq20);
573 rinv21 = gmx_mm_invsqrt_pd(rsq21);
574 rinv22 = gmx_mm_invsqrt_pd(rsq22);
576 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
577 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
578 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
579 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
580 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
581 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
582 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
583 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
584 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
586 fjx0 = _mm_setzero_pd();
587 fjy0 = _mm_setzero_pd();
588 fjz0 = _mm_setzero_pd();
589 fjx1 = _mm_setzero_pd();
590 fjy1 = _mm_setzero_pd();
591 fjz1 = _mm_setzero_pd();
592 fjx2 = _mm_setzero_pd();
593 fjy2 = _mm_setzero_pd();
594 fjz2 = _mm_setzero_pd();
596 /**************************
597 * CALCULATE INTERACTIONS *
598 **************************/
600 r00 = _mm_mul_pd(rsq00,rinv00);
602 /* Calculate table index by multiplying r with table scale and truncate to integer */
603 rt = _mm_mul_pd(r00,vftabscale);
604 vfitab = _mm_cvttpd_epi32(rt);
606 vfeps = _mm_frcz_pd(rt);
608 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
610 twovfeps = _mm_add_pd(vfeps,vfeps);
611 vfitab = _mm_slli_epi32(vfitab,3);
613 /* COULOMB ELECTROSTATICS */
614 velec = _mm_mul_pd(qq00,rinv00);
615 felec = _mm_mul_pd(velec,rinvsq00);
617 /* CUBIC SPLINE TABLE DISPERSION */
618 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
619 F = _mm_setzero_pd();
620 GMX_MM_TRANSPOSE2_PD(Y,F);
621 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
622 H = _mm_setzero_pd();
623 GMX_MM_TRANSPOSE2_PD(G,H);
624 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
625 VV = _mm_macc_pd(vfeps,Fp,Y);
626 vvdw6 = _mm_mul_pd(c6_00,VV);
627 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
628 fvdw6 = _mm_mul_pd(c6_00,FF);
630 /* CUBIC SPLINE TABLE REPULSION */
631 vfitab = _mm_add_epi32(vfitab,ifour);
632 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
633 F = _mm_setzero_pd();
634 GMX_MM_TRANSPOSE2_PD(Y,F);
635 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
636 H = _mm_setzero_pd();
637 GMX_MM_TRANSPOSE2_PD(G,H);
638 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
639 VV = _mm_macc_pd(vfeps,Fp,Y);
640 vvdw12 = _mm_mul_pd(c12_00,VV);
641 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
642 fvdw12 = _mm_mul_pd(c12_00,FF);
643 vvdw = _mm_add_pd(vvdw12,vvdw6);
644 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
646 /* Update potential sum for this i atom from the interaction with this j atom. */
647 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
648 velecsum = _mm_add_pd(velecsum,velec);
649 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
650 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
652 fscal = _mm_add_pd(felec,fvdw);
654 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
656 /* Update vectorial force */
657 fix0 = _mm_macc_pd(dx00,fscal,fix0);
658 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
659 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
661 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
662 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
663 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
665 /**************************
666 * CALCULATE INTERACTIONS *
667 **************************/
669 /* COULOMB ELECTROSTATICS */
670 velec = _mm_mul_pd(qq01,rinv01);
671 felec = _mm_mul_pd(velec,rinvsq01);
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
675 velecsum = _mm_add_pd(velecsum,velec);
679 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
681 /* Update vectorial force */
682 fix0 = _mm_macc_pd(dx01,fscal,fix0);
683 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
684 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
686 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
687 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
688 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
690 /**************************
691 * CALCULATE INTERACTIONS *
692 **************************/
694 /* COULOMB ELECTROSTATICS */
695 velec = _mm_mul_pd(qq02,rinv02);
696 felec = _mm_mul_pd(velec,rinvsq02);
698 /* Update potential sum for this i atom from the interaction with this j atom. */
699 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
700 velecsum = _mm_add_pd(velecsum,velec);
704 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
706 /* Update vectorial force */
707 fix0 = _mm_macc_pd(dx02,fscal,fix0);
708 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
709 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
711 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
712 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
713 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
715 /**************************
716 * CALCULATE INTERACTIONS *
717 **************************/
719 /* COULOMB ELECTROSTATICS */
720 velec = _mm_mul_pd(qq10,rinv10);
721 felec = _mm_mul_pd(velec,rinvsq10);
723 /* Update potential sum for this i atom from the interaction with this j atom. */
724 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
725 velecsum = _mm_add_pd(velecsum,velec);
729 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
731 /* Update vectorial force */
732 fix1 = _mm_macc_pd(dx10,fscal,fix1);
733 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
734 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
736 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
737 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
738 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
740 /**************************
741 * CALCULATE INTERACTIONS *
742 **************************/
744 /* COULOMB ELECTROSTATICS */
745 velec = _mm_mul_pd(qq11,rinv11);
746 felec = _mm_mul_pd(velec,rinvsq11);
748 /* Update potential sum for this i atom from the interaction with this j atom. */
749 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
750 velecsum = _mm_add_pd(velecsum,velec);
754 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
756 /* Update vectorial force */
757 fix1 = _mm_macc_pd(dx11,fscal,fix1);
758 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
759 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
761 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
762 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
763 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
765 /**************************
766 * CALCULATE INTERACTIONS *
767 **************************/
769 /* COULOMB ELECTROSTATICS */
770 velec = _mm_mul_pd(qq12,rinv12);
771 felec = _mm_mul_pd(velec,rinvsq12);
773 /* Update potential sum for this i atom from the interaction with this j atom. */
774 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
775 velecsum = _mm_add_pd(velecsum,velec);
779 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
781 /* Update vectorial force */
782 fix1 = _mm_macc_pd(dx12,fscal,fix1);
783 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
784 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
786 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
787 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
788 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
790 /**************************
791 * CALCULATE INTERACTIONS *
792 **************************/
794 /* COULOMB ELECTROSTATICS */
795 velec = _mm_mul_pd(qq20,rinv20);
796 felec = _mm_mul_pd(velec,rinvsq20);
798 /* Update potential sum for this i atom from the interaction with this j atom. */
799 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
800 velecsum = _mm_add_pd(velecsum,velec);
804 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
806 /* Update vectorial force */
807 fix2 = _mm_macc_pd(dx20,fscal,fix2);
808 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
809 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
811 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
812 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
813 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
819 /* COULOMB ELECTROSTATICS */
820 velec = _mm_mul_pd(qq21,rinv21);
821 felec = _mm_mul_pd(velec,rinvsq21);
823 /* Update potential sum for this i atom from the interaction with this j atom. */
824 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
825 velecsum = _mm_add_pd(velecsum,velec);
829 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
831 /* Update vectorial force */
832 fix2 = _mm_macc_pd(dx21,fscal,fix2);
833 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
834 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
836 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
837 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
838 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 /* COULOMB ELECTROSTATICS */
845 velec = _mm_mul_pd(qq22,rinv22);
846 felec = _mm_mul_pd(velec,rinvsq22);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
850 velecsum = _mm_add_pd(velecsum,velec);
854 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
856 /* Update vectorial force */
857 fix2 = _mm_macc_pd(dx22,fscal,fix2);
858 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
859 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
861 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
862 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
863 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
865 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
867 /* Inner loop uses 314 flops */
870 /* End of innermost loop */
872 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
873 f+i_coord_offset,fshift+i_shift_offset);
876 /* Update potential energies */
877 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
878 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
880 /* Increment number of inner iterations */
881 inneriter += j_index_end - j_index_start;
883 /* Outer loop uses 20 flops */
886 /* Increment number of outer iterations */
889 /* Update outer/inner flops */
891 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*314);
894 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double
895 * Electrostatics interaction: Coulomb
896 * VdW interaction: CubicSplineTable
897 * Geometry: Water3-Water3
898 * Calculate force/pot: Force
901 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double
902 (t_nblist * gmx_restrict nlist,
903 rvec * gmx_restrict xx,
904 rvec * gmx_restrict ff,
905 t_forcerec * gmx_restrict fr,
906 t_mdatoms * gmx_restrict mdatoms,
907 nb_kernel_data_t * gmx_restrict kernel_data,
908 t_nrnb * gmx_restrict nrnb)
910 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
911 * just 0 for non-waters.
912 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
913 * jnr indices corresponding to data put in the four positions in the SIMD register.
915 int i_shift_offset,i_coord_offset,outeriter,inneriter;
916 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
918 int j_coord_offsetA,j_coord_offsetB;
919 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
921 real *shiftvec,*fshift,*x,*f;
922 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
924 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
926 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
928 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
929 int vdwjidx0A,vdwjidx0B;
930 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
931 int vdwjidx1A,vdwjidx1B;
932 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
933 int vdwjidx2A,vdwjidx2B;
934 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
935 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
936 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
937 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
938 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
939 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
940 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
941 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
942 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
943 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
944 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
947 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
950 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
951 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
953 __m128i ifour = _mm_set1_epi32(4);
954 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
956 __m128d dummy_mask,cutoff_mask;
957 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
958 __m128d one = _mm_set1_pd(1.0);
959 __m128d two = _mm_set1_pd(2.0);
965 jindex = nlist->jindex;
967 shiftidx = nlist->shift;
969 shiftvec = fr->shift_vec[0];
970 fshift = fr->fshift[0];
971 facel = _mm_set1_pd(fr->epsfac);
972 charge = mdatoms->chargeA;
973 nvdwtype = fr->ntype;
975 vdwtype = mdatoms->typeA;
977 vftab = kernel_data->table_vdw->data;
978 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
980 /* Setup water-specific parameters */
981 inr = nlist->iinr[0];
982 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
983 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
984 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
985 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
987 jq0 = _mm_set1_pd(charge[inr+0]);
988 jq1 = _mm_set1_pd(charge[inr+1]);
989 jq2 = _mm_set1_pd(charge[inr+2]);
990 vdwjidx0A = 2*vdwtype[inr+0];
991 qq00 = _mm_mul_pd(iq0,jq0);
992 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
993 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
994 qq01 = _mm_mul_pd(iq0,jq1);
995 qq02 = _mm_mul_pd(iq0,jq2);
996 qq10 = _mm_mul_pd(iq1,jq0);
997 qq11 = _mm_mul_pd(iq1,jq1);
998 qq12 = _mm_mul_pd(iq1,jq2);
999 qq20 = _mm_mul_pd(iq2,jq0);
1000 qq21 = _mm_mul_pd(iq2,jq1);
1001 qq22 = _mm_mul_pd(iq2,jq2);
1003 /* Avoid stupid compiler warnings */
1005 j_coord_offsetA = 0;
1006 j_coord_offsetB = 0;
1011 /* Start outer loop over neighborlists */
1012 for(iidx=0; iidx<nri; iidx++)
1014 /* Load shift vector for this list */
1015 i_shift_offset = DIM*shiftidx[iidx];
1017 /* Load limits for loop over neighbors */
1018 j_index_start = jindex[iidx];
1019 j_index_end = jindex[iidx+1];
1021 /* Get outer coordinate index */
1023 i_coord_offset = DIM*inr;
1025 /* Load i particle coords and add shift vector */
1026 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1027 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1029 fix0 = _mm_setzero_pd();
1030 fiy0 = _mm_setzero_pd();
1031 fiz0 = _mm_setzero_pd();
1032 fix1 = _mm_setzero_pd();
1033 fiy1 = _mm_setzero_pd();
1034 fiz1 = _mm_setzero_pd();
1035 fix2 = _mm_setzero_pd();
1036 fiy2 = _mm_setzero_pd();
1037 fiz2 = _mm_setzero_pd();
1039 /* Start inner kernel loop */
1040 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1043 /* Get j neighbor index, and coordinate index */
1045 jnrB = jjnr[jidx+1];
1046 j_coord_offsetA = DIM*jnrA;
1047 j_coord_offsetB = DIM*jnrB;
1049 /* load j atom coordinates */
1050 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1051 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1053 /* Calculate displacement vector */
1054 dx00 = _mm_sub_pd(ix0,jx0);
1055 dy00 = _mm_sub_pd(iy0,jy0);
1056 dz00 = _mm_sub_pd(iz0,jz0);
1057 dx01 = _mm_sub_pd(ix0,jx1);
1058 dy01 = _mm_sub_pd(iy0,jy1);
1059 dz01 = _mm_sub_pd(iz0,jz1);
1060 dx02 = _mm_sub_pd(ix0,jx2);
1061 dy02 = _mm_sub_pd(iy0,jy2);
1062 dz02 = _mm_sub_pd(iz0,jz2);
1063 dx10 = _mm_sub_pd(ix1,jx0);
1064 dy10 = _mm_sub_pd(iy1,jy0);
1065 dz10 = _mm_sub_pd(iz1,jz0);
1066 dx11 = _mm_sub_pd(ix1,jx1);
1067 dy11 = _mm_sub_pd(iy1,jy1);
1068 dz11 = _mm_sub_pd(iz1,jz1);
1069 dx12 = _mm_sub_pd(ix1,jx2);
1070 dy12 = _mm_sub_pd(iy1,jy2);
1071 dz12 = _mm_sub_pd(iz1,jz2);
1072 dx20 = _mm_sub_pd(ix2,jx0);
1073 dy20 = _mm_sub_pd(iy2,jy0);
1074 dz20 = _mm_sub_pd(iz2,jz0);
1075 dx21 = _mm_sub_pd(ix2,jx1);
1076 dy21 = _mm_sub_pd(iy2,jy1);
1077 dz21 = _mm_sub_pd(iz2,jz1);
1078 dx22 = _mm_sub_pd(ix2,jx2);
1079 dy22 = _mm_sub_pd(iy2,jy2);
1080 dz22 = _mm_sub_pd(iz2,jz2);
1082 /* Calculate squared distance and things based on it */
1083 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1084 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1085 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1086 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1087 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1088 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1089 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1090 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1091 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1093 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1094 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1095 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1096 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1097 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1098 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1099 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1100 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1101 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1103 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1104 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1105 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1106 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1107 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1108 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1109 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1110 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1111 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1113 fjx0 = _mm_setzero_pd();
1114 fjy0 = _mm_setzero_pd();
1115 fjz0 = _mm_setzero_pd();
1116 fjx1 = _mm_setzero_pd();
1117 fjy1 = _mm_setzero_pd();
1118 fjz1 = _mm_setzero_pd();
1119 fjx2 = _mm_setzero_pd();
1120 fjy2 = _mm_setzero_pd();
1121 fjz2 = _mm_setzero_pd();
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 r00 = _mm_mul_pd(rsq00,rinv00);
1129 /* Calculate table index by multiplying r with table scale and truncate to integer */
1130 rt = _mm_mul_pd(r00,vftabscale);
1131 vfitab = _mm_cvttpd_epi32(rt);
1133 vfeps = _mm_frcz_pd(rt);
1135 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1137 twovfeps = _mm_add_pd(vfeps,vfeps);
1138 vfitab = _mm_slli_epi32(vfitab,3);
1140 /* COULOMB ELECTROSTATICS */
1141 velec = _mm_mul_pd(qq00,rinv00);
1142 felec = _mm_mul_pd(velec,rinvsq00);
1144 /* CUBIC SPLINE TABLE DISPERSION */
1145 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1146 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1147 GMX_MM_TRANSPOSE2_PD(Y,F);
1148 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1149 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1150 GMX_MM_TRANSPOSE2_PD(G,H);
1151 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1152 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1153 fvdw6 = _mm_mul_pd(c6_00,FF);
1155 /* CUBIC SPLINE TABLE REPULSION */
1156 vfitab = _mm_add_epi32(vfitab,ifour);
1157 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1158 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1159 GMX_MM_TRANSPOSE2_PD(Y,F);
1160 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1161 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1162 GMX_MM_TRANSPOSE2_PD(G,H);
1163 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1164 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1165 fvdw12 = _mm_mul_pd(c12_00,FF);
1166 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1168 fscal = _mm_add_pd(felec,fvdw);
1170 /* Update vectorial force */
1171 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1172 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1173 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1175 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1176 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1177 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1179 /**************************
1180 * CALCULATE INTERACTIONS *
1181 **************************/
1183 /* COULOMB ELECTROSTATICS */
1184 velec = _mm_mul_pd(qq01,rinv01);
1185 felec = _mm_mul_pd(velec,rinvsq01);
1189 /* Update vectorial force */
1190 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1191 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1192 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1194 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1195 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1196 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1198 /**************************
1199 * CALCULATE INTERACTIONS *
1200 **************************/
1202 /* COULOMB ELECTROSTATICS */
1203 velec = _mm_mul_pd(qq02,rinv02);
1204 felec = _mm_mul_pd(velec,rinvsq02);
1208 /* Update vectorial force */
1209 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1210 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1211 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1213 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1214 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1215 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 /* COULOMB ELECTROSTATICS */
1222 velec = _mm_mul_pd(qq10,rinv10);
1223 felec = _mm_mul_pd(velec,rinvsq10);
1227 /* Update vectorial force */
1228 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1229 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1230 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1232 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1233 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1234 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1236 /**************************
1237 * CALCULATE INTERACTIONS *
1238 **************************/
1240 /* COULOMB ELECTROSTATICS */
1241 velec = _mm_mul_pd(qq11,rinv11);
1242 felec = _mm_mul_pd(velec,rinvsq11);
1246 /* Update vectorial force */
1247 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1248 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1249 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1251 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1252 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1253 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1255 /**************************
1256 * CALCULATE INTERACTIONS *
1257 **************************/
1259 /* COULOMB ELECTROSTATICS */
1260 velec = _mm_mul_pd(qq12,rinv12);
1261 felec = _mm_mul_pd(velec,rinvsq12);
1265 /* Update vectorial force */
1266 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1267 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1268 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1270 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1271 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1272 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1274 /**************************
1275 * CALCULATE INTERACTIONS *
1276 **************************/
1278 /* COULOMB ELECTROSTATICS */
1279 velec = _mm_mul_pd(qq20,rinv20);
1280 felec = _mm_mul_pd(velec,rinvsq20);
1284 /* Update vectorial force */
1285 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1286 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1287 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1289 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1290 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1291 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* COULOMB ELECTROSTATICS */
1298 velec = _mm_mul_pd(qq21,rinv21);
1299 felec = _mm_mul_pd(velec,rinvsq21);
1303 /* Update vectorial force */
1304 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1305 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1306 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1308 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1309 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1310 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1312 /**************************
1313 * CALCULATE INTERACTIONS *
1314 **************************/
1316 /* COULOMB ELECTROSTATICS */
1317 velec = _mm_mul_pd(qq22,rinv22);
1318 felec = _mm_mul_pd(velec,rinvsq22);
1322 /* Update vectorial force */
1323 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1324 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1325 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1327 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1328 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1329 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1331 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1333 /* Inner loop uses 297 flops */
1336 if(jidx<j_index_end)
1340 j_coord_offsetA = DIM*jnrA;
1342 /* load j atom coordinates */
1343 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1344 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1346 /* Calculate displacement vector */
1347 dx00 = _mm_sub_pd(ix0,jx0);
1348 dy00 = _mm_sub_pd(iy0,jy0);
1349 dz00 = _mm_sub_pd(iz0,jz0);
1350 dx01 = _mm_sub_pd(ix0,jx1);
1351 dy01 = _mm_sub_pd(iy0,jy1);
1352 dz01 = _mm_sub_pd(iz0,jz1);
1353 dx02 = _mm_sub_pd(ix0,jx2);
1354 dy02 = _mm_sub_pd(iy0,jy2);
1355 dz02 = _mm_sub_pd(iz0,jz2);
1356 dx10 = _mm_sub_pd(ix1,jx0);
1357 dy10 = _mm_sub_pd(iy1,jy0);
1358 dz10 = _mm_sub_pd(iz1,jz0);
1359 dx11 = _mm_sub_pd(ix1,jx1);
1360 dy11 = _mm_sub_pd(iy1,jy1);
1361 dz11 = _mm_sub_pd(iz1,jz1);
1362 dx12 = _mm_sub_pd(ix1,jx2);
1363 dy12 = _mm_sub_pd(iy1,jy2);
1364 dz12 = _mm_sub_pd(iz1,jz2);
1365 dx20 = _mm_sub_pd(ix2,jx0);
1366 dy20 = _mm_sub_pd(iy2,jy0);
1367 dz20 = _mm_sub_pd(iz2,jz0);
1368 dx21 = _mm_sub_pd(ix2,jx1);
1369 dy21 = _mm_sub_pd(iy2,jy1);
1370 dz21 = _mm_sub_pd(iz2,jz1);
1371 dx22 = _mm_sub_pd(ix2,jx2);
1372 dy22 = _mm_sub_pd(iy2,jy2);
1373 dz22 = _mm_sub_pd(iz2,jz2);
1375 /* Calculate squared distance and things based on it */
1376 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1377 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1378 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1379 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1380 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1381 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1382 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1383 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1384 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1386 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1387 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1388 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1389 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1390 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1391 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1392 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1393 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1394 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1396 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1397 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1398 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1399 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1400 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1401 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1402 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1403 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1404 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1406 fjx0 = _mm_setzero_pd();
1407 fjy0 = _mm_setzero_pd();
1408 fjz0 = _mm_setzero_pd();
1409 fjx1 = _mm_setzero_pd();
1410 fjy1 = _mm_setzero_pd();
1411 fjz1 = _mm_setzero_pd();
1412 fjx2 = _mm_setzero_pd();
1413 fjy2 = _mm_setzero_pd();
1414 fjz2 = _mm_setzero_pd();
1416 /**************************
1417 * CALCULATE INTERACTIONS *
1418 **************************/
1420 r00 = _mm_mul_pd(rsq00,rinv00);
1422 /* Calculate table index by multiplying r with table scale and truncate to integer */
1423 rt = _mm_mul_pd(r00,vftabscale);
1424 vfitab = _mm_cvttpd_epi32(rt);
1426 vfeps = _mm_frcz_pd(rt);
1428 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1430 twovfeps = _mm_add_pd(vfeps,vfeps);
1431 vfitab = _mm_slli_epi32(vfitab,3);
1433 /* COULOMB ELECTROSTATICS */
1434 velec = _mm_mul_pd(qq00,rinv00);
1435 felec = _mm_mul_pd(velec,rinvsq00);
1437 /* CUBIC SPLINE TABLE DISPERSION */
1438 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1439 F = _mm_setzero_pd();
1440 GMX_MM_TRANSPOSE2_PD(Y,F);
1441 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1442 H = _mm_setzero_pd();
1443 GMX_MM_TRANSPOSE2_PD(G,H);
1444 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1445 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1446 fvdw6 = _mm_mul_pd(c6_00,FF);
1448 /* CUBIC SPLINE TABLE REPULSION */
1449 vfitab = _mm_add_epi32(vfitab,ifour);
1450 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1451 F = _mm_setzero_pd();
1452 GMX_MM_TRANSPOSE2_PD(Y,F);
1453 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1454 H = _mm_setzero_pd();
1455 GMX_MM_TRANSPOSE2_PD(G,H);
1456 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1457 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1458 fvdw12 = _mm_mul_pd(c12_00,FF);
1459 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1461 fscal = _mm_add_pd(felec,fvdw);
1463 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1465 /* Update vectorial force */
1466 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1467 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1468 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1470 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1471 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1472 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1474 /**************************
1475 * CALCULATE INTERACTIONS *
1476 **************************/
1478 /* COULOMB ELECTROSTATICS */
1479 velec = _mm_mul_pd(qq01,rinv01);
1480 felec = _mm_mul_pd(velec,rinvsq01);
1484 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1486 /* Update vectorial force */
1487 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1488 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1489 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1491 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1492 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1493 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1495 /**************************
1496 * CALCULATE INTERACTIONS *
1497 **************************/
1499 /* COULOMB ELECTROSTATICS */
1500 velec = _mm_mul_pd(qq02,rinv02);
1501 felec = _mm_mul_pd(velec,rinvsq02);
1505 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1507 /* Update vectorial force */
1508 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1509 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1510 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1512 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1513 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1514 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1516 /**************************
1517 * CALCULATE INTERACTIONS *
1518 **************************/
1520 /* COULOMB ELECTROSTATICS */
1521 velec = _mm_mul_pd(qq10,rinv10);
1522 felec = _mm_mul_pd(velec,rinvsq10);
1526 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1528 /* Update vectorial force */
1529 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1530 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1531 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1533 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1534 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1535 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1537 /**************************
1538 * CALCULATE INTERACTIONS *
1539 **************************/
1541 /* COULOMB ELECTROSTATICS */
1542 velec = _mm_mul_pd(qq11,rinv11);
1543 felec = _mm_mul_pd(velec,rinvsq11);
1547 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1549 /* Update vectorial force */
1550 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1551 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1552 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1554 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1555 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1556 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1558 /**************************
1559 * CALCULATE INTERACTIONS *
1560 **************************/
1562 /* COULOMB ELECTROSTATICS */
1563 velec = _mm_mul_pd(qq12,rinv12);
1564 felec = _mm_mul_pd(velec,rinvsq12);
1568 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1570 /* Update vectorial force */
1571 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1572 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1573 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1575 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1576 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1577 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1579 /**************************
1580 * CALCULATE INTERACTIONS *
1581 **************************/
1583 /* COULOMB ELECTROSTATICS */
1584 velec = _mm_mul_pd(qq20,rinv20);
1585 felec = _mm_mul_pd(velec,rinvsq20);
1589 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1591 /* Update vectorial force */
1592 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1593 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1594 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1596 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1597 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1598 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1600 /**************************
1601 * CALCULATE INTERACTIONS *
1602 **************************/
1604 /* COULOMB ELECTROSTATICS */
1605 velec = _mm_mul_pd(qq21,rinv21);
1606 felec = _mm_mul_pd(velec,rinvsq21);
1610 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1612 /* Update vectorial force */
1613 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1614 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1615 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1617 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1618 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1619 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 /* COULOMB ELECTROSTATICS */
1626 velec = _mm_mul_pd(qq22,rinv22);
1627 felec = _mm_mul_pd(velec,rinvsq22);
1631 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1633 /* Update vectorial force */
1634 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1635 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1636 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1638 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1639 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1640 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1642 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1644 /* Inner loop uses 297 flops */
1647 /* End of innermost loop */
1649 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1650 f+i_coord_offset,fshift+i_shift_offset);
1652 /* Increment number of inner iterations */
1653 inneriter += j_index_end - j_index_start;
1655 /* Outer loop uses 18 flops */
1658 /* Increment number of outer iterations */
1661 /* Update outer/inner flops */
1663 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);