2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_elec->data;
121 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_pd(charge[inr+0]);
131 jq1 = _mm_set1_pd(charge[inr+1]);
132 jq2 = _mm_set1_pd(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_pd(iq0,jq0);
135 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_pd(iq0,jq1);
138 qq02 = _mm_mul_pd(iq0,jq2);
139 qq10 = _mm_mul_pd(iq1,jq0);
140 qq11 = _mm_mul_pd(iq1,jq1);
141 qq12 = _mm_mul_pd(iq1,jq2);
142 qq20 = _mm_mul_pd(iq2,jq0);
143 qq21 = _mm_mul_pd(iq2,jq1);
144 qq22 = _mm_mul_pd(iq2,jq2);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm_setzero_pd();
173 fiy0 = _mm_setzero_pd();
174 fiz0 = _mm_setzero_pd();
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx01 = _mm_sub_pd(ix0,jx1);
205 dy01 = _mm_sub_pd(iy0,jy1);
206 dz01 = _mm_sub_pd(iz0,jz1);
207 dx02 = _mm_sub_pd(ix0,jx2);
208 dy02 = _mm_sub_pd(iy0,jy2);
209 dz02 = _mm_sub_pd(iz0,jz2);
210 dx10 = _mm_sub_pd(ix1,jx0);
211 dy10 = _mm_sub_pd(iy1,jy0);
212 dz10 = _mm_sub_pd(iz1,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx20 = _mm_sub_pd(ix2,jx0);
220 dy20 = _mm_sub_pd(iy2,jy0);
221 dz20 = _mm_sub_pd(iz2,jz0);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
240 rinv00 = gmx_mm_invsqrt_pd(rsq00);
241 rinv01 = gmx_mm_invsqrt_pd(rsq01);
242 rinv02 = gmx_mm_invsqrt_pd(rsq02);
243 rinv10 = gmx_mm_invsqrt_pd(rsq10);
244 rinv11 = gmx_mm_invsqrt_pd(rsq11);
245 rinv12 = gmx_mm_invsqrt_pd(rsq12);
246 rinv20 = gmx_mm_invsqrt_pd(rsq20);
247 rinv21 = gmx_mm_invsqrt_pd(rsq21);
248 rinv22 = gmx_mm_invsqrt_pd(rsq22);
250 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
252 fjx0 = _mm_setzero_pd();
253 fjy0 = _mm_setzero_pd();
254 fjz0 = _mm_setzero_pd();
255 fjx1 = _mm_setzero_pd();
256 fjy1 = _mm_setzero_pd();
257 fjz1 = _mm_setzero_pd();
258 fjx2 = _mm_setzero_pd();
259 fjy2 = _mm_setzero_pd();
260 fjz2 = _mm_setzero_pd();
262 /**************************
263 * CALCULATE INTERACTIONS *
264 **************************/
266 r00 = _mm_mul_pd(rsq00,rinv00);
268 /* Calculate table index by multiplying r with table scale and truncate to integer */
269 rt = _mm_mul_pd(r00,vftabscale);
270 vfitab = _mm_cvttpd_epi32(rt);
272 vfeps = _mm_frcz_pd(rt);
274 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
276 twovfeps = _mm_add_pd(vfeps,vfeps);
277 vfitab = _mm_slli_epi32(vfitab,2);
279 /* CUBIC SPLINE TABLE ELECTROSTATICS */
280 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
281 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
282 GMX_MM_TRANSPOSE2_PD(Y,F);
283 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
284 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
285 GMX_MM_TRANSPOSE2_PD(G,H);
286 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
287 VV = _mm_macc_pd(vfeps,Fp,Y);
288 velec = _mm_mul_pd(qq00,VV);
289 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
290 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
292 /* LENNARD-JONES DISPERSION/REPULSION */
294 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
295 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
296 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
297 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
298 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
300 /* Update potential sum for this i atom from the interaction with this j atom. */
301 velecsum = _mm_add_pd(velecsum,velec);
302 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
304 fscal = _mm_add_pd(felec,fvdw);
306 /* Update vectorial force */
307 fix0 = _mm_macc_pd(dx00,fscal,fix0);
308 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
309 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
311 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
312 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
313 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
319 r01 = _mm_mul_pd(rsq01,rinv01);
321 /* Calculate table index by multiplying r with table scale and truncate to integer */
322 rt = _mm_mul_pd(r01,vftabscale);
323 vfitab = _mm_cvttpd_epi32(rt);
325 vfeps = _mm_frcz_pd(rt);
327 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
329 twovfeps = _mm_add_pd(vfeps,vfeps);
330 vfitab = _mm_slli_epi32(vfitab,2);
332 /* CUBIC SPLINE TABLE ELECTROSTATICS */
333 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
334 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
335 GMX_MM_TRANSPOSE2_PD(Y,F);
336 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
337 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
338 GMX_MM_TRANSPOSE2_PD(G,H);
339 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
340 VV = _mm_macc_pd(vfeps,Fp,Y);
341 velec = _mm_mul_pd(qq01,VV);
342 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
343 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velecsum = _mm_add_pd(velecsum,velec);
350 /* Update vectorial force */
351 fix0 = _mm_macc_pd(dx01,fscal,fix0);
352 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
353 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
355 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
356 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
357 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 r02 = _mm_mul_pd(rsq02,rinv02);
365 /* Calculate table index by multiplying r with table scale and truncate to integer */
366 rt = _mm_mul_pd(r02,vftabscale);
367 vfitab = _mm_cvttpd_epi32(rt);
369 vfeps = _mm_frcz_pd(rt);
371 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
373 twovfeps = _mm_add_pd(vfeps,vfeps);
374 vfitab = _mm_slli_epi32(vfitab,2);
376 /* CUBIC SPLINE TABLE ELECTROSTATICS */
377 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
378 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
379 GMX_MM_TRANSPOSE2_PD(Y,F);
380 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
381 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
382 GMX_MM_TRANSPOSE2_PD(G,H);
383 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
384 VV = _mm_macc_pd(vfeps,Fp,Y);
385 velec = _mm_mul_pd(qq02,VV);
386 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
387 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velecsum = _mm_add_pd(velecsum,velec);
394 /* Update vectorial force */
395 fix0 = _mm_macc_pd(dx02,fscal,fix0);
396 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
397 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
399 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
400 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
401 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
403 /**************************
404 * CALCULATE INTERACTIONS *
405 **************************/
407 r10 = _mm_mul_pd(rsq10,rinv10);
409 /* Calculate table index by multiplying r with table scale and truncate to integer */
410 rt = _mm_mul_pd(r10,vftabscale);
411 vfitab = _mm_cvttpd_epi32(rt);
413 vfeps = _mm_frcz_pd(rt);
415 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
417 twovfeps = _mm_add_pd(vfeps,vfeps);
418 vfitab = _mm_slli_epi32(vfitab,2);
420 /* CUBIC SPLINE TABLE ELECTROSTATICS */
421 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
422 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
423 GMX_MM_TRANSPOSE2_PD(Y,F);
424 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
425 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
426 GMX_MM_TRANSPOSE2_PD(G,H);
427 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
428 VV = _mm_macc_pd(vfeps,Fp,Y);
429 velec = _mm_mul_pd(qq10,VV);
430 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
431 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velecsum = _mm_add_pd(velecsum,velec);
438 /* Update vectorial force */
439 fix1 = _mm_macc_pd(dx10,fscal,fix1);
440 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
441 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
443 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
444 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
445 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 r11 = _mm_mul_pd(rsq11,rinv11);
453 /* Calculate table index by multiplying r with table scale and truncate to integer */
454 rt = _mm_mul_pd(r11,vftabscale);
455 vfitab = _mm_cvttpd_epi32(rt);
457 vfeps = _mm_frcz_pd(rt);
459 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
461 twovfeps = _mm_add_pd(vfeps,vfeps);
462 vfitab = _mm_slli_epi32(vfitab,2);
464 /* CUBIC SPLINE TABLE ELECTROSTATICS */
465 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
466 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
467 GMX_MM_TRANSPOSE2_PD(Y,F);
468 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
469 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
470 GMX_MM_TRANSPOSE2_PD(G,H);
471 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
472 VV = _mm_macc_pd(vfeps,Fp,Y);
473 velec = _mm_mul_pd(qq11,VV);
474 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
475 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velecsum = _mm_add_pd(velecsum,velec);
482 /* Update vectorial force */
483 fix1 = _mm_macc_pd(dx11,fscal,fix1);
484 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
485 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
487 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
488 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
489 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 r12 = _mm_mul_pd(rsq12,rinv12);
497 /* Calculate table index by multiplying r with table scale and truncate to integer */
498 rt = _mm_mul_pd(r12,vftabscale);
499 vfitab = _mm_cvttpd_epi32(rt);
501 vfeps = _mm_frcz_pd(rt);
503 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
505 twovfeps = _mm_add_pd(vfeps,vfeps);
506 vfitab = _mm_slli_epi32(vfitab,2);
508 /* CUBIC SPLINE TABLE ELECTROSTATICS */
509 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
510 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
511 GMX_MM_TRANSPOSE2_PD(Y,F);
512 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
513 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
514 GMX_MM_TRANSPOSE2_PD(G,H);
515 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
516 VV = _mm_macc_pd(vfeps,Fp,Y);
517 velec = _mm_mul_pd(qq12,VV);
518 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
519 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velecsum = _mm_add_pd(velecsum,velec);
526 /* Update vectorial force */
527 fix1 = _mm_macc_pd(dx12,fscal,fix1);
528 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
529 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
531 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
532 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
533 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
535 /**************************
536 * CALCULATE INTERACTIONS *
537 **************************/
539 r20 = _mm_mul_pd(rsq20,rinv20);
541 /* Calculate table index by multiplying r with table scale and truncate to integer */
542 rt = _mm_mul_pd(r20,vftabscale);
543 vfitab = _mm_cvttpd_epi32(rt);
545 vfeps = _mm_frcz_pd(rt);
547 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
549 twovfeps = _mm_add_pd(vfeps,vfeps);
550 vfitab = _mm_slli_epi32(vfitab,2);
552 /* CUBIC SPLINE TABLE ELECTROSTATICS */
553 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
554 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
555 GMX_MM_TRANSPOSE2_PD(Y,F);
556 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
557 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
558 GMX_MM_TRANSPOSE2_PD(G,H);
559 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
560 VV = _mm_macc_pd(vfeps,Fp,Y);
561 velec = _mm_mul_pd(qq20,VV);
562 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
563 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velecsum = _mm_add_pd(velecsum,velec);
570 /* Update vectorial force */
571 fix2 = _mm_macc_pd(dx20,fscal,fix2);
572 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
573 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
575 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
576 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
577 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r21 = _mm_mul_pd(rsq21,rinv21);
585 /* Calculate table index by multiplying r with table scale and truncate to integer */
586 rt = _mm_mul_pd(r21,vftabscale);
587 vfitab = _mm_cvttpd_epi32(rt);
589 vfeps = _mm_frcz_pd(rt);
591 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
593 twovfeps = _mm_add_pd(vfeps,vfeps);
594 vfitab = _mm_slli_epi32(vfitab,2);
596 /* CUBIC SPLINE TABLE ELECTROSTATICS */
597 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
598 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
599 GMX_MM_TRANSPOSE2_PD(Y,F);
600 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
601 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
602 GMX_MM_TRANSPOSE2_PD(G,H);
603 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
604 VV = _mm_macc_pd(vfeps,Fp,Y);
605 velec = _mm_mul_pd(qq21,VV);
606 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
607 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm_add_pd(velecsum,velec);
614 /* Update vectorial force */
615 fix2 = _mm_macc_pd(dx21,fscal,fix2);
616 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
617 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
619 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
620 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
621 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 r22 = _mm_mul_pd(rsq22,rinv22);
629 /* Calculate table index by multiplying r with table scale and truncate to integer */
630 rt = _mm_mul_pd(r22,vftabscale);
631 vfitab = _mm_cvttpd_epi32(rt);
633 vfeps = _mm_frcz_pd(rt);
635 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
637 twovfeps = _mm_add_pd(vfeps,vfeps);
638 vfitab = _mm_slli_epi32(vfitab,2);
640 /* CUBIC SPLINE TABLE ELECTROSTATICS */
641 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
642 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
643 GMX_MM_TRANSPOSE2_PD(Y,F);
644 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
645 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
646 GMX_MM_TRANSPOSE2_PD(G,H);
647 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
648 VV = _mm_macc_pd(vfeps,Fp,Y);
649 velec = _mm_mul_pd(qq22,VV);
650 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
651 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
653 /* Update potential sum for this i atom from the interaction with this j atom. */
654 velecsum = _mm_add_pd(velecsum,velec);
658 /* Update vectorial force */
659 fix2 = _mm_macc_pd(dx22,fscal,fix2);
660 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
661 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
663 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
664 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
665 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
667 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
669 /* Inner loop uses 427 flops */
676 j_coord_offsetA = DIM*jnrA;
678 /* load j atom coordinates */
679 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
680 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
682 /* Calculate displacement vector */
683 dx00 = _mm_sub_pd(ix0,jx0);
684 dy00 = _mm_sub_pd(iy0,jy0);
685 dz00 = _mm_sub_pd(iz0,jz0);
686 dx01 = _mm_sub_pd(ix0,jx1);
687 dy01 = _mm_sub_pd(iy0,jy1);
688 dz01 = _mm_sub_pd(iz0,jz1);
689 dx02 = _mm_sub_pd(ix0,jx2);
690 dy02 = _mm_sub_pd(iy0,jy2);
691 dz02 = _mm_sub_pd(iz0,jz2);
692 dx10 = _mm_sub_pd(ix1,jx0);
693 dy10 = _mm_sub_pd(iy1,jy0);
694 dz10 = _mm_sub_pd(iz1,jz0);
695 dx11 = _mm_sub_pd(ix1,jx1);
696 dy11 = _mm_sub_pd(iy1,jy1);
697 dz11 = _mm_sub_pd(iz1,jz1);
698 dx12 = _mm_sub_pd(ix1,jx2);
699 dy12 = _mm_sub_pd(iy1,jy2);
700 dz12 = _mm_sub_pd(iz1,jz2);
701 dx20 = _mm_sub_pd(ix2,jx0);
702 dy20 = _mm_sub_pd(iy2,jy0);
703 dz20 = _mm_sub_pd(iz2,jz0);
704 dx21 = _mm_sub_pd(ix2,jx1);
705 dy21 = _mm_sub_pd(iy2,jy1);
706 dz21 = _mm_sub_pd(iz2,jz1);
707 dx22 = _mm_sub_pd(ix2,jx2);
708 dy22 = _mm_sub_pd(iy2,jy2);
709 dz22 = _mm_sub_pd(iz2,jz2);
711 /* Calculate squared distance and things based on it */
712 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
713 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
714 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
715 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
716 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
717 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
718 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
719 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
720 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
722 rinv00 = gmx_mm_invsqrt_pd(rsq00);
723 rinv01 = gmx_mm_invsqrt_pd(rsq01);
724 rinv02 = gmx_mm_invsqrt_pd(rsq02);
725 rinv10 = gmx_mm_invsqrt_pd(rsq10);
726 rinv11 = gmx_mm_invsqrt_pd(rsq11);
727 rinv12 = gmx_mm_invsqrt_pd(rsq12);
728 rinv20 = gmx_mm_invsqrt_pd(rsq20);
729 rinv21 = gmx_mm_invsqrt_pd(rsq21);
730 rinv22 = gmx_mm_invsqrt_pd(rsq22);
732 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
734 fjx0 = _mm_setzero_pd();
735 fjy0 = _mm_setzero_pd();
736 fjz0 = _mm_setzero_pd();
737 fjx1 = _mm_setzero_pd();
738 fjy1 = _mm_setzero_pd();
739 fjz1 = _mm_setzero_pd();
740 fjx2 = _mm_setzero_pd();
741 fjy2 = _mm_setzero_pd();
742 fjz2 = _mm_setzero_pd();
744 /**************************
745 * CALCULATE INTERACTIONS *
746 **************************/
748 r00 = _mm_mul_pd(rsq00,rinv00);
750 /* Calculate table index by multiplying r with table scale and truncate to integer */
751 rt = _mm_mul_pd(r00,vftabscale);
752 vfitab = _mm_cvttpd_epi32(rt);
754 vfeps = _mm_frcz_pd(rt);
756 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
758 twovfeps = _mm_add_pd(vfeps,vfeps);
759 vfitab = _mm_slli_epi32(vfitab,2);
761 /* CUBIC SPLINE TABLE ELECTROSTATICS */
762 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
763 F = _mm_setzero_pd();
764 GMX_MM_TRANSPOSE2_PD(Y,F);
765 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
766 H = _mm_setzero_pd();
767 GMX_MM_TRANSPOSE2_PD(G,H);
768 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
769 VV = _mm_macc_pd(vfeps,Fp,Y);
770 velec = _mm_mul_pd(qq00,VV);
771 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
772 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
774 /* LENNARD-JONES DISPERSION/REPULSION */
776 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
777 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
778 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
779 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
780 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
782 /* Update potential sum for this i atom from the interaction with this j atom. */
783 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
784 velecsum = _mm_add_pd(velecsum,velec);
785 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
786 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
788 fscal = _mm_add_pd(felec,fvdw);
790 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
792 /* Update vectorial force */
793 fix0 = _mm_macc_pd(dx00,fscal,fix0);
794 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
795 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
797 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
798 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
799 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 r01 = _mm_mul_pd(rsq01,rinv01);
807 /* Calculate table index by multiplying r with table scale and truncate to integer */
808 rt = _mm_mul_pd(r01,vftabscale);
809 vfitab = _mm_cvttpd_epi32(rt);
811 vfeps = _mm_frcz_pd(rt);
813 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
815 twovfeps = _mm_add_pd(vfeps,vfeps);
816 vfitab = _mm_slli_epi32(vfitab,2);
818 /* CUBIC SPLINE TABLE ELECTROSTATICS */
819 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
820 F = _mm_setzero_pd();
821 GMX_MM_TRANSPOSE2_PD(Y,F);
822 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
823 H = _mm_setzero_pd();
824 GMX_MM_TRANSPOSE2_PD(G,H);
825 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
826 VV = _mm_macc_pd(vfeps,Fp,Y);
827 velec = _mm_mul_pd(qq01,VV);
828 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
829 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
831 /* Update potential sum for this i atom from the interaction with this j atom. */
832 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
833 velecsum = _mm_add_pd(velecsum,velec);
837 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
839 /* Update vectorial force */
840 fix0 = _mm_macc_pd(dx01,fscal,fix0);
841 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
842 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
844 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
845 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
846 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
848 /**************************
849 * CALCULATE INTERACTIONS *
850 **************************/
852 r02 = _mm_mul_pd(rsq02,rinv02);
854 /* Calculate table index by multiplying r with table scale and truncate to integer */
855 rt = _mm_mul_pd(r02,vftabscale);
856 vfitab = _mm_cvttpd_epi32(rt);
858 vfeps = _mm_frcz_pd(rt);
860 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
862 twovfeps = _mm_add_pd(vfeps,vfeps);
863 vfitab = _mm_slli_epi32(vfitab,2);
865 /* CUBIC SPLINE TABLE ELECTROSTATICS */
866 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
867 F = _mm_setzero_pd();
868 GMX_MM_TRANSPOSE2_PD(Y,F);
869 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
870 H = _mm_setzero_pd();
871 GMX_MM_TRANSPOSE2_PD(G,H);
872 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
873 VV = _mm_macc_pd(vfeps,Fp,Y);
874 velec = _mm_mul_pd(qq02,VV);
875 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
876 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
878 /* Update potential sum for this i atom from the interaction with this j atom. */
879 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
880 velecsum = _mm_add_pd(velecsum,velec);
884 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
886 /* Update vectorial force */
887 fix0 = _mm_macc_pd(dx02,fscal,fix0);
888 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
889 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
891 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
892 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
893 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 r10 = _mm_mul_pd(rsq10,rinv10);
901 /* Calculate table index by multiplying r with table scale and truncate to integer */
902 rt = _mm_mul_pd(r10,vftabscale);
903 vfitab = _mm_cvttpd_epi32(rt);
905 vfeps = _mm_frcz_pd(rt);
907 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
909 twovfeps = _mm_add_pd(vfeps,vfeps);
910 vfitab = _mm_slli_epi32(vfitab,2);
912 /* CUBIC SPLINE TABLE ELECTROSTATICS */
913 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
914 F = _mm_setzero_pd();
915 GMX_MM_TRANSPOSE2_PD(Y,F);
916 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
917 H = _mm_setzero_pd();
918 GMX_MM_TRANSPOSE2_PD(G,H);
919 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
920 VV = _mm_macc_pd(vfeps,Fp,Y);
921 velec = _mm_mul_pd(qq10,VV);
922 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
923 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
925 /* Update potential sum for this i atom from the interaction with this j atom. */
926 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
927 velecsum = _mm_add_pd(velecsum,velec);
931 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
933 /* Update vectorial force */
934 fix1 = _mm_macc_pd(dx10,fscal,fix1);
935 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
936 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
938 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
939 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
940 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
942 /**************************
943 * CALCULATE INTERACTIONS *
944 **************************/
946 r11 = _mm_mul_pd(rsq11,rinv11);
948 /* Calculate table index by multiplying r with table scale and truncate to integer */
949 rt = _mm_mul_pd(r11,vftabscale);
950 vfitab = _mm_cvttpd_epi32(rt);
952 vfeps = _mm_frcz_pd(rt);
954 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
956 twovfeps = _mm_add_pd(vfeps,vfeps);
957 vfitab = _mm_slli_epi32(vfitab,2);
959 /* CUBIC SPLINE TABLE ELECTROSTATICS */
960 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
961 F = _mm_setzero_pd();
962 GMX_MM_TRANSPOSE2_PD(Y,F);
963 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
964 H = _mm_setzero_pd();
965 GMX_MM_TRANSPOSE2_PD(G,H);
966 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
967 VV = _mm_macc_pd(vfeps,Fp,Y);
968 velec = _mm_mul_pd(qq11,VV);
969 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
970 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
972 /* Update potential sum for this i atom from the interaction with this j atom. */
973 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
974 velecsum = _mm_add_pd(velecsum,velec);
978 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
980 /* Update vectorial force */
981 fix1 = _mm_macc_pd(dx11,fscal,fix1);
982 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
983 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
985 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
986 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
987 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 r12 = _mm_mul_pd(rsq12,rinv12);
995 /* Calculate table index by multiplying r with table scale and truncate to integer */
996 rt = _mm_mul_pd(r12,vftabscale);
997 vfitab = _mm_cvttpd_epi32(rt);
999 vfeps = _mm_frcz_pd(rt);
1001 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1003 twovfeps = _mm_add_pd(vfeps,vfeps);
1004 vfitab = _mm_slli_epi32(vfitab,2);
1006 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1007 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1008 F = _mm_setzero_pd();
1009 GMX_MM_TRANSPOSE2_PD(Y,F);
1010 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1011 H = _mm_setzero_pd();
1012 GMX_MM_TRANSPOSE2_PD(G,H);
1013 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1014 VV = _mm_macc_pd(vfeps,Fp,Y);
1015 velec = _mm_mul_pd(qq12,VV);
1016 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1017 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1019 /* Update potential sum for this i atom from the interaction with this j atom. */
1020 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1021 velecsum = _mm_add_pd(velecsum,velec);
1025 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1027 /* Update vectorial force */
1028 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1029 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1030 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1032 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1033 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1034 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1036 /**************************
1037 * CALCULATE INTERACTIONS *
1038 **************************/
1040 r20 = _mm_mul_pd(rsq20,rinv20);
1042 /* Calculate table index by multiplying r with table scale and truncate to integer */
1043 rt = _mm_mul_pd(r20,vftabscale);
1044 vfitab = _mm_cvttpd_epi32(rt);
1046 vfeps = _mm_frcz_pd(rt);
1048 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1050 twovfeps = _mm_add_pd(vfeps,vfeps);
1051 vfitab = _mm_slli_epi32(vfitab,2);
1053 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1054 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1055 F = _mm_setzero_pd();
1056 GMX_MM_TRANSPOSE2_PD(Y,F);
1057 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1058 H = _mm_setzero_pd();
1059 GMX_MM_TRANSPOSE2_PD(G,H);
1060 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1061 VV = _mm_macc_pd(vfeps,Fp,Y);
1062 velec = _mm_mul_pd(qq20,VV);
1063 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1064 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1066 /* Update potential sum for this i atom from the interaction with this j atom. */
1067 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1068 velecsum = _mm_add_pd(velecsum,velec);
1072 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1074 /* Update vectorial force */
1075 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1076 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1077 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1079 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1080 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1081 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1083 /**************************
1084 * CALCULATE INTERACTIONS *
1085 **************************/
1087 r21 = _mm_mul_pd(rsq21,rinv21);
1089 /* Calculate table index by multiplying r with table scale and truncate to integer */
1090 rt = _mm_mul_pd(r21,vftabscale);
1091 vfitab = _mm_cvttpd_epi32(rt);
1093 vfeps = _mm_frcz_pd(rt);
1095 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1097 twovfeps = _mm_add_pd(vfeps,vfeps);
1098 vfitab = _mm_slli_epi32(vfitab,2);
1100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1102 F = _mm_setzero_pd();
1103 GMX_MM_TRANSPOSE2_PD(Y,F);
1104 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1105 H = _mm_setzero_pd();
1106 GMX_MM_TRANSPOSE2_PD(G,H);
1107 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1108 VV = _mm_macc_pd(vfeps,Fp,Y);
1109 velec = _mm_mul_pd(qq21,VV);
1110 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1111 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1115 velecsum = _mm_add_pd(velecsum,velec);
1119 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1121 /* Update vectorial force */
1122 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1123 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1124 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1126 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1127 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1128 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1130 /**************************
1131 * CALCULATE INTERACTIONS *
1132 **************************/
1134 r22 = _mm_mul_pd(rsq22,rinv22);
1136 /* Calculate table index by multiplying r with table scale and truncate to integer */
1137 rt = _mm_mul_pd(r22,vftabscale);
1138 vfitab = _mm_cvttpd_epi32(rt);
1140 vfeps = _mm_frcz_pd(rt);
1142 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1144 twovfeps = _mm_add_pd(vfeps,vfeps);
1145 vfitab = _mm_slli_epi32(vfitab,2);
1147 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1149 F = _mm_setzero_pd();
1150 GMX_MM_TRANSPOSE2_PD(Y,F);
1151 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1152 H = _mm_setzero_pd();
1153 GMX_MM_TRANSPOSE2_PD(G,H);
1154 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1155 VV = _mm_macc_pd(vfeps,Fp,Y);
1156 velec = _mm_mul_pd(qq22,VV);
1157 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1158 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1160 /* Update potential sum for this i atom from the interaction with this j atom. */
1161 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1162 velecsum = _mm_add_pd(velecsum,velec);
1166 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1168 /* Update vectorial force */
1169 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1170 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1171 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1173 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1174 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1175 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1177 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1179 /* Inner loop uses 427 flops */
1182 /* End of innermost loop */
1184 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1185 f+i_coord_offset,fshift+i_shift_offset);
1188 /* Update potential energies */
1189 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1190 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1192 /* Increment number of inner iterations */
1193 inneriter += j_index_end - j_index_start;
1195 /* Outer loop uses 20 flops */
1198 /* Increment number of outer iterations */
1201 /* Update outer/inner flops */
1203 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*427);
1206 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_double
1207 * Electrostatics interaction: CubicSplineTable
1208 * VdW interaction: LennardJones
1209 * Geometry: Water3-Water3
1210 * Calculate force/pot: Force
1213 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_double
1214 (t_nblist * gmx_restrict nlist,
1215 rvec * gmx_restrict xx,
1216 rvec * gmx_restrict ff,
1217 t_forcerec * gmx_restrict fr,
1218 t_mdatoms * gmx_restrict mdatoms,
1219 nb_kernel_data_t * gmx_restrict kernel_data,
1220 t_nrnb * gmx_restrict nrnb)
1222 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1223 * just 0 for non-waters.
1224 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1225 * jnr indices corresponding to data put in the four positions in the SIMD register.
1227 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1228 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1230 int j_coord_offsetA,j_coord_offsetB;
1231 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1232 real rcutoff_scalar;
1233 real *shiftvec,*fshift,*x,*f;
1234 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1236 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1238 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1240 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1241 int vdwjidx0A,vdwjidx0B;
1242 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1243 int vdwjidx1A,vdwjidx1B;
1244 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1245 int vdwjidx2A,vdwjidx2B;
1246 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1247 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1248 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1249 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1250 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1251 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1252 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1253 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1254 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1255 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1256 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1259 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1262 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1263 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1265 __m128i ifour = _mm_set1_epi32(4);
1266 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1268 __m128d dummy_mask,cutoff_mask;
1269 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1270 __m128d one = _mm_set1_pd(1.0);
1271 __m128d two = _mm_set1_pd(2.0);
1277 jindex = nlist->jindex;
1279 shiftidx = nlist->shift;
1281 shiftvec = fr->shift_vec[0];
1282 fshift = fr->fshift[0];
1283 facel = _mm_set1_pd(fr->epsfac);
1284 charge = mdatoms->chargeA;
1285 nvdwtype = fr->ntype;
1286 vdwparam = fr->nbfp;
1287 vdwtype = mdatoms->typeA;
1289 vftab = kernel_data->table_elec->data;
1290 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1292 /* Setup water-specific parameters */
1293 inr = nlist->iinr[0];
1294 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1295 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1296 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1297 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1299 jq0 = _mm_set1_pd(charge[inr+0]);
1300 jq1 = _mm_set1_pd(charge[inr+1]);
1301 jq2 = _mm_set1_pd(charge[inr+2]);
1302 vdwjidx0A = 2*vdwtype[inr+0];
1303 qq00 = _mm_mul_pd(iq0,jq0);
1304 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1305 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1306 qq01 = _mm_mul_pd(iq0,jq1);
1307 qq02 = _mm_mul_pd(iq0,jq2);
1308 qq10 = _mm_mul_pd(iq1,jq0);
1309 qq11 = _mm_mul_pd(iq1,jq1);
1310 qq12 = _mm_mul_pd(iq1,jq2);
1311 qq20 = _mm_mul_pd(iq2,jq0);
1312 qq21 = _mm_mul_pd(iq2,jq1);
1313 qq22 = _mm_mul_pd(iq2,jq2);
1315 /* Avoid stupid compiler warnings */
1317 j_coord_offsetA = 0;
1318 j_coord_offsetB = 0;
1323 /* Start outer loop over neighborlists */
1324 for(iidx=0; iidx<nri; iidx++)
1326 /* Load shift vector for this list */
1327 i_shift_offset = DIM*shiftidx[iidx];
1329 /* Load limits for loop over neighbors */
1330 j_index_start = jindex[iidx];
1331 j_index_end = jindex[iidx+1];
1333 /* Get outer coordinate index */
1335 i_coord_offset = DIM*inr;
1337 /* Load i particle coords and add shift vector */
1338 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1339 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1341 fix0 = _mm_setzero_pd();
1342 fiy0 = _mm_setzero_pd();
1343 fiz0 = _mm_setzero_pd();
1344 fix1 = _mm_setzero_pd();
1345 fiy1 = _mm_setzero_pd();
1346 fiz1 = _mm_setzero_pd();
1347 fix2 = _mm_setzero_pd();
1348 fiy2 = _mm_setzero_pd();
1349 fiz2 = _mm_setzero_pd();
1351 /* Start inner kernel loop */
1352 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1355 /* Get j neighbor index, and coordinate index */
1357 jnrB = jjnr[jidx+1];
1358 j_coord_offsetA = DIM*jnrA;
1359 j_coord_offsetB = DIM*jnrB;
1361 /* load j atom coordinates */
1362 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1363 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1365 /* Calculate displacement vector */
1366 dx00 = _mm_sub_pd(ix0,jx0);
1367 dy00 = _mm_sub_pd(iy0,jy0);
1368 dz00 = _mm_sub_pd(iz0,jz0);
1369 dx01 = _mm_sub_pd(ix0,jx1);
1370 dy01 = _mm_sub_pd(iy0,jy1);
1371 dz01 = _mm_sub_pd(iz0,jz1);
1372 dx02 = _mm_sub_pd(ix0,jx2);
1373 dy02 = _mm_sub_pd(iy0,jy2);
1374 dz02 = _mm_sub_pd(iz0,jz2);
1375 dx10 = _mm_sub_pd(ix1,jx0);
1376 dy10 = _mm_sub_pd(iy1,jy0);
1377 dz10 = _mm_sub_pd(iz1,jz0);
1378 dx11 = _mm_sub_pd(ix1,jx1);
1379 dy11 = _mm_sub_pd(iy1,jy1);
1380 dz11 = _mm_sub_pd(iz1,jz1);
1381 dx12 = _mm_sub_pd(ix1,jx2);
1382 dy12 = _mm_sub_pd(iy1,jy2);
1383 dz12 = _mm_sub_pd(iz1,jz2);
1384 dx20 = _mm_sub_pd(ix2,jx0);
1385 dy20 = _mm_sub_pd(iy2,jy0);
1386 dz20 = _mm_sub_pd(iz2,jz0);
1387 dx21 = _mm_sub_pd(ix2,jx1);
1388 dy21 = _mm_sub_pd(iy2,jy1);
1389 dz21 = _mm_sub_pd(iz2,jz1);
1390 dx22 = _mm_sub_pd(ix2,jx2);
1391 dy22 = _mm_sub_pd(iy2,jy2);
1392 dz22 = _mm_sub_pd(iz2,jz2);
1394 /* Calculate squared distance and things based on it */
1395 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1396 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1397 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1398 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1399 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1400 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1401 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1402 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1403 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1405 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1406 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1407 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1408 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1409 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1410 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1411 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1412 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1413 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1415 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1417 fjx0 = _mm_setzero_pd();
1418 fjy0 = _mm_setzero_pd();
1419 fjz0 = _mm_setzero_pd();
1420 fjx1 = _mm_setzero_pd();
1421 fjy1 = _mm_setzero_pd();
1422 fjz1 = _mm_setzero_pd();
1423 fjx2 = _mm_setzero_pd();
1424 fjy2 = _mm_setzero_pd();
1425 fjz2 = _mm_setzero_pd();
1427 /**************************
1428 * CALCULATE INTERACTIONS *
1429 **************************/
1431 r00 = _mm_mul_pd(rsq00,rinv00);
1433 /* Calculate table index by multiplying r with table scale and truncate to integer */
1434 rt = _mm_mul_pd(r00,vftabscale);
1435 vfitab = _mm_cvttpd_epi32(rt);
1437 vfeps = _mm_frcz_pd(rt);
1439 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1441 twovfeps = _mm_add_pd(vfeps,vfeps);
1442 vfitab = _mm_slli_epi32(vfitab,2);
1444 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1445 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1446 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1447 GMX_MM_TRANSPOSE2_PD(Y,F);
1448 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1449 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1450 GMX_MM_TRANSPOSE2_PD(G,H);
1451 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1452 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1453 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1455 /* LENNARD-JONES DISPERSION/REPULSION */
1457 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1458 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1460 fscal = _mm_add_pd(felec,fvdw);
1462 /* Update vectorial force */
1463 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1464 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1465 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1467 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1468 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1469 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1471 /**************************
1472 * CALCULATE INTERACTIONS *
1473 **************************/
1475 r01 = _mm_mul_pd(rsq01,rinv01);
1477 /* Calculate table index by multiplying r with table scale and truncate to integer */
1478 rt = _mm_mul_pd(r01,vftabscale);
1479 vfitab = _mm_cvttpd_epi32(rt);
1481 vfeps = _mm_frcz_pd(rt);
1483 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1485 twovfeps = _mm_add_pd(vfeps,vfeps);
1486 vfitab = _mm_slli_epi32(vfitab,2);
1488 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1489 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1490 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1491 GMX_MM_TRANSPOSE2_PD(Y,F);
1492 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1493 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1494 GMX_MM_TRANSPOSE2_PD(G,H);
1495 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1496 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1497 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1501 /* Update vectorial force */
1502 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1503 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1504 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1506 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1507 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1508 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1510 /**************************
1511 * CALCULATE INTERACTIONS *
1512 **************************/
1514 r02 = _mm_mul_pd(rsq02,rinv02);
1516 /* Calculate table index by multiplying r with table scale and truncate to integer */
1517 rt = _mm_mul_pd(r02,vftabscale);
1518 vfitab = _mm_cvttpd_epi32(rt);
1520 vfeps = _mm_frcz_pd(rt);
1522 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1524 twovfeps = _mm_add_pd(vfeps,vfeps);
1525 vfitab = _mm_slli_epi32(vfitab,2);
1527 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1528 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1529 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1530 GMX_MM_TRANSPOSE2_PD(Y,F);
1531 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1532 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1533 GMX_MM_TRANSPOSE2_PD(G,H);
1534 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1535 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1536 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1540 /* Update vectorial force */
1541 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1542 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1543 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1545 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1546 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1547 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1549 /**************************
1550 * CALCULATE INTERACTIONS *
1551 **************************/
1553 r10 = _mm_mul_pd(rsq10,rinv10);
1555 /* Calculate table index by multiplying r with table scale and truncate to integer */
1556 rt = _mm_mul_pd(r10,vftabscale);
1557 vfitab = _mm_cvttpd_epi32(rt);
1559 vfeps = _mm_frcz_pd(rt);
1561 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1563 twovfeps = _mm_add_pd(vfeps,vfeps);
1564 vfitab = _mm_slli_epi32(vfitab,2);
1566 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1567 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1568 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1569 GMX_MM_TRANSPOSE2_PD(Y,F);
1570 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1571 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1572 GMX_MM_TRANSPOSE2_PD(G,H);
1573 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1574 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1575 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1579 /* Update vectorial force */
1580 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1581 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1582 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1584 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1585 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1586 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1588 /**************************
1589 * CALCULATE INTERACTIONS *
1590 **************************/
1592 r11 = _mm_mul_pd(rsq11,rinv11);
1594 /* Calculate table index by multiplying r with table scale and truncate to integer */
1595 rt = _mm_mul_pd(r11,vftabscale);
1596 vfitab = _mm_cvttpd_epi32(rt);
1598 vfeps = _mm_frcz_pd(rt);
1600 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1602 twovfeps = _mm_add_pd(vfeps,vfeps);
1603 vfitab = _mm_slli_epi32(vfitab,2);
1605 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1606 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1607 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1608 GMX_MM_TRANSPOSE2_PD(Y,F);
1609 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1610 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1611 GMX_MM_TRANSPOSE2_PD(G,H);
1612 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1613 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1614 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1618 /* Update vectorial force */
1619 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1620 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1621 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1623 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1624 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1625 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1627 /**************************
1628 * CALCULATE INTERACTIONS *
1629 **************************/
1631 r12 = _mm_mul_pd(rsq12,rinv12);
1633 /* Calculate table index by multiplying r with table scale and truncate to integer */
1634 rt = _mm_mul_pd(r12,vftabscale);
1635 vfitab = _mm_cvttpd_epi32(rt);
1637 vfeps = _mm_frcz_pd(rt);
1639 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1641 twovfeps = _mm_add_pd(vfeps,vfeps);
1642 vfitab = _mm_slli_epi32(vfitab,2);
1644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1645 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1646 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1647 GMX_MM_TRANSPOSE2_PD(Y,F);
1648 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1649 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1650 GMX_MM_TRANSPOSE2_PD(G,H);
1651 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1652 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1653 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1657 /* Update vectorial force */
1658 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1659 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1660 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1662 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1663 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1664 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1666 /**************************
1667 * CALCULATE INTERACTIONS *
1668 **************************/
1670 r20 = _mm_mul_pd(rsq20,rinv20);
1672 /* Calculate table index by multiplying r with table scale and truncate to integer */
1673 rt = _mm_mul_pd(r20,vftabscale);
1674 vfitab = _mm_cvttpd_epi32(rt);
1676 vfeps = _mm_frcz_pd(rt);
1678 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1680 twovfeps = _mm_add_pd(vfeps,vfeps);
1681 vfitab = _mm_slli_epi32(vfitab,2);
1683 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1684 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1685 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1686 GMX_MM_TRANSPOSE2_PD(Y,F);
1687 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1688 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1689 GMX_MM_TRANSPOSE2_PD(G,H);
1690 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1691 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1692 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1696 /* Update vectorial force */
1697 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1698 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1699 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1701 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1702 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1703 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1705 /**************************
1706 * CALCULATE INTERACTIONS *
1707 **************************/
1709 r21 = _mm_mul_pd(rsq21,rinv21);
1711 /* Calculate table index by multiplying r with table scale and truncate to integer */
1712 rt = _mm_mul_pd(r21,vftabscale);
1713 vfitab = _mm_cvttpd_epi32(rt);
1715 vfeps = _mm_frcz_pd(rt);
1717 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1719 twovfeps = _mm_add_pd(vfeps,vfeps);
1720 vfitab = _mm_slli_epi32(vfitab,2);
1722 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1723 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1724 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1725 GMX_MM_TRANSPOSE2_PD(Y,F);
1726 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1727 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1728 GMX_MM_TRANSPOSE2_PD(G,H);
1729 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1730 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1731 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1735 /* Update vectorial force */
1736 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1737 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1738 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1740 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1741 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1742 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 r22 = _mm_mul_pd(rsq22,rinv22);
1750 /* Calculate table index by multiplying r with table scale and truncate to integer */
1751 rt = _mm_mul_pd(r22,vftabscale);
1752 vfitab = _mm_cvttpd_epi32(rt);
1754 vfeps = _mm_frcz_pd(rt);
1756 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1758 twovfeps = _mm_add_pd(vfeps,vfeps);
1759 vfitab = _mm_slli_epi32(vfitab,2);
1761 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1762 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1763 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1764 GMX_MM_TRANSPOSE2_PD(Y,F);
1765 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1766 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1767 GMX_MM_TRANSPOSE2_PD(G,H);
1768 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1769 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1770 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1774 /* Update vectorial force */
1775 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1776 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1777 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1779 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1780 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1781 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1783 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1785 /* Inner loop uses 386 flops */
1788 if(jidx<j_index_end)
1792 j_coord_offsetA = DIM*jnrA;
1794 /* load j atom coordinates */
1795 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1796 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1798 /* Calculate displacement vector */
1799 dx00 = _mm_sub_pd(ix0,jx0);
1800 dy00 = _mm_sub_pd(iy0,jy0);
1801 dz00 = _mm_sub_pd(iz0,jz0);
1802 dx01 = _mm_sub_pd(ix0,jx1);
1803 dy01 = _mm_sub_pd(iy0,jy1);
1804 dz01 = _mm_sub_pd(iz0,jz1);
1805 dx02 = _mm_sub_pd(ix0,jx2);
1806 dy02 = _mm_sub_pd(iy0,jy2);
1807 dz02 = _mm_sub_pd(iz0,jz2);
1808 dx10 = _mm_sub_pd(ix1,jx0);
1809 dy10 = _mm_sub_pd(iy1,jy0);
1810 dz10 = _mm_sub_pd(iz1,jz0);
1811 dx11 = _mm_sub_pd(ix1,jx1);
1812 dy11 = _mm_sub_pd(iy1,jy1);
1813 dz11 = _mm_sub_pd(iz1,jz1);
1814 dx12 = _mm_sub_pd(ix1,jx2);
1815 dy12 = _mm_sub_pd(iy1,jy2);
1816 dz12 = _mm_sub_pd(iz1,jz2);
1817 dx20 = _mm_sub_pd(ix2,jx0);
1818 dy20 = _mm_sub_pd(iy2,jy0);
1819 dz20 = _mm_sub_pd(iz2,jz0);
1820 dx21 = _mm_sub_pd(ix2,jx1);
1821 dy21 = _mm_sub_pd(iy2,jy1);
1822 dz21 = _mm_sub_pd(iz2,jz1);
1823 dx22 = _mm_sub_pd(ix2,jx2);
1824 dy22 = _mm_sub_pd(iy2,jy2);
1825 dz22 = _mm_sub_pd(iz2,jz2);
1827 /* Calculate squared distance and things based on it */
1828 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1829 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1830 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1831 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1832 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1833 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1834 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1835 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1836 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1838 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1839 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1840 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1841 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1842 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1843 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1844 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1845 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1846 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1848 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1850 fjx0 = _mm_setzero_pd();
1851 fjy0 = _mm_setzero_pd();
1852 fjz0 = _mm_setzero_pd();
1853 fjx1 = _mm_setzero_pd();
1854 fjy1 = _mm_setzero_pd();
1855 fjz1 = _mm_setzero_pd();
1856 fjx2 = _mm_setzero_pd();
1857 fjy2 = _mm_setzero_pd();
1858 fjz2 = _mm_setzero_pd();
1860 /**************************
1861 * CALCULATE INTERACTIONS *
1862 **************************/
1864 r00 = _mm_mul_pd(rsq00,rinv00);
1866 /* Calculate table index by multiplying r with table scale and truncate to integer */
1867 rt = _mm_mul_pd(r00,vftabscale);
1868 vfitab = _mm_cvttpd_epi32(rt);
1870 vfeps = _mm_frcz_pd(rt);
1872 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1874 twovfeps = _mm_add_pd(vfeps,vfeps);
1875 vfitab = _mm_slli_epi32(vfitab,2);
1877 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1878 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1879 F = _mm_setzero_pd();
1880 GMX_MM_TRANSPOSE2_PD(Y,F);
1881 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1882 H = _mm_setzero_pd();
1883 GMX_MM_TRANSPOSE2_PD(G,H);
1884 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1885 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1886 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1888 /* LENNARD-JONES DISPERSION/REPULSION */
1890 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1891 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1893 fscal = _mm_add_pd(felec,fvdw);
1895 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1897 /* Update vectorial force */
1898 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1899 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1900 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1902 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1903 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1904 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1906 /**************************
1907 * CALCULATE INTERACTIONS *
1908 **************************/
1910 r01 = _mm_mul_pd(rsq01,rinv01);
1912 /* Calculate table index by multiplying r with table scale and truncate to integer */
1913 rt = _mm_mul_pd(r01,vftabscale);
1914 vfitab = _mm_cvttpd_epi32(rt);
1916 vfeps = _mm_frcz_pd(rt);
1918 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1920 twovfeps = _mm_add_pd(vfeps,vfeps);
1921 vfitab = _mm_slli_epi32(vfitab,2);
1923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1925 F = _mm_setzero_pd();
1926 GMX_MM_TRANSPOSE2_PD(Y,F);
1927 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1928 H = _mm_setzero_pd();
1929 GMX_MM_TRANSPOSE2_PD(G,H);
1930 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1931 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1932 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1936 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1938 /* Update vectorial force */
1939 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1940 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1941 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1943 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1944 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1945 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1947 /**************************
1948 * CALCULATE INTERACTIONS *
1949 **************************/
1951 r02 = _mm_mul_pd(rsq02,rinv02);
1953 /* Calculate table index by multiplying r with table scale and truncate to integer */
1954 rt = _mm_mul_pd(r02,vftabscale);
1955 vfitab = _mm_cvttpd_epi32(rt);
1957 vfeps = _mm_frcz_pd(rt);
1959 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1961 twovfeps = _mm_add_pd(vfeps,vfeps);
1962 vfitab = _mm_slli_epi32(vfitab,2);
1964 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1965 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1966 F = _mm_setzero_pd();
1967 GMX_MM_TRANSPOSE2_PD(Y,F);
1968 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1969 H = _mm_setzero_pd();
1970 GMX_MM_TRANSPOSE2_PD(G,H);
1971 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1972 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1973 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1977 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1979 /* Update vectorial force */
1980 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1981 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1982 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1984 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1985 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1986 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1988 /**************************
1989 * CALCULATE INTERACTIONS *
1990 **************************/
1992 r10 = _mm_mul_pd(rsq10,rinv10);
1994 /* Calculate table index by multiplying r with table scale and truncate to integer */
1995 rt = _mm_mul_pd(r10,vftabscale);
1996 vfitab = _mm_cvttpd_epi32(rt);
1998 vfeps = _mm_frcz_pd(rt);
2000 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2002 twovfeps = _mm_add_pd(vfeps,vfeps);
2003 vfitab = _mm_slli_epi32(vfitab,2);
2005 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2006 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2007 F = _mm_setzero_pd();
2008 GMX_MM_TRANSPOSE2_PD(Y,F);
2009 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2010 H = _mm_setzero_pd();
2011 GMX_MM_TRANSPOSE2_PD(G,H);
2012 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2013 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2014 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2018 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2020 /* Update vectorial force */
2021 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2022 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2023 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2025 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2026 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2027 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2029 /**************************
2030 * CALCULATE INTERACTIONS *
2031 **************************/
2033 r11 = _mm_mul_pd(rsq11,rinv11);
2035 /* Calculate table index by multiplying r with table scale and truncate to integer */
2036 rt = _mm_mul_pd(r11,vftabscale);
2037 vfitab = _mm_cvttpd_epi32(rt);
2039 vfeps = _mm_frcz_pd(rt);
2041 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2043 twovfeps = _mm_add_pd(vfeps,vfeps);
2044 vfitab = _mm_slli_epi32(vfitab,2);
2046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2048 F = _mm_setzero_pd();
2049 GMX_MM_TRANSPOSE2_PD(Y,F);
2050 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2051 H = _mm_setzero_pd();
2052 GMX_MM_TRANSPOSE2_PD(G,H);
2053 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2054 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2055 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2059 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2061 /* Update vectorial force */
2062 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2063 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2064 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2066 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2067 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2068 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2070 /**************************
2071 * CALCULATE INTERACTIONS *
2072 **************************/
2074 r12 = _mm_mul_pd(rsq12,rinv12);
2076 /* Calculate table index by multiplying r with table scale and truncate to integer */
2077 rt = _mm_mul_pd(r12,vftabscale);
2078 vfitab = _mm_cvttpd_epi32(rt);
2080 vfeps = _mm_frcz_pd(rt);
2082 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2084 twovfeps = _mm_add_pd(vfeps,vfeps);
2085 vfitab = _mm_slli_epi32(vfitab,2);
2087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2088 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2089 F = _mm_setzero_pd();
2090 GMX_MM_TRANSPOSE2_PD(Y,F);
2091 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2092 H = _mm_setzero_pd();
2093 GMX_MM_TRANSPOSE2_PD(G,H);
2094 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2095 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2096 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2100 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2102 /* Update vectorial force */
2103 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2104 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2105 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2107 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2108 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2109 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 r20 = _mm_mul_pd(rsq20,rinv20);
2117 /* Calculate table index by multiplying r with table scale and truncate to integer */
2118 rt = _mm_mul_pd(r20,vftabscale);
2119 vfitab = _mm_cvttpd_epi32(rt);
2121 vfeps = _mm_frcz_pd(rt);
2123 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2125 twovfeps = _mm_add_pd(vfeps,vfeps);
2126 vfitab = _mm_slli_epi32(vfitab,2);
2128 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2129 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2130 F = _mm_setzero_pd();
2131 GMX_MM_TRANSPOSE2_PD(Y,F);
2132 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2133 H = _mm_setzero_pd();
2134 GMX_MM_TRANSPOSE2_PD(G,H);
2135 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2136 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2137 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2141 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2143 /* Update vectorial force */
2144 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2145 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2146 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2148 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2149 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2150 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2152 /**************************
2153 * CALCULATE INTERACTIONS *
2154 **************************/
2156 r21 = _mm_mul_pd(rsq21,rinv21);
2158 /* Calculate table index by multiplying r with table scale and truncate to integer */
2159 rt = _mm_mul_pd(r21,vftabscale);
2160 vfitab = _mm_cvttpd_epi32(rt);
2162 vfeps = _mm_frcz_pd(rt);
2164 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2166 twovfeps = _mm_add_pd(vfeps,vfeps);
2167 vfitab = _mm_slli_epi32(vfitab,2);
2169 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2171 F = _mm_setzero_pd();
2172 GMX_MM_TRANSPOSE2_PD(Y,F);
2173 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2174 H = _mm_setzero_pd();
2175 GMX_MM_TRANSPOSE2_PD(G,H);
2176 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2177 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2178 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2182 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2184 /* Update vectorial force */
2185 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2186 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2187 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2189 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2190 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2191 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2193 /**************************
2194 * CALCULATE INTERACTIONS *
2195 **************************/
2197 r22 = _mm_mul_pd(rsq22,rinv22);
2199 /* Calculate table index by multiplying r with table scale and truncate to integer */
2200 rt = _mm_mul_pd(r22,vftabscale);
2201 vfitab = _mm_cvttpd_epi32(rt);
2203 vfeps = _mm_frcz_pd(rt);
2205 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2207 twovfeps = _mm_add_pd(vfeps,vfeps);
2208 vfitab = _mm_slli_epi32(vfitab,2);
2210 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2211 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2212 F = _mm_setzero_pd();
2213 GMX_MM_TRANSPOSE2_PD(Y,F);
2214 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2215 H = _mm_setzero_pd();
2216 GMX_MM_TRANSPOSE2_PD(G,H);
2217 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2218 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2219 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2223 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2225 /* Update vectorial force */
2226 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2227 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2228 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2230 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2231 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2232 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2234 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2236 /* Inner loop uses 386 flops */
2239 /* End of innermost loop */
2241 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2242 f+i_coord_offset,fshift+i_shift_offset);
2244 /* Increment number of inner iterations */
2245 inneriter += j_index_end - j_index_start;
2247 /* Outer loop uses 18 flops */
2250 /* Increment number of outer iterations */
2253 /* Update outer/inner flops */
2255 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*386);