2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_elec_vdw->data;
121 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_pd(charge[inr+0]);
131 jq1 = _mm_set1_pd(charge[inr+1]);
132 jq2 = _mm_set1_pd(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_pd(iq0,jq0);
135 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_pd(iq0,jq1);
138 qq02 = _mm_mul_pd(iq0,jq2);
139 qq10 = _mm_mul_pd(iq1,jq0);
140 qq11 = _mm_mul_pd(iq1,jq1);
141 qq12 = _mm_mul_pd(iq1,jq2);
142 qq20 = _mm_mul_pd(iq2,jq0);
143 qq21 = _mm_mul_pd(iq2,jq1);
144 qq22 = _mm_mul_pd(iq2,jq2);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm_setzero_pd();
173 fiy0 = _mm_setzero_pd();
174 fiz0 = _mm_setzero_pd();
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx01 = _mm_sub_pd(ix0,jx1);
205 dy01 = _mm_sub_pd(iy0,jy1);
206 dz01 = _mm_sub_pd(iz0,jz1);
207 dx02 = _mm_sub_pd(ix0,jx2);
208 dy02 = _mm_sub_pd(iy0,jy2);
209 dz02 = _mm_sub_pd(iz0,jz2);
210 dx10 = _mm_sub_pd(ix1,jx0);
211 dy10 = _mm_sub_pd(iy1,jy0);
212 dz10 = _mm_sub_pd(iz1,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx20 = _mm_sub_pd(ix2,jx0);
220 dy20 = _mm_sub_pd(iy2,jy0);
221 dz20 = _mm_sub_pd(iz2,jz0);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
240 rinv00 = gmx_mm_invsqrt_pd(rsq00);
241 rinv01 = gmx_mm_invsqrt_pd(rsq01);
242 rinv02 = gmx_mm_invsqrt_pd(rsq02);
243 rinv10 = gmx_mm_invsqrt_pd(rsq10);
244 rinv11 = gmx_mm_invsqrt_pd(rsq11);
245 rinv12 = gmx_mm_invsqrt_pd(rsq12);
246 rinv20 = gmx_mm_invsqrt_pd(rsq20);
247 rinv21 = gmx_mm_invsqrt_pd(rsq21);
248 rinv22 = gmx_mm_invsqrt_pd(rsq22);
250 fjx0 = _mm_setzero_pd();
251 fjy0 = _mm_setzero_pd();
252 fjz0 = _mm_setzero_pd();
253 fjx1 = _mm_setzero_pd();
254 fjy1 = _mm_setzero_pd();
255 fjz1 = _mm_setzero_pd();
256 fjx2 = _mm_setzero_pd();
257 fjy2 = _mm_setzero_pd();
258 fjz2 = _mm_setzero_pd();
260 /**************************
261 * CALCULATE INTERACTIONS *
262 **************************/
264 r00 = _mm_mul_pd(rsq00,rinv00);
266 /* Calculate table index by multiplying r with table scale and truncate to integer */
267 rt = _mm_mul_pd(r00,vftabscale);
268 vfitab = _mm_cvttpd_epi32(rt);
270 vfeps = _mm_frcz_pd(rt);
272 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
274 twovfeps = _mm_add_pd(vfeps,vfeps);
275 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
277 /* CUBIC SPLINE TABLE ELECTROSTATICS */
278 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
279 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
280 GMX_MM_TRANSPOSE2_PD(Y,F);
281 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
282 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
283 GMX_MM_TRANSPOSE2_PD(G,H);
284 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
285 VV = _mm_macc_pd(vfeps,Fp,Y);
286 velec = _mm_mul_pd(qq00,VV);
287 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
288 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
290 /* CUBIC SPLINE TABLE DISPERSION */
291 vfitab = _mm_add_epi32(vfitab,ifour);
292 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294 GMX_MM_TRANSPOSE2_PD(Y,F);
295 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
296 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
297 GMX_MM_TRANSPOSE2_PD(G,H);
298 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
299 VV = _mm_macc_pd(vfeps,Fp,Y);
300 vvdw6 = _mm_mul_pd(c6_00,VV);
301 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
302 fvdw6 = _mm_mul_pd(c6_00,FF);
304 /* CUBIC SPLINE TABLE REPULSION */
305 vfitab = _mm_add_epi32(vfitab,ifour);
306 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308 GMX_MM_TRANSPOSE2_PD(Y,F);
309 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
310 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
311 GMX_MM_TRANSPOSE2_PD(G,H);
312 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
313 VV = _mm_macc_pd(vfeps,Fp,Y);
314 vvdw12 = _mm_mul_pd(c12_00,VV);
315 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
316 fvdw12 = _mm_mul_pd(c12_00,FF);
317 vvdw = _mm_add_pd(vvdw12,vvdw6);
318 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 velecsum = _mm_add_pd(velecsum,velec);
322 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
324 fscal = _mm_add_pd(felec,fvdw);
326 /* Update vectorial force */
327 fix0 = _mm_macc_pd(dx00,fscal,fix0);
328 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
329 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
331 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
332 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
333 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
335 /**************************
336 * CALCULATE INTERACTIONS *
337 **************************/
339 r01 = _mm_mul_pd(rsq01,rinv01);
341 /* Calculate table index by multiplying r with table scale and truncate to integer */
342 rt = _mm_mul_pd(r01,vftabscale);
343 vfitab = _mm_cvttpd_epi32(rt);
345 vfeps = _mm_frcz_pd(rt);
347 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
349 twovfeps = _mm_add_pd(vfeps,vfeps);
350 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
352 /* CUBIC SPLINE TABLE ELECTROSTATICS */
353 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
354 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
355 GMX_MM_TRANSPOSE2_PD(Y,F);
356 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
357 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
358 GMX_MM_TRANSPOSE2_PD(G,H);
359 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
360 VV = _mm_macc_pd(vfeps,Fp,Y);
361 velec = _mm_mul_pd(qq01,VV);
362 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
363 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velecsum = _mm_add_pd(velecsum,velec);
370 /* Update vectorial force */
371 fix0 = _mm_macc_pd(dx01,fscal,fix0);
372 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
373 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
375 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
376 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
377 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 r02 = _mm_mul_pd(rsq02,rinv02);
385 /* Calculate table index by multiplying r with table scale and truncate to integer */
386 rt = _mm_mul_pd(r02,vftabscale);
387 vfitab = _mm_cvttpd_epi32(rt);
389 vfeps = _mm_frcz_pd(rt);
391 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
393 twovfeps = _mm_add_pd(vfeps,vfeps);
394 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
396 /* CUBIC SPLINE TABLE ELECTROSTATICS */
397 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
398 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
399 GMX_MM_TRANSPOSE2_PD(Y,F);
400 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
401 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
402 GMX_MM_TRANSPOSE2_PD(G,H);
403 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
404 VV = _mm_macc_pd(vfeps,Fp,Y);
405 velec = _mm_mul_pd(qq02,VV);
406 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
407 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velecsum = _mm_add_pd(velecsum,velec);
414 /* Update vectorial force */
415 fix0 = _mm_macc_pd(dx02,fscal,fix0);
416 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
417 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
419 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
420 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
421 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 r10 = _mm_mul_pd(rsq10,rinv10);
429 /* Calculate table index by multiplying r with table scale and truncate to integer */
430 rt = _mm_mul_pd(r10,vftabscale);
431 vfitab = _mm_cvttpd_epi32(rt);
433 vfeps = _mm_frcz_pd(rt);
435 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
437 twovfeps = _mm_add_pd(vfeps,vfeps);
438 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
440 /* CUBIC SPLINE TABLE ELECTROSTATICS */
441 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
442 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
443 GMX_MM_TRANSPOSE2_PD(Y,F);
444 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
445 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
446 GMX_MM_TRANSPOSE2_PD(G,H);
447 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
448 VV = _mm_macc_pd(vfeps,Fp,Y);
449 velec = _mm_mul_pd(qq10,VV);
450 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
451 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velecsum = _mm_add_pd(velecsum,velec);
458 /* Update vectorial force */
459 fix1 = _mm_macc_pd(dx10,fscal,fix1);
460 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
461 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
463 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
464 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
465 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
467 /**************************
468 * CALCULATE INTERACTIONS *
469 **************************/
471 r11 = _mm_mul_pd(rsq11,rinv11);
473 /* Calculate table index by multiplying r with table scale and truncate to integer */
474 rt = _mm_mul_pd(r11,vftabscale);
475 vfitab = _mm_cvttpd_epi32(rt);
477 vfeps = _mm_frcz_pd(rt);
479 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
481 twovfeps = _mm_add_pd(vfeps,vfeps);
482 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
484 /* CUBIC SPLINE TABLE ELECTROSTATICS */
485 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
486 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
487 GMX_MM_TRANSPOSE2_PD(Y,F);
488 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
489 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
490 GMX_MM_TRANSPOSE2_PD(G,H);
491 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
492 VV = _mm_macc_pd(vfeps,Fp,Y);
493 velec = _mm_mul_pd(qq11,VV);
494 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
495 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm_add_pd(velecsum,velec);
502 /* Update vectorial force */
503 fix1 = _mm_macc_pd(dx11,fscal,fix1);
504 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
505 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
507 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
508 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
509 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 r12 = _mm_mul_pd(rsq12,rinv12);
517 /* Calculate table index by multiplying r with table scale and truncate to integer */
518 rt = _mm_mul_pd(r12,vftabscale);
519 vfitab = _mm_cvttpd_epi32(rt);
521 vfeps = _mm_frcz_pd(rt);
523 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
525 twovfeps = _mm_add_pd(vfeps,vfeps);
526 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
528 /* CUBIC SPLINE TABLE ELECTROSTATICS */
529 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
530 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
531 GMX_MM_TRANSPOSE2_PD(Y,F);
532 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
533 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
534 GMX_MM_TRANSPOSE2_PD(G,H);
535 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
536 VV = _mm_macc_pd(vfeps,Fp,Y);
537 velec = _mm_mul_pd(qq12,VV);
538 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
539 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velecsum = _mm_add_pd(velecsum,velec);
546 /* Update vectorial force */
547 fix1 = _mm_macc_pd(dx12,fscal,fix1);
548 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
549 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
551 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
552 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
553 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r20 = _mm_mul_pd(rsq20,rinv20);
561 /* Calculate table index by multiplying r with table scale and truncate to integer */
562 rt = _mm_mul_pd(r20,vftabscale);
563 vfitab = _mm_cvttpd_epi32(rt);
565 vfeps = _mm_frcz_pd(rt);
567 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
569 twovfeps = _mm_add_pd(vfeps,vfeps);
570 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
572 /* CUBIC SPLINE TABLE ELECTROSTATICS */
573 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
574 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
575 GMX_MM_TRANSPOSE2_PD(Y,F);
576 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
577 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
578 GMX_MM_TRANSPOSE2_PD(G,H);
579 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
580 VV = _mm_macc_pd(vfeps,Fp,Y);
581 velec = _mm_mul_pd(qq20,VV);
582 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
583 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velecsum = _mm_add_pd(velecsum,velec);
590 /* Update vectorial force */
591 fix2 = _mm_macc_pd(dx20,fscal,fix2);
592 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
593 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
595 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
596 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
597 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 r21 = _mm_mul_pd(rsq21,rinv21);
605 /* Calculate table index by multiplying r with table scale and truncate to integer */
606 rt = _mm_mul_pd(r21,vftabscale);
607 vfitab = _mm_cvttpd_epi32(rt);
609 vfeps = _mm_frcz_pd(rt);
611 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
613 twovfeps = _mm_add_pd(vfeps,vfeps);
614 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
616 /* CUBIC SPLINE TABLE ELECTROSTATICS */
617 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
618 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
619 GMX_MM_TRANSPOSE2_PD(Y,F);
620 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
621 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
622 GMX_MM_TRANSPOSE2_PD(G,H);
623 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
624 VV = _mm_macc_pd(vfeps,Fp,Y);
625 velec = _mm_mul_pd(qq21,VV);
626 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
627 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
629 /* Update potential sum for this i atom from the interaction with this j atom. */
630 velecsum = _mm_add_pd(velecsum,velec);
634 /* Update vectorial force */
635 fix2 = _mm_macc_pd(dx21,fscal,fix2);
636 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
637 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
639 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
640 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
641 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
643 /**************************
644 * CALCULATE INTERACTIONS *
645 **************************/
647 r22 = _mm_mul_pd(rsq22,rinv22);
649 /* Calculate table index by multiplying r with table scale and truncate to integer */
650 rt = _mm_mul_pd(r22,vftabscale);
651 vfitab = _mm_cvttpd_epi32(rt);
653 vfeps = _mm_frcz_pd(rt);
655 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
657 twovfeps = _mm_add_pd(vfeps,vfeps);
658 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
660 /* CUBIC SPLINE TABLE ELECTROSTATICS */
661 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
662 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
663 GMX_MM_TRANSPOSE2_PD(Y,F);
664 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
665 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
666 GMX_MM_TRANSPOSE2_PD(G,H);
667 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
668 VV = _mm_macc_pd(vfeps,Fp,Y);
669 velec = _mm_mul_pd(qq22,VV);
670 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
671 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velecsum = _mm_add_pd(velecsum,velec);
678 /* Update vectorial force */
679 fix2 = _mm_macc_pd(dx22,fscal,fix2);
680 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
681 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
683 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
684 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
685 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
687 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
689 /* Inner loop uses 444 flops */
696 j_coord_offsetA = DIM*jnrA;
698 /* load j atom coordinates */
699 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
700 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
702 /* Calculate displacement vector */
703 dx00 = _mm_sub_pd(ix0,jx0);
704 dy00 = _mm_sub_pd(iy0,jy0);
705 dz00 = _mm_sub_pd(iz0,jz0);
706 dx01 = _mm_sub_pd(ix0,jx1);
707 dy01 = _mm_sub_pd(iy0,jy1);
708 dz01 = _mm_sub_pd(iz0,jz1);
709 dx02 = _mm_sub_pd(ix0,jx2);
710 dy02 = _mm_sub_pd(iy0,jy2);
711 dz02 = _mm_sub_pd(iz0,jz2);
712 dx10 = _mm_sub_pd(ix1,jx0);
713 dy10 = _mm_sub_pd(iy1,jy0);
714 dz10 = _mm_sub_pd(iz1,jz0);
715 dx11 = _mm_sub_pd(ix1,jx1);
716 dy11 = _mm_sub_pd(iy1,jy1);
717 dz11 = _mm_sub_pd(iz1,jz1);
718 dx12 = _mm_sub_pd(ix1,jx2);
719 dy12 = _mm_sub_pd(iy1,jy2);
720 dz12 = _mm_sub_pd(iz1,jz2);
721 dx20 = _mm_sub_pd(ix2,jx0);
722 dy20 = _mm_sub_pd(iy2,jy0);
723 dz20 = _mm_sub_pd(iz2,jz0);
724 dx21 = _mm_sub_pd(ix2,jx1);
725 dy21 = _mm_sub_pd(iy2,jy1);
726 dz21 = _mm_sub_pd(iz2,jz1);
727 dx22 = _mm_sub_pd(ix2,jx2);
728 dy22 = _mm_sub_pd(iy2,jy2);
729 dz22 = _mm_sub_pd(iz2,jz2);
731 /* Calculate squared distance and things based on it */
732 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
733 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
734 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
735 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
736 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
737 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
738 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
739 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
740 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
742 rinv00 = gmx_mm_invsqrt_pd(rsq00);
743 rinv01 = gmx_mm_invsqrt_pd(rsq01);
744 rinv02 = gmx_mm_invsqrt_pd(rsq02);
745 rinv10 = gmx_mm_invsqrt_pd(rsq10);
746 rinv11 = gmx_mm_invsqrt_pd(rsq11);
747 rinv12 = gmx_mm_invsqrt_pd(rsq12);
748 rinv20 = gmx_mm_invsqrt_pd(rsq20);
749 rinv21 = gmx_mm_invsqrt_pd(rsq21);
750 rinv22 = gmx_mm_invsqrt_pd(rsq22);
752 fjx0 = _mm_setzero_pd();
753 fjy0 = _mm_setzero_pd();
754 fjz0 = _mm_setzero_pd();
755 fjx1 = _mm_setzero_pd();
756 fjy1 = _mm_setzero_pd();
757 fjz1 = _mm_setzero_pd();
758 fjx2 = _mm_setzero_pd();
759 fjy2 = _mm_setzero_pd();
760 fjz2 = _mm_setzero_pd();
762 /**************************
763 * CALCULATE INTERACTIONS *
764 **************************/
766 r00 = _mm_mul_pd(rsq00,rinv00);
768 /* Calculate table index by multiplying r with table scale and truncate to integer */
769 rt = _mm_mul_pd(r00,vftabscale);
770 vfitab = _mm_cvttpd_epi32(rt);
772 vfeps = _mm_frcz_pd(rt);
774 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
776 twovfeps = _mm_add_pd(vfeps,vfeps);
777 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
780 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
781 F = _mm_setzero_pd();
782 GMX_MM_TRANSPOSE2_PD(Y,F);
783 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
784 H = _mm_setzero_pd();
785 GMX_MM_TRANSPOSE2_PD(G,H);
786 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
787 VV = _mm_macc_pd(vfeps,Fp,Y);
788 velec = _mm_mul_pd(qq00,VV);
789 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
790 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
792 /* CUBIC SPLINE TABLE DISPERSION */
793 vfitab = _mm_add_epi32(vfitab,ifour);
794 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
795 F = _mm_setzero_pd();
796 GMX_MM_TRANSPOSE2_PD(Y,F);
797 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
798 H = _mm_setzero_pd();
799 GMX_MM_TRANSPOSE2_PD(G,H);
800 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
801 VV = _mm_macc_pd(vfeps,Fp,Y);
802 vvdw6 = _mm_mul_pd(c6_00,VV);
803 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
804 fvdw6 = _mm_mul_pd(c6_00,FF);
806 /* CUBIC SPLINE TABLE REPULSION */
807 vfitab = _mm_add_epi32(vfitab,ifour);
808 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
809 F = _mm_setzero_pd();
810 GMX_MM_TRANSPOSE2_PD(Y,F);
811 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
812 H = _mm_setzero_pd();
813 GMX_MM_TRANSPOSE2_PD(G,H);
814 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
815 VV = _mm_macc_pd(vfeps,Fp,Y);
816 vvdw12 = _mm_mul_pd(c12_00,VV);
817 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
818 fvdw12 = _mm_mul_pd(c12_00,FF);
819 vvdw = _mm_add_pd(vvdw12,vvdw6);
820 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
822 /* Update potential sum for this i atom from the interaction with this j atom. */
823 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
824 velecsum = _mm_add_pd(velecsum,velec);
825 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
826 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
828 fscal = _mm_add_pd(felec,fvdw);
830 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
832 /* Update vectorial force */
833 fix0 = _mm_macc_pd(dx00,fscal,fix0);
834 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
835 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
837 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
838 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
839 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
841 /**************************
842 * CALCULATE INTERACTIONS *
843 **************************/
845 r01 = _mm_mul_pd(rsq01,rinv01);
847 /* Calculate table index by multiplying r with table scale and truncate to integer */
848 rt = _mm_mul_pd(r01,vftabscale);
849 vfitab = _mm_cvttpd_epi32(rt);
851 vfeps = _mm_frcz_pd(rt);
853 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
855 twovfeps = _mm_add_pd(vfeps,vfeps);
856 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
858 /* CUBIC SPLINE TABLE ELECTROSTATICS */
859 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
860 F = _mm_setzero_pd();
861 GMX_MM_TRANSPOSE2_PD(Y,F);
862 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
863 H = _mm_setzero_pd();
864 GMX_MM_TRANSPOSE2_PD(G,H);
865 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
866 VV = _mm_macc_pd(vfeps,Fp,Y);
867 velec = _mm_mul_pd(qq01,VV);
868 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
869 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
871 /* Update potential sum for this i atom from the interaction with this j atom. */
872 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
873 velecsum = _mm_add_pd(velecsum,velec);
877 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
879 /* Update vectorial force */
880 fix0 = _mm_macc_pd(dx01,fscal,fix0);
881 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
882 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
884 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
885 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
886 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 r02 = _mm_mul_pd(rsq02,rinv02);
894 /* Calculate table index by multiplying r with table scale and truncate to integer */
895 rt = _mm_mul_pd(r02,vftabscale);
896 vfitab = _mm_cvttpd_epi32(rt);
898 vfeps = _mm_frcz_pd(rt);
900 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
902 twovfeps = _mm_add_pd(vfeps,vfeps);
903 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
905 /* CUBIC SPLINE TABLE ELECTROSTATICS */
906 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
907 F = _mm_setzero_pd();
908 GMX_MM_TRANSPOSE2_PD(Y,F);
909 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
910 H = _mm_setzero_pd();
911 GMX_MM_TRANSPOSE2_PD(G,H);
912 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
913 VV = _mm_macc_pd(vfeps,Fp,Y);
914 velec = _mm_mul_pd(qq02,VV);
915 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
916 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
918 /* Update potential sum for this i atom from the interaction with this j atom. */
919 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
920 velecsum = _mm_add_pd(velecsum,velec);
924 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
926 /* Update vectorial force */
927 fix0 = _mm_macc_pd(dx02,fscal,fix0);
928 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
929 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
931 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
932 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
933 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 r10 = _mm_mul_pd(rsq10,rinv10);
941 /* Calculate table index by multiplying r with table scale and truncate to integer */
942 rt = _mm_mul_pd(r10,vftabscale);
943 vfitab = _mm_cvttpd_epi32(rt);
945 vfeps = _mm_frcz_pd(rt);
947 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
949 twovfeps = _mm_add_pd(vfeps,vfeps);
950 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
952 /* CUBIC SPLINE TABLE ELECTROSTATICS */
953 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
954 F = _mm_setzero_pd();
955 GMX_MM_TRANSPOSE2_PD(Y,F);
956 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
957 H = _mm_setzero_pd();
958 GMX_MM_TRANSPOSE2_PD(G,H);
959 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
960 VV = _mm_macc_pd(vfeps,Fp,Y);
961 velec = _mm_mul_pd(qq10,VV);
962 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
963 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
965 /* Update potential sum for this i atom from the interaction with this j atom. */
966 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
967 velecsum = _mm_add_pd(velecsum,velec);
971 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
973 /* Update vectorial force */
974 fix1 = _mm_macc_pd(dx10,fscal,fix1);
975 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
976 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
978 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
979 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
980 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
986 r11 = _mm_mul_pd(rsq11,rinv11);
988 /* Calculate table index by multiplying r with table scale and truncate to integer */
989 rt = _mm_mul_pd(r11,vftabscale);
990 vfitab = _mm_cvttpd_epi32(rt);
992 vfeps = _mm_frcz_pd(rt);
994 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
996 twovfeps = _mm_add_pd(vfeps,vfeps);
997 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
999 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1000 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1001 F = _mm_setzero_pd();
1002 GMX_MM_TRANSPOSE2_PD(Y,F);
1003 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1004 H = _mm_setzero_pd();
1005 GMX_MM_TRANSPOSE2_PD(G,H);
1006 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1007 VV = _mm_macc_pd(vfeps,Fp,Y);
1008 velec = _mm_mul_pd(qq11,VV);
1009 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1010 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1012 /* Update potential sum for this i atom from the interaction with this j atom. */
1013 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1014 velecsum = _mm_add_pd(velecsum,velec);
1018 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1020 /* Update vectorial force */
1021 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1022 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1023 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1025 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1026 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1027 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1029 /**************************
1030 * CALCULATE INTERACTIONS *
1031 **************************/
1033 r12 = _mm_mul_pd(rsq12,rinv12);
1035 /* Calculate table index by multiplying r with table scale and truncate to integer */
1036 rt = _mm_mul_pd(r12,vftabscale);
1037 vfitab = _mm_cvttpd_epi32(rt);
1039 vfeps = _mm_frcz_pd(rt);
1041 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1043 twovfeps = _mm_add_pd(vfeps,vfeps);
1044 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1047 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1048 F = _mm_setzero_pd();
1049 GMX_MM_TRANSPOSE2_PD(Y,F);
1050 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1051 H = _mm_setzero_pd();
1052 GMX_MM_TRANSPOSE2_PD(G,H);
1053 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1054 VV = _mm_macc_pd(vfeps,Fp,Y);
1055 velec = _mm_mul_pd(qq12,VV);
1056 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1057 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1059 /* Update potential sum for this i atom from the interaction with this j atom. */
1060 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1061 velecsum = _mm_add_pd(velecsum,velec);
1065 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1067 /* Update vectorial force */
1068 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1069 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1070 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1072 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1073 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1074 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1076 /**************************
1077 * CALCULATE INTERACTIONS *
1078 **************************/
1080 r20 = _mm_mul_pd(rsq20,rinv20);
1082 /* Calculate table index by multiplying r with table scale and truncate to integer */
1083 rt = _mm_mul_pd(r20,vftabscale);
1084 vfitab = _mm_cvttpd_epi32(rt);
1086 vfeps = _mm_frcz_pd(rt);
1088 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1090 twovfeps = _mm_add_pd(vfeps,vfeps);
1091 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1093 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1094 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1095 F = _mm_setzero_pd();
1096 GMX_MM_TRANSPOSE2_PD(Y,F);
1097 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1098 H = _mm_setzero_pd();
1099 GMX_MM_TRANSPOSE2_PD(G,H);
1100 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1101 VV = _mm_macc_pd(vfeps,Fp,Y);
1102 velec = _mm_mul_pd(qq20,VV);
1103 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1104 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1108 velecsum = _mm_add_pd(velecsum,velec);
1112 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1114 /* Update vectorial force */
1115 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1116 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1117 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1119 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1120 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1121 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 r21 = _mm_mul_pd(rsq21,rinv21);
1129 /* Calculate table index by multiplying r with table scale and truncate to integer */
1130 rt = _mm_mul_pd(r21,vftabscale);
1131 vfitab = _mm_cvttpd_epi32(rt);
1133 vfeps = _mm_frcz_pd(rt);
1135 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1137 twovfeps = _mm_add_pd(vfeps,vfeps);
1138 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1140 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1141 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1142 F = _mm_setzero_pd();
1143 GMX_MM_TRANSPOSE2_PD(Y,F);
1144 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1145 H = _mm_setzero_pd();
1146 GMX_MM_TRANSPOSE2_PD(G,H);
1147 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1148 VV = _mm_macc_pd(vfeps,Fp,Y);
1149 velec = _mm_mul_pd(qq21,VV);
1150 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1151 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1153 /* Update potential sum for this i atom from the interaction with this j atom. */
1154 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1155 velecsum = _mm_add_pd(velecsum,velec);
1159 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1161 /* Update vectorial force */
1162 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1163 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1164 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1166 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1167 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1168 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1170 /**************************
1171 * CALCULATE INTERACTIONS *
1172 **************************/
1174 r22 = _mm_mul_pd(rsq22,rinv22);
1176 /* Calculate table index by multiplying r with table scale and truncate to integer */
1177 rt = _mm_mul_pd(r22,vftabscale);
1178 vfitab = _mm_cvttpd_epi32(rt);
1180 vfeps = _mm_frcz_pd(rt);
1182 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1184 twovfeps = _mm_add_pd(vfeps,vfeps);
1185 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1187 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1188 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1189 F = _mm_setzero_pd();
1190 GMX_MM_TRANSPOSE2_PD(Y,F);
1191 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1192 H = _mm_setzero_pd();
1193 GMX_MM_TRANSPOSE2_PD(G,H);
1194 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1195 VV = _mm_macc_pd(vfeps,Fp,Y);
1196 velec = _mm_mul_pd(qq22,VV);
1197 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1198 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1200 /* Update potential sum for this i atom from the interaction with this j atom. */
1201 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1202 velecsum = _mm_add_pd(velecsum,velec);
1206 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1208 /* Update vectorial force */
1209 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1210 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1211 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1213 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1214 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1215 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1217 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1219 /* Inner loop uses 444 flops */
1222 /* End of innermost loop */
1224 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1225 f+i_coord_offset,fshift+i_shift_offset);
1228 /* Update potential energies */
1229 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1230 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1232 /* Increment number of inner iterations */
1233 inneriter += j_index_end - j_index_start;
1235 /* Outer loop uses 20 flops */
1238 /* Increment number of outer iterations */
1241 /* Update outer/inner flops */
1243 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*444);
1246 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1247 * Electrostatics interaction: CubicSplineTable
1248 * VdW interaction: CubicSplineTable
1249 * Geometry: Water3-Water3
1250 * Calculate force/pot: Force
1253 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1254 (t_nblist * gmx_restrict nlist,
1255 rvec * gmx_restrict xx,
1256 rvec * gmx_restrict ff,
1257 t_forcerec * gmx_restrict fr,
1258 t_mdatoms * gmx_restrict mdatoms,
1259 nb_kernel_data_t * gmx_restrict kernel_data,
1260 t_nrnb * gmx_restrict nrnb)
1262 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1263 * just 0 for non-waters.
1264 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1265 * jnr indices corresponding to data put in the four positions in the SIMD register.
1267 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1268 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1270 int j_coord_offsetA,j_coord_offsetB;
1271 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1272 real rcutoff_scalar;
1273 real *shiftvec,*fshift,*x,*f;
1274 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1276 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1278 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1280 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1281 int vdwjidx0A,vdwjidx0B;
1282 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1283 int vdwjidx1A,vdwjidx1B;
1284 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1285 int vdwjidx2A,vdwjidx2B;
1286 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1287 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1288 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1289 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1290 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1291 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1292 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1293 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1294 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1295 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1296 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1299 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1302 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1303 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1305 __m128i ifour = _mm_set1_epi32(4);
1306 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1308 __m128d dummy_mask,cutoff_mask;
1309 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1310 __m128d one = _mm_set1_pd(1.0);
1311 __m128d two = _mm_set1_pd(2.0);
1317 jindex = nlist->jindex;
1319 shiftidx = nlist->shift;
1321 shiftvec = fr->shift_vec[0];
1322 fshift = fr->fshift[0];
1323 facel = _mm_set1_pd(fr->epsfac);
1324 charge = mdatoms->chargeA;
1325 nvdwtype = fr->ntype;
1326 vdwparam = fr->nbfp;
1327 vdwtype = mdatoms->typeA;
1329 vftab = kernel_data->table_elec_vdw->data;
1330 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1332 /* Setup water-specific parameters */
1333 inr = nlist->iinr[0];
1334 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1335 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1336 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1337 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1339 jq0 = _mm_set1_pd(charge[inr+0]);
1340 jq1 = _mm_set1_pd(charge[inr+1]);
1341 jq2 = _mm_set1_pd(charge[inr+2]);
1342 vdwjidx0A = 2*vdwtype[inr+0];
1343 qq00 = _mm_mul_pd(iq0,jq0);
1344 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1345 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1346 qq01 = _mm_mul_pd(iq0,jq1);
1347 qq02 = _mm_mul_pd(iq0,jq2);
1348 qq10 = _mm_mul_pd(iq1,jq0);
1349 qq11 = _mm_mul_pd(iq1,jq1);
1350 qq12 = _mm_mul_pd(iq1,jq2);
1351 qq20 = _mm_mul_pd(iq2,jq0);
1352 qq21 = _mm_mul_pd(iq2,jq1);
1353 qq22 = _mm_mul_pd(iq2,jq2);
1355 /* Avoid stupid compiler warnings */
1357 j_coord_offsetA = 0;
1358 j_coord_offsetB = 0;
1363 /* Start outer loop over neighborlists */
1364 for(iidx=0; iidx<nri; iidx++)
1366 /* Load shift vector for this list */
1367 i_shift_offset = DIM*shiftidx[iidx];
1369 /* Load limits for loop over neighbors */
1370 j_index_start = jindex[iidx];
1371 j_index_end = jindex[iidx+1];
1373 /* Get outer coordinate index */
1375 i_coord_offset = DIM*inr;
1377 /* Load i particle coords and add shift vector */
1378 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1379 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1381 fix0 = _mm_setzero_pd();
1382 fiy0 = _mm_setzero_pd();
1383 fiz0 = _mm_setzero_pd();
1384 fix1 = _mm_setzero_pd();
1385 fiy1 = _mm_setzero_pd();
1386 fiz1 = _mm_setzero_pd();
1387 fix2 = _mm_setzero_pd();
1388 fiy2 = _mm_setzero_pd();
1389 fiz2 = _mm_setzero_pd();
1391 /* Start inner kernel loop */
1392 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1395 /* Get j neighbor index, and coordinate index */
1397 jnrB = jjnr[jidx+1];
1398 j_coord_offsetA = DIM*jnrA;
1399 j_coord_offsetB = DIM*jnrB;
1401 /* load j atom coordinates */
1402 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1403 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1405 /* Calculate displacement vector */
1406 dx00 = _mm_sub_pd(ix0,jx0);
1407 dy00 = _mm_sub_pd(iy0,jy0);
1408 dz00 = _mm_sub_pd(iz0,jz0);
1409 dx01 = _mm_sub_pd(ix0,jx1);
1410 dy01 = _mm_sub_pd(iy0,jy1);
1411 dz01 = _mm_sub_pd(iz0,jz1);
1412 dx02 = _mm_sub_pd(ix0,jx2);
1413 dy02 = _mm_sub_pd(iy0,jy2);
1414 dz02 = _mm_sub_pd(iz0,jz2);
1415 dx10 = _mm_sub_pd(ix1,jx0);
1416 dy10 = _mm_sub_pd(iy1,jy0);
1417 dz10 = _mm_sub_pd(iz1,jz0);
1418 dx11 = _mm_sub_pd(ix1,jx1);
1419 dy11 = _mm_sub_pd(iy1,jy1);
1420 dz11 = _mm_sub_pd(iz1,jz1);
1421 dx12 = _mm_sub_pd(ix1,jx2);
1422 dy12 = _mm_sub_pd(iy1,jy2);
1423 dz12 = _mm_sub_pd(iz1,jz2);
1424 dx20 = _mm_sub_pd(ix2,jx0);
1425 dy20 = _mm_sub_pd(iy2,jy0);
1426 dz20 = _mm_sub_pd(iz2,jz0);
1427 dx21 = _mm_sub_pd(ix2,jx1);
1428 dy21 = _mm_sub_pd(iy2,jy1);
1429 dz21 = _mm_sub_pd(iz2,jz1);
1430 dx22 = _mm_sub_pd(ix2,jx2);
1431 dy22 = _mm_sub_pd(iy2,jy2);
1432 dz22 = _mm_sub_pd(iz2,jz2);
1434 /* Calculate squared distance and things based on it */
1435 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1436 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1437 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1438 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1439 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1440 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1441 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1442 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1443 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1445 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1446 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1447 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1448 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1449 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1450 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1451 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1452 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1453 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1455 fjx0 = _mm_setzero_pd();
1456 fjy0 = _mm_setzero_pd();
1457 fjz0 = _mm_setzero_pd();
1458 fjx1 = _mm_setzero_pd();
1459 fjy1 = _mm_setzero_pd();
1460 fjz1 = _mm_setzero_pd();
1461 fjx2 = _mm_setzero_pd();
1462 fjy2 = _mm_setzero_pd();
1463 fjz2 = _mm_setzero_pd();
1465 /**************************
1466 * CALCULATE INTERACTIONS *
1467 **************************/
1469 r00 = _mm_mul_pd(rsq00,rinv00);
1471 /* Calculate table index by multiplying r with table scale and truncate to integer */
1472 rt = _mm_mul_pd(r00,vftabscale);
1473 vfitab = _mm_cvttpd_epi32(rt);
1475 vfeps = _mm_frcz_pd(rt);
1477 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1479 twovfeps = _mm_add_pd(vfeps,vfeps);
1480 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1482 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1483 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1484 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1485 GMX_MM_TRANSPOSE2_PD(Y,F);
1486 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1487 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1488 GMX_MM_TRANSPOSE2_PD(G,H);
1489 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1490 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1491 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1493 /* CUBIC SPLINE TABLE DISPERSION */
1494 vfitab = _mm_add_epi32(vfitab,ifour);
1495 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1496 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1497 GMX_MM_TRANSPOSE2_PD(Y,F);
1498 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1499 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1500 GMX_MM_TRANSPOSE2_PD(G,H);
1501 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1502 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1503 fvdw6 = _mm_mul_pd(c6_00,FF);
1505 /* CUBIC SPLINE TABLE REPULSION */
1506 vfitab = _mm_add_epi32(vfitab,ifour);
1507 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1508 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1509 GMX_MM_TRANSPOSE2_PD(Y,F);
1510 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1511 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1512 GMX_MM_TRANSPOSE2_PD(G,H);
1513 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1514 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1515 fvdw12 = _mm_mul_pd(c12_00,FF);
1516 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1518 fscal = _mm_add_pd(felec,fvdw);
1520 /* Update vectorial force */
1521 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1522 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1523 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1525 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1526 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1527 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1529 /**************************
1530 * CALCULATE INTERACTIONS *
1531 **************************/
1533 r01 = _mm_mul_pd(rsq01,rinv01);
1535 /* Calculate table index by multiplying r with table scale and truncate to integer */
1536 rt = _mm_mul_pd(r01,vftabscale);
1537 vfitab = _mm_cvttpd_epi32(rt);
1539 vfeps = _mm_frcz_pd(rt);
1541 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1543 twovfeps = _mm_add_pd(vfeps,vfeps);
1544 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1546 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1547 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1548 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1549 GMX_MM_TRANSPOSE2_PD(Y,F);
1550 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1551 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1552 GMX_MM_TRANSPOSE2_PD(G,H);
1553 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1554 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1555 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1559 /* Update vectorial force */
1560 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1561 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1562 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1564 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1565 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1566 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1568 /**************************
1569 * CALCULATE INTERACTIONS *
1570 **************************/
1572 r02 = _mm_mul_pd(rsq02,rinv02);
1574 /* Calculate table index by multiplying r with table scale and truncate to integer */
1575 rt = _mm_mul_pd(r02,vftabscale);
1576 vfitab = _mm_cvttpd_epi32(rt);
1578 vfeps = _mm_frcz_pd(rt);
1580 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1582 twovfeps = _mm_add_pd(vfeps,vfeps);
1583 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1585 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1586 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1587 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1588 GMX_MM_TRANSPOSE2_PD(Y,F);
1589 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1590 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1591 GMX_MM_TRANSPOSE2_PD(G,H);
1592 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1593 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1594 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1598 /* Update vectorial force */
1599 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1600 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1601 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1603 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1604 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1605 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1607 /**************************
1608 * CALCULATE INTERACTIONS *
1609 **************************/
1611 r10 = _mm_mul_pd(rsq10,rinv10);
1613 /* Calculate table index by multiplying r with table scale and truncate to integer */
1614 rt = _mm_mul_pd(r10,vftabscale);
1615 vfitab = _mm_cvttpd_epi32(rt);
1617 vfeps = _mm_frcz_pd(rt);
1619 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1621 twovfeps = _mm_add_pd(vfeps,vfeps);
1622 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1624 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1625 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1626 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1627 GMX_MM_TRANSPOSE2_PD(Y,F);
1628 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1629 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1630 GMX_MM_TRANSPOSE2_PD(G,H);
1631 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1632 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1633 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1637 /* Update vectorial force */
1638 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1639 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1640 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1642 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1643 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1644 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1646 /**************************
1647 * CALCULATE INTERACTIONS *
1648 **************************/
1650 r11 = _mm_mul_pd(rsq11,rinv11);
1652 /* Calculate table index by multiplying r with table scale and truncate to integer */
1653 rt = _mm_mul_pd(r11,vftabscale);
1654 vfitab = _mm_cvttpd_epi32(rt);
1656 vfeps = _mm_frcz_pd(rt);
1658 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1660 twovfeps = _mm_add_pd(vfeps,vfeps);
1661 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1663 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1664 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1665 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1666 GMX_MM_TRANSPOSE2_PD(Y,F);
1667 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1668 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1669 GMX_MM_TRANSPOSE2_PD(G,H);
1670 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1671 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1672 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1676 /* Update vectorial force */
1677 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1678 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1679 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1681 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1682 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1683 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 r12 = _mm_mul_pd(rsq12,rinv12);
1691 /* Calculate table index by multiplying r with table scale and truncate to integer */
1692 rt = _mm_mul_pd(r12,vftabscale);
1693 vfitab = _mm_cvttpd_epi32(rt);
1695 vfeps = _mm_frcz_pd(rt);
1697 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1699 twovfeps = _mm_add_pd(vfeps,vfeps);
1700 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1702 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1703 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1704 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1705 GMX_MM_TRANSPOSE2_PD(Y,F);
1706 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1707 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1708 GMX_MM_TRANSPOSE2_PD(G,H);
1709 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1710 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1711 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1715 /* Update vectorial force */
1716 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1717 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1718 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1720 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1721 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1722 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1724 /**************************
1725 * CALCULATE INTERACTIONS *
1726 **************************/
1728 r20 = _mm_mul_pd(rsq20,rinv20);
1730 /* Calculate table index by multiplying r with table scale and truncate to integer */
1731 rt = _mm_mul_pd(r20,vftabscale);
1732 vfitab = _mm_cvttpd_epi32(rt);
1734 vfeps = _mm_frcz_pd(rt);
1736 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1738 twovfeps = _mm_add_pd(vfeps,vfeps);
1739 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1741 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1743 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1744 GMX_MM_TRANSPOSE2_PD(Y,F);
1745 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1746 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1747 GMX_MM_TRANSPOSE2_PD(G,H);
1748 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1749 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1750 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1754 /* Update vectorial force */
1755 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1756 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1757 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1759 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1760 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1761 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1763 /**************************
1764 * CALCULATE INTERACTIONS *
1765 **************************/
1767 r21 = _mm_mul_pd(rsq21,rinv21);
1769 /* Calculate table index by multiplying r with table scale and truncate to integer */
1770 rt = _mm_mul_pd(r21,vftabscale);
1771 vfitab = _mm_cvttpd_epi32(rt);
1773 vfeps = _mm_frcz_pd(rt);
1775 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1777 twovfeps = _mm_add_pd(vfeps,vfeps);
1778 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1781 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1782 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1783 GMX_MM_TRANSPOSE2_PD(Y,F);
1784 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1785 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1786 GMX_MM_TRANSPOSE2_PD(G,H);
1787 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1788 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1789 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1793 /* Update vectorial force */
1794 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1795 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1796 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1798 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1799 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1800 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1802 /**************************
1803 * CALCULATE INTERACTIONS *
1804 **************************/
1806 r22 = _mm_mul_pd(rsq22,rinv22);
1808 /* Calculate table index by multiplying r with table scale and truncate to integer */
1809 rt = _mm_mul_pd(r22,vftabscale);
1810 vfitab = _mm_cvttpd_epi32(rt);
1812 vfeps = _mm_frcz_pd(rt);
1814 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1816 twovfeps = _mm_add_pd(vfeps,vfeps);
1817 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1819 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1820 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1821 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1822 GMX_MM_TRANSPOSE2_PD(Y,F);
1823 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1824 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1825 GMX_MM_TRANSPOSE2_PD(G,H);
1826 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1827 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1828 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1832 /* Update vectorial force */
1833 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1834 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1835 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1837 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1838 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1839 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1841 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1843 /* Inner loop uses 400 flops */
1846 if(jidx<j_index_end)
1850 j_coord_offsetA = DIM*jnrA;
1852 /* load j atom coordinates */
1853 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1854 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1856 /* Calculate displacement vector */
1857 dx00 = _mm_sub_pd(ix0,jx0);
1858 dy00 = _mm_sub_pd(iy0,jy0);
1859 dz00 = _mm_sub_pd(iz0,jz0);
1860 dx01 = _mm_sub_pd(ix0,jx1);
1861 dy01 = _mm_sub_pd(iy0,jy1);
1862 dz01 = _mm_sub_pd(iz0,jz1);
1863 dx02 = _mm_sub_pd(ix0,jx2);
1864 dy02 = _mm_sub_pd(iy0,jy2);
1865 dz02 = _mm_sub_pd(iz0,jz2);
1866 dx10 = _mm_sub_pd(ix1,jx0);
1867 dy10 = _mm_sub_pd(iy1,jy0);
1868 dz10 = _mm_sub_pd(iz1,jz0);
1869 dx11 = _mm_sub_pd(ix1,jx1);
1870 dy11 = _mm_sub_pd(iy1,jy1);
1871 dz11 = _mm_sub_pd(iz1,jz1);
1872 dx12 = _mm_sub_pd(ix1,jx2);
1873 dy12 = _mm_sub_pd(iy1,jy2);
1874 dz12 = _mm_sub_pd(iz1,jz2);
1875 dx20 = _mm_sub_pd(ix2,jx0);
1876 dy20 = _mm_sub_pd(iy2,jy0);
1877 dz20 = _mm_sub_pd(iz2,jz0);
1878 dx21 = _mm_sub_pd(ix2,jx1);
1879 dy21 = _mm_sub_pd(iy2,jy1);
1880 dz21 = _mm_sub_pd(iz2,jz1);
1881 dx22 = _mm_sub_pd(ix2,jx2);
1882 dy22 = _mm_sub_pd(iy2,jy2);
1883 dz22 = _mm_sub_pd(iz2,jz2);
1885 /* Calculate squared distance and things based on it */
1886 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1887 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1888 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1889 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1890 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1891 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1892 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1893 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1894 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1896 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1897 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1898 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1899 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1900 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1901 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1902 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1903 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1904 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1906 fjx0 = _mm_setzero_pd();
1907 fjy0 = _mm_setzero_pd();
1908 fjz0 = _mm_setzero_pd();
1909 fjx1 = _mm_setzero_pd();
1910 fjy1 = _mm_setzero_pd();
1911 fjz1 = _mm_setzero_pd();
1912 fjx2 = _mm_setzero_pd();
1913 fjy2 = _mm_setzero_pd();
1914 fjz2 = _mm_setzero_pd();
1916 /**************************
1917 * CALCULATE INTERACTIONS *
1918 **************************/
1920 r00 = _mm_mul_pd(rsq00,rinv00);
1922 /* Calculate table index by multiplying r with table scale and truncate to integer */
1923 rt = _mm_mul_pd(r00,vftabscale);
1924 vfitab = _mm_cvttpd_epi32(rt);
1926 vfeps = _mm_frcz_pd(rt);
1928 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1930 twovfeps = _mm_add_pd(vfeps,vfeps);
1931 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1933 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1934 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1935 F = _mm_setzero_pd();
1936 GMX_MM_TRANSPOSE2_PD(Y,F);
1937 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1938 H = _mm_setzero_pd();
1939 GMX_MM_TRANSPOSE2_PD(G,H);
1940 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1941 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1942 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1944 /* CUBIC SPLINE TABLE DISPERSION */
1945 vfitab = _mm_add_epi32(vfitab,ifour);
1946 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1947 F = _mm_setzero_pd();
1948 GMX_MM_TRANSPOSE2_PD(Y,F);
1949 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1950 H = _mm_setzero_pd();
1951 GMX_MM_TRANSPOSE2_PD(G,H);
1952 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1953 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1954 fvdw6 = _mm_mul_pd(c6_00,FF);
1956 /* CUBIC SPLINE TABLE REPULSION */
1957 vfitab = _mm_add_epi32(vfitab,ifour);
1958 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1959 F = _mm_setzero_pd();
1960 GMX_MM_TRANSPOSE2_PD(Y,F);
1961 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1962 H = _mm_setzero_pd();
1963 GMX_MM_TRANSPOSE2_PD(G,H);
1964 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1965 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1966 fvdw12 = _mm_mul_pd(c12_00,FF);
1967 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1969 fscal = _mm_add_pd(felec,fvdw);
1971 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1973 /* Update vectorial force */
1974 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1975 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1976 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1978 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1979 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1980 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1982 /**************************
1983 * CALCULATE INTERACTIONS *
1984 **************************/
1986 r01 = _mm_mul_pd(rsq01,rinv01);
1988 /* Calculate table index by multiplying r with table scale and truncate to integer */
1989 rt = _mm_mul_pd(r01,vftabscale);
1990 vfitab = _mm_cvttpd_epi32(rt);
1992 vfeps = _mm_frcz_pd(rt);
1994 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1996 twovfeps = _mm_add_pd(vfeps,vfeps);
1997 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1999 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2000 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2001 F = _mm_setzero_pd();
2002 GMX_MM_TRANSPOSE2_PD(Y,F);
2003 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2004 H = _mm_setzero_pd();
2005 GMX_MM_TRANSPOSE2_PD(G,H);
2006 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2007 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2008 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2012 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2014 /* Update vectorial force */
2015 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2016 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2017 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2019 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2020 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2021 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2023 /**************************
2024 * CALCULATE INTERACTIONS *
2025 **************************/
2027 r02 = _mm_mul_pd(rsq02,rinv02);
2029 /* Calculate table index by multiplying r with table scale and truncate to integer */
2030 rt = _mm_mul_pd(r02,vftabscale);
2031 vfitab = _mm_cvttpd_epi32(rt);
2033 vfeps = _mm_frcz_pd(rt);
2035 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2037 twovfeps = _mm_add_pd(vfeps,vfeps);
2038 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2040 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2041 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2042 F = _mm_setzero_pd();
2043 GMX_MM_TRANSPOSE2_PD(Y,F);
2044 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2045 H = _mm_setzero_pd();
2046 GMX_MM_TRANSPOSE2_PD(G,H);
2047 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2048 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2049 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2053 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2055 /* Update vectorial force */
2056 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2057 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2058 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2060 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2061 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2062 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2064 /**************************
2065 * CALCULATE INTERACTIONS *
2066 **************************/
2068 r10 = _mm_mul_pd(rsq10,rinv10);
2070 /* Calculate table index by multiplying r with table scale and truncate to integer */
2071 rt = _mm_mul_pd(r10,vftabscale);
2072 vfitab = _mm_cvttpd_epi32(rt);
2074 vfeps = _mm_frcz_pd(rt);
2076 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2078 twovfeps = _mm_add_pd(vfeps,vfeps);
2079 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2081 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2082 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2083 F = _mm_setzero_pd();
2084 GMX_MM_TRANSPOSE2_PD(Y,F);
2085 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2086 H = _mm_setzero_pd();
2087 GMX_MM_TRANSPOSE2_PD(G,H);
2088 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2089 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2090 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2094 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2096 /* Update vectorial force */
2097 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2098 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2099 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2101 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2102 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2103 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 r11 = _mm_mul_pd(rsq11,rinv11);
2111 /* Calculate table index by multiplying r with table scale and truncate to integer */
2112 rt = _mm_mul_pd(r11,vftabscale);
2113 vfitab = _mm_cvttpd_epi32(rt);
2115 vfeps = _mm_frcz_pd(rt);
2117 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2119 twovfeps = _mm_add_pd(vfeps,vfeps);
2120 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2122 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2123 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2124 F = _mm_setzero_pd();
2125 GMX_MM_TRANSPOSE2_PD(Y,F);
2126 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2127 H = _mm_setzero_pd();
2128 GMX_MM_TRANSPOSE2_PD(G,H);
2129 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2130 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2131 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2135 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2137 /* Update vectorial force */
2138 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2139 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2140 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2142 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2143 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2144 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2146 /**************************
2147 * CALCULATE INTERACTIONS *
2148 **************************/
2150 r12 = _mm_mul_pd(rsq12,rinv12);
2152 /* Calculate table index by multiplying r with table scale and truncate to integer */
2153 rt = _mm_mul_pd(r12,vftabscale);
2154 vfitab = _mm_cvttpd_epi32(rt);
2156 vfeps = _mm_frcz_pd(rt);
2158 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2160 twovfeps = _mm_add_pd(vfeps,vfeps);
2161 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2163 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2164 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2165 F = _mm_setzero_pd();
2166 GMX_MM_TRANSPOSE2_PD(Y,F);
2167 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2168 H = _mm_setzero_pd();
2169 GMX_MM_TRANSPOSE2_PD(G,H);
2170 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2171 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2172 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2176 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2178 /* Update vectorial force */
2179 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2180 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2181 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2183 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2184 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2185 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2187 /**************************
2188 * CALCULATE INTERACTIONS *
2189 **************************/
2191 r20 = _mm_mul_pd(rsq20,rinv20);
2193 /* Calculate table index by multiplying r with table scale and truncate to integer */
2194 rt = _mm_mul_pd(r20,vftabscale);
2195 vfitab = _mm_cvttpd_epi32(rt);
2197 vfeps = _mm_frcz_pd(rt);
2199 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2201 twovfeps = _mm_add_pd(vfeps,vfeps);
2202 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2204 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2205 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2206 F = _mm_setzero_pd();
2207 GMX_MM_TRANSPOSE2_PD(Y,F);
2208 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2209 H = _mm_setzero_pd();
2210 GMX_MM_TRANSPOSE2_PD(G,H);
2211 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2212 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2213 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2217 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2219 /* Update vectorial force */
2220 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2221 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2222 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2224 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2225 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2226 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2228 /**************************
2229 * CALCULATE INTERACTIONS *
2230 **************************/
2232 r21 = _mm_mul_pd(rsq21,rinv21);
2234 /* Calculate table index by multiplying r with table scale and truncate to integer */
2235 rt = _mm_mul_pd(r21,vftabscale);
2236 vfitab = _mm_cvttpd_epi32(rt);
2238 vfeps = _mm_frcz_pd(rt);
2240 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2242 twovfeps = _mm_add_pd(vfeps,vfeps);
2243 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2245 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2246 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2247 F = _mm_setzero_pd();
2248 GMX_MM_TRANSPOSE2_PD(Y,F);
2249 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2250 H = _mm_setzero_pd();
2251 GMX_MM_TRANSPOSE2_PD(G,H);
2252 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2253 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2254 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2258 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2260 /* Update vectorial force */
2261 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2262 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2263 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2265 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2266 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2267 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2269 /**************************
2270 * CALCULATE INTERACTIONS *
2271 **************************/
2273 r22 = _mm_mul_pd(rsq22,rinv22);
2275 /* Calculate table index by multiplying r with table scale and truncate to integer */
2276 rt = _mm_mul_pd(r22,vftabscale);
2277 vfitab = _mm_cvttpd_epi32(rt);
2279 vfeps = _mm_frcz_pd(rt);
2281 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2283 twovfeps = _mm_add_pd(vfeps,vfeps);
2284 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2286 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2287 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2288 F = _mm_setzero_pd();
2289 GMX_MM_TRANSPOSE2_PD(Y,F);
2290 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2291 H = _mm_setzero_pd();
2292 GMX_MM_TRANSPOSE2_PD(G,H);
2293 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2294 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2295 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2299 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2301 /* Update vectorial force */
2302 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2303 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2304 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2306 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2307 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2308 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2310 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2312 /* Inner loop uses 400 flops */
2315 /* End of innermost loop */
2317 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2318 f+i_coord_offset,fshift+i_shift_offset);
2320 /* Increment number of inner iterations */
2321 inneriter += j_index_end - j_index_start;
2323 /* Outer loop uses 18 flops */
2326 /* Increment number of outer iterations */
2329 /* Update outer/inner flops */
2331 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);