2 * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_elec->data;
121 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_pd(charge[inr+0]);
131 jq1 = _mm_set1_pd(charge[inr+1]);
132 jq2 = _mm_set1_pd(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_pd(iq0,jq0);
135 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_pd(iq0,jq1);
138 qq02 = _mm_mul_pd(iq0,jq2);
139 qq10 = _mm_mul_pd(iq1,jq0);
140 qq11 = _mm_mul_pd(iq1,jq1);
141 qq12 = _mm_mul_pd(iq1,jq2);
142 qq20 = _mm_mul_pd(iq2,jq0);
143 qq21 = _mm_mul_pd(iq2,jq1);
144 qq22 = _mm_mul_pd(iq2,jq2);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm_setzero_pd();
173 fiy0 = _mm_setzero_pd();
174 fiz0 = _mm_setzero_pd();
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx01 = _mm_sub_pd(ix0,jx1);
205 dy01 = _mm_sub_pd(iy0,jy1);
206 dz01 = _mm_sub_pd(iz0,jz1);
207 dx02 = _mm_sub_pd(ix0,jx2);
208 dy02 = _mm_sub_pd(iy0,jy2);
209 dz02 = _mm_sub_pd(iz0,jz2);
210 dx10 = _mm_sub_pd(ix1,jx0);
211 dy10 = _mm_sub_pd(iy1,jy0);
212 dz10 = _mm_sub_pd(iz1,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx20 = _mm_sub_pd(ix2,jx0);
220 dy20 = _mm_sub_pd(iy2,jy0);
221 dz20 = _mm_sub_pd(iz2,jz0);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
240 rinv00 = gmx_mm_invsqrt_pd(rsq00);
241 rinv01 = gmx_mm_invsqrt_pd(rsq01);
242 rinv02 = gmx_mm_invsqrt_pd(rsq02);
243 rinv10 = gmx_mm_invsqrt_pd(rsq10);
244 rinv11 = gmx_mm_invsqrt_pd(rsq11);
245 rinv12 = gmx_mm_invsqrt_pd(rsq12);
246 rinv20 = gmx_mm_invsqrt_pd(rsq20);
247 rinv21 = gmx_mm_invsqrt_pd(rsq21);
248 rinv22 = gmx_mm_invsqrt_pd(rsq22);
250 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
252 fjx0 = _mm_setzero_pd();
253 fjy0 = _mm_setzero_pd();
254 fjz0 = _mm_setzero_pd();
255 fjx1 = _mm_setzero_pd();
256 fjy1 = _mm_setzero_pd();
257 fjz1 = _mm_setzero_pd();
258 fjx2 = _mm_setzero_pd();
259 fjy2 = _mm_setzero_pd();
260 fjz2 = _mm_setzero_pd();
262 /**************************
263 * CALCULATE INTERACTIONS *
264 **************************/
266 r00 = _mm_mul_pd(rsq00,rinv00);
268 /* Calculate table index by multiplying r with table scale and truncate to integer */
269 rt = _mm_mul_pd(r00,vftabscale);
270 vfitab = _mm_cvttpd_epi32(rt);
271 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
272 vfitab = _mm_slli_epi32(vfitab,2);
274 /* CUBIC SPLINE TABLE ELECTROSTATICS */
275 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
276 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
277 GMX_MM_TRANSPOSE2_PD(Y,F);
278 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
279 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
280 GMX_MM_TRANSPOSE2_PD(G,H);
281 Heps = _mm_mul_pd(vfeps,H);
282 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
283 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
284 velec = _mm_mul_pd(qq00,VV);
285 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
286 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
288 /* LENNARD-JONES DISPERSION/REPULSION */
290 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
291 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
292 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
293 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
294 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
296 /* Update potential sum for this i atom from the interaction with this j atom. */
297 velecsum = _mm_add_pd(velecsum,velec);
298 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
300 fscal = _mm_add_pd(felec,fvdw);
302 /* Calculate temporary vectorial force */
303 tx = _mm_mul_pd(fscal,dx00);
304 ty = _mm_mul_pd(fscal,dy00);
305 tz = _mm_mul_pd(fscal,dz00);
307 /* Update vectorial force */
308 fix0 = _mm_add_pd(fix0,tx);
309 fiy0 = _mm_add_pd(fiy0,ty);
310 fiz0 = _mm_add_pd(fiz0,tz);
312 fjx0 = _mm_add_pd(fjx0,tx);
313 fjy0 = _mm_add_pd(fjy0,ty);
314 fjz0 = _mm_add_pd(fjz0,tz);
316 /**************************
317 * CALCULATE INTERACTIONS *
318 **************************/
320 r01 = _mm_mul_pd(rsq01,rinv01);
322 /* Calculate table index by multiplying r with table scale and truncate to integer */
323 rt = _mm_mul_pd(r01,vftabscale);
324 vfitab = _mm_cvttpd_epi32(rt);
325 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
326 vfitab = _mm_slli_epi32(vfitab,2);
328 /* CUBIC SPLINE TABLE ELECTROSTATICS */
329 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
330 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
331 GMX_MM_TRANSPOSE2_PD(Y,F);
332 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
333 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
334 GMX_MM_TRANSPOSE2_PD(G,H);
335 Heps = _mm_mul_pd(vfeps,H);
336 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
337 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
338 velec = _mm_mul_pd(qq01,VV);
339 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
340 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
342 /* Update potential sum for this i atom from the interaction with this j atom. */
343 velecsum = _mm_add_pd(velecsum,velec);
347 /* Calculate temporary vectorial force */
348 tx = _mm_mul_pd(fscal,dx01);
349 ty = _mm_mul_pd(fscal,dy01);
350 tz = _mm_mul_pd(fscal,dz01);
352 /* Update vectorial force */
353 fix0 = _mm_add_pd(fix0,tx);
354 fiy0 = _mm_add_pd(fiy0,ty);
355 fiz0 = _mm_add_pd(fiz0,tz);
357 fjx1 = _mm_add_pd(fjx1,tx);
358 fjy1 = _mm_add_pd(fjy1,ty);
359 fjz1 = _mm_add_pd(fjz1,tz);
361 /**************************
362 * CALCULATE INTERACTIONS *
363 **************************/
365 r02 = _mm_mul_pd(rsq02,rinv02);
367 /* Calculate table index by multiplying r with table scale and truncate to integer */
368 rt = _mm_mul_pd(r02,vftabscale);
369 vfitab = _mm_cvttpd_epi32(rt);
370 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
371 vfitab = _mm_slli_epi32(vfitab,2);
373 /* CUBIC SPLINE TABLE ELECTROSTATICS */
374 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
375 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
376 GMX_MM_TRANSPOSE2_PD(Y,F);
377 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
378 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
379 GMX_MM_TRANSPOSE2_PD(G,H);
380 Heps = _mm_mul_pd(vfeps,H);
381 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
382 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
383 velec = _mm_mul_pd(qq02,VV);
384 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
385 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velecsum = _mm_add_pd(velecsum,velec);
392 /* Calculate temporary vectorial force */
393 tx = _mm_mul_pd(fscal,dx02);
394 ty = _mm_mul_pd(fscal,dy02);
395 tz = _mm_mul_pd(fscal,dz02);
397 /* Update vectorial force */
398 fix0 = _mm_add_pd(fix0,tx);
399 fiy0 = _mm_add_pd(fiy0,ty);
400 fiz0 = _mm_add_pd(fiz0,tz);
402 fjx2 = _mm_add_pd(fjx2,tx);
403 fjy2 = _mm_add_pd(fjy2,ty);
404 fjz2 = _mm_add_pd(fjz2,tz);
406 /**************************
407 * CALCULATE INTERACTIONS *
408 **************************/
410 r10 = _mm_mul_pd(rsq10,rinv10);
412 /* Calculate table index by multiplying r with table scale and truncate to integer */
413 rt = _mm_mul_pd(r10,vftabscale);
414 vfitab = _mm_cvttpd_epi32(rt);
415 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
416 vfitab = _mm_slli_epi32(vfitab,2);
418 /* CUBIC SPLINE TABLE ELECTROSTATICS */
419 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
420 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
421 GMX_MM_TRANSPOSE2_PD(Y,F);
422 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
423 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
424 GMX_MM_TRANSPOSE2_PD(G,H);
425 Heps = _mm_mul_pd(vfeps,H);
426 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
427 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
428 velec = _mm_mul_pd(qq10,VV);
429 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
430 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum = _mm_add_pd(velecsum,velec);
437 /* Calculate temporary vectorial force */
438 tx = _mm_mul_pd(fscal,dx10);
439 ty = _mm_mul_pd(fscal,dy10);
440 tz = _mm_mul_pd(fscal,dz10);
442 /* Update vectorial force */
443 fix1 = _mm_add_pd(fix1,tx);
444 fiy1 = _mm_add_pd(fiy1,ty);
445 fiz1 = _mm_add_pd(fiz1,tz);
447 fjx0 = _mm_add_pd(fjx0,tx);
448 fjy0 = _mm_add_pd(fjy0,ty);
449 fjz0 = _mm_add_pd(fjz0,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 r11 = _mm_mul_pd(rsq11,rinv11);
457 /* Calculate table index by multiplying r with table scale and truncate to integer */
458 rt = _mm_mul_pd(r11,vftabscale);
459 vfitab = _mm_cvttpd_epi32(rt);
460 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
461 vfitab = _mm_slli_epi32(vfitab,2);
463 /* CUBIC SPLINE TABLE ELECTROSTATICS */
464 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
465 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
466 GMX_MM_TRANSPOSE2_PD(Y,F);
467 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
468 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
469 GMX_MM_TRANSPOSE2_PD(G,H);
470 Heps = _mm_mul_pd(vfeps,H);
471 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
472 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
473 velec = _mm_mul_pd(qq11,VV);
474 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
475 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velecsum = _mm_add_pd(velecsum,velec);
482 /* Calculate temporary vectorial force */
483 tx = _mm_mul_pd(fscal,dx11);
484 ty = _mm_mul_pd(fscal,dy11);
485 tz = _mm_mul_pd(fscal,dz11);
487 /* Update vectorial force */
488 fix1 = _mm_add_pd(fix1,tx);
489 fiy1 = _mm_add_pd(fiy1,ty);
490 fiz1 = _mm_add_pd(fiz1,tz);
492 fjx1 = _mm_add_pd(fjx1,tx);
493 fjy1 = _mm_add_pd(fjy1,ty);
494 fjz1 = _mm_add_pd(fjz1,tz);
496 /**************************
497 * CALCULATE INTERACTIONS *
498 **************************/
500 r12 = _mm_mul_pd(rsq12,rinv12);
502 /* Calculate table index by multiplying r with table scale and truncate to integer */
503 rt = _mm_mul_pd(r12,vftabscale);
504 vfitab = _mm_cvttpd_epi32(rt);
505 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
506 vfitab = _mm_slli_epi32(vfitab,2);
508 /* CUBIC SPLINE TABLE ELECTROSTATICS */
509 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
510 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
511 GMX_MM_TRANSPOSE2_PD(Y,F);
512 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
513 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
514 GMX_MM_TRANSPOSE2_PD(G,H);
515 Heps = _mm_mul_pd(vfeps,H);
516 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
517 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
518 velec = _mm_mul_pd(qq12,VV);
519 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
520 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum = _mm_add_pd(velecsum,velec);
527 /* Calculate temporary vectorial force */
528 tx = _mm_mul_pd(fscal,dx12);
529 ty = _mm_mul_pd(fscal,dy12);
530 tz = _mm_mul_pd(fscal,dz12);
532 /* Update vectorial force */
533 fix1 = _mm_add_pd(fix1,tx);
534 fiy1 = _mm_add_pd(fiy1,ty);
535 fiz1 = _mm_add_pd(fiz1,tz);
537 fjx2 = _mm_add_pd(fjx2,tx);
538 fjy2 = _mm_add_pd(fjy2,ty);
539 fjz2 = _mm_add_pd(fjz2,tz);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 r20 = _mm_mul_pd(rsq20,rinv20);
547 /* Calculate table index by multiplying r with table scale and truncate to integer */
548 rt = _mm_mul_pd(r20,vftabscale);
549 vfitab = _mm_cvttpd_epi32(rt);
550 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
551 vfitab = _mm_slli_epi32(vfitab,2);
553 /* CUBIC SPLINE TABLE ELECTROSTATICS */
554 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
555 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
556 GMX_MM_TRANSPOSE2_PD(Y,F);
557 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
558 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
559 GMX_MM_TRANSPOSE2_PD(G,H);
560 Heps = _mm_mul_pd(vfeps,H);
561 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
562 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
563 velec = _mm_mul_pd(qq20,VV);
564 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
565 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm_add_pd(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_pd(fscal,dx20);
574 ty = _mm_mul_pd(fscal,dy20);
575 tz = _mm_mul_pd(fscal,dz20);
577 /* Update vectorial force */
578 fix2 = _mm_add_pd(fix2,tx);
579 fiy2 = _mm_add_pd(fiy2,ty);
580 fiz2 = _mm_add_pd(fiz2,tz);
582 fjx0 = _mm_add_pd(fjx0,tx);
583 fjy0 = _mm_add_pd(fjy0,ty);
584 fjz0 = _mm_add_pd(fjz0,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r21 = _mm_mul_pd(rsq21,rinv21);
592 /* Calculate table index by multiplying r with table scale and truncate to integer */
593 rt = _mm_mul_pd(r21,vftabscale);
594 vfitab = _mm_cvttpd_epi32(rt);
595 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
596 vfitab = _mm_slli_epi32(vfitab,2);
598 /* CUBIC SPLINE TABLE ELECTROSTATICS */
599 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
600 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
601 GMX_MM_TRANSPOSE2_PD(Y,F);
602 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
603 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
604 GMX_MM_TRANSPOSE2_PD(G,H);
605 Heps = _mm_mul_pd(vfeps,H);
606 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
607 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
608 velec = _mm_mul_pd(qq21,VV);
609 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
610 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velecsum = _mm_add_pd(velecsum,velec);
617 /* Calculate temporary vectorial force */
618 tx = _mm_mul_pd(fscal,dx21);
619 ty = _mm_mul_pd(fscal,dy21);
620 tz = _mm_mul_pd(fscal,dz21);
622 /* Update vectorial force */
623 fix2 = _mm_add_pd(fix2,tx);
624 fiy2 = _mm_add_pd(fiy2,ty);
625 fiz2 = _mm_add_pd(fiz2,tz);
627 fjx1 = _mm_add_pd(fjx1,tx);
628 fjy1 = _mm_add_pd(fjy1,ty);
629 fjz1 = _mm_add_pd(fjz1,tz);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 r22 = _mm_mul_pd(rsq22,rinv22);
637 /* Calculate table index by multiplying r with table scale and truncate to integer */
638 rt = _mm_mul_pd(r22,vftabscale);
639 vfitab = _mm_cvttpd_epi32(rt);
640 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
641 vfitab = _mm_slli_epi32(vfitab,2);
643 /* CUBIC SPLINE TABLE ELECTROSTATICS */
644 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
645 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
646 GMX_MM_TRANSPOSE2_PD(Y,F);
647 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
648 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
649 GMX_MM_TRANSPOSE2_PD(G,H);
650 Heps = _mm_mul_pd(vfeps,H);
651 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
652 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
653 velec = _mm_mul_pd(qq22,VV);
654 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
655 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velecsum = _mm_add_pd(velecsum,velec);
662 /* Calculate temporary vectorial force */
663 tx = _mm_mul_pd(fscal,dx22);
664 ty = _mm_mul_pd(fscal,dy22);
665 tz = _mm_mul_pd(fscal,dz22);
667 /* Update vectorial force */
668 fix2 = _mm_add_pd(fix2,tx);
669 fiy2 = _mm_add_pd(fiy2,ty);
670 fiz2 = _mm_add_pd(fiz2,tz);
672 fjx2 = _mm_add_pd(fjx2,tx);
673 fjy2 = _mm_add_pd(fjy2,ty);
674 fjz2 = _mm_add_pd(fjz2,tz);
676 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
678 /* Inner loop uses 400 flops */
685 j_coord_offsetA = DIM*jnrA;
687 /* load j atom coordinates */
688 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
689 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
691 /* Calculate displacement vector */
692 dx00 = _mm_sub_pd(ix0,jx0);
693 dy00 = _mm_sub_pd(iy0,jy0);
694 dz00 = _mm_sub_pd(iz0,jz0);
695 dx01 = _mm_sub_pd(ix0,jx1);
696 dy01 = _mm_sub_pd(iy0,jy1);
697 dz01 = _mm_sub_pd(iz0,jz1);
698 dx02 = _mm_sub_pd(ix0,jx2);
699 dy02 = _mm_sub_pd(iy0,jy2);
700 dz02 = _mm_sub_pd(iz0,jz2);
701 dx10 = _mm_sub_pd(ix1,jx0);
702 dy10 = _mm_sub_pd(iy1,jy0);
703 dz10 = _mm_sub_pd(iz1,jz0);
704 dx11 = _mm_sub_pd(ix1,jx1);
705 dy11 = _mm_sub_pd(iy1,jy1);
706 dz11 = _mm_sub_pd(iz1,jz1);
707 dx12 = _mm_sub_pd(ix1,jx2);
708 dy12 = _mm_sub_pd(iy1,jy2);
709 dz12 = _mm_sub_pd(iz1,jz2);
710 dx20 = _mm_sub_pd(ix2,jx0);
711 dy20 = _mm_sub_pd(iy2,jy0);
712 dz20 = _mm_sub_pd(iz2,jz0);
713 dx21 = _mm_sub_pd(ix2,jx1);
714 dy21 = _mm_sub_pd(iy2,jy1);
715 dz21 = _mm_sub_pd(iz2,jz1);
716 dx22 = _mm_sub_pd(ix2,jx2);
717 dy22 = _mm_sub_pd(iy2,jy2);
718 dz22 = _mm_sub_pd(iz2,jz2);
720 /* Calculate squared distance and things based on it */
721 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
722 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
723 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
724 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
725 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
726 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
727 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
728 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
729 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
731 rinv00 = gmx_mm_invsqrt_pd(rsq00);
732 rinv01 = gmx_mm_invsqrt_pd(rsq01);
733 rinv02 = gmx_mm_invsqrt_pd(rsq02);
734 rinv10 = gmx_mm_invsqrt_pd(rsq10);
735 rinv11 = gmx_mm_invsqrt_pd(rsq11);
736 rinv12 = gmx_mm_invsqrt_pd(rsq12);
737 rinv20 = gmx_mm_invsqrt_pd(rsq20);
738 rinv21 = gmx_mm_invsqrt_pd(rsq21);
739 rinv22 = gmx_mm_invsqrt_pd(rsq22);
741 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
743 fjx0 = _mm_setzero_pd();
744 fjy0 = _mm_setzero_pd();
745 fjz0 = _mm_setzero_pd();
746 fjx1 = _mm_setzero_pd();
747 fjy1 = _mm_setzero_pd();
748 fjz1 = _mm_setzero_pd();
749 fjx2 = _mm_setzero_pd();
750 fjy2 = _mm_setzero_pd();
751 fjz2 = _mm_setzero_pd();
753 /**************************
754 * CALCULATE INTERACTIONS *
755 **************************/
757 r00 = _mm_mul_pd(rsq00,rinv00);
759 /* Calculate table index by multiplying r with table scale and truncate to integer */
760 rt = _mm_mul_pd(r00,vftabscale);
761 vfitab = _mm_cvttpd_epi32(rt);
762 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
763 vfitab = _mm_slli_epi32(vfitab,2);
765 /* CUBIC SPLINE TABLE ELECTROSTATICS */
766 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
767 F = _mm_setzero_pd();
768 GMX_MM_TRANSPOSE2_PD(Y,F);
769 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
770 H = _mm_setzero_pd();
771 GMX_MM_TRANSPOSE2_PD(G,H);
772 Heps = _mm_mul_pd(vfeps,H);
773 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
774 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
775 velec = _mm_mul_pd(qq00,VV);
776 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
777 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
779 /* LENNARD-JONES DISPERSION/REPULSION */
781 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
782 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
783 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
784 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
785 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
787 /* Update potential sum for this i atom from the interaction with this j atom. */
788 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
789 velecsum = _mm_add_pd(velecsum,velec);
790 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
791 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
793 fscal = _mm_add_pd(felec,fvdw);
795 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
797 /* Calculate temporary vectorial force */
798 tx = _mm_mul_pd(fscal,dx00);
799 ty = _mm_mul_pd(fscal,dy00);
800 tz = _mm_mul_pd(fscal,dz00);
802 /* Update vectorial force */
803 fix0 = _mm_add_pd(fix0,tx);
804 fiy0 = _mm_add_pd(fiy0,ty);
805 fiz0 = _mm_add_pd(fiz0,tz);
807 fjx0 = _mm_add_pd(fjx0,tx);
808 fjy0 = _mm_add_pd(fjy0,ty);
809 fjz0 = _mm_add_pd(fjz0,tz);
811 /**************************
812 * CALCULATE INTERACTIONS *
813 **************************/
815 r01 = _mm_mul_pd(rsq01,rinv01);
817 /* Calculate table index by multiplying r with table scale and truncate to integer */
818 rt = _mm_mul_pd(r01,vftabscale);
819 vfitab = _mm_cvttpd_epi32(rt);
820 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
821 vfitab = _mm_slli_epi32(vfitab,2);
823 /* CUBIC SPLINE TABLE ELECTROSTATICS */
824 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
825 F = _mm_setzero_pd();
826 GMX_MM_TRANSPOSE2_PD(Y,F);
827 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
828 H = _mm_setzero_pd();
829 GMX_MM_TRANSPOSE2_PD(G,H);
830 Heps = _mm_mul_pd(vfeps,H);
831 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
832 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
833 velec = _mm_mul_pd(qq01,VV);
834 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
835 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
839 velecsum = _mm_add_pd(velecsum,velec);
843 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
845 /* Calculate temporary vectorial force */
846 tx = _mm_mul_pd(fscal,dx01);
847 ty = _mm_mul_pd(fscal,dy01);
848 tz = _mm_mul_pd(fscal,dz01);
850 /* Update vectorial force */
851 fix0 = _mm_add_pd(fix0,tx);
852 fiy0 = _mm_add_pd(fiy0,ty);
853 fiz0 = _mm_add_pd(fiz0,tz);
855 fjx1 = _mm_add_pd(fjx1,tx);
856 fjy1 = _mm_add_pd(fjy1,ty);
857 fjz1 = _mm_add_pd(fjz1,tz);
859 /**************************
860 * CALCULATE INTERACTIONS *
861 **************************/
863 r02 = _mm_mul_pd(rsq02,rinv02);
865 /* Calculate table index by multiplying r with table scale and truncate to integer */
866 rt = _mm_mul_pd(r02,vftabscale);
867 vfitab = _mm_cvttpd_epi32(rt);
868 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
869 vfitab = _mm_slli_epi32(vfitab,2);
871 /* CUBIC SPLINE TABLE ELECTROSTATICS */
872 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
873 F = _mm_setzero_pd();
874 GMX_MM_TRANSPOSE2_PD(Y,F);
875 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
876 H = _mm_setzero_pd();
877 GMX_MM_TRANSPOSE2_PD(G,H);
878 Heps = _mm_mul_pd(vfeps,H);
879 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
880 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
881 velec = _mm_mul_pd(qq02,VV);
882 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
883 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
885 /* Update potential sum for this i atom from the interaction with this j atom. */
886 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
887 velecsum = _mm_add_pd(velecsum,velec);
891 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
893 /* Calculate temporary vectorial force */
894 tx = _mm_mul_pd(fscal,dx02);
895 ty = _mm_mul_pd(fscal,dy02);
896 tz = _mm_mul_pd(fscal,dz02);
898 /* Update vectorial force */
899 fix0 = _mm_add_pd(fix0,tx);
900 fiy0 = _mm_add_pd(fiy0,ty);
901 fiz0 = _mm_add_pd(fiz0,tz);
903 fjx2 = _mm_add_pd(fjx2,tx);
904 fjy2 = _mm_add_pd(fjy2,ty);
905 fjz2 = _mm_add_pd(fjz2,tz);
907 /**************************
908 * CALCULATE INTERACTIONS *
909 **************************/
911 r10 = _mm_mul_pd(rsq10,rinv10);
913 /* Calculate table index by multiplying r with table scale and truncate to integer */
914 rt = _mm_mul_pd(r10,vftabscale);
915 vfitab = _mm_cvttpd_epi32(rt);
916 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
917 vfitab = _mm_slli_epi32(vfitab,2);
919 /* CUBIC SPLINE TABLE ELECTROSTATICS */
920 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
921 F = _mm_setzero_pd();
922 GMX_MM_TRANSPOSE2_PD(Y,F);
923 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
924 H = _mm_setzero_pd();
925 GMX_MM_TRANSPOSE2_PD(G,H);
926 Heps = _mm_mul_pd(vfeps,H);
927 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
928 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
929 velec = _mm_mul_pd(qq10,VV);
930 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
931 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
935 velecsum = _mm_add_pd(velecsum,velec);
939 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
941 /* Calculate temporary vectorial force */
942 tx = _mm_mul_pd(fscal,dx10);
943 ty = _mm_mul_pd(fscal,dy10);
944 tz = _mm_mul_pd(fscal,dz10);
946 /* Update vectorial force */
947 fix1 = _mm_add_pd(fix1,tx);
948 fiy1 = _mm_add_pd(fiy1,ty);
949 fiz1 = _mm_add_pd(fiz1,tz);
951 fjx0 = _mm_add_pd(fjx0,tx);
952 fjy0 = _mm_add_pd(fjy0,ty);
953 fjz0 = _mm_add_pd(fjz0,tz);
955 /**************************
956 * CALCULATE INTERACTIONS *
957 **************************/
959 r11 = _mm_mul_pd(rsq11,rinv11);
961 /* Calculate table index by multiplying r with table scale and truncate to integer */
962 rt = _mm_mul_pd(r11,vftabscale);
963 vfitab = _mm_cvttpd_epi32(rt);
964 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
965 vfitab = _mm_slli_epi32(vfitab,2);
967 /* CUBIC SPLINE TABLE ELECTROSTATICS */
968 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
969 F = _mm_setzero_pd();
970 GMX_MM_TRANSPOSE2_PD(Y,F);
971 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
972 H = _mm_setzero_pd();
973 GMX_MM_TRANSPOSE2_PD(G,H);
974 Heps = _mm_mul_pd(vfeps,H);
975 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
976 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
977 velec = _mm_mul_pd(qq11,VV);
978 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
979 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
981 /* Update potential sum for this i atom from the interaction with this j atom. */
982 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
983 velecsum = _mm_add_pd(velecsum,velec);
987 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
989 /* Calculate temporary vectorial force */
990 tx = _mm_mul_pd(fscal,dx11);
991 ty = _mm_mul_pd(fscal,dy11);
992 tz = _mm_mul_pd(fscal,dz11);
994 /* Update vectorial force */
995 fix1 = _mm_add_pd(fix1,tx);
996 fiy1 = _mm_add_pd(fiy1,ty);
997 fiz1 = _mm_add_pd(fiz1,tz);
999 fjx1 = _mm_add_pd(fjx1,tx);
1000 fjy1 = _mm_add_pd(fjy1,ty);
1001 fjz1 = _mm_add_pd(fjz1,tz);
1003 /**************************
1004 * CALCULATE INTERACTIONS *
1005 **************************/
1007 r12 = _mm_mul_pd(rsq12,rinv12);
1009 /* Calculate table index by multiplying r with table scale and truncate to integer */
1010 rt = _mm_mul_pd(r12,vftabscale);
1011 vfitab = _mm_cvttpd_epi32(rt);
1012 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1013 vfitab = _mm_slli_epi32(vfitab,2);
1015 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1016 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1017 F = _mm_setzero_pd();
1018 GMX_MM_TRANSPOSE2_PD(Y,F);
1019 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1020 H = _mm_setzero_pd();
1021 GMX_MM_TRANSPOSE2_PD(G,H);
1022 Heps = _mm_mul_pd(vfeps,H);
1023 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1024 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1025 velec = _mm_mul_pd(qq12,VV);
1026 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1027 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1029 /* Update potential sum for this i atom from the interaction with this j atom. */
1030 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1031 velecsum = _mm_add_pd(velecsum,velec);
1035 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1037 /* Calculate temporary vectorial force */
1038 tx = _mm_mul_pd(fscal,dx12);
1039 ty = _mm_mul_pd(fscal,dy12);
1040 tz = _mm_mul_pd(fscal,dz12);
1042 /* Update vectorial force */
1043 fix1 = _mm_add_pd(fix1,tx);
1044 fiy1 = _mm_add_pd(fiy1,ty);
1045 fiz1 = _mm_add_pd(fiz1,tz);
1047 fjx2 = _mm_add_pd(fjx2,tx);
1048 fjy2 = _mm_add_pd(fjy2,ty);
1049 fjz2 = _mm_add_pd(fjz2,tz);
1051 /**************************
1052 * CALCULATE INTERACTIONS *
1053 **************************/
1055 r20 = _mm_mul_pd(rsq20,rinv20);
1057 /* Calculate table index by multiplying r with table scale and truncate to integer */
1058 rt = _mm_mul_pd(r20,vftabscale);
1059 vfitab = _mm_cvttpd_epi32(rt);
1060 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1061 vfitab = _mm_slli_epi32(vfitab,2);
1063 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1064 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1065 F = _mm_setzero_pd();
1066 GMX_MM_TRANSPOSE2_PD(Y,F);
1067 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1068 H = _mm_setzero_pd();
1069 GMX_MM_TRANSPOSE2_PD(G,H);
1070 Heps = _mm_mul_pd(vfeps,H);
1071 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1072 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1073 velec = _mm_mul_pd(qq20,VV);
1074 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1075 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1077 /* Update potential sum for this i atom from the interaction with this j atom. */
1078 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1079 velecsum = _mm_add_pd(velecsum,velec);
1083 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1085 /* Calculate temporary vectorial force */
1086 tx = _mm_mul_pd(fscal,dx20);
1087 ty = _mm_mul_pd(fscal,dy20);
1088 tz = _mm_mul_pd(fscal,dz20);
1090 /* Update vectorial force */
1091 fix2 = _mm_add_pd(fix2,tx);
1092 fiy2 = _mm_add_pd(fiy2,ty);
1093 fiz2 = _mm_add_pd(fiz2,tz);
1095 fjx0 = _mm_add_pd(fjx0,tx);
1096 fjy0 = _mm_add_pd(fjy0,ty);
1097 fjz0 = _mm_add_pd(fjz0,tz);
1099 /**************************
1100 * CALCULATE INTERACTIONS *
1101 **************************/
1103 r21 = _mm_mul_pd(rsq21,rinv21);
1105 /* Calculate table index by multiplying r with table scale and truncate to integer */
1106 rt = _mm_mul_pd(r21,vftabscale);
1107 vfitab = _mm_cvttpd_epi32(rt);
1108 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1109 vfitab = _mm_slli_epi32(vfitab,2);
1111 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1112 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1113 F = _mm_setzero_pd();
1114 GMX_MM_TRANSPOSE2_PD(Y,F);
1115 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1116 H = _mm_setzero_pd();
1117 GMX_MM_TRANSPOSE2_PD(G,H);
1118 Heps = _mm_mul_pd(vfeps,H);
1119 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1120 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1121 velec = _mm_mul_pd(qq21,VV);
1122 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1123 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1125 /* Update potential sum for this i atom from the interaction with this j atom. */
1126 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1127 velecsum = _mm_add_pd(velecsum,velec);
1131 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1133 /* Calculate temporary vectorial force */
1134 tx = _mm_mul_pd(fscal,dx21);
1135 ty = _mm_mul_pd(fscal,dy21);
1136 tz = _mm_mul_pd(fscal,dz21);
1138 /* Update vectorial force */
1139 fix2 = _mm_add_pd(fix2,tx);
1140 fiy2 = _mm_add_pd(fiy2,ty);
1141 fiz2 = _mm_add_pd(fiz2,tz);
1143 fjx1 = _mm_add_pd(fjx1,tx);
1144 fjy1 = _mm_add_pd(fjy1,ty);
1145 fjz1 = _mm_add_pd(fjz1,tz);
1147 /**************************
1148 * CALCULATE INTERACTIONS *
1149 **************************/
1151 r22 = _mm_mul_pd(rsq22,rinv22);
1153 /* Calculate table index by multiplying r with table scale and truncate to integer */
1154 rt = _mm_mul_pd(r22,vftabscale);
1155 vfitab = _mm_cvttpd_epi32(rt);
1156 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1157 vfitab = _mm_slli_epi32(vfitab,2);
1159 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1160 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1161 F = _mm_setzero_pd();
1162 GMX_MM_TRANSPOSE2_PD(Y,F);
1163 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1164 H = _mm_setzero_pd();
1165 GMX_MM_TRANSPOSE2_PD(G,H);
1166 Heps = _mm_mul_pd(vfeps,H);
1167 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1168 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1169 velec = _mm_mul_pd(qq22,VV);
1170 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1171 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1173 /* Update potential sum for this i atom from the interaction with this j atom. */
1174 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1175 velecsum = _mm_add_pd(velecsum,velec);
1179 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1181 /* Calculate temporary vectorial force */
1182 tx = _mm_mul_pd(fscal,dx22);
1183 ty = _mm_mul_pd(fscal,dy22);
1184 tz = _mm_mul_pd(fscal,dz22);
1186 /* Update vectorial force */
1187 fix2 = _mm_add_pd(fix2,tx);
1188 fiy2 = _mm_add_pd(fiy2,ty);
1189 fiz2 = _mm_add_pd(fiz2,tz);
1191 fjx2 = _mm_add_pd(fjx2,tx);
1192 fjy2 = _mm_add_pd(fjy2,ty);
1193 fjz2 = _mm_add_pd(fjz2,tz);
1195 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1197 /* Inner loop uses 400 flops */
1200 /* End of innermost loop */
1202 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1203 f+i_coord_offset,fshift+i_shift_offset);
1206 /* Update potential energies */
1207 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1208 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1210 /* Increment number of inner iterations */
1211 inneriter += j_index_end - j_index_start;
1213 /* Outer loop uses 20 flops */
1216 /* Increment number of outer iterations */
1219 /* Update outer/inner flops */
1221 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*400);
1224 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1225 * Electrostatics interaction: CubicSplineTable
1226 * VdW interaction: LennardJones
1227 * Geometry: Water3-Water3
1228 * Calculate force/pot: Force
1231 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1232 (t_nblist * gmx_restrict nlist,
1233 rvec * gmx_restrict xx,
1234 rvec * gmx_restrict ff,
1235 t_forcerec * gmx_restrict fr,
1236 t_mdatoms * gmx_restrict mdatoms,
1237 nb_kernel_data_t * gmx_restrict kernel_data,
1238 t_nrnb * gmx_restrict nrnb)
1240 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1241 * just 0 for non-waters.
1242 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1243 * jnr indices corresponding to data put in the four positions in the SIMD register.
1245 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1246 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1248 int j_coord_offsetA,j_coord_offsetB;
1249 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1250 real rcutoff_scalar;
1251 real *shiftvec,*fshift,*x,*f;
1252 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1254 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1256 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1258 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1259 int vdwjidx0A,vdwjidx0B;
1260 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1261 int vdwjidx1A,vdwjidx1B;
1262 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1263 int vdwjidx2A,vdwjidx2B;
1264 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1265 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1266 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1267 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1268 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1269 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1270 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1271 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1272 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1273 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1274 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1277 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1280 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1281 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1283 __m128i ifour = _mm_set1_epi32(4);
1284 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1286 __m128d dummy_mask,cutoff_mask;
1287 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1288 __m128d one = _mm_set1_pd(1.0);
1289 __m128d two = _mm_set1_pd(2.0);
1295 jindex = nlist->jindex;
1297 shiftidx = nlist->shift;
1299 shiftvec = fr->shift_vec[0];
1300 fshift = fr->fshift[0];
1301 facel = _mm_set1_pd(fr->epsfac);
1302 charge = mdatoms->chargeA;
1303 nvdwtype = fr->ntype;
1304 vdwparam = fr->nbfp;
1305 vdwtype = mdatoms->typeA;
1307 vftab = kernel_data->table_elec->data;
1308 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1310 /* Setup water-specific parameters */
1311 inr = nlist->iinr[0];
1312 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1313 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1314 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1315 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1317 jq0 = _mm_set1_pd(charge[inr+0]);
1318 jq1 = _mm_set1_pd(charge[inr+1]);
1319 jq2 = _mm_set1_pd(charge[inr+2]);
1320 vdwjidx0A = 2*vdwtype[inr+0];
1321 qq00 = _mm_mul_pd(iq0,jq0);
1322 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1323 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1324 qq01 = _mm_mul_pd(iq0,jq1);
1325 qq02 = _mm_mul_pd(iq0,jq2);
1326 qq10 = _mm_mul_pd(iq1,jq0);
1327 qq11 = _mm_mul_pd(iq1,jq1);
1328 qq12 = _mm_mul_pd(iq1,jq2);
1329 qq20 = _mm_mul_pd(iq2,jq0);
1330 qq21 = _mm_mul_pd(iq2,jq1);
1331 qq22 = _mm_mul_pd(iq2,jq2);
1333 /* Avoid stupid compiler warnings */
1335 j_coord_offsetA = 0;
1336 j_coord_offsetB = 0;
1341 /* Start outer loop over neighborlists */
1342 for(iidx=0; iidx<nri; iidx++)
1344 /* Load shift vector for this list */
1345 i_shift_offset = DIM*shiftidx[iidx];
1347 /* Load limits for loop over neighbors */
1348 j_index_start = jindex[iidx];
1349 j_index_end = jindex[iidx+1];
1351 /* Get outer coordinate index */
1353 i_coord_offset = DIM*inr;
1355 /* Load i particle coords and add shift vector */
1356 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1357 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1359 fix0 = _mm_setzero_pd();
1360 fiy0 = _mm_setzero_pd();
1361 fiz0 = _mm_setzero_pd();
1362 fix1 = _mm_setzero_pd();
1363 fiy1 = _mm_setzero_pd();
1364 fiz1 = _mm_setzero_pd();
1365 fix2 = _mm_setzero_pd();
1366 fiy2 = _mm_setzero_pd();
1367 fiz2 = _mm_setzero_pd();
1369 /* Start inner kernel loop */
1370 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1373 /* Get j neighbor index, and coordinate index */
1375 jnrB = jjnr[jidx+1];
1376 j_coord_offsetA = DIM*jnrA;
1377 j_coord_offsetB = DIM*jnrB;
1379 /* load j atom coordinates */
1380 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1381 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1383 /* Calculate displacement vector */
1384 dx00 = _mm_sub_pd(ix0,jx0);
1385 dy00 = _mm_sub_pd(iy0,jy0);
1386 dz00 = _mm_sub_pd(iz0,jz0);
1387 dx01 = _mm_sub_pd(ix0,jx1);
1388 dy01 = _mm_sub_pd(iy0,jy1);
1389 dz01 = _mm_sub_pd(iz0,jz1);
1390 dx02 = _mm_sub_pd(ix0,jx2);
1391 dy02 = _mm_sub_pd(iy0,jy2);
1392 dz02 = _mm_sub_pd(iz0,jz2);
1393 dx10 = _mm_sub_pd(ix1,jx0);
1394 dy10 = _mm_sub_pd(iy1,jy0);
1395 dz10 = _mm_sub_pd(iz1,jz0);
1396 dx11 = _mm_sub_pd(ix1,jx1);
1397 dy11 = _mm_sub_pd(iy1,jy1);
1398 dz11 = _mm_sub_pd(iz1,jz1);
1399 dx12 = _mm_sub_pd(ix1,jx2);
1400 dy12 = _mm_sub_pd(iy1,jy2);
1401 dz12 = _mm_sub_pd(iz1,jz2);
1402 dx20 = _mm_sub_pd(ix2,jx0);
1403 dy20 = _mm_sub_pd(iy2,jy0);
1404 dz20 = _mm_sub_pd(iz2,jz0);
1405 dx21 = _mm_sub_pd(ix2,jx1);
1406 dy21 = _mm_sub_pd(iy2,jy1);
1407 dz21 = _mm_sub_pd(iz2,jz1);
1408 dx22 = _mm_sub_pd(ix2,jx2);
1409 dy22 = _mm_sub_pd(iy2,jy2);
1410 dz22 = _mm_sub_pd(iz2,jz2);
1412 /* Calculate squared distance and things based on it */
1413 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1414 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1415 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1416 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1417 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1418 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1419 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1420 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1421 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1423 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1424 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1425 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1426 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1427 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1428 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1429 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1430 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1431 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1433 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1435 fjx0 = _mm_setzero_pd();
1436 fjy0 = _mm_setzero_pd();
1437 fjz0 = _mm_setzero_pd();
1438 fjx1 = _mm_setzero_pd();
1439 fjy1 = _mm_setzero_pd();
1440 fjz1 = _mm_setzero_pd();
1441 fjx2 = _mm_setzero_pd();
1442 fjy2 = _mm_setzero_pd();
1443 fjz2 = _mm_setzero_pd();
1445 /**************************
1446 * CALCULATE INTERACTIONS *
1447 **************************/
1449 r00 = _mm_mul_pd(rsq00,rinv00);
1451 /* Calculate table index by multiplying r with table scale and truncate to integer */
1452 rt = _mm_mul_pd(r00,vftabscale);
1453 vfitab = _mm_cvttpd_epi32(rt);
1454 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1455 vfitab = _mm_slli_epi32(vfitab,2);
1457 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1458 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1459 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1460 GMX_MM_TRANSPOSE2_PD(Y,F);
1461 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1462 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1463 GMX_MM_TRANSPOSE2_PD(G,H);
1464 Heps = _mm_mul_pd(vfeps,H);
1465 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1466 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1467 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1469 /* LENNARD-JONES DISPERSION/REPULSION */
1471 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1472 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1474 fscal = _mm_add_pd(felec,fvdw);
1476 /* Calculate temporary vectorial force */
1477 tx = _mm_mul_pd(fscal,dx00);
1478 ty = _mm_mul_pd(fscal,dy00);
1479 tz = _mm_mul_pd(fscal,dz00);
1481 /* Update vectorial force */
1482 fix0 = _mm_add_pd(fix0,tx);
1483 fiy0 = _mm_add_pd(fiy0,ty);
1484 fiz0 = _mm_add_pd(fiz0,tz);
1486 fjx0 = _mm_add_pd(fjx0,tx);
1487 fjy0 = _mm_add_pd(fjy0,ty);
1488 fjz0 = _mm_add_pd(fjz0,tz);
1490 /**************************
1491 * CALCULATE INTERACTIONS *
1492 **************************/
1494 r01 = _mm_mul_pd(rsq01,rinv01);
1496 /* Calculate table index by multiplying r with table scale and truncate to integer */
1497 rt = _mm_mul_pd(r01,vftabscale);
1498 vfitab = _mm_cvttpd_epi32(rt);
1499 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1500 vfitab = _mm_slli_epi32(vfitab,2);
1502 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1503 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1504 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1505 GMX_MM_TRANSPOSE2_PD(Y,F);
1506 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1507 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1508 GMX_MM_TRANSPOSE2_PD(G,H);
1509 Heps = _mm_mul_pd(vfeps,H);
1510 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1511 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1512 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1516 /* Calculate temporary vectorial force */
1517 tx = _mm_mul_pd(fscal,dx01);
1518 ty = _mm_mul_pd(fscal,dy01);
1519 tz = _mm_mul_pd(fscal,dz01);
1521 /* Update vectorial force */
1522 fix0 = _mm_add_pd(fix0,tx);
1523 fiy0 = _mm_add_pd(fiy0,ty);
1524 fiz0 = _mm_add_pd(fiz0,tz);
1526 fjx1 = _mm_add_pd(fjx1,tx);
1527 fjy1 = _mm_add_pd(fjy1,ty);
1528 fjz1 = _mm_add_pd(fjz1,tz);
1530 /**************************
1531 * CALCULATE INTERACTIONS *
1532 **************************/
1534 r02 = _mm_mul_pd(rsq02,rinv02);
1536 /* Calculate table index by multiplying r with table scale and truncate to integer */
1537 rt = _mm_mul_pd(r02,vftabscale);
1538 vfitab = _mm_cvttpd_epi32(rt);
1539 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1540 vfitab = _mm_slli_epi32(vfitab,2);
1542 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1543 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1544 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1545 GMX_MM_TRANSPOSE2_PD(Y,F);
1546 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1547 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1548 GMX_MM_TRANSPOSE2_PD(G,H);
1549 Heps = _mm_mul_pd(vfeps,H);
1550 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1551 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1552 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1556 /* Calculate temporary vectorial force */
1557 tx = _mm_mul_pd(fscal,dx02);
1558 ty = _mm_mul_pd(fscal,dy02);
1559 tz = _mm_mul_pd(fscal,dz02);
1561 /* Update vectorial force */
1562 fix0 = _mm_add_pd(fix0,tx);
1563 fiy0 = _mm_add_pd(fiy0,ty);
1564 fiz0 = _mm_add_pd(fiz0,tz);
1566 fjx2 = _mm_add_pd(fjx2,tx);
1567 fjy2 = _mm_add_pd(fjy2,ty);
1568 fjz2 = _mm_add_pd(fjz2,tz);
1570 /**************************
1571 * CALCULATE INTERACTIONS *
1572 **************************/
1574 r10 = _mm_mul_pd(rsq10,rinv10);
1576 /* Calculate table index by multiplying r with table scale and truncate to integer */
1577 rt = _mm_mul_pd(r10,vftabscale);
1578 vfitab = _mm_cvttpd_epi32(rt);
1579 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1580 vfitab = _mm_slli_epi32(vfitab,2);
1582 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1583 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1584 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1585 GMX_MM_TRANSPOSE2_PD(Y,F);
1586 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1587 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1588 GMX_MM_TRANSPOSE2_PD(G,H);
1589 Heps = _mm_mul_pd(vfeps,H);
1590 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1591 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1592 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1596 /* Calculate temporary vectorial force */
1597 tx = _mm_mul_pd(fscal,dx10);
1598 ty = _mm_mul_pd(fscal,dy10);
1599 tz = _mm_mul_pd(fscal,dz10);
1601 /* Update vectorial force */
1602 fix1 = _mm_add_pd(fix1,tx);
1603 fiy1 = _mm_add_pd(fiy1,ty);
1604 fiz1 = _mm_add_pd(fiz1,tz);
1606 fjx0 = _mm_add_pd(fjx0,tx);
1607 fjy0 = _mm_add_pd(fjy0,ty);
1608 fjz0 = _mm_add_pd(fjz0,tz);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 r11 = _mm_mul_pd(rsq11,rinv11);
1616 /* Calculate table index by multiplying r with table scale and truncate to integer */
1617 rt = _mm_mul_pd(r11,vftabscale);
1618 vfitab = _mm_cvttpd_epi32(rt);
1619 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1620 vfitab = _mm_slli_epi32(vfitab,2);
1622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1623 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1624 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1625 GMX_MM_TRANSPOSE2_PD(Y,F);
1626 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1627 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1628 GMX_MM_TRANSPOSE2_PD(G,H);
1629 Heps = _mm_mul_pd(vfeps,H);
1630 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1631 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1632 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1636 /* Calculate temporary vectorial force */
1637 tx = _mm_mul_pd(fscal,dx11);
1638 ty = _mm_mul_pd(fscal,dy11);
1639 tz = _mm_mul_pd(fscal,dz11);
1641 /* Update vectorial force */
1642 fix1 = _mm_add_pd(fix1,tx);
1643 fiy1 = _mm_add_pd(fiy1,ty);
1644 fiz1 = _mm_add_pd(fiz1,tz);
1646 fjx1 = _mm_add_pd(fjx1,tx);
1647 fjy1 = _mm_add_pd(fjy1,ty);
1648 fjz1 = _mm_add_pd(fjz1,tz);
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 r12 = _mm_mul_pd(rsq12,rinv12);
1656 /* Calculate table index by multiplying r with table scale and truncate to integer */
1657 rt = _mm_mul_pd(r12,vftabscale);
1658 vfitab = _mm_cvttpd_epi32(rt);
1659 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1660 vfitab = _mm_slli_epi32(vfitab,2);
1662 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1663 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1664 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1665 GMX_MM_TRANSPOSE2_PD(Y,F);
1666 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1667 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1668 GMX_MM_TRANSPOSE2_PD(G,H);
1669 Heps = _mm_mul_pd(vfeps,H);
1670 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1671 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1672 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1676 /* Calculate temporary vectorial force */
1677 tx = _mm_mul_pd(fscal,dx12);
1678 ty = _mm_mul_pd(fscal,dy12);
1679 tz = _mm_mul_pd(fscal,dz12);
1681 /* Update vectorial force */
1682 fix1 = _mm_add_pd(fix1,tx);
1683 fiy1 = _mm_add_pd(fiy1,ty);
1684 fiz1 = _mm_add_pd(fiz1,tz);
1686 fjx2 = _mm_add_pd(fjx2,tx);
1687 fjy2 = _mm_add_pd(fjy2,ty);
1688 fjz2 = _mm_add_pd(fjz2,tz);
1690 /**************************
1691 * CALCULATE INTERACTIONS *
1692 **************************/
1694 r20 = _mm_mul_pd(rsq20,rinv20);
1696 /* Calculate table index by multiplying r with table scale and truncate to integer */
1697 rt = _mm_mul_pd(r20,vftabscale);
1698 vfitab = _mm_cvttpd_epi32(rt);
1699 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1700 vfitab = _mm_slli_epi32(vfitab,2);
1702 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1703 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1704 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1705 GMX_MM_TRANSPOSE2_PD(Y,F);
1706 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1707 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1708 GMX_MM_TRANSPOSE2_PD(G,H);
1709 Heps = _mm_mul_pd(vfeps,H);
1710 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1711 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1712 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1716 /* Calculate temporary vectorial force */
1717 tx = _mm_mul_pd(fscal,dx20);
1718 ty = _mm_mul_pd(fscal,dy20);
1719 tz = _mm_mul_pd(fscal,dz20);
1721 /* Update vectorial force */
1722 fix2 = _mm_add_pd(fix2,tx);
1723 fiy2 = _mm_add_pd(fiy2,ty);
1724 fiz2 = _mm_add_pd(fiz2,tz);
1726 fjx0 = _mm_add_pd(fjx0,tx);
1727 fjy0 = _mm_add_pd(fjy0,ty);
1728 fjz0 = _mm_add_pd(fjz0,tz);
1730 /**************************
1731 * CALCULATE INTERACTIONS *
1732 **************************/
1734 r21 = _mm_mul_pd(rsq21,rinv21);
1736 /* Calculate table index by multiplying r with table scale and truncate to integer */
1737 rt = _mm_mul_pd(r21,vftabscale);
1738 vfitab = _mm_cvttpd_epi32(rt);
1739 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1740 vfitab = _mm_slli_epi32(vfitab,2);
1742 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1743 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1744 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1745 GMX_MM_TRANSPOSE2_PD(Y,F);
1746 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1747 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1748 GMX_MM_TRANSPOSE2_PD(G,H);
1749 Heps = _mm_mul_pd(vfeps,H);
1750 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1751 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1752 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1756 /* Calculate temporary vectorial force */
1757 tx = _mm_mul_pd(fscal,dx21);
1758 ty = _mm_mul_pd(fscal,dy21);
1759 tz = _mm_mul_pd(fscal,dz21);
1761 /* Update vectorial force */
1762 fix2 = _mm_add_pd(fix2,tx);
1763 fiy2 = _mm_add_pd(fiy2,ty);
1764 fiz2 = _mm_add_pd(fiz2,tz);
1766 fjx1 = _mm_add_pd(fjx1,tx);
1767 fjy1 = _mm_add_pd(fjy1,ty);
1768 fjz1 = _mm_add_pd(fjz1,tz);
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 r22 = _mm_mul_pd(rsq22,rinv22);
1776 /* Calculate table index by multiplying r with table scale and truncate to integer */
1777 rt = _mm_mul_pd(r22,vftabscale);
1778 vfitab = _mm_cvttpd_epi32(rt);
1779 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1780 vfitab = _mm_slli_epi32(vfitab,2);
1782 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1783 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1784 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1785 GMX_MM_TRANSPOSE2_PD(Y,F);
1786 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1787 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1788 GMX_MM_TRANSPOSE2_PD(G,H);
1789 Heps = _mm_mul_pd(vfeps,H);
1790 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1791 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1792 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1796 /* Calculate temporary vectorial force */
1797 tx = _mm_mul_pd(fscal,dx22);
1798 ty = _mm_mul_pd(fscal,dy22);
1799 tz = _mm_mul_pd(fscal,dz22);
1801 /* Update vectorial force */
1802 fix2 = _mm_add_pd(fix2,tx);
1803 fiy2 = _mm_add_pd(fiy2,ty);
1804 fiz2 = _mm_add_pd(fiz2,tz);
1806 fjx2 = _mm_add_pd(fjx2,tx);
1807 fjy2 = _mm_add_pd(fjy2,ty);
1808 fjz2 = _mm_add_pd(fjz2,tz);
1810 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1812 /* Inner loop uses 359 flops */
1815 if(jidx<j_index_end)
1819 j_coord_offsetA = DIM*jnrA;
1821 /* load j atom coordinates */
1822 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1823 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1825 /* Calculate displacement vector */
1826 dx00 = _mm_sub_pd(ix0,jx0);
1827 dy00 = _mm_sub_pd(iy0,jy0);
1828 dz00 = _mm_sub_pd(iz0,jz0);
1829 dx01 = _mm_sub_pd(ix0,jx1);
1830 dy01 = _mm_sub_pd(iy0,jy1);
1831 dz01 = _mm_sub_pd(iz0,jz1);
1832 dx02 = _mm_sub_pd(ix0,jx2);
1833 dy02 = _mm_sub_pd(iy0,jy2);
1834 dz02 = _mm_sub_pd(iz0,jz2);
1835 dx10 = _mm_sub_pd(ix1,jx0);
1836 dy10 = _mm_sub_pd(iy1,jy0);
1837 dz10 = _mm_sub_pd(iz1,jz0);
1838 dx11 = _mm_sub_pd(ix1,jx1);
1839 dy11 = _mm_sub_pd(iy1,jy1);
1840 dz11 = _mm_sub_pd(iz1,jz1);
1841 dx12 = _mm_sub_pd(ix1,jx2);
1842 dy12 = _mm_sub_pd(iy1,jy2);
1843 dz12 = _mm_sub_pd(iz1,jz2);
1844 dx20 = _mm_sub_pd(ix2,jx0);
1845 dy20 = _mm_sub_pd(iy2,jy0);
1846 dz20 = _mm_sub_pd(iz2,jz0);
1847 dx21 = _mm_sub_pd(ix2,jx1);
1848 dy21 = _mm_sub_pd(iy2,jy1);
1849 dz21 = _mm_sub_pd(iz2,jz1);
1850 dx22 = _mm_sub_pd(ix2,jx2);
1851 dy22 = _mm_sub_pd(iy2,jy2);
1852 dz22 = _mm_sub_pd(iz2,jz2);
1854 /* Calculate squared distance and things based on it */
1855 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1856 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1857 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1858 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1859 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1860 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1861 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1862 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1863 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1865 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1866 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1867 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1868 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1869 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1870 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1871 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1872 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1873 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1875 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1877 fjx0 = _mm_setzero_pd();
1878 fjy0 = _mm_setzero_pd();
1879 fjz0 = _mm_setzero_pd();
1880 fjx1 = _mm_setzero_pd();
1881 fjy1 = _mm_setzero_pd();
1882 fjz1 = _mm_setzero_pd();
1883 fjx2 = _mm_setzero_pd();
1884 fjy2 = _mm_setzero_pd();
1885 fjz2 = _mm_setzero_pd();
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r00 = _mm_mul_pd(rsq00,rinv00);
1893 /* Calculate table index by multiplying r with table scale and truncate to integer */
1894 rt = _mm_mul_pd(r00,vftabscale);
1895 vfitab = _mm_cvttpd_epi32(rt);
1896 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1897 vfitab = _mm_slli_epi32(vfitab,2);
1899 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1900 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1901 F = _mm_setzero_pd();
1902 GMX_MM_TRANSPOSE2_PD(Y,F);
1903 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1904 H = _mm_setzero_pd();
1905 GMX_MM_TRANSPOSE2_PD(G,H);
1906 Heps = _mm_mul_pd(vfeps,H);
1907 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1908 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1909 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1911 /* LENNARD-JONES DISPERSION/REPULSION */
1913 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1914 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1916 fscal = _mm_add_pd(felec,fvdw);
1918 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1920 /* Calculate temporary vectorial force */
1921 tx = _mm_mul_pd(fscal,dx00);
1922 ty = _mm_mul_pd(fscal,dy00);
1923 tz = _mm_mul_pd(fscal,dz00);
1925 /* Update vectorial force */
1926 fix0 = _mm_add_pd(fix0,tx);
1927 fiy0 = _mm_add_pd(fiy0,ty);
1928 fiz0 = _mm_add_pd(fiz0,tz);
1930 fjx0 = _mm_add_pd(fjx0,tx);
1931 fjy0 = _mm_add_pd(fjy0,ty);
1932 fjz0 = _mm_add_pd(fjz0,tz);
1934 /**************************
1935 * CALCULATE INTERACTIONS *
1936 **************************/
1938 r01 = _mm_mul_pd(rsq01,rinv01);
1940 /* Calculate table index by multiplying r with table scale and truncate to integer */
1941 rt = _mm_mul_pd(r01,vftabscale);
1942 vfitab = _mm_cvttpd_epi32(rt);
1943 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1944 vfitab = _mm_slli_epi32(vfitab,2);
1946 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1947 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1948 F = _mm_setzero_pd();
1949 GMX_MM_TRANSPOSE2_PD(Y,F);
1950 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1951 H = _mm_setzero_pd();
1952 GMX_MM_TRANSPOSE2_PD(G,H);
1953 Heps = _mm_mul_pd(vfeps,H);
1954 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1955 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1956 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1960 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1962 /* Calculate temporary vectorial force */
1963 tx = _mm_mul_pd(fscal,dx01);
1964 ty = _mm_mul_pd(fscal,dy01);
1965 tz = _mm_mul_pd(fscal,dz01);
1967 /* Update vectorial force */
1968 fix0 = _mm_add_pd(fix0,tx);
1969 fiy0 = _mm_add_pd(fiy0,ty);
1970 fiz0 = _mm_add_pd(fiz0,tz);
1972 fjx1 = _mm_add_pd(fjx1,tx);
1973 fjy1 = _mm_add_pd(fjy1,ty);
1974 fjz1 = _mm_add_pd(fjz1,tz);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 r02 = _mm_mul_pd(rsq02,rinv02);
1982 /* Calculate table index by multiplying r with table scale and truncate to integer */
1983 rt = _mm_mul_pd(r02,vftabscale);
1984 vfitab = _mm_cvttpd_epi32(rt);
1985 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1986 vfitab = _mm_slli_epi32(vfitab,2);
1988 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1989 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1990 F = _mm_setzero_pd();
1991 GMX_MM_TRANSPOSE2_PD(Y,F);
1992 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1993 H = _mm_setzero_pd();
1994 GMX_MM_TRANSPOSE2_PD(G,H);
1995 Heps = _mm_mul_pd(vfeps,H);
1996 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1997 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1998 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2002 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2004 /* Calculate temporary vectorial force */
2005 tx = _mm_mul_pd(fscal,dx02);
2006 ty = _mm_mul_pd(fscal,dy02);
2007 tz = _mm_mul_pd(fscal,dz02);
2009 /* Update vectorial force */
2010 fix0 = _mm_add_pd(fix0,tx);
2011 fiy0 = _mm_add_pd(fiy0,ty);
2012 fiz0 = _mm_add_pd(fiz0,tz);
2014 fjx2 = _mm_add_pd(fjx2,tx);
2015 fjy2 = _mm_add_pd(fjy2,ty);
2016 fjz2 = _mm_add_pd(fjz2,tz);
2018 /**************************
2019 * CALCULATE INTERACTIONS *
2020 **************************/
2022 r10 = _mm_mul_pd(rsq10,rinv10);
2024 /* Calculate table index by multiplying r with table scale and truncate to integer */
2025 rt = _mm_mul_pd(r10,vftabscale);
2026 vfitab = _mm_cvttpd_epi32(rt);
2027 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2028 vfitab = _mm_slli_epi32(vfitab,2);
2030 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2031 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2032 F = _mm_setzero_pd();
2033 GMX_MM_TRANSPOSE2_PD(Y,F);
2034 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2035 H = _mm_setzero_pd();
2036 GMX_MM_TRANSPOSE2_PD(G,H);
2037 Heps = _mm_mul_pd(vfeps,H);
2038 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2039 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2040 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2044 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2046 /* Calculate temporary vectorial force */
2047 tx = _mm_mul_pd(fscal,dx10);
2048 ty = _mm_mul_pd(fscal,dy10);
2049 tz = _mm_mul_pd(fscal,dz10);
2051 /* Update vectorial force */
2052 fix1 = _mm_add_pd(fix1,tx);
2053 fiy1 = _mm_add_pd(fiy1,ty);
2054 fiz1 = _mm_add_pd(fiz1,tz);
2056 fjx0 = _mm_add_pd(fjx0,tx);
2057 fjy0 = _mm_add_pd(fjy0,ty);
2058 fjz0 = _mm_add_pd(fjz0,tz);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 r11 = _mm_mul_pd(rsq11,rinv11);
2066 /* Calculate table index by multiplying r with table scale and truncate to integer */
2067 rt = _mm_mul_pd(r11,vftabscale);
2068 vfitab = _mm_cvttpd_epi32(rt);
2069 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2070 vfitab = _mm_slli_epi32(vfitab,2);
2072 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2073 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2074 F = _mm_setzero_pd();
2075 GMX_MM_TRANSPOSE2_PD(Y,F);
2076 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2077 H = _mm_setzero_pd();
2078 GMX_MM_TRANSPOSE2_PD(G,H);
2079 Heps = _mm_mul_pd(vfeps,H);
2080 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2081 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2082 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2086 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2088 /* Calculate temporary vectorial force */
2089 tx = _mm_mul_pd(fscal,dx11);
2090 ty = _mm_mul_pd(fscal,dy11);
2091 tz = _mm_mul_pd(fscal,dz11);
2093 /* Update vectorial force */
2094 fix1 = _mm_add_pd(fix1,tx);
2095 fiy1 = _mm_add_pd(fiy1,ty);
2096 fiz1 = _mm_add_pd(fiz1,tz);
2098 fjx1 = _mm_add_pd(fjx1,tx);
2099 fjy1 = _mm_add_pd(fjy1,ty);
2100 fjz1 = _mm_add_pd(fjz1,tz);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 r12 = _mm_mul_pd(rsq12,rinv12);
2108 /* Calculate table index by multiplying r with table scale and truncate to integer */
2109 rt = _mm_mul_pd(r12,vftabscale);
2110 vfitab = _mm_cvttpd_epi32(rt);
2111 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2112 vfitab = _mm_slli_epi32(vfitab,2);
2114 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2115 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2116 F = _mm_setzero_pd();
2117 GMX_MM_TRANSPOSE2_PD(Y,F);
2118 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2119 H = _mm_setzero_pd();
2120 GMX_MM_TRANSPOSE2_PD(G,H);
2121 Heps = _mm_mul_pd(vfeps,H);
2122 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2123 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2124 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2128 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2130 /* Calculate temporary vectorial force */
2131 tx = _mm_mul_pd(fscal,dx12);
2132 ty = _mm_mul_pd(fscal,dy12);
2133 tz = _mm_mul_pd(fscal,dz12);
2135 /* Update vectorial force */
2136 fix1 = _mm_add_pd(fix1,tx);
2137 fiy1 = _mm_add_pd(fiy1,ty);
2138 fiz1 = _mm_add_pd(fiz1,tz);
2140 fjx2 = _mm_add_pd(fjx2,tx);
2141 fjy2 = _mm_add_pd(fjy2,ty);
2142 fjz2 = _mm_add_pd(fjz2,tz);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 r20 = _mm_mul_pd(rsq20,rinv20);
2150 /* Calculate table index by multiplying r with table scale and truncate to integer */
2151 rt = _mm_mul_pd(r20,vftabscale);
2152 vfitab = _mm_cvttpd_epi32(rt);
2153 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2154 vfitab = _mm_slli_epi32(vfitab,2);
2156 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2157 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2158 F = _mm_setzero_pd();
2159 GMX_MM_TRANSPOSE2_PD(Y,F);
2160 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2161 H = _mm_setzero_pd();
2162 GMX_MM_TRANSPOSE2_PD(G,H);
2163 Heps = _mm_mul_pd(vfeps,H);
2164 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2165 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2166 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2170 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2172 /* Calculate temporary vectorial force */
2173 tx = _mm_mul_pd(fscal,dx20);
2174 ty = _mm_mul_pd(fscal,dy20);
2175 tz = _mm_mul_pd(fscal,dz20);
2177 /* Update vectorial force */
2178 fix2 = _mm_add_pd(fix2,tx);
2179 fiy2 = _mm_add_pd(fiy2,ty);
2180 fiz2 = _mm_add_pd(fiz2,tz);
2182 fjx0 = _mm_add_pd(fjx0,tx);
2183 fjy0 = _mm_add_pd(fjy0,ty);
2184 fjz0 = _mm_add_pd(fjz0,tz);
2186 /**************************
2187 * CALCULATE INTERACTIONS *
2188 **************************/
2190 r21 = _mm_mul_pd(rsq21,rinv21);
2192 /* Calculate table index by multiplying r with table scale and truncate to integer */
2193 rt = _mm_mul_pd(r21,vftabscale);
2194 vfitab = _mm_cvttpd_epi32(rt);
2195 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2196 vfitab = _mm_slli_epi32(vfitab,2);
2198 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2199 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2200 F = _mm_setzero_pd();
2201 GMX_MM_TRANSPOSE2_PD(Y,F);
2202 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2203 H = _mm_setzero_pd();
2204 GMX_MM_TRANSPOSE2_PD(G,H);
2205 Heps = _mm_mul_pd(vfeps,H);
2206 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2207 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2208 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2212 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2214 /* Calculate temporary vectorial force */
2215 tx = _mm_mul_pd(fscal,dx21);
2216 ty = _mm_mul_pd(fscal,dy21);
2217 tz = _mm_mul_pd(fscal,dz21);
2219 /* Update vectorial force */
2220 fix2 = _mm_add_pd(fix2,tx);
2221 fiy2 = _mm_add_pd(fiy2,ty);
2222 fiz2 = _mm_add_pd(fiz2,tz);
2224 fjx1 = _mm_add_pd(fjx1,tx);
2225 fjy1 = _mm_add_pd(fjy1,ty);
2226 fjz1 = _mm_add_pd(fjz1,tz);
2228 /**************************
2229 * CALCULATE INTERACTIONS *
2230 **************************/
2232 r22 = _mm_mul_pd(rsq22,rinv22);
2234 /* Calculate table index by multiplying r with table scale and truncate to integer */
2235 rt = _mm_mul_pd(r22,vftabscale);
2236 vfitab = _mm_cvttpd_epi32(rt);
2237 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2238 vfitab = _mm_slli_epi32(vfitab,2);
2240 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2241 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2242 F = _mm_setzero_pd();
2243 GMX_MM_TRANSPOSE2_PD(Y,F);
2244 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2245 H = _mm_setzero_pd();
2246 GMX_MM_TRANSPOSE2_PD(G,H);
2247 Heps = _mm_mul_pd(vfeps,H);
2248 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2249 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2250 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2254 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2256 /* Calculate temporary vectorial force */
2257 tx = _mm_mul_pd(fscal,dx22);
2258 ty = _mm_mul_pd(fscal,dy22);
2259 tz = _mm_mul_pd(fscal,dz22);
2261 /* Update vectorial force */
2262 fix2 = _mm_add_pd(fix2,tx);
2263 fiy2 = _mm_add_pd(fiy2,ty);
2264 fiz2 = _mm_add_pd(fiz2,tz);
2266 fjx2 = _mm_add_pd(fjx2,tx);
2267 fjy2 = _mm_add_pd(fjy2,ty);
2268 fjz2 = _mm_add_pd(fjz2,tz);
2270 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2272 /* Inner loop uses 359 flops */
2275 /* End of innermost loop */
2277 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2278 f+i_coord_offset,fshift+i_shift_offset);
2280 /* Increment number of inner iterations */
2281 inneriter += j_index_end - j_index_start;
2283 /* Outer loop uses 18 flops */
2286 /* Increment number of outer iterations */
2289 /* Update outer/inner flops */
2291 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);