2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128d dummy_mask,cutoff_mask;
105 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one = _mm_set1_pd(1.0);
107 __m128d two = _mm_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 vftab = kernel_data->table_elec->data;
126 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
131 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
132 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
133 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
135 jq1 = _mm_set1_pd(charge[inr+1]);
136 jq2 = _mm_set1_pd(charge[inr+2]);
137 jq3 = _mm_set1_pd(charge[inr+3]);
138 vdwjidx0A = 2*vdwtype[inr+0];
139 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq11 = _mm_mul_pd(iq1,jq1);
142 qq12 = _mm_mul_pd(iq1,jq2);
143 qq13 = _mm_mul_pd(iq1,jq3);
144 qq21 = _mm_mul_pd(iq2,jq1);
145 qq22 = _mm_mul_pd(iq2,jq2);
146 qq23 = _mm_mul_pd(iq2,jq3);
147 qq31 = _mm_mul_pd(iq3,jq1);
148 qq32 = _mm_mul_pd(iq3,jq2);
149 qq33 = _mm_mul_pd(iq3,jq3);
151 /* Avoid stupid compiler warnings */
159 /* Start outer loop over neighborlists */
160 for(iidx=0; iidx<nri; iidx++)
162 /* Load shift vector for this list */
163 i_shift_offset = DIM*shiftidx[iidx];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
175 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
177 fix0 = _mm_setzero_pd();
178 fiy0 = _mm_setzero_pd();
179 fiz0 = _mm_setzero_pd();
180 fix1 = _mm_setzero_pd();
181 fiy1 = _mm_setzero_pd();
182 fiz1 = _mm_setzero_pd();
183 fix2 = _mm_setzero_pd();
184 fiy2 = _mm_setzero_pd();
185 fiz2 = _mm_setzero_pd();
186 fix3 = _mm_setzero_pd();
187 fiy3 = _mm_setzero_pd();
188 fiz3 = _mm_setzero_pd();
190 /* Reset potential sums */
191 velecsum = _mm_setzero_pd();
192 vvdwsum = _mm_setzero_pd();
194 /* Start inner kernel loop */
195 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
198 /* Get j neighbor index, and coordinate index */
201 j_coord_offsetA = DIM*jnrA;
202 j_coord_offsetB = DIM*jnrB;
204 /* load j atom coordinates */
205 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
206 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
207 &jy2,&jz2,&jx3,&jy3,&jz3);
209 /* Calculate displacement vector */
210 dx00 = _mm_sub_pd(ix0,jx0);
211 dy00 = _mm_sub_pd(iy0,jy0);
212 dz00 = _mm_sub_pd(iz0,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx13 = _mm_sub_pd(ix1,jx3);
220 dy13 = _mm_sub_pd(iy1,jy3);
221 dz13 = _mm_sub_pd(iz1,jz3);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
228 dx23 = _mm_sub_pd(ix2,jx3);
229 dy23 = _mm_sub_pd(iy2,jy3);
230 dz23 = _mm_sub_pd(iz2,jz3);
231 dx31 = _mm_sub_pd(ix3,jx1);
232 dy31 = _mm_sub_pd(iy3,jy1);
233 dz31 = _mm_sub_pd(iz3,jz1);
234 dx32 = _mm_sub_pd(ix3,jx2);
235 dy32 = _mm_sub_pd(iy3,jy2);
236 dz32 = _mm_sub_pd(iz3,jz2);
237 dx33 = _mm_sub_pd(ix3,jx3);
238 dy33 = _mm_sub_pd(iy3,jy3);
239 dz33 = _mm_sub_pd(iz3,jz3);
241 /* Calculate squared distance and things based on it */
242 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
243 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
246 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
248 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
249 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
250 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
251 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
253 rinv11 = gmx_mm_invsqrt_pd(rsq11);
254 rinv12 = gmx_mm_invsqrt_pd(rsq12);
255 rinv13 = gmx_mm_invsqrt_pd(rsq13);
256 rinv21 = gmx_mm_invsqrt_pd(rsq21);
257 rinv22 = gmx_mm_invsqrt_pd(rsq22);
258 rinv23 = gmx_mm_invsqrt_pd(rsq23);
259 rinv31 = gmx_mm_invsqrt_pd(rsq31);
260 rinv32 = gmx_mm_invsqrt_pd(rsq32);
261 rinv33 = gmx_mm_invsqrt_pd(rsq33);
263 rinvsq00 = gmx_mm_inv_pd(rsq00);
265 fjx0 = _mm_setzero_pd();
266 fjy0 = _mm_setzero_pd();
267 fjz0 = _mm_setzero_pd();
268 fjx1 = _mm_setzero_pd();
269 fjy1 = _mm_setzero_pd();
270 fjz1 = _mm_setzero_pd();
271 fjx2 = _mm_setzero_pd();
272 fjy2 = _mm_setzero_pd();
273 fjz2 = _mm_setzero_pd();
274 fjx3 = _mm_setzero_pd();
275 fjy3 = _mm_setzero_pd();
276 fjz3 = _mm_setzero_pd();
278 /**************************
279 * CALCULATE INTERACTIONS *
280 **************************/
282 /* LENNARD-JONES DISPERSION/REPULSION */
284 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
285 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
286 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
287 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
288 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
290 /* Update potential sum for this i atom from the interaction with this j atom. */
291 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
295 /* Calculate temporary vectorial force */
296 tx = _mm_mul_pd(fscal,dx00);
297 ty = _mm_mul_pd(fscal,dy00);
298 tz = _mm_mul_pd(fscal,dz00);
300 /* Update vectorial force */
301 fix0 = _mm_add_pd(fix0,tx);
302 fiy0 = _mm_add_pd(fiy0,ty);
303 fiz0 = _mm_add_pd(fiz0,tz);
305 fjx0 = _mm_add_pd(fjx0,tx);
306 fjy0 = _mm_add_pd(fjy0,ty);
307 fjz0 = _mm_add_pd(fjz0,tz);
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 r11 = _mm_mul_pd(rsq11,rinv11);
315 /* Calculate table index by multiplying r with table scale and truncate to integer */
316 rt = _mm_mul_pd(r11,vftabscale);
317 vfitab = _mm_cvttpd_epi32(rt);
318 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
319 vfitab = _mm_slli_epi32(vfitab,2);
321 /* CUBIC SPLINE TABLE ELECTROSTATICS */
322 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
323 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
324 GMX_MM_TRANSPOSE2_PD(Y,F);
325 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
326 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
327 GMX_MM_TRANSPOSE2_PD(G,H);
328 Heps = _mm_mul_pd(vfeps,H);
329 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
330 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
331 velec = _mm_mul_pd(qq11,VV);
332 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
333 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 velecsum = _mm_add_pd(velecsum,velec);
340 /* Calculate temporary vectorial force */
341 tx = _mm_mul_pd(fscal,dx11);
342 ty = _mm_mul_pd(fscal,dy11);
343 tz = _mm_mul_pd(fscal,dz11);
345 /* Update vectorial force */
346 fix1 = _mm_add_pd(fix1,tx);
347 fiy1 = _mm_add_pd(fiy1,ty);
348 fiz1 = _mm_add_pd(fiz1,tz);
350 fjx1 = _mm_add_pd(fjx1,tx);
351 fjy1 = _mm_add_pd(fjy1,ty);
352 fjz1 = _mm_add_pd(fjz1,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 r12 = _mm_mul_pd(rsq12,rinv12);
360 /* Calculate table index by multiplying r with table scale and truncate to integer */
361 rt = _mm_mul_pd(r12,vftabscale);
362 vfitab = _mm_cvttpd_epi32(rt);
363 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
364 vfitab = _mm_slli_epi32(vfitab,2);
366 /* CUBIC SPLINE TABLE ELECTROSTATICS */
367 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
368 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
369 GMX_MM_TRANSPOSE2_PD(Y,F);
370 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
371 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
372 GMX_MM_TRANSPOSE2_PD(G,H);
373 Heps = _mm_mul_pd(vfeps,H);
374 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
375 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
376 velec = _mm_mul_pd(qq12,VV);
377 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
378 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velecsum = _mm_add_pd(velecsum,velec);
385 /* Calculate temporary vectorial force */
386 tx = _mm_mul_pd(fscal,dx12);
387 ty = _mm_mul_pd(fscal,dy12);
388 tz = _mm_mul_pd(fscal,dz12);
390 /* Update vectorial force */
391 fix1 = _mm_add_pd(fix1,tx);
392 fiy1 = _mm_add_pd(fiy1,ty);
393 fiz1 = _mm_add_pd(fiz1,tz);
395 fjx2 = _mm_add_pd(fjx2,tx);
396 fjy2 = _mm_add_pd(fjy2,ty);
397 fjz2 = _mm_add_pd(fjz2,tz);
399 /**************************
400 * CALCULATE INTERACTIONS *
401 **************************/
403 r13 = _mm_mul_pd(rsq13,rinv13);
405 /* Calculate table index by multiplying r with table scale and truncate to integer */
406 rt = _mm_mul_pd(r13,vftabscale);
407 vfitab = _mm_cvttpd_epi32(rt);
408 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
409 vfitab = _mm_slli_epi32(vfitab,2);
411 /* CUBIC SPLINE TABLE ELECTROSTATICS */
412 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
413 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
414 GMX_MM_TRANSPOSE2_PD(Y,F);
415 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
416 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
417 GMX_MM_TRANSPOSE2_PD(G,H);
418 Heps = _mm_mul_pd(vfeps,H);
419 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
420 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
421 velec = _mm_mul_pd(qq13,VV);
422 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
423 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
425 /* Update potential sum for this i atom from the interaction with this j atom. */
426 velecsum = _mm_add_pd(velecsum,velec);
430 /* Calculate temporary vectorial force */
431 tx = _mm_mul_pd(fscal,dx13);
432 ty = _mm_mul_pd(fscal,dy13);
433 tz = _mm_mul_pd(fscal,dz13);
435 /* Update vectorial force */
436 fix1 = _mm_add_pd(fix1,tx);
437 fiy1 = _mm_add_pd(fiy1,ty);
438 fiz1 = _mm_add_pd(fiz1,tz);
440 fjx3 = _mm_add_pd(fjx3,tx);
441 fjy3 = _mm_add_pd(fjy3,ty);
442 fjz3 = _mm_add_pd(fjz3,tz);
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 r21 = _mm_mul_pd(rsq21,rinv21);
450 /* Calculate table index by multiplying r with table scale and truncate to integer */
451 rt = _mm_mul_pd(r21,vftabscale);
452 vfitab = _mm_cvttpd_epi32(rt);
453 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
454 vfitab = _mm_slli_epi32(vfitab,2);
456 /* CUBIC SPLINE TABLE ELECTROSTATICS */
457 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
458 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
459 GMX_MM_TRANSPOSE2_PD(Y,F);
460 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
461 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
462 GMX_MM_TRANSPOSE2_PD(G,H);
463 Heps = _mm_mul_pd(vfeps,H);
464 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
465 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
466 velec = _mm_mul_pd(qq21,VV);
467 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
468 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm_add_pd(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm_mul_pd(fscal,dx21);
477 ty = _mm_mul_pd(fscal,dy21);
478 tz = _mm_mul_pd(fscal,dz21);
480 /* Update vectorial force */
481 fix2 = _mm_add_pd(fix2,tx);
482 fiy2 = _mm_add_pd(fiy2,ty);
483 fiz2 = _mm_add_pd(fiz2,tz);
485 fjx1 = _mm_add_pd(fjx1,tx);
486 fjy1 = _mm_add_pd(fjy1,ty);
487 fjz1 = _mm_add_pd(fjz1,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 r22 = _mm_mul_pd(rsq22,rinv22);
495 /* Calculate table index by multiplying r with table scale and truncate to integer */
496 rt = _mm_mul_pd(r22,vftabscale);
497 vfitab = _mm_cvttpd_epi32(rt);
498 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
499 vfitab = _mm_slli_epi32(vfitab,2);
501 /* CUBIC SPLINE TABLE ELECTROSTATICS */
502 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
503 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
504 GMX_MM_TRANSPOSE2_PD(Y,F);
505 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
506 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
507 GMX_MM_TRANSPOSE2_PD(G,H);
508 Heps = _mm_mul_pd(vfeps,H);
509 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
510 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
511 velec = _mm_mul_pd(qq22,VV);
512 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
513 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
515 /* Update potential sum for this i atom from the interaction with this j atom. */
516 velecsum = _mm_add_pd(velecsum,velec);
520 /* Calculate temporary vectorial force */
521 tx = _mm_mul_pd(fscal,dx22);
522 ty = _mm_mul_pd(fscal,dy22);
523 tz = _mm_mul_pd(fscal,dz22);
525 /* Update vectorial force */
526 fix2 = _mm_add_pd(fix2,tx);
527 fiy2 = _mm_add_pd(fiy2,ty);
528 fiz2 = _mm_add_pd(fiz2,tz);
530 fjx2 = _mm_add_pd(fjx2,tx);
531 fjy2 = _mm_add_pd(fjy2,ty);
532 fjz2 = _mm_add_pd(fjz2,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 r23 = _mm_mul_pd(rsq23,rinv23);
540 /* Calculate table index by multiplying r with table scale and truncate to integer */
541 rt = _mm_mul_pd(r23,vftabscale);
542 vfitab = _mm_cvttpd_epi32(rt);
543 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
544 vfitab = _mm_slli_epi32(vfitab,2);
546 /* CUBIC SPLINE TABLE ELECTROSTATICS */
547 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
548 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
549 GMX_MM_TRANSPOSE2_PD(Y,F);
550 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
551 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
552 GMX_MM_TRANSPOSE2_PD(G,H);
553 Heps = _mm_mul_pd(vfeps,H);
554 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
555 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
556 velec = _mm_mul_pd(qq23,VV);
557 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
558 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velecsum = _mm_add_pd(velecsum,velec);
565 /* Calculate temporary vectorial force */
566 tx = _mm_mul_pd(fscal,dx23);
567 ty = _mm_mul_pd(fscal,dy23);
568 tz = _mm_mul_pd(fscal,dz23);
570 /* Update vectorial force */
571 fix2 = _mm_add_pd(fix2,tx);
572 fiy2 = _mm_add_pd(fiy2,ty);
573 fiz2 = _mm_add_pd(fiz2,tz);
575 fjx3 = _mm_add_pd(fjx3,tx);
576 fjy3 = _mm_add_pd(fjy3,ty);
577 fjz3 = _mm_add_pd(fjz3,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r31 = _mm_mul_pd(rsq31,rinv31);
585 /* Calculate table index by multiplying r with table scale and truncate to integer */
586 rt = _mm_mul_pd(r31,vftabscale);
587 vfitab = _mm_cvttpd_epi32(rt);
588 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
589 vfitab = _mm_slli_epi32(vfitab,2);
591 /* CUBIC SPLINE TABLE ELECTROSTATICS */
592 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
593 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
594 GMX_MM_TRANSPOSE2_PD(Y,F);
595 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
596 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
597 GMX_MM_TRANSPOSE2_PD(G,H);
598 Heps = _mm_mul_pd(vfeps,H);
599 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
600 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
601 velec = _mm_mul_pd(qq31,VV);
602 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
603 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
605 /* Update potential sum for this i atom from the interaction with this j atom. */
606 velecsum = _mm_add_pd(velecsum,velec);
610 /* Calculate temporary vectorial force */
611 tx = _mm_mul_pd(fscal,dx31);
612 ty = _mm_mul_pd(fscal,dy31);
613 tz = _mm_mul_pd(fscal,dz31);
615 /* Update vectorial force */
616 fix3 = _mm_add_pd(fix3,tx);
617 fiy3 = _mm_add_pd(fiy3,ty);
618 fiz3 = _mm_add_pd(fiz3,tz);
620 fjx1 = _mm_add_pd(fjx1,tx);
621 fjy1 = _mm_add_pd(fjy1,ty);
622 fjz1 = _mm_add_pd(fjz1,tz);
624 /**************************
625 * CALCULATE INTERACTIONS *
626 **************************/
628 r32 = _mm_mul_pd(rsq32,rinv32);
630 /* Calculate table index by multiplying r with table scale and truncate to integer */
631 rt = _mm_mul_pd(r32,vftabscale);
632 vfitab = _mm_cvttpd_epi32(rt);
633 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
634 vfitab = _mm_slli_epi32(vfitab,2);
636 /* CUBIC SPLINE TABLE ELECTROSTATICS */
637 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
638 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
639 GMX_MM_TRANSPOSE2_PD(Y,F);
640 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
641 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
642 GMX_MM_TRANSPOSE2_PD(G,H);
643 Heps = _mm_mul_pd(vfeps,H);
644 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
645 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
646 velec = _mm_mul_pd(qq32,VV);
647 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
648 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velecsum = _mm_add_pd(velecsum,velec);
655 /* Calculate temporary vectorial force */
656 tx = _mm_mul_pd(fscal,dx32);
657 ty = _mm_mul_pd(fscal,dy32);
658 tz = _mm_mul_pd(fscal,dz32);
660 /* Update vectorial force */
661 fix3 = _mm_add_pd(fix3,tx);
662 fiy3 = _mm_add_pd(fiy3,ty);
663 fiz3 = _mm_add_pd(fiz3,tz);
665 fjx2 = _mm_add_pd(fjx2,tx);
666 fjy2 = _mm_add_pd(fjy2,ty);
667 fjz2 = _mm_add_pd(fjz2,tz);
669 /**************************
670 * CALCULATE INTERACTIONS *
671 **************************/
673 r33 = _mm_mul_pd(rsq33,rinv33);
675 /* Calculate table index by multiplying r with table scale and truncate to integer */
676 rt = _mm_mul_pd(r33,vftabscale);
677 vfitab = _mm_cvttpd_epi32(rt);
678 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
679 vfitab = _mm_slli_epi32(vfitab,2);
681 /* CUBIC SPLINE TABLE ELECTROSTATICS */
682 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
683 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
684 GMX_MM_TRANSPOSE2_PD(Y,F);
685 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
686 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
687 GMX_MM_TRANSPOSE2_PD(G,H);
688 Heps = _mm_mul_pd(vfeps,H);
689 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
690 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
691 velec = _mm_mul_pd(qq33,VV);
692 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
693 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
695 /* Update potential sum for this i atom from the interaction with this j atom. */
696 velecsum = _mm_add_pd(velecsum,velec);
700 /* Calculate temporary vectorial force */
701 tx = _mm_mul_pd(fscal,dx33);
702 ty = _mm_mul_pd(fscal,dy33);
703 tz = _mm_mul_pd(fscal,dz33);
705 /* Update vectorial force */
706 fix3 = _mm_add_pd(fix3,tx);
707 fiy3 = _mm_add_pd(fiy3,ty);
708 fiz3 = _mm_add_pd(fiz3,tz);
710 fjx3 = _mm_add_pd(fjx3,tx);
711 fjy3 = _mm_add_pd(fjy3,ty);
712 fjz3 = _mm_add_pd(fjz3,tz);
714 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
716 /* Inner loop uses 422 flops */
723 j_coord_offsetA = DIM*jnrA;
725 /* load j atom coordinates */
726 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
727 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
728 &jy2,&jz2,&jx3,&jy3,&jz3);
730 /* Calculate displacement vector */
731 dx00 = _mm_sub_pd(ix0,jx0);
732 dy00 = _mm_sub_pd(iy0,jy0);
733 dz00 = _mm_sub_pd(iz0,jz0);
734 dx11 = _mm_sub_pd(ix1,jx1);
735 dy11 = _mm_sub_pd(iy1,jy1);
736 dz11 = _mm_sub_pd(iz1,jz1);
737 dx12 = _mm_sub_pd(ix1,jx2);
738 dy12 = _mm_sub_pd(iy1,jy2);
739 dz12 = _mm_sub_pd(iz1,jz2);
740 dx13 = _mm_sub_pd(ix1,jx3);
741 dy13 = _mm_sub_pd(iy1,jy3);
742 dz13 = _mm_sub_pd(iz1,jz3);
743 dx21 = _mm_sub_pd(ix2,jx1);
744 dy21 = _mm_sub_pd(iy2,jy1);
745 dz21 = _mm_sub_pd(iz2,jz1);
746 dx22 = _mm_sub_pd(ix2,jx2);
747 dy22 = _mm_sub_pd(iy2,jy2);
748 dz22 = _mm_sub_pd(iz2,jz2);
749 dx23 = _mm_sub_pd(ix2,jx3);
750 dy23 = _mm_sub_pd(iy2,jy3);
751 dz23 = _mm_sub_pd(iz2,jz3);
752 dx31 = _mm_sub_pd(ix3,jx1);
753 dy31 = _mm_sub_pd(iy3,jy1);
754 dz31 = _mm_sub_pd(iz3,jz1);
755 dx32 = _mm_sub_pd(ix3,jx2);
756 dy32 = _mm_sub_pd(iy3,jy2);
757 dz32 = _mm_sub_pd(iz3,jz2);
758 dx33 = _mm_sub_pd(ix3,jx3);
759 dy33 = _mm_sub_pd(iy3,jy3);
760 dz33 = _mm_sub_pd(iz3,jz3);
762 /* Calculate squared distance and things based on it */
763 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
764 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
765 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
766 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
767 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
768 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
769 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
770 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
771 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
772 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
774 rinv11 = gmx_mm_invsqrt_pd(rsq11);
775 rinv12 = gmx_mm_invsqrt_pd(rsq12);
776 rinv13 = gmx_mm_invsqrt_pd(rsq13);
777 rinv21 = gmx_mm_invsqrt_pd(rsq21);
778 rinv22 = gmx_mm_invsqrt_pd(rsq22);
779 rinv23 = gmx_mm_invsqrt_pd(rsq23);
780 rinv31 = gmx_mm_invsqrt_pd(rsq31);
781 rinv32 = gmx_mm_invsqrt_pd(rsq32);
782 rinv33 = gmx_mm_invsqrt_pd(rsq33);
784 rinvsq00 = gmx_mm_inv_pd(rsq00);
786 fjx0 = _mm_setzero_pd();
787 fjy0 = _mm_setzero_pd();
788 fjz0 = _mm_setzero_pd();
789 fjx1 = _mm_setzero_pd();
790 fjy1 = _mm_setzero_pd();
791 fjz1 = _mm_setzero_pd();
792 fjx2 = _mm_setzero_pd();
793 fjy2 = _mm_setzero_pd();
794 fjz2 = _mm_setzero_pd();
795 fjx3 = _mm_setzero_pd();
796 fjy3 = _mm_setzero_pd();
797 fjz3 = _mm_setzero_pd();
799 /**************************
800 * CALCULATE INTERACTIONS *
801 **************************/
803 /* LENNARD-JONES DISPERSION/REPULSION */
805 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
806 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
807 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
808 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
809 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
811 /* Update potential sum for this i atom from the interaction with this j atom. */
812 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
813 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
817 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
819 /* Calculate temporary vectorial force */
820 tx = _mm_mul_pd(fscal,dx00);
821 ty = _mm_mul_pd(fscal,dy00);
822 tz = _mm_mul_pd(fscal,dz00);
824 /* Update vectorial force */
825 fix0 = _mm_add_pd(fix0,tx);
826 fiy0 = _mm_add_pd(fiy0,ty);
827 fiz0 = _mm_add_pd(fiz0,tz);
829 fjx0 = _mm_add_pd(fjx0,tx);
830 fjy0 = _mm_add_pd(fjy0,ty);
831 fjz0 = _mm_add_pd(fjz0,tz);
833 /**************************
834 * CALCULATE INTERACTIONS *
835 **************************/
837 r11 = _mm_mul_pd(rsq11,rinv11);
839 /* Calculate table index by multiplying r with table scale and truncate to integer */
840 rt = _mm_mul_pd(r11,vftabscale);
841 vfitab = _mm_cvttpd_epi32(rt);
842 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
843 vfitab = _mm_slli_epi32(vfitab,2);
845 /* CUBIC SPLINE TABLE ELECTROSTATICS */
846 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
847 F = _mm_setzero_pd();
848 GMX_MM_TRANSPOSE2_PD(Y,F);
849 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
850 H = _mm_setzero_pd();
851 GMX_MM_TRANSPOSE2_PD(G,H);
852 Heps = _mm_mul_pd(vfeps,H);
853 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
854 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
855 velec = _mm_mul_pd(qq11,VV);
856 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
857 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
861 velecsum = _mm_add_pd(velecsum,velec);
865 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
867 /* Calculate temporary vectorial force */
868 tx = _mm_mul_pd(fscal,dx11);
869 ty = _mm_mul_pd(fscal,dy11);
870 tz = _mm_mul_pd(fscal,dz11);
872 /* Update vectorial force */
873 fix1 = _mm_add_pd(fix1,tx);
874 fiy1 = _mm_add_pd(fiy1,ty);
875 fiz1 = _mm_add_pd(fiz1,tz);
877 fjx1 = _mm_add_pd(fjx1,tx);
878 fjy1 = _mm_add_pd(fjy1,ty);
879 fjz1 = _mm_add_pd(fjz1,tz);
881 /**************************
882 * CALCULATE INTERACTIONS *
883 **************************/
885 r12 = _mm_mul_pd(rsq12,rinv12);
887 /* Calculate table index by multiplying r with table scale and truncate to integer */
888 rt = _mm_mul_pd(r12,vftabscale);
889 vfitab = _mm_cvttpd_epi32(rt);
890 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
891 vfitab = _mm_slli_epi32(vfitab,2);
893 /* CUBIC SPLINE TABLE ELECTROSTATICS */
894 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
895 F = _mm_setzero_pd();
896 GMX_MM_TRANSPOSE2_PD(Y,F);
897 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
898 H = _mm_setzero_pd();
899 GMX_MM_TRANSPOSE2_PD(G,H);
900 Heps = _mm_mul_pd(vfeps,H);
901 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
902 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
903 velec = _mm_mul_pd(qq12,VV);
904 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
905 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
909 velecsum = _mm_add_pd(velecsum,velec);
913 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
915 /* Calculate temporary vectorial force */
916 tx = _mm_mul_pd(fscal,dx12);
917 ty = _mm_mul_pd(fscal,dy12);
918 tz = _mm_mul_pd(fscal,dz12);
920 /* Update vectorial force */
921 fix1 = _mm_add_pd(fix1,tx);
922 fiy1 = _mm_add_pd(fiy1,ty);
923 fiz1 = _mm_add_pd(fiz1,tz);
925 fjx2 = _mm_add_pd(fjx2,tx);
926 fjy2 = _mm_add_pd(fjy2,ty);
927 fjz2 = _mm_add_pd(fjz2,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 r13 = _mm_mul_pd(rsq13,rinv13);
935 /* Calculate table index by multiplying r with table scale and truncate to integer */
936 rt = _mm_mul_pd(r13,vftabscale);
937 vfitab = _mm_cvttpd_epi32(rt);
938 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
939 vfitab = _mm_slli_epi32(vfitab,2);
941 /* CUBIC SPLINE TABLE ELECTROSTATICS */
942 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
943 F = _mm_setzero_pd();
944 GMX_MM_TRANSPOSE2_PD(Y,F);
945 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
946 H = _mm_setzero_pd();
947 GMX_MM_TRANSPOSE2_PD(G,H);
948 Heps = _mm_mul_pd(vfeps,H);
949 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
950 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
951 velec = _mm_mul_pd(qq13,VV);
952 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
953 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
955 /* Update potential sum for this i atom from the interaction with this j atom. */
956 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
957 velecsum = _mm_add_pd(velecsum,velec);
961 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
963 /* Calculate temporary vectorial force */
964 tx = _mm_mul_pd(fscal,dx13);
965 ty = _mm_mul_pd(fscal,dy13);
966 tz = _mm_mul_pd(fscal,dz13);
968 /* Update vectorial force */
969 fix1 = _mm_add_pd(fix1,tx);
970 fiy1 = _mm_add_pd(fiy1,ty);
971 fiz1 = _mm_add_pd(fiz1,tz);
973 fjx3 = _mm_add_pd(fjx3,tx);
974 fjy3 = _mm_add_pd(fjy3,ty);
975 fjz3 = _mm_add_pd(fjz3,tz);
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 r21 = _mm_mul_pd(rsq21,rinv21);
983 /* Calculate table index by multiplying r with table scale and truncate to integer */
984 rt = _mm_mul_pd(r21,vftabscale);
985 vfitab = _mm_cvttpd_epi32(rt);
986 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
987 vfitab = _mm_slli_epi32(vfitab,2);
989 /* CUBIC SPLINE TABLE ELECTROSTATICS */
990 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
991 F = _mm_setzero_pd();
992 GMX_MM_TRANSPOSE2_PD(Y,F);
993 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
994 H = _mm_setzero_pd();
995 GMX_MM_TRANSPOSE2_PD(G,H);
996 Heps = _mm_mul_pd(vfeps,H);
997 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
998 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
999 velec = _mm_mul_pd(qq21,VV);
1000 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1001 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1003 /* Update potential sum for this i atom from the interaction with this j atom. */
1004 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1005 velecsum = _mm_add_pd(velecsum,velec);
1009 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1011 /* Calculate temporary vectorial force */
1012 tx = _mm_mul_pd(fscal,dx21);
1013 ty = _mm_mul_pd(fscal,dy21);
1014 tz = _mm_mul_pd(fscal,dz21);
1016 /* Update vectorial force */
1017 fix2 = _mm_add_pd(fix2,tx);
1018 fiy2 = _mm_add_pd(fiy2,ty);
1019 fiz2 = _mm_add_pd(fiz2,tz);
1021 fjx1 = _mm_add_pd(fjx1,tx);
1022 fjy1 = _mm_add_pd(fjy1,ty);
1023 fjz1 = _mm_add_pd(fjz1,tz);
1025 /**************************
1026 * CALCULATE INTERACTIONS *
1027 **************************/
1029 r22 = _mm_mul_pd(rsq22,rinv22);
1031 /* Calculate table index by multiplying r with table scale and truncate to integer */
1032 rt = _mm_mul_pd(r22,vftabscale);
1033 vfitab = _mm_cvttpd_epi32(rt);
1034 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1035 vfitab = _mm_slli_epi32(vfitab,2);
1037 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1038 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1039 F = _mm_setzero_pd();
1040 GMX_MM_TRANSPOSE2_PD(Y,F);
1041 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1042 H = _mm_setzero_pd();
1043 GMX_MM_TRANSPOSE2_PD(G,H);
1044 Heps = _mm_mul_pd(vfeps,H);
1045 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1046 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1047 velec = _mm_mul_pd(qq22,VV);
1048 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1049 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1053 velecsum = _mm_add_pd(velecsum,velec);
1057 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1059 /* Calculate temporary vectorial force */
1060 tx = _mm_mul_pd(fscal,dx22);
1061 ty = _mm_mul_pd(fscal,dy22);
1062 tz = _mm_mul_pd(fscal,dz22);
1064 /* Update vectorial force */
1065 fix2 = _mm_add_pd(fix2,tx);
1066 fiy2 = _mm_add_pd(fiy2,ty);
1067 fiz2 = _mm_add_pd(fiz2,tz);
1069 fjx2 = _mm_add_pd(fjx2,tx);
1070 fjy2 = _mm_add_pd(fjy2,ty);
1071 fjz2 = _mm_add_pd(fjz2,tz);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 r23 = _mm_mul_pd(rsq23,rinv23);
1079 /* Calculate table index by multiplying r with table scale and truncate to integer */
1080 rt = _mm_mul_pd(r23,vftabscale);
1081 vfitab = _mm_cvttpd_epi32(rt);
1082 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1083 vfitab = _mm_slli_epi32(vfitab,2);
1085 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1086 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1087 F = _mm_setzero_pd();
1088 GMX_MM_TRANSPOSE2_PD(Y,F);
1089 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1090 H = _mm_setzero_pd();
1091 GMX_MM_TRANSPOSE2_PD(G,H);
1092 Heps = _mm_mul_pd(vfeps,H);
1093 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1094 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1095 velec = _mm_mul_pd(qq23,VV);
1096 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1097 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1101 velecsum = _mm_add_pd(velecsum,velec);
1105 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1107 /* Calculate temporary vectorial force */
1108 tx = _mm_mul_pd(fscal,dx23);
1109 ty = _mm_mul_pd(fscal,dy23);
1110 tz = _mm_mul_pd(fscal,dz23);
1112 /* Update vectorial force */
1113 fix2 = _mm_add_pd(fix2,tx);
1114 fiy2 = _mm_add_pd(fiy2,ty);
1115 fiz2 = _mm_add_pd(fiz2,tz);
1117 fjx3 = _mm_add_pd(fjx3,tx);
1118 fjy3 = _mm_add_pd(fjy3,ty);
1119 fjz3 = _mm_add_pd(fjz3,tz);
1121 /**************************
1122 * CALCULATE INTERACTIONS *
1123 **************************/
1125 r31 = _mm_mul_pd(rsq31,rinv31);
1127 /* Calculate table index by multiplying r with table scale and truncate to integer */
1128 rt = _mm_mul_pd(r31,vftabscale);
1129 vfitab = _mm_cvttpd_epi32(rt);
1130 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1131 vfitab = _mm_slli_epi32(vfitab,2);
1133 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1134 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1135 F = _mm_setzero_pd();
1136 GMX_MM_TRANSPOSE2_PD(Y,F);
1137 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1138 H = _mm_setzero_pd();
1139 GMX_MM_TRANSPOSE2_PD(G,H);
1140 Heps = _mm_mul_pd(vfeps,H);
1141 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1142 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1143 velec = _mm_mul_pd(qq31,VV);
1144 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1145 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1147 /* Update potential sum for this i atom from the interaction with this j atom. */
1148 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1149 velecsum = _mm_add_pd(velecsum,velec);
1153 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1155 /* Calculate temporary vectorial force */
1156 tx = _mm_mul_pd(fscal,dx31);
1157 ty = _mm_mul_pd(fscal,dy31);
1158 tz = _mm_mul_pd(fscal,dz31);
1160 /* Update vectorial force */
1161 fix3 = _mm_add_pd(fix3,tx);
1162 fiy3 = _mm_add_pd(fiy3,ty);
1163 fiz3 = _mm_add_pd(fiz3,tz);
1165 fjx1 = _mm_add_pd(fjx1,tx);
1166 fjy1 = _mm_add_pd(fjy1,ty);
1167 fjz1 = _mm_add_pd(fjz1,tz);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 r32 = _mm_mul_pd(rsq32,rinv32);
1175 /* Calculate table index by multiplying r with table scale and truncate to integer */
1176 rt = _mm_mul_pd(r32,vftabscale);
1177 vfitab = _mm_cvttpd_epi32(rt);
1178 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1179 vfitab = _mm_slli_epi32(vfitab,2);
1181 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1182 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1183 F = _mm_setzero_pd();
1184 GMX_MM_TRANSPOSE2_PD(Y,F);
1185 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1186 H = _mm_setzero_pd();
1187 GMX_MM_TRANSPOSE2_PD(G,H);
1188 Heps = _mm_mul_pd(vfeps,H);
1189 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1190 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1191 velec = _mm_mul_pd(qq32,VV);
1192 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1193 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1197 velecsum = _mm_add_pd(velecsum,velec);
1201 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1203 /* Calculate temporary vectorial force */
1204 tx = _mm_mul_pd(fscal,dx32);
1205 ty = _mm_mul_pd(fscal,dy32);
1206 tz = _mm_mul_pd(fscal,dz32);
1208 /* Update vectorial force */
1209 fix3 = _mm_add_pd(fix3,tx);
1210 fiy3 = _mm_add_pd(fiy3,ty);
1211 fiz3 = _mm_add_pd(fiz3,tz);
1213 fjx2 = _mm_add_pd(fjx2,tx);
1214 fjy2 = _mm_add_pd(fjy2,ty);
1215 fjz2 = _mm_add_pd(fjz2,tz);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 r33 = _mm_mul_pd(rsq33,rinv33);
1223 /* Calculate table index by multiplying r with table scale and truncate to integer */
1224 rt = _mm_mul_pd(r33,vftabscale);
1225 vfitab = _mm_cvttpd_epi32(rt);
1226 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1227 vfitab = _mm_slli_epi32(vfitab,2);
1229 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1230 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1231 F = _mm_setzero_pd();
1232 GMX_MM_TRANSPOSE2_PD(Y,F);
1233 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1234 H = _mm_setzero_pd();
1235 GMX_MM_TRANSPOSE2_PD(G,H);
1236 Heps = _mm_mul_pd(vfeps,H);
1237 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1238 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1239 velec = _mm_mul_pd(qq33,VV);
1240 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1241 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1243 /* Update potential sum for this i atom from the interaction with this j atom. */
1244 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1245 velecsum = _mm_add_pd(velecsum,velec);
1249 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1251 /* Calculate temporary vectorial force */
1252 tx = _mm_mul_pd(fscal,dx33);
1253 ty = _mm_mul_pd(fscal,dy33);
1254 tz = _mm_mul_pd(fscal,dz33);
1256 /* Update vectorial force */
1257 fix3 = _mm_add_pd(fix3,tx);
1258 fiy3 = _mm_add_pd(fiy3,ty);
1259 fiz3 = _mm_add_pd(fiz3,tz);
1261 fjx3 = _mm_add_pd(fjx3,tx);
1262 fjy3 = _mm_add_pd(fjy3,ty);
1263 fjz3 = _mm_add_pd(fjz3,tz);
1265 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1267 /* Inner loop uses 422 flops */
1270 /* End of innermost loop */
1272 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1273 f+i_coord_offset,fshift+i_shift_offset);
1276 /* Update potential energies */
1277 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1278 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1280 /* Increment number of inner iterations */
1281 inneriter += j_index_end - j_index_start;
1283 /* Outer loop uses 26 flops */
1286 /* Increment number of outer iterations */
1289 /* Update outer/inner flops */
1291 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*422);
1294 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1295 * Electrostatics interaction: CubicSplineTable
1296 * VdW interaction: LennardJones
1297 * Geometry: Water4-Water4
1298 * Calculate force/pot: Force
1301 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1302 (t_nblist * gmx_restrict nlist,
1303 rvec * gmx_restrict xx,
1304 rvec * gmx_restrict ff,
1305 t_forcerec * gmx_restrict fr,
1306 t_mdatoms * gmx_restrict mdatoms,
1307 nb_kernel_data_t * gmx_restrict kernel_data,
1308 t_nrnb * gmx_restrict nrnb)
1310 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1311 * just 0 for non-waters.
1312 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1313 * jnr indices corresponding to data put in the four positions in the SIMD register.
1315 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1316 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1318 int j_coord_offsetA,j_coord_offsetB;
1319 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1320 real rcutoff_scalar;
1321 real *shiftvec,*fshift,*x,*f;
1322 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1324 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1326 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1328 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1330 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1331 int vdwjidx0A,vdwjidx0B;
1332 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1333 int vdwjidx1A,vdwjidx1B;
1334 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1335 int vdwjidx2A,vdwjidx2B;
1336 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1337 int vdwjidx3A,vdwjidx3B;
1338 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1339 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1340 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1341 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1342 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1343 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1344 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1345 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1346 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1347 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1348 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1349 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1352 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1355 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1356 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1358 __m128i ifour = _mm_set1_epi32(4);
1359 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1361 __m128d dummy_mask,cutoff_mask;
1362 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1363 __m128d one = _mm_set1_pd(1.0);
1364 __m128d two = _mm_set1_pd(2.0);
1370 jindex = nlist->jindex;
1372 shiftidx = nlist->shift;
1374 shiftvec = fr->shift_vec[0];
1375 fshift = fr->fshift[0];
1376 facel = _mm_set1_pd(fr->epsfac);
1377 charge = mdatoms->chargeA;
1378 nvdwtype = fr->ntype;
1379 vdwparam = fr->nbfp;
1380 vdwtype = mdatoms->typeA;
1382 vftab = kernel_data->table_elec->data;
1383 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1385 /* Setup water-specific parameters */
1386 inr = nlist->iinr[0];
1387 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1388 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1389 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1390 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1392 jq1 = _mm_set1_pd(charge[inr+1]);
1393 jq2 = _mm_set1_pd(charge[inr+2]);
1394 jq3 = _mm_set1_pd(charge[inr+3]);
1395 vdwjidx0A = 2*vdwtype[inr+0];
1396 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1397 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1398 qq11 = _mm_mul_pd(iq1,jq1);
1399 qq12 = _mm_mul_pd(iq1,jq2);
1400 qq13 = _mm_mul_pd(iq1,jq3);
1401 qq21 = _mm_mul_pd(iq2,jq1);
1402 qq22 = _mm_mul_pd(iq2,jq2);
1403 qq23 = _mm_mul_pd(iq2,jq3);
1404 qq31 = _mm_mul_pd(iq3,jq1);
1405 qq32 = _mm_mul_pd(iq3,jq2);
1406 qq33 = _mm_mul_pd(iq3,jq3);
1408 /* Avoid stupid compiler warnings */
1410 j_coord_offsetA = 0;
1411 j_coord_offsetB = 0;
1416 /* Start outer loop over neighborlists */
1417 for(iidx=0; iidx<nri; iidx++)
1419 /* Load shift vector for this list */
1420 i_shift_offset = DIM*shiftidx[iidx];
1422 /* Load limits for loop over neighbors */
1423 j_index_start = jindex[iidx];
1424 j_index_end = jindex[iidx+1];
1426 /* Get outer coordinate index */
1428 i_coord_offset = DIM*inr;
1430 /* Load i particle coords and add shift vector */
1431 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1432 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1434 fix0 = _mm_setzero_pd();
1435 fiy0 = _mm_setzero_pd();
1436 fiz0 = _mm_setzero_pd();
1437 fix1 = _mm_setzero_pd();
1438 fiy1 = _mm_setzero_pd();
1439 fiz1 = _mm_setzero_pd();
1440 fix2 = _mm_setzero_pd();
1441 fiy2 = _mm_setzero_pd();
1442 fiz2 = _mm_setzero_pd();
1443 fix3 = _mm_setzero_pd();
1444 fiy3 = _mm_setzero_pd();
1445 fiz3 = _mm_setzero_pd();
1447 /* Start inner kernel loop */
1448 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1451 /* Get j neighbor index, and coordinate index */
1453 jnrB = jjnr[jidx+1];
1454 j_coord_offsetA = DIM*jnrA;
1455 j_coord_offsetB = DIM*jnrB;
1457 /* load j atom coordinates */
1458 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1459 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1460 &jy2,&jz2,&jx3,&jy3,&jz3);
1462 /* Calculate displacement vector */
1463 dx00 = _mm_sub_pd(ix0,jx0);
1464 dy00 = _mm_sub_pd(iy0,jy0);
1465 dz00 = _mm_sub_pd(iz0,jz0);
1466 dx11 = _mm_sub_pd(ix1,jx1);
1467 dy11 = _mm_sub_pd(iy1,jy1);
1468 dz11 = _mm_sub_pd(iz1,jz1);
1469 dx12 = _mm_sub_pd(ix1,jx2);
1470 dy12 = _mm_sub_pd(iy1,jy2);
1471 dz12 = _mm_sub_pd(iz1,jz2);
1472 dx13 = _mm_sub_pd(ix1,jx3);
1473 dy13 = _mm_sub_pd(iy1,jy3);
1474 dz13 = _mm_sub_pd(iz1,jz3);
1475 dx21 = _mm_sub_pd(ix2,jx1);
1476 dy21 = _mm_sub_pd(iy2,jy1);
1477 dz21 = _mm_sub_pd(iz2,jz1);
1478 dx22 = _mm_sub_pd(ix2,jx2);
1479 dy22 = _mm_sub_pd(iy2,jy2);
1480 dz22 = _mm_sub_pd(iz2,jz2);
1481 dx23 = _mm_sub_pd(ix2,jx3);
1482 dy23 = _mm_sub_pd(iy2,jy3);
1483 dz23 = _mm_sub_pd(iz2,jz3);
1484 dx31 = _mm_sub_pd(ix3,jx1);
1485 dy31 = _mm_sub_pd(iy3,jy1);
1486 dz31 = _mm_sub_pd(iz3,jz1);
1487 dx32 = _mm_sub_pd(ix3,jx2);
1488 dy32 = _mm_sub_pd(iy3,jy2);
1489 dz32 = _mm_sub_pd(iz3,jz2);
1490 dx33 = _mm_sub_pd(ix3,jx3);
1491 dy33 = _mm_sub_pd(iy3,jy3);
1492 dz33 = _mm_sub_pd(iz3,jz3);
1494 /* Calculate squared distance and things based on it */
1495 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1496 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1497 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1498 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1499 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1500 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1501 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1502 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1503 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1504 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1506 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1507 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1508 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1509 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1510 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1511 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1512 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1513 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1514 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1516 rinvsq00 = gmx_mm_inv_pd(rsq00);
1518 fjx0 = _mm_setzero_pd();
1519 fjy0 = _mm_setzero_pd();
1520 fjz0 = _mm_setzero_pd();
1521 fjx1 = _mm_setzero_pd();
1522 fjy1 = _mm_setzero_pd();
1523 fjz1 = _mm_setzero_pd();
1524 fjx2 = _mm_setzero_pd();
1525 fjy2 = _mm_setzero_pd();
1526 fjz2 = _mm_setzero_pd();
1527 fjx3 = _mm_setzero_pd();
1528 fjy3 = _mm_setzero_pd();
1529 fjz3 = _mm_setzero_pd();
1531 /**************************
1532 * CALCULATE INTERACTIONS *
1533 **************************/
1535 /* LENNARD-JONES DISPERSION/REPULSION */
1537 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1538 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1542 /* Calculate temporary vectorial force */
1543 tx = _mm_mul_pd(fscal,dx00);
1544 ty = _mm_mul_pd(fscal,dy00);
1545 tz = _mm_mul_pd(fscal,dz00);
1547 /* Update vectorial force */
1548 fix0 = _mm_add_pd(fix0,tx);
1549 fiy0 = _mm_add_pd(fiy0,ty);
1550 fiz0 = _mm_add_pd(fiz0,tz);
1552 fjx0 = _mm_add_pd(fjx0,tx);
1553 fjy0 = _mm_add_pd(fjy0,ty);
1554 fjz0 = _mm_add_pd(fjz0,tz);
1556 /**************************
1557 * CALCULATE INTERACTIONS *
1558 **************************/
1560 r11 = _mm_mul_pd(rsq11,rinv11);
1562 /* Calculate table index by multiplying r with table scale and truncate to integer */
1563 rt = _mm_mul_pd(r11,vftabscale);
1564 vfitab = _mm_cvttpd_epi32(rt);
1565 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1566 vfitab = _mm_slli_epi32(vfitab,2);
1568 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1569 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1570 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1571 GMX_MM_TRANSPOSE2_PD(Y,F);
1572 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1573 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1574 GMX_MM_TRANSPOSE2_PD(G,H);
1575 Heps = _mm_mul_pd(vfeps,H);
1576 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1577 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1578 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1582 /* Calculate temporary vectorial force */
1583 tx = _mm_mul_pd(fscal,dx11);
1584 ty = _mm_mul_pd(fscal,dy11);
1585 tz = _mm_mul_pd(fscal,dz11);
1587 /* Update vectorial force */
1588 fix1 = _mm_add_pd(fix1,tx);
1589 fiy1 = _mm_add_pd(fiy1,ty);
1590 fiz1 = _mm_add_pd(fiz1,tz);
1592 fjx1 = _mm_add_pd(fjx1,tx);
1593 fjy1 = _mm_add_pd(fjy1,ty);
1594 fjz1 = _mm_add_pd(fjz1,tz);
1596 /**************************
1597 * CALCULATE INTERACTIONS *
1598 **************************/
1600 r12 = _mm_mul_pd(rsq12,rinv12);
1602 /* Calculate table index by multiplying r with table scale and truncate to integer */
1603 rt = _mm_mul_pd(r12,vftabscale);
1604 vfitab = _mm_cvttpd_epi32(rt);
1605 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1606 vfitab = _mm_slli_epi32(vfitab,2);
1608 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1609 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1610 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1611 GMX_MM_TRANSPOSE2_PD(Y,F);
1612 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1613 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1614 GMX_MM_TRANSPOSE2_PD(G,H);
1615 Heps = _mm_mul_pd(vfeps,H);
1616 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1617 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1618 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1622 /* Calculate temporary vectorial force */
1623 tx = _mm_mul_pd(fscal,dx12);
1624 ty = _mm_mul_pd(fscal,dy12);
1625 tz = _mm_mul_pd(fscal,dz12);
1627 /* Update vectorial force */
1628 fix1 = _mm_add_pd(fix1,tx);
1629 fiy1 = _mm_add_pd(fiy1,ty);
1630 fiz1 = _mm_add_pd(fiz1,tz);
1632 fjx2 = _mm_add_pd(fjx2,tx);
1633 fjy2 = _mm_add_pd(fjy2,ty);
1634 fjz2 = _mm_add_pd(fjz2,tz);
1636 /**************************
1637 * CALCULATE INTERACTIONS *
1638 **************************/
1640 r13 = _mm_mul_pd(rsq13,rinv13);
1642 /* Calculate table index by multiplying r with table scale and truncate to integer */
1643 rt = _mm_mul_pd(r13,vftabscale);
1644 vfitab = _mm_cvttpd_epi32(rt);
1645 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1646 vfitab = _mm_slli_epi32(vfitab,2);
1648 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1649 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1650 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1651 GMX_MM_TRANSPOSE2_PD(Y,F);
1652 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1653 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1654 GMX_MM_TRANSPOSE2_PD(G,H);
1655 Heps = _mm_mul_pd(vfeps,H);
1656 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1657 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1658 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1662 /* Calculate temporary vectorial force */
1663 tx = _mm_mul_pd(fscal,dx13);
1664 ty = _mm_mul_pd(fscal,dy13);
1665 tz = _mm_mul_pd(fscal,dz13);
1667 /* Update vectorial force */
1668 fix1 = _mm_add_pd(fix1,tx);
1669 fiy1 = _mm_add_pd(fiy1,ty);
1670 fiz1 = _mm_add_pd(fiz1,tz);
1672 fjx3 = _mm_add_pd(fjx3,tx);
1673 fjy3 = _mm_add_pd(fjy3,ty);
1674 fjz3 = _mm_add_pd(fjz3,tz);
1676 /**************************
1677 * CALCULATE INTERACTIONS *
1678 **************************/
1680 r21 = _mm_mul_pd(rsq21,rinv21);
1682 /* Calculate table index by multiplying r with table scale and truncate to integer */
1683 rt = _mm_mul_pd(r21,vftabscale);
1684 vfitab = _mm_cvttpd_epi32(rt);
1685 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1686 vfitab = _mm_slli_epi32(vfitab,2);
1688 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1689 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1690 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1691 GMX_MM_TRANSPOSE2_PD(Y,F);
1692 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1693 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1694 GMX_MM_TRANSPOSE2_PD(G,H);
1695 Heps = _mm_mul_pd(vfeps,H);
1696 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1697 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1698 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1702 /* Calculate temporary vectorial force */
1703 tx = _mm_mul_pd(fscal,dx21);
1704 ty = _mm_mul_pd(fscal,dy21);
1705 tz = _mm_mul_pd(fscal,dz21);
1707 /* Update vectorial force */
1708 fix2 = _mm_add_pd(fix2,tx);
1709 fiy2 = _mm_add_pd(fiy2,ty);
1710 fiz2 = _mm_add_pd(fiz2,tz);
1712 fjx1 = _mm_add_pd(fjx1,tx);
1713 fjy1 = _mm_add_pd(fjy1,ty);
1714 fjz1 = _mm_add_pd(fjz1,tz);
1716 /**************************
1717 * CALCULATE INTERACTIONS *
1718 **************************/
1720 r22 = _mm_mul_pd(rsq22,rinv22);
1722 /* Calculate table index by multiplying r with table scale and truncate to integer */
1723 rt = _mm_mul_pd(r22,vftabscale);
1724 vfitab = _mm_cvttpd_epi32(rt);
1725 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1726 vfitab = _mm_slli_epi32(vfitab,2);
1728 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1729 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1730 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1731 GMX_MM_TRANSPOSE2_PD(Y,F);
1732 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1733 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1734 GMX_MM_TRANSPOSE2_PD(G,H);
1735 Heps = _mm_mul_pd(vfeps,H);
1736 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1737 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1738 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1742 /* Calculate temporary vectorial force */
1743 tx = _mm_mul_pd(fscal,dx22);
1744 ty = _mm_mul_pd(fscal,dy22);
1745 tz = _mm_mul_pd(fscal,dz22);
1747 /* Update vectorial force */
1748 fix2 = _mm_add_pd(fix2,tx);
1749 fiy2 = _mm_add_pd(fiy2,ty);
1750 fiz2 = _mm_add_pd(fiz2,tz);
1752 fjx2 = _mm_add_pd(fjx2,tx);
1753 fjy2 = _mm_add_pd(fjy2,ty);
1754 fjz2 = _mm_add_pd(fjz2,tz);
1756 /**************************
1757 * CALCULATE INTERACTIONS *
1758 **************************/
1760 r23 = _mm_mul_pd(rsq23,rinv23);
1762 /* Calculate table index by multiplying r with table scale and truncate to integer */
1763 rt = _mm_mul_pd(r23,vftabscale);
1764 vfitab = _mm_cvttpd_epi32(rt);
1765 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1766 vfitab = _mm_slli_epi32(vfitab,2);
1768 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1769 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1770 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1771 GMX_MM_TRANSPOSE2_PD(Y,F);
1772 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1773 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1774 GMX_MM_TRANSPOSE2_PD(G,H);
1775 Heps = _mm_mul_pd(vfeps,H);
1776 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1777 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1778 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1782 /* Calculate temporary vectorial force */
1783 tx = _mm_mul_pd(fscal,dx23);
1784 ty = _mm_mul_pd(fscal,dy23);
1785 tz = _mm_mul_pd(fscal,dz23);
1787 /* Update vectorial force */
1788 fix2 = _mm_add_pd(fix2,tx);
1789 fiy2 = _mm_add_pd(fiy2,ty);
1790 fiz2 = _mm_add_pd(fiz2,tz);
1792 fjx3 = _mm_add_pd(fjx3,tx);
1793 fjy3 = _mm_add_pd(fjy3,ty);
1794 fjz3 = _mm_add_pd(fjz3,tz);
1796 /**************************
1797 * CALCULATE INTERACTIONS *
1798 **************************/
1800 r31 = _mm_mul_pd(rsq31,rinv31);
1802 /* Calculate table index by multiplying r with table scale and truncate to integer */
1803 rt = _mm_mul_pd(r31,vftabscale);
1804 vfitab = _mm_cvttpd_epi32(rt);
1805 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1806 vfitab = _mm_slli_epi32(vfitab,2);
1808 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1809 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1810 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1811 GMX_MM_TRANSPOSE2_PD(Y,F);
1812 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1813 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1814 GMX_MM_TRANSPOSE2_PD(G,H);
1815 Heps = _mm_mul_pd(vfeps,H);
1816 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1817 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1818 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1822 /* Calculate temporary vectorial force */
1823 tx = _mm_mul_pd(fscal,dx31);
1824 ty = _mm_mul_pd(fscal,dy31);
1825 tz = _mm_mul_pd(fscal,dz31);
1827 /* Update vectorial force */
1828 fix3 = _mm_add_pd(fix3,tx);
1829 fiy3 = _mm_add_pd(fiy3,ty);
1830 fiz3 = _mm_add_pd(fiz3,tz);
1832 fjx1 = _mm_add_pd(fjx1,tx);
1833 fjy1 = _mm_add_pd(fjy1,ty);
1834 fjz1 = _mm_add_pd(fjz1,tz);
1836 /**************************
1837 * CALCULATE INTERACTIONS *
1838 **************************/
1840 r32 = _mm_mul_pd(rsq32,rinv32);
1842 /* Calculate table index by multiplying r with table scale and truncate to integer */
1843 rt = _mm_mul_pd(r32,vftabscale);
1844 vfitab = _mm_cvttpd_epi32(rt);
1845 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1846 vfitab = _mm_slli_epi32(vfitab,2);
1848 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1849 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1850 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1851 GMX_MM_TRANSPOSE2_PD(Y,F);
1852 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1853 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1854 GMX_MM_TRANSPOSE2_PD(G,H);
1855 Heps = _mm_mul_pd(vfeps,H);
1856 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1857 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1858 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1862 /* Calculate temporary vectorial force */
1863 tx = _mm_mul_pd(fscal,dx32);
1864 ty = _mm_mul_pd(fscal,dy32);
1865 tz = _mm_mul_pd(fscal,dz32);
1867 /* Update vectorial force */
1868 fix3 = _mm_add_pd(fix3,tx);
1869 fiy3 = _mm_add_pd(fiy3,ty);
1870 fiz3 = _mm_add_pd(fiz3,tz);
1872 fjx2 = _mm_add_pd(fjx2,tx);
1873 fjy2 = _mm_add_pd(fjy2,ty);
1874 fjz2 = _mm_add_pd(fjz2,tz);
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 r33 = _mm_mul_pd(rsq33,rinv33);
1882 /* Calculate table index by multiplying r with table scale and truncate to integer */
1883 rt = _mm_mul_pd(r33,vftabscale);
1884 vfitab = _mm_cvttpd_epi32(rt);
1885 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1886 vfitab = _mm_slli_epi32(vfitab,2);
1888 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1889 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1890 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1891 GMX_MM_TRANSPOSE2_PD(Y,F);
1892 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1893 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1894 GMX_MM_TRANSPOSE2_PD(G,H);
1895 Heps = _mm_mul_pd(vfeps,H);
1896 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1897 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1898 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1902 /* Calculate temporary vectorial force */
1903 tx = _mm_mul_pd(fscal,dx33);
1904 ty = _mm_mul_pd(fscal,dy33);
1905 tz = _mm_mul_pd(fscal,dz33);
1907 /* Update vectorial force */
1908 fix3 = _mm_add_pd(fix3,tx);
1909 fiy3 = _mm_add_pd(fiy3,ty);
1910 fiz3 = _mm_add_pd(fiz3,tz);
1912 fjx3 = _mm_add_pd(fjx3,tx);
1913 fjy3 = _mm_add_pd(fjy3,ty);
1914 fjz3 = _mm_add_pd(fjz3,tz);
1916 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1918 /* Inner loop uses 381 flops */
1921 if(jidx<j_index_end)
1925 j_coord_offsetA = DIM*jnrA;
1927 /* load j atom coordinates */
1928 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1929 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1930 &jy2,&jz2,&jx3,&jy3,&jz3);
1932 /* Calculate displacement vector */
1933 dx00 = _mm_sub_pd(ix0,jx0);
1934 dy00 = _mm_sub_pd(iy0,jy0);
1935 dz00 = _mm_sub_pd(iz0,jz0);
1936 dx11 = _mm_sub_pd(ix1,jx1);
1937 dy11 = _mm_sub_pd(iy1,jy1);
1938 dz11 = _mm_sub_pd(iz1,jz1);
1939 dx12 = _mm_sub_pd(ix1,jx2);
1940 dy12 = _mm_sub_pd(iy1,jy2);
1941 dz12 = _mm_sub_pd(iz1,jz2);
1942 dx13 = _mm_sub_pd(ix1,jx3);
1943 dy13 = _mm_sub_pd(iy1,jy3);
1944 dz13 = _mm_sub_pd(iz1,jz3);
1945 dx21 = _mm_sub_pd(ix2,jx1);
1946 dy21 = _mm_sub_pd(iy2,jy1);
1947 dz21 = _mm_sub_pd(iz2,jz1);
1948 dx22 = _mm_sub_pd(ix2,jx2);
1949 dy22 = _mm_sub_pd(iy2,jy2);
1950 dz22 = _mm_sub_pd(iz2,jz2);
1951 dx23 = _mm_sub_pd(ix2,jx3);
1952 dy23 = _mm_sub_pd(iy2,jy3);
1953 dz23 = _mm_sub_pd(iz2,jz3);
1954 dx31 = _mm_sub_pd(ix3,jx1);
1955 dy31 = _mm_sub_pd(iy3,jy1);
1956 dz31 = _mm_sub_pd(iz3,jz1);
1957 dx32 = _mm_sub_pd(ix3,jx2);
1958 dy32 = _mm_sub_pd(iy3,jy2);
1959 dz32 = _mm_sub_pd(iz3,jz2);
1960 dx33 = _mm_sub_pd(ix3,jx3);
1961 dy33 = _mm_sub_pd(iy3,jy3);
1962 dz33 = _mm_sub_pd(iz3,jz3);
1964 /* Calculate squared distance and things based on it */
1965 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1966 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1967 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1968 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1969 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1970 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1971 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1972 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1973 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1974 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1976 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1977 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1978 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1979 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1980 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1981 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1982 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1983 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1984 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1986 rinvsq00 = gmx_mm_inv_pd(rsq00);
1988 fjx0 = _mm_setzero_pd();
1989 fjy0 = _mm_setzero_pd();
1990 fjz0 = _mm_setzero_pd();
1991 fjx1 = _mm_setzero_pd();
1992 fjy1 = _mm_setzero_pd();
1993 fjz1 = _mm_setzero_pd();
1994 fjx2 = _mm_setzero_pd();
1995 fjy2 = _mm_setzero_pd();
1996 fjz2 = _mm_setzero_pd();
1997 fjx3 = _mm_setzero_pd();
1998 fjy3 = _mm_setzero_pd();
1999 fjz3 = _mm_setzero_pd();
2001 /**************************
2002 * CALCULATE INTERACTIONS *
2003 **************************/
2005 /* LENNARD-JONES DISPERSION/REPULSION */
2007 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2008 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2012 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2014 /* Calculate temporary vectorial force */
2015 tx = _mm_mul_pd(fscal,dx00);
2016 ty = _mm_mul_pd(fscal,dy00);
2017 tz = _mm_mul_pd(fscal,dz00);
2019 /* Update vectorial force */
2020 fix0 = _mm_add_pd(fix0,tx);
2021 fiy0 = _mm_add_pd(fiy0,ty);
2022 fiz0 = _mm_add_pd(fiz0,tz);
2024 fjx0 = _mm_add_pd(fjx0,tx);
2025 fjy0 = _mm_add_pd(fjy0,ty);
2026 fjz0 = _mm_add_pd(fjz0,tz);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 r11 = _mm_mul_pd(rsq11,rinv11);
2034 /* Calculate table index by multiplying r with table scale and truncate to integer */
2035 rt = _mm_mul_pd(r11,vftabscale);
2036 vfitab = _mm_cvttpd_epi32(rt);
2037 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2038 vfitab = _mm_slli_epi32(vfitab,2);
2040 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2041 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2042 F = _mm_setzero_pd();
2043 GMX_MM_TRANSPOSE2_PD(Y,F);
2044 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2045 H = _mm_setzero_pd();
2046 GMX_MM_TRANSPOSE2_PD(G,H);
2047 Heps = _mm_mul_pd(vfeps,H);
2048 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2049 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2050 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2054 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2056 /* Calculate temporary vectorial force */
2057 tx = _mm_mul_pd(fscal,dx11);
2058 ty = _mm_mul_pd(fscal,dy11);
2059 tz = _mm_mul_pd(fscal,dz11);
2061 /* Update vectorial force */
2062 fix1 = _mm_add_pd(fix1,tx);
2063 fiy1 = _mm_add_pd(fiy1,ty);
2064 fiz1 = _mm_add_pd(fiz1,tz);
2066 fjx1 = _mm_add_pd(fjx1,tx);
2067 fjy1 = _mm_add_pd(fjy1,ty);
2068 fjz1 = _mm_add_pd(fjz1,tz);
2070 /**************************
2071 * CALCULATE INTERACTIONS *
2072 **************************/
2074 r12 = _mm_mul_pd(rsq12,rinv12);
2076 /* Calculate table index by multiplying r with table scale and truncate to integer */
2077 rt = _mm_mul_pd(r12,vftabscale);
2078 vfitab = _mm_cvttpd_epi32(rt);
2079 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2080 vfitab = _mm_slli_epi32(vfitab,2);
2082 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2083 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2084 F = _mm_setzero_pd();
2085 GMX_MM_TRANSPOSE2_PD(Y,F);
2086 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2087 H = _mm_setzero_pd();
2088 GMX_MM_TRANSPOSE2_PD(G,H);
2089 Heps = _mm_mul_pd(vfeps,H);
2090 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2091 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2092 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2096 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2098 /* Calculate temporary vectorial force */
2099 tx = _mm_mul_pd(fscal,dx12);
2100 ty = _mm_mul_pd(fscal,dy12);
2101 tz = _mm_mul_pd(fscal,dz12);
2103 /* Update vectorial force */
2104 fix1 = _mm_add_pd(fix1,tx);
2105 fiy1 = _mm_add_pd(fiy1,ty);
2106 fiz1 = _mm_add_pd(fiz1,tz);
2108 fjx2 = _mm_add_pd(fjx2,tx);
2109 fjy2 = _mm_add_pd(fjy2,ty);
2110 fjz2 = _mm_add_pd(fjz2,tz);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 r13 = _mm_mul_pd(rsq13,rinv13);
2118 /* Calculate table index by multiplying r with table scale and truncate to integer */
2119 rt = _mm_mul_pd(r13,vftabscale);
2120 vfitab = _mm_cvttpd_epi32(rt);
2121 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2122 vfitab = _mm_slli_epi32(vfitab,2);
2124 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2125 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2126 F = _mm_setzero_pd();
2127 GMX_MM_TRANSPOSE2_PD(Y,F);
2128 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2129 H = _mm_setzero_pd();
2130 GMX_MM_TRANSPOSE2_PD(G,H);
2131 Heps = _mm_mul_pd(vfeps,H);
2132 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2133 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2134 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
2138 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2140 /* Calculate temporary vectorial force */
2141 tx = _mm_mul_pd(fscal,dx13);
2142 ty = _mm_mul_pd(fscal,dy13);
2143 tz = _mm_mul_pd(fscal,dz13);
2145 /* Update vectorial force */
2146 fix1 = _mm_add_pd(fix1,tx);
2147 fiy1 = _mm_add_pd(fiy1,ty);
2148 fiz1 = _mm_add_pd(fiz1,tz);
2150 fjx3 = _mm_add_pd(fjx3,tx);
2151 fjy3 = _mm_add_pd(fjy3,ty);
2152 fjz3 = _mm_add_pd(fjz3,tz);
2154 /**************************
2155 * CALCULATE INTERACTIONS *
2156 **************************/
2158 r21 = _mm_mul_pd(rsq21,rinv21);
2160 /* Calculate table index by multiplying r with table scale and truncate to integer */
2161 rt = _mm_mul_pd(r21,vftabscale);
2162 vfitab = _mm_cvttpd_epi32(rt);
2163 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2164 vfitab = _mm_slli_epi32(vfitab,2);
2166 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2167 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2168 F = _mm_setzero_pd();
2169 GMX_MM_TRANSPOSE2_PD(Y,F);
2170 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2171 H = _mm_setzero_pd();
2172 GMX_MM_TRANSPOSE2_PD(G,H);
2173 Heps = _mm_mul_pd(vfeps,H);
2174 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2175 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2176 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2180 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2182 /* Calculate temporary vectorial force */
2183 tx = _mm_mul_pd(fscal,dx21);
2184 ty = _mm_mul_pd(fscal,dy21);
2185 tz = _mm_mul_pd(fscal,dz21);
2187 /* Update vectorial force */
2188 fix2 = _mm_add_pd(fix2,tx);
2189 fiy2 = _mm_add_pd(fiy2,ty);
2190 fiz2 = _mm_add_pd(fiz2,tz);
2192 fjx1 = _mm_add_pd(fjx1,tx);
2193 fjy1 = _mm_add_pd(fjy1,ty);
2194 fjz1 = _mm_add_pd(fjz1,tz);
2196 /**************************
2197 * CALCULATE INTERACTIONS *
2198 **************************/
2200 r22 = _mm_mul_pd(rsq22,rinv22);
2202 /* Calculate table index by multiplying r with table scale and truncate to integer */
2203 rt = _mm_mul_pd(r22,vftabscale);
2204 vfitab = _mm_cvttpd_epi32(rt);
2205 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2206 vfitab = _mm_slli_epi32(vfitab,2);
2208 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2209 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2210 F = _mm_setzero_pd();
2211 GMX_MM_TRANSPOSE2_PD(Y,F);
2212 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2213 H = _mm_setzero_pd();
2214 GMX_MM_TRANSPOSE2_PD(G,H);
2215 Heps = _mm_mul_pd(vfeps,H);
2216 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2217 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2218 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2222 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2224 /* Calculate temporary vectorial force */
2225 tx = _mm_mul_pd(fscal,dx22);
2226 ty = _mm_mul_pd(fscal,dy22);
2227 tz = _mm_mul_pd(fscal,dz22);
2229 /* Update vectorial force */
2230 fix2 = _mm_add_pd(fix2,tx);
2231 fiy2 = _mm_add_pd(fiy2,ty);
2232 fiz2 = _mm_add_pd(fiz2,tz);
2234 fjx2 = _mm_add_pd(fjx2,tx);
2235 fjy2 = _mm_add_pd(fjy2,ty);
2236 fjz2 = _mm_add_pd(fjz2,tz);
2238 /**************************
2239 * CALCULATE INTERACTIONS *
2240 **************************/
2242 r23 = _mm_mul_pd(rsq23,rinv23);
2244 /* Calculate table index by multiplying r with table scale and truncate to integer */
2245 rt = _mm_mul_pd(r23,vftabscale);
2246 vfitab = _mm_cvttpd_epi32(rt);
2247 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2248 vfitab = _mm_slli_epi32(vfitab,2);
2250 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2251 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2252 F = _mm_setzero_pd();
2253 GMX_MM_TRANSPOSE2_PD(Y,F);
2254 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2255 H = _mm_setzero_pd();
2256 GMX_MM_TRANSPOSE2_PD(G,H);
2257 Heps = _mm_mul_pd(vfeps,H);
2258 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2259 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2260 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2264 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2266 /* Calculate temporary vectorial force */
2267 tx = _mm_mul_pd(fscal,dx23);
2268 ty = _mm_mul_pd(fscal,dy23);
2269 tz = _mm_mul_pd(fscal,dz23);
2271 /* Update vectorial force */
2272 fix2 = _mm_add_pd(fix2,tx);
2273 fiy2 = _mm_add_pd(fiy2,ty);
2274 fiz2 = _mm_add_pd(fiz2,tz);
2276 fjx3 = _mm_add_pd(fjx3,tx);
2277 fjy3 = _mm_add_pd(fjy3,ty);
2278 fjz3 = _mm_add_pd(fjz3,tz);
2280 /**************************
2281 * CALCULATE INTERACTIONS *
2282 **************************/
2284 r31 = _mm_mul_pd(rsq31,rinv31);
2286 /* Calculate table index by multiplying r with table scale and truncate to integer */
2287 rt = _mm_mul_pd(r31,vftabscale);
2288 vfitab = _mm_cvttpd_epi32(rt);
2289 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2290 vfitab = _mm_slli_epi32(vfitab,2);
2292 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2293 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2294 F = _mm_setzero_pd();
2295 GMX_MM_TRANSPOSE2_PD(Y,F);
2296 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2297 H = _mm_setzero_pd();
2298 GMX_MM_TRANSPOSE2_PD(G,H);
2299 Heps = _mm_mul_pd(vfeps,H);
2300 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2301 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2302 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2306 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2308 /* Calculate temporary vectorial force */
2309 tx = _mm_mul_pd(fscal,dx31);
2310 ty = _mm_mul_pd(fscal,dy31);
2311 tz = _mm_mul_pd(fscal,dz31);
2313 /* Update vectorial force */
2314 fix3 = _mm_add_pd(fix3,tx);
2315 fiy3 = _mm_add_pd(fiy3,ty);
2316 fiz3 = _mm_add_pd(fiz3,tz);
2318 fjx1 = _mm_add_pd(fjx1,tx);
2319 fjy1 = _mm_add_pd(fjy1,ty);
2320 fjz1 = _mm_add_pd(fjz1,tz);
2322 /**************************
2323 * CALCULATE INTERACTIONS *
2324 **************************/
2326 r32 = _mm_mul_pd(rsq32,rinv32);
2328 /* Calculate table index by multiplying r with table scale and truncate to integer */
2329 rt = _mm_mul_pd(r32,vftabscale);
2330 vfitab = _mm_cvttpd_epi32(rt);
2331 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2332 vfitab = _mm_slli_epi32(vfitab,2);
2334 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2335 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2336 F = _mm_setzero_pd();
2337 GMX_MM_TRANSPOSE2_PD(Y,F);
2338 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2339 H = _mm_setzero_pd();
2340 GMX_MM_TRANSPOSE2_PD(G,H);
2341 Heps = _mm_mul_pd(vfeps,H);
2342 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2343 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2344 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2348 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2350 /* Calculate temporary vectorial force */
2351 tx = _mm_mul_pd(fscal,dx32);
2352 ty = _mm_mul_pd(fscal,dy32);
2353 tz = _mm_mul_pd(fscal,dz32);
2355 /* Update vectorial force */
2356 fix3 = _mm_add_pd(fix3,tx);
2357 fiy3 = _mm_add_pd(fiy3,ty);
2358 fiz3 = _mm_add_pd(fiz3,tz);
2360 fjx2 = _mm_add_pd(fjx2,tx);
2361 fjy2 = _mm_add_pd(fjy2,ty);
2362 fjz2 = _mm_add_pd(fjz2,tz);
2364 /**************************
2365 * CALCULATE INTERACTIONS *
2366 **************************/
2368 r33 = _mm_mul_pd(rsq33,rinv33);
2370 /* Calculate table index by multiplying r with table scale and truncate to integer */
2371 rt = _mm_mul_pd(r33,vftabscale);
2372 vfitab = _mm_cvttpd_epi32(rt);
2373 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2374 vfitab = _mm_slli_epi32(vfitab,2);
2376 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2377 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2378 F = _mm_setzero_pd();
2379 GMX_MM_TRANSPOSE2_PD(Y,F);
2380 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2381 H = _mm_setzero_pd();
2382 GMX_MM_TRANSPOSE2_PD(G,H);
2383 Heps = _mm_mul_pd(vfeps,H);
2384 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2385 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2386 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2390 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2392 /* Calculate temporary vectorial force */
2393 tx = _mm_mul_pd(fscal,dx33);
2394 ty = _mm_mul_pd(fscal,dy33);
2395 tz = _mm_mul_pd(fscal,dz33);
2397 /* Update vectorial force */
2398 fix3 = _mm_add_pd(fix3,tx);
2399 fiy3 = _mm_add_pd(fiy3,ty);
2400 fiz3 = _mm_add_pd(fiz3,tz);
2402 fjx3 = _mm_add_pd(fjx3,tx);
2403 fjy3 = _mm_add_pd(fjy3,ty);
2404 fjz3 = _mm_add_pd(fjz3,tz);
2406 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2408 /* Inner loop uses 381 flops */
2411 /* End of innermost loop */
2413 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2414 f+i_coord_offset,fshift+i_shift_offset);
2416 /* Increment number of inner iterations */
2417 inneriter += j_index_end - j_index_start;
2419 /* Outer loop uses 24 flops */
2422 /* Increment number of outer iterations */
2425 /* Update outer/inner flops */
2427 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*381);