2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128d dummy_mask,cutoff_mask;
105 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one = _mm_set1_pd(1.0);
107 __m128d two = _mm_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 vftab = kernel_data->table_vdw->data;
126 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
131 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
132 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
133 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
135 jq1 = _mm_set1_pd(charge[inr+1]);
136 jq2 = _mm_set1_pd(charge[inr+2]);
137 jq3 = _mm_set1_pd(charge[inr+3]);
138 vdwjidx0A = 2*vdwtype[inr+0];
139 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq11 = _mm_mul_pd(iq1,jq1);
142 qq12 = _mm_mul_pd(iq1,jq2);
143 qq13 = _mm_mul_pd(iq1,jq3);
144 qq21 = _mm_mul_pd(iq2,jq1);
145 qq22 = _mm_mul_pd(iq2,jq2);
146 qq23 = _mm_mul_pd(iq2,jq3);
147 qq31 = _mm_mul_pd(iq3,jq1);
148 qq32 = _mm_mul_pd(iq3,jq2);
149 qq33 = _mm_mul_pd(iq3,jq3);
151 /* Avoid stupid compiler warnings */
159 /* Start outer loop over neighborlists */
160 for(iidx=0; iidx<nri; iidx++)
162 /* Load shift vector for this list */
163 i_shift_offset = DIM*shiftidx[iidx];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
175 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
177 fix0 = _mm_setzero_pd();
178 fiy0 = _mm_setzero_pd();
179 fiz0 = _mm_setzero_pd();
180 fix1 = _mm_setzero_pd();
181 fiy1 = _mm_setzero_pd();
182 fiz1 = _mm_setzero_pd();
183 fix2 = _mm_setzero_pd();
184 fiy2 = _mm_setzero_pd();
185 fiz2 = _mm_setzero_pd();
186 fix3 = _mm_setzero_pd();
187 fiy3 = _mm_setzero_pd();
188 fiz3 = _mm_setzero_pd();
190 /* Reset potential sums */
191 velecsum = _mm_setzero_pd();
192 vvdwsum = _mm_setzero_pd();
194 /* Start inner kernel loop */
195 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
198 /* Get j neighbor index, and coordinate index */
201 j_coord_offsetA = DIM*jnrA;
202 j_coord_offsetB = DIM*jnrB;
204 /* load j atom coordinates */
205 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
206 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
207 &jy2,&jz2,&jx3,&jy3,&jz3);
209 /* Calculate displacement vector */
210 dx00 = _mm_sub_pd(ix0,jx0);
211 dy00 = _mm_sub_pd(iy0,jy0);
212 dz00 = _mm_sub_pd(iz0,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx13 = _mm_sub_pd(ix1,jx3);
220 dy13 = _mm_sub_pd(iy1,jy3);
221 dz13 = _mm_sub_pd(iz1,jz3);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
228 dx23 = _mm_sub_pd(ix2,jx3);
229 dy23 = _mm_sub_pd(iy2,jy3);
230 dz23 = _mm_sub_pd(iz2,jz3);
231 dx31 = _mm_sub_pd(ix3,jx1);
232 dy31 = _mm_sub_pd(iy3,jy1);
233 dz31 = _mm_sub_pd(iz3,jz1);
234 dx32 = _mm_sub_pd(ix3,jx2);
235 dy32 = _mm_sub_pd(iy3,jy2);
236 dz32 = _mm_sub_pd(iz3,jz2);
237 dx33 = _mm_sub_pd(ix3,jx3);
238 dy33 = _mm_sub_pd(iy3,jy3);
239 dz33 = _mm_sub_pd(iz3,jz3);
241 /* Calculate squared distance and things based on it */
242 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
243 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
246 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
248 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
249 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
250 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
251 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
253 rinv00 = gmx_mm_invsqrt_pd(rsq00);
254 rinv11 = gmx_mm_invsqrt_pd(rsq11);
255 rinv12 = gmx_mm_invsqrt_pd(rsq12);
256 rinv13 = gmx_mm_invsqrt_pd(rsq13);
257 rinv21 = gmx_mm_invsqrt_pd(rsq21);
258 rinv22 = gmx_mm_invsqrt_pd(rsq22);
259 rinv23 = gmx_mm_invsqrt_pd(rsq23);
260 rinv31 = gmx_mm_invsqrt_pd(rsq31);
261 rinv32 = gmx_mm_invsqrt_pd(rsq32);
262 rinv33 = gmx_mm_invsqrt_pd(rsq33);
264 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
265 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
266 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
267 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
268 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
269 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
270 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
271 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
272 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
274 fjx0 = _mm_setzero_pd();
275 fjy0 = _mm_setzero_pd();
276 fjz0 = _mm_setzero_pd();
277 fjx1 = _mm_setzero_pd();
278 fjy1 = _mm_setzero_pd();
279 fjz1 = _mm_setzero_pd();
280 fjx2 = _mm_setzero_pd();
281 fjy2 = _mm_setzero_pd();
282 fjz2 = _mm_setzero_pd();
283 fjx3 = _mm_setzero_pd();
284 fjy3 = _mm_setzero_pd();
285 fjz3 = _mm_setzero_pd();
287 /**************************
288 * CALCULATE INTERACTIONS *
289 **************************/
291 r00 = _mm_mul_pd(rsq00,rinv00);
293 /* Calculate table index by multiplying r with table scale and truncate to integer */
294 rt = _mm_mul_pd(r00,vftabscale);
295 vfitab = _mm_cvttpd_epi32(rt);
296 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
297 vfitab = _mm_slli_epi32(vfitab,3);
299 /* CUBIC SPLINE TABLE DISPERSION */
300 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
301 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
302 GMX_MM_TRANSPOSE2_PD(Y,F);
303 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
304 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
305 GMX_MM_TRANSPOSE2_PD(G,H);
306 Heps = _mm_mul_pd(vfeps,H);
307 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
308 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
309 vvdw6 = _mm_mul_pd(c6_00,VV);
310 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
311 fvdw6 = _mm_mul_pd(c6_00,FF);
313 /* CUBIC SPLINE TABLE REPULSION */
314 vfitab = _mm_add_epi32(vfitab,ifour);
315 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
316 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
317 GMX_MM_TRANSPOSE2_PD(Y,F);
318 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
319 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
320 GMX_MM_TRANSPOSE2_PD(G,H);
321 Heps = _mm_mul_pd(vfeps,H);
322 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
323 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
324 vvdw12 = _mm_mul_pd(c12_00,VV);
325 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
326 fvdw12 = _mm_mul_pd(c12_00,FF);
327 vvdw = _mm_add_pd(vvdw12,vvdw6);
328 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
335 /* Calculate temporary vectorial force */
336 tx = _mm_mul_pd(fscal,dx00);
337 ty = _mm_mul_pd(fscal,dy00);
338 tz = _mm_mul_pd(fscal,dz00);
340 /* Update vectorial force */
341 fix0 = _mm_add_pd(fix0,tx);
342 fiy0 = _mm_add_pd(fiy0,ty);
343 fiz0 = _mm_add_pd(fiz0,tz);
345 fjx0 = _mm_add_pd(fjx0,tx);
346 fjy0 = _mm_add_pd(fjy0,ty);
347 fjz0 = _mm_add_pd(fjz0,tz);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 /* COULOMB ELECTROSTATICS */
354 velec = _mm_mul_pd(qq11,rinv11);
355 felec = _mm_mul_pd(velec,rinvsq11);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm_add_pd(velecsum,velec);
362 /* Calculate temporary vectorial force */
363 tx = _mm_mul_pd(fscal,dx11);
364 ty = _mm_mul_pd(fscal,dy11);
365 tz = _mm_mul_pd(fscal,dz11);
367 /* Update vectorial force */
368 fix1 = _mm_add_pd(fix1,tx);
369 fiy1 = _mm_add_pd(fiy1,ty);
370 fiz1 = _mm_add_pd(fiz1,tz);
372 fjx1 = _mm_add_pd(fjx1,tx);
373 fjy1 = _mm_add_pd(fjy1,ty);
374 fjz1 = _mm_add_pd(fjz1,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 /* COULOMB ELECTROSTATICS */
381 velec = _mm_mul_pd(qq12,rinv12);
382 felec = _mm_mul_pd(velec,rinvsq12);
384 /* Update potential sum for this i atom from the interaction with this j atom. */
385 velecsum = _mm_add_pd(velecsum,velec);
389 /* Calculate temporary vectorial force */
390 tx = _mm_mul_pd(fscal,dx12);
391 ty = _mm_mul_pd(fscal,dy12);
392 tz = _mm_mul_pd(fscal,dz12);
394 /* Update vectorial force */
395 fix1 = _mm_add_pd(fix1,tx);
396 fiy1 = _mm_add_pd(fiy1,ty);
397 fiz1 = _mm_add_pd(fiz1,tz);
399 fjx2 = _mm_add_pd(fjx2,tx);
400 fjy2 = _mm_add_pd(fjy2,ty);
401 fjz2 = _mm_add_pd(fjz2,tz);
403 /**************************
404 * CALCULATE INTERACTIONS *
405 **************************/
407 /* COULOMB ELECTROSTATICS */
408 velec = _mm_mul_pd(qq13,rinv13);
409 felec = _mm_mul_pd(velec,rinvsq13);
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velecsum = _mm_add_pd(velecsum,velec);
416 /* Calculate temporary vectorial force */
417 tx = _mm_mul_pd(fscal,dx13);
418 ty = _mm_mul_pd(fscal,dy13);
419 tz = _mm_mul_pd(fscal,dz13);
421 /* Update vectorial force */
422 fix1 = _mm_add_pd(fix1,tx);
423 fiy1 = _mm_add_pd(fiy1,ty);
424 fiz1 = _mm_add_pd(fiz1,tz);
426 fjx3 = _mm_add_pd(fjx3,tx);
427 fjy3 = _mm_add_pd(fjy3,ty);
428 fjz3 = _mm_add_pd(fjz3,tz);
430 /**************************
431 * CALCULATE INTERACTIONS *
432 **************************/
434 /* COULOMB ELECTROSTATICS */
435 velec = _mm_mul_pd(qq21,rinv21);
436 felec = _mm_mul_pd(velec,rinvsq21);
438 /* Update potential sum for this i atom from the interaction with this j atom. */
439 velecsum = _mm_add_pd(velecsum,velec);
443 /* Calculate temporary vectorial force */
444 tx = _mm_mul_pd(fscal,dx21);
445 ty = _mm_mul_pd(fscal,dy21);
446 tz = _mm_mul_pd(fscal,dz21);
448 /* Update vectorial force */
449 fix2 = _mm_add_pd(fix2,tx);
450 fiy2 = _mm_add_pd(fiy2,ty);
451 fiz2 = _mm_add_pd(fiz2,tz);
453 fjx1 = _mm_add_pd(fjx1,tx);
454 fjy1 = _mm_add_pd(fjy1,ty);
455 fjz1 = _mm_add_pd(fjz1,tz);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 /* COULOMB ELECTROSTATICS */
462 velec = _mm_mul_pd(qq22,rinv22);
463 felec = _mm_mul_pd(velec,rinvsq22);
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velecsum = _mm_add_pd(velecsum,velec);
470 /* Calculate temporary vectorial force */
471 tx = _mm_mul_pd(fscal,dx22);
472 ty = _mm_mul_pd(fscal,dy22);
473 tz = _mm_mul_pd(fscal,dz22);
475 /* Update vectorial force */
476 fix2 = _mm_add_pd(fix2,tx);
477 fiy2 = _mm_add_pd(fiy2,ty);
478 fiz2 = _mm_add_pd(fiz2,tz);
480 fjx2 = _mm_add_pd(fjx2,tx);
481 fjy2 = _mm_add_pd(fjy2,ty);
482 fjz2 = _mm_add_pd(fjz2,tz);
484 /**************************
485 * CALCULATE INTERACTIONS *
486 **************************/
488 /* COULOMB ELECTROSTATICS */
489 velec = _mm_mul_pd(qq23,rinv23);
490 felec = _mm_mul_pd(velec,rinvsq23);
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velecsum = _mm_add_pd(velecsum,velec);
497 /* Calculate temporary vectorial force */
498 tx = _mm_mul_pd(fscal,dx23);
499 ty = _mm_mul_pd(fscal,dy23);
500 tz = _mm_mul_pd(fscal,dz23);
502 /* Update vectorial force */
503 fix2 = _mm_add_pd(fix2,tx);
504 fiy2 = _mm_add_pd(fiy2,ty);
505 fiz2 = _mm_add_pd(fiz2,tz);
507 fjx3 = _mm_add_pd(fjx3,tx);
508 fjy3 = _mm_add_pd(fjy3,ty);
509 fjz3 = _mm_add_pd(fjz3,tz);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 /* COULOMB ELECTROSTATICS */
516 velec = _mm_mul_pd(qq31,rinv31);
517 felec = _mm_mul_pd(velec,rinvsq31);
519 /* Update potential sum for this i atom from the interaction with this j atom. */
520 velecsum = _mm_add_pd(velecsum,velec);
524 /* Calculate temporary vectorial force */
525 tx = _mm_mul_pd(fscal,dx31);
526 ty = _mm_mul_pd(fscal,dy31);
527 tz = _mm_mul_pd(fscal,dz31);
529 /* Update vectorial force */
530 fix3 = _mm_add_pd(fix3,tx);
531 fiy3 = _mm_add_pd(fiy3,ty);
532 fiz3 = _mm_add_pd(fiz3,tz);
534 fjx1 = _mm_add_pd(fjx1,tx);
535 fjy1 = _mm_add_pd(fjy1,ty);
536 fjz1 = _mm_add_pd(fjz1,tz);
538 /**************************
539 * CALCULATE INTERACTIONS *
540 **************************/
542 /* COULOMB ELECTROSTATICS */
543 velec = _mm_mul_pd(qq32,rinv32);
544 felec = _mm_mul_pd(velec,rinvsq32);
546 /* Update potential sum for this i atom from the interaction with this j atom. */
547 velecsum = _mm_add_pd(velecsum,velec);
551 /* Calculate temporary vectorial force */
552 tx = _mm_mul_pd(fscal,dx32);
553 ty = _mm_mul_pd(fscal,dy32);
554 tz = _mm_mul_pd(fscal,dz32);
556 /* Update vectorial force */
557 fix3 = _mm_add_pd(fix3,tx);
558 fiy3 = _mm_add_pd(fiy3,ty);
559 fiz3 = _mm_add_pd(fiz3,tz);
561 fjx2 = _mm_add_pd(fjx2,tx);
562 fjy2 = _mm_add_pd(fjy2,ty);
563 fjz2 = _mm_add_pd(fjz2,tz);
565 /**************************
566 * CALCULATE INTERACTIONS *
567 **************************/
569 /* COULOMB ELECTROSTATICS */
570 velec = _mm_mul_pd(qq33,rinv33);
571 felec = _mm_mul_pd(velec,rinvsq33);
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velecsum = _mm_add_pd(velecsum,velec);
578 /* Calculate temporary vectorial force */
579 tx = _mm_mul_pd(fscal,dx33);
580 ty = _mm_mul_pd(fscal,dy33);
581 tz = _mm_mul_pd(fscal,dz33);
583 /* Update vectorial force */
584 fix3 = _mm_add_pd(fix3,tx);
585 fiy3 = _mm_add_pd(fiy3,ty);
586 fiz3 = _mm_add_pd(fiz3,tz);
588 fjx3 = _mm_add_pd(fjx3,tx);
589 fjy3 = _mm_add_pd(fjy3,ty);
590 fjz3 = _mm_add_pd(fjz3,tz);
592 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
594 /* Inner loop uses 311 flops */
601 j_coord_offsetA = DIM*jnrA;
603 /* load j atom coordinates */
604 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
605 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
606 &jy2,&jz2,&jx3,&jy3,&jz3);
608 /* Calculate displacement vector */
609 dx00 = _mm_sub_pd(ix0,jx0);
610 dy00 = _mm_sub_pd(iy0,jy0);
611 dz00 = _mm_sub_pd(iz0,jz0);
612 dx11 = _mm_sub_pd(ix1,jx1);
613 dy11 = _mm_sub_pd(iy1,jy1);
614 dz11 = _mm_sub_pd(iz1,jz1);
615 dx12 = _mm_sub_pd(ix1,jx2);
616 dy12 = _mm_sub_pd(iy1,jy2);
617 dz12 = _mm_sub_pd(iz1,jz2);
618 dx13 = _mm_sub_pd(ix1,jx3);
619 dy13 = _mm_sub_pd(iy1,jy3);
620 dz13 = _mm_sub_pd(iz1,jz3);
621 dx21 = _mm_sub_pd(ix2,jx1);
622 dy21 = _mm_sub_pd(iy2,jy1);
623 dz21 = _mm_sub_pd(iz2,jz1);
624 dx22 = _mm_sub_pd(ix2,jx2);
625 dy22 = _mm_sub_pd(iy2,jy2);
626 dz22 = _mm_sub_pd(iz2,jz2);
627 dx23 = _mm_sub_pd(ix2,jx3);
628 dy23 = _mm_sub_pd(iy2,jy3);
629 dz23 = _mm_sub_pd(iz2,jz3);
630 dx31 = _mm_sub_pd(ix3,jx1);
631 dy31 = _mm_sub_pd(iy3,jy1);
632 dz31 = _mm_sub_pd(iz3,jz1);
633 dx32 = _mm_sub_pd(ix3,jx2);
634 dy32 = _mm_sub_pd(iy3,jy2);
635 dz32 = _mm_sub_pd(iz3,jz2);
636 dx33 = _mm_sub_pd(ix3,jx3);
637 dy33 = _mm_sub_pd(iy3,jy3);
638 dz33 = _mm_sub_pd(iz3,jz3);
640 /* Calculate squared distance and things based on it */
641 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
642 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
643 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
644 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
645 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
646 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
647 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
648 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
649 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
650 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
652 rinv00 = gmx_mm_invsqrt_pd(rsq00);
653 rinv11 = gmx_mm_invsqrt_pd(rsq11);
654 rinv12 = gmx_mm_invsqrt_pd(rsq12);
655 rinv13 = gmx_mm_invsqrt_pd(rsq13);
656 rinv21 = gmx_mm_invsqrt_pd(rsq21);
657 rinv22 = gmx_mm_invsqrt_pd(rsq22);
658 rinv23 = gmx_mm_invsqrt_pd(rsq23);
659 rinv31 = gmx_mm_invsqrt_pd(rsq31);
660 rinv32 = gmx_mm_invsqrt_pd(rsq32);
661 rinv33 = gmx_mm_invsqrt_pd(rsq33);
663 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
664 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
665 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
666 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
667 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
668 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
669 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
670 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
671 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
673 fjx0 = _mm_setzero_pd();
674 fjy0 = _mm_setzero_pd();
675 fjz0 = _mm_setzero_pd();
676 fjx1 = _mm_setzero_pd();
677 fjy1 = _mm_setzero_pd();
678 fjz1 = _mm_setzero_pd();
679 fjx2 = _mm_setzero_pd();
680 fjy2 = _mm_setzero_pd();
681 fjz2 = _mm_setzero_pd();
682 fjx3 = _mm_setzero_pd();
683 fjy3 = _mm_setzero_pd();
684 fjz3 = _mm_setzero_pd();
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 r00 = _mm_mul_pd(rsq00,rinv00);
692 /* Calculate table index by multiplying r with table scale and truncate to integer */
693 rt = _mm_mul_pd(r00,vftabscale);
694 vfitab = _mm_cvttpd_epi32(rt);
695 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
696 vfitab = _mm_slli_epi32(vfitab,3);
698 /* CUBIC SPLINE TABLE DISPERSION */
699 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
700 F = _mm_setzero_pd();
701 GMX_MM_TRANSPOSE2_PD(Y,F);
702 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
703 H = _mm_setzero_pd();
704 GMX_MM_TRANSPOSE2_PD(G,H);
705 Heps = _mm_mul_pd(vfeps,H);
706 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
707 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
708 vvdw6 = _mm_mul_pd(c6_00,VV);
709 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
710 fvdw6 = _mm_mul_pd(c6_00,FF);
712 /* CUBIC SPLINE TABLE REPULSION */
713 vfitab = _mm_add_epi32(vfitab,ifour);
714 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
715 F = _mm_setzero_pd();
716 GMX_MM_TRANSPOSE2_PD(Y,F);
717 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
718 H = _mm_setzero_pd();
719 GMX_MM_TRANSPOSE2_PD(G,H);
720 Heps = _mm_mul_pd(vfeps,H);
721 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
722 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
723 vvdw12 = _mm_mul_pd(c12_00,VV);
724 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
725 fvdw12 = _mm_mul_pd(c12_00,FF);
726 vvdw = _mm_add_pd(vvdw12,vvdw6);
727 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
729 /* Update potential sum for this i atom from the interaction with this j atom. */
730 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
731 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
735 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
737 /* Calculate temporary vectorial force */
738 tx = _mm_mul_pd(fscal,dx00);
739 ty = _mm_mul_pd(fscal,dy00);
740 tz = _mm_mul_pd(fscal,dz00);
742 /* Update vectorial force */
743 fix0 = _mm_add_pd(fix0,tx);
744 fiy0 = _mm_add_pd(fiy0,ty);
745 fiz0 = _mm_add_pd(fiz0,tz);
747 fjx0 = _mm_add_pd(fjx0,tx);
748 fjy0 = _mm_add_pd(fjy0,ty);
749 fjz0 = _mm_add_pd(fjz0,tz);
751 /**************************
752 * CALCULATE INTERACTIONS *
753 **************************/
755 /* COULOMB ELECTROSTATICS */
756 velec = _mm_mul_pd(qq11,rinv11);
757 felec = _mm_mul_pd(velec,rinvsq11);
759 /* Update potential sum for this i atom from the interaction with this j atom. */
760 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
761 velecsum = _mm_add_pd(velecsum,velec);
765 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
767 /* Calculate temporary vectorial force */
768 tx = _mm_mul_pd(fscal,dx11);
769 ty = _mm_mul_pd(fscal,dy11);
770 tz = _mm_mul_pd(fscal,dz11);
772 /* Update vectorial force */
773 fix1 = _mm_add_pd(fix1,tx);
774 fiy1 = _mm_add_pd(fiy1,ty);
775 fiz1 = _mm_add_pd(fiz1,tz);
777 fjx1 = _mm_add_pd(fjx1,tx);
778 fjy1 = _mm_add_pd(fjy1,ty);
779 fjz1 = _mm_add_pd(fjz1,tz);
781 /**************************
782 * CALCULATE INTERACTIONS *
783 **************************/
785 /* COULOMB ELECTROSTATICS */
786 velec = _mm_mul_pd(qq12,rinv12);
787 felec = _mm_mul_pd(velec,rinvsq12);
789 /* Update potential sum for this i atom from the interaction with this j atom. */
790 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
791 velecsum = _mm_add_pd(velecsum,velec);
795 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
797 /* Calculate temporary vectorial force */
798 tx = _mm_mul_pd(fscal,dx12);
799 ty = _mm_mul_pd(fscal,dy12);
800 tz = _mm_mul_pd(fscal,dz12);
802 /* Update vectorial force */
803 fix1 = _mm_add_pd(fix1,tx);
804 fiy1 = _mm_add_pd(fiy1,ty);
805 fiz1 = _mm_add_pd(fiz1,tz);
807 fjx2 = _mm_add_pd(fjx2,tx);
808 fjy2 = _mm_add_pd(fjy2,ty);
809 fjz2 = _mm_add_pd(fjz2,tz);
811 /**************************
812 * CALCULATE INTERACTIONS *
813 **************************/
815 /* COULOMB ELECTROSTATICS */
816 velec = _mm_mul_pd(qq13,rinv13);
817 felec = _mm_mul_pd(velec,rinvsq13);
819 /* Update potential sum for this i atom from the interaction with this j atom. */
820 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
821 velecsum = _mm_add_pd(velecsum,velec);
825 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
827 /* Calculate temporary vectorial force */
828 tx = _mm_mul_pd(fscal,dx13);
829 ty = _mm_mul_pd(fscal,dy13);
830 tz = _mm_mul_pd(fscal,dz13);
832 /* Update vectorial force */
833 fix1 = _mm_add_pd(fix1,tx);
834 fiy1 = _mm_add_pd(fiy1,ty);
835 fiz1 = _mm_add_pd(fiz1,tz);
837 fjx3 = _mm_add_pd(fjx3,tx);
838 fjy3 = _mm_add_pd(fjy3,ty);
839 fjz3 = _mm_add_pd(fjz3,tz);
841 /**************************
842 * CALCULATE INTERACTIONS *
843 **************************/
845 /* COULOMB ELECTROSTATICS */
846 velec = _mm_mul_pd(qq21,rinv21);
847 felec = _mm_mul_pd(velec,rinvsq21);
849 /* Update potential sum for this i atom from the interaction with this j atom. */
850 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
851 velecsum = _mm_add_pd(velecsum,velec);
855 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
857 /* Calculate temporary vectorial force */
858 tx = _mm_mul_pd(fscal,dx21);
859 ty = _mm_mul_pd(fscal,dy21);
860 tz = _mm_mul_pd(fscal,dz21);
862 /* Update vectorial force */
863 fix2 = _mm_add_pd(fix2,tx);
864 fiy2 = _mm_add_pd(fiy2,ty);
865 fiz2 = _mm_add_pd(fiz2,tz);
867 fjx1 = _mm_add_pd(fjx1,tx);
868 fjy1 = _mm_add_pd(fjy1,ty);
869 fjz1 = _mm_add_pd(fjz1,tz);
871 /**************************
872 * CALCULATE INTERACTIONS *
873 **************************/
875 /* COULOMB ELECTROSTATICS */
876 velec = _mm_mul_pd(qq22,rinv22);
877 felec = _mm_mul_pd(velec,rinvsq22);
879 /* Update potential sum for this i atom from the interaction with this j atom. */
880 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
881 velecsum = _mm_add_pd(velecsum,velec);
885 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
887 /* Calculate temporary vectorial force */
888 tx = _mm_mul_pd(fscal,dx22);
889 ty = _mm_mul_pd(fscal,dy22);
890 tz = _mm_mul_pd(fscal,dz22);
892 /* Update vectorial force */
893 fix2 = _mm_add_pd(fix2,tx);
894 fiy2 = _mm_add_pd(fiy2,ty);
895 fiz2 = _mm_add_pd(fiz2,tz);
897 fjx2 = _mm_add_pd(fjx2,tx);
898 fjy2 = _mm_add_pd(fjy2,ty);
899 fjz2 = _mm_add_pd(fjz2,tz);
901 /**************************
902 * CALCULATE INTERACTIONS *
903 **************************/
905 /* COULOMB ELECTROSTATICS */
906 velec = _mm_mul_pd(qq23,rinv23);
907 felec = _mm_mul_pd(velec,rinvsq23);
909 /* Update potential sum for this i atom from the interaction with this j atom. */
910 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
911 velecsum = _mm_add_pd(velecsum,velec);
915 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
917 /* Calculate temporary vectorial force */
918 tx = _mm_mul_pd(fscal,dx23);
919 ty = _mm_mul_pd(fscal,dy23);
920 tz = _mm_mul_pd(fscal,dz23);
922 /* Update vectorial force */
923 fix2 = _mm_add_pd(fix2,tx);
924 fiy2 = _mm_add_pd(fiy2,ty);
925 fiz2 = _mm_add_pd(fiz2,tz);
927 fjx3 = _mm_add_pd(fjx3,tx);
928 fjy3 = _mm_add_pd(fjy3,ty);
929 fjz3 = _mm_add_pd(fjz3,tz);
931 /**************************
932 * CALCULATE INTERACTIONS *
933 **************************/
935 /* COULOMB ELECTROSTATICS */
936 velec = _mm_mul_pd(qq31,rinv31);
937 felec = _mm_mul_pd(velec,rinvsq31);
939 /* Update potential sum for this i atom from the interaction with this j atom. */
940 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
941 velecsum = _mm_add_pd(velecsum,velec);
945 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
947 /* Calculate temporary vectorial force */
948 tx = _mm_mul_pd(fscal,dx31);
949 ty = _mm_mul_pd(fscal,dy31);
950 tz = _mm_mul_pd(fscal,dz31);
952 /* Update vectorial force */
953 fix3 = _mm_add_pd(fix3,tx);
954 fiy3 = _mm_add_pd(fiy3,ty);
955 fiz3 = _mm_add_pd(fiz3,tz);
957 fjx1 = _mm_add_pd(fjx1,tx);
958 fjy1 = _mm_add_pd(fjy1,ty);
959 fjz1 = _mm_add_pd(fjz1,tz);
961 /**************************
962 * CALCULATE INTERACTIONS *
963 **************************/
965 /* COULOMB ELECTROSTATICS */
966 velec = _mm_mul_pd(qq32,rinv32);
967 felec = _mm_mul_pd(velec,rinvsq32);
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
971 velecsum = _mm_add_pd(velecsum,velec);
975 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
977 /* Calculate temporary vectorial force */
978 tx = _mm_mul_pd(fscal,dx32);
979 ty = _mm_mul_pd(fscal,dy32);
980 tz = _mm_mul_pd(fscal,dz32);
982 /* Update vectorial force */
983 fix3 = _mm_add_pd(fix3,tx);
984 fiy3 = _mm_add_pd(fiy3,ty);
985 fiz3 = _mm_add_pd(fiz3,tz);
987 fjx2 = _mm_add_pd(fjx2,tx);
988 fjy2 = _mm_add_pd(fjy2,ty);
989 fjz2 = _mm_add_pd(fjz2,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 /* COULOMB ELECTROSTATICS */
996 velec = _mm_mul_pd(qq33,rinv33);
997 felec = _mm_mul_pd(velec,rinvsq33);
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1001 velecsum = _mm_add_pd(velecsum,velec);
1005 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1007 /* Calculate temporary vectorial force */
1008 tx = _mm_mul_pd(fscal,dx33);
1009 ty = _mm_mul_pd(fscal,dy33);
1010 tz = _mm_mul_pd(fscal,dz33);
1012 /* Update vectorial force */
1013 fix3 = _mm_add_pd(fix3,tx);
1014 fiy3 = _mm_add_pd(fiy3,ty);
1015 fiz3 = _mm_add_pd(fiz3,tz);
1017 fjx3 = _mm_add_pd(fjx3,tx);
1018 fjy3 = _mm_add_pd(fjy3,ty);
1019 fjz3 = _mm_add_pd(fjz3,tz);
1021 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1023 /* Inner loop uses 311 flops */
1026 /* End of innermost loop */
1028 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1029 f+i_coord_offset,fshift+i_shift_offset);
1032 /* Update potential energies */
1033 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1034 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1036 /* Increment number of inner iterations */
1037 inneriter += j_index_end - j_index_start;
1039 /* Outer loop uses 26 flops */
1042 /* Increment number of outer iterations */
1045 /* Update outer/inner flops */
1047 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*311);
1050 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_double
1051 * Electrostatics interaction: Coulomb
1052 * VdW interaction: CubicSplineTable
1053 * Geometry: Water4-Water4
1054 * Calculate force/pot: Force
1057 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_double
1058 (t_nblist * gmx_restrict nlist,
1059 rvec * gmx_restrict xx,
1060 rvec * gmx_restrict ff,
1061 t_forcerec * gmx_restrict fr,
1062 t_mdatoms * gmx_restrict mdatoms,
1063 nb_kernel_data_t * gmx_restrict kernel_data,
1064 t_nrnb * gmx_restrict nrnb)
1066 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1067 * just 0 for non-waters.
1068 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1069 * jnr indices corresponding to data put in the four positions in the SIMD register.
1071 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1072 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1074 int j_coord_offsetA,j_coord_offsetB;
1075 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1076 real rcutoff_scalar;
1077 real *shiftvec,*fshift,*x,*f;
1078 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1080 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1082 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1084 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1086 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1087 int vdwjidx0A,vdwjidx0B;
1088 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1089 int vdwjidx1A,vdwjidx1B;
1090 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1091 int vdwjidx2A,vdwjidx2B;
1092 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1093 int vdwjidx3A,vdwjidx3B;
1094 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1095 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1096 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1097 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1098 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1099 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1101 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1102 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1103 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1104 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1105 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1108 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1111 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1112 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1114 __m128i ifour = _mm_set1_epi32(4);
1115 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1117 __m128d dummy_mask,cutoff_mask;
1118 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1119 __m128d one = _mm_set1_pd(1.0);
1120 __m128d two = _mm_set1_pd(2.0);
1126 jindex = nlist->jindex;
1128 shiftidx = nlist->shift;
1130 shiftvec = fr->shift_vec[0];
1131 fshift = fr->fshift[0];
1132 facel = _mm_set1_pd(fr->epsfac);
1133 charge = mdatoms->chargeA;
1134 nvdwtype = fr->ntype;
1135 vdwparam = fr->nbfp;
1136 vdwtype = mdatoms->typeA;
1138 vftab = kernel_data->table_vdw->data;
1139 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1141 /* Setup water-specific parameters */
1142 inr = nlist->iinr[0];
1143 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1144 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1145 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1148 jq1 = _mm_set1_pd(charge[inr+1]);
1149 jq2 = _mm_set1_pd(charge[inr+2]);
1150 jq3 = _mm_set1_pd(charge[inr+3]);
1151 vdwjidx0A = 2*vdwtype[inr+0];
1152 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1153 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1154 qq11 = _mm_mul_pd(iq1,jq1);
1155 qq12 = _mm_mul_pd(iq1,jq2);
1156 qq13 = _mm_mul_pd(iq1,jq3);
1157 qq21 = _mm_mul_pd(iq2,jq1);
1158 qq22 = _mm_mul_pd(iq2,jq2);
1159 qq23 = _mm_mul_pd(iq2,jq3);
1160 qq31 = _mm_mul_pd(iq3,jq1);
1161 qq32 = _mm_mul_pd(iq3,jq2);
1162 qq33 = _mm_mul_pd(iq3,jq3);
1164 /* Avoid stupid compiler warnings */
1166 j_coord_offsetA = 0;
1167 j_coord_offsetB = 0;
1172 /* Start outer loop over neighborlists */
1173 for(iidx=0; iidx<nri; iidx++)
1175 /* Load shift vector for this list */
1176 i_shift_offset = DIM*shiftidx[iidx];
1178 /* Load limits for loop over neighbors */
1179 j_index_start = jindex[iidx];
1180 j_index_end = jindex[iidx+1];
1182 /* Get outer coordinate index */
1184 i_coord_offset = DIM*inr;
1186 /* Load i particle coords and add shift vector */
1187 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1190 fix0 = _mm_setzero_pd();
1191 fiy0 = _mm_setzero_pd();
1192 fiz0 = _mm_setzero_pd();
1193 fix1 = _mm_setzero_pd();
1194 fiy1 = _mm_setzero_pd();
1195 fiz1 = _mm_setzero_pd();
1196 fix2 = _mm_setzero_pd();
1197 fiy2 = _mm_setzero_pd();
1198 fiz2 = _mm_setzero_pd();
1199 fix3 = _mm_setzero_pd();
1200 fiy3 = _mm_setzero_pd();
1201 fiz3 = _mm_setzero_pd();
1203 /* Start inner kernel loop */
1204 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1207 /* Get j neighbor index, and coordinate index */
1209 jnrB = jjnr[jidx+1];
1210 j_coord_offsetA = DIM*jnrA;
1211 j_coord_offsetB = DIM*jnrB;
1213 /* load j atom coordinates */
1214 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1215 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1216 &jy2,&jz2,&jx3,&jy3,&jz3);
1218 /* Calculate displacement vector */
1219 dx00 = _mm_sub_pd(ix0,jx0);
1220 dy00 = _mm_sub_pd(iy0,jy0);
1221 dz00 = _mm_sub_pd(iz0,jz0);
1222 dx11 = _mm_sub_pd(ix1,jx1);
1223 dy11 = _mm_sub_pd(iy1,jy1);
1224 dz11 = _mm_sub_pd(iz1,jz1);
1225 dx12 = _mm_sub_pd(ix1,jx2);
1226 dy12 = _mm_sub_pd(iy1,jy2);
1227 dz12 = _mm_sub_pd(iz1,jz2);
1228 dx13 = _mm_sub_pd(ix1,jx3);
1229 dy13 = _mm_sub_pd(iy1,jy3);
1230 dz13 = _mm_sub_pd(iz1,jz3);
1231 dx21 = _mm_sub_pd(ix2,jx1);
1232 dy21 = _mm_sub_pd(iy2,jy1);
1233 dz21 = _mm_sub_pd(iz2,jz1);
1234 dx22 = _mm_sub_pd(ix2,jx2);
1235 dy22 = _mm_sub_pd(iy2,jy2);
1236 dz22 = _mm_sub_pd(iz2,jz2);
1237 dx23 = _mm_sub_pd(ix2,jx3);
1238 dy23 = _mm_sub_pd(iy2,jy3);
1239 dz23 = _mm_sub_pd(iz2,jz3);
1240 dx31 = _mm_sub_pd(ix3,jx1);
1241 dy31 = _mm_sub_pd(iy3,jy1);
1242 dz31 = _mm_sub_pd(iz3,jz1);
1243 dx32 = _mm_sub_pd(ix3,jx2);
1244 dy32 = _mm_sub_pd(iy3,jy2);
1245 dz32 = _mm_sub_pd(iz3,jz2);
1246 dx33 = _mm_sub_pd(ix3,jx3);
1247 dy33 = _mm_sub_pd(iy3,jy3);
1248 dz33 = _mm_sub_pd(iz3,jz3);
1250 /* Calculate squared distance and things based on it */
1251 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1252 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1253 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1254 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1255 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1256 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1257 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1258 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1259 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1260 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1262 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1263 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1264 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1265 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1266 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1267 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1268 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1269 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1270 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1271 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1273 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1274 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1275 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1276 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1277 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1278 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1279 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1280 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1281 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1283 fjx0 = _mm_setzero_pd();
1284 fjy0 = _mm_setzero_pd();
1285 fjz0 = _mm_setzero_pd();
1286 fjx1 = _mm_setzero_pd();
1287 fjy1 = _mm_setzero_pd();
1288 fjz1 = _mm_setzero_pd();
1289 fjx2 = _mm_setzero_pd();
1290 fjy2 = _mm_setzero_pd();
1291 fjz2 = _mm_setzero_pd();
1292 fjx3 = _mm_setzero_pd();
1293 fjy3 = _mm_setzero_pd();
1294 fjz3 = _mm_setzero_pd();
1296 /**************************
1297 * CALCULATE INTERACTIONS *
1298 **************************/
1300 r00 = _mm_mul_pd(rsq00,rinv00);
1302 /* Calculate table index by multiplying r with table scale and truncate to integer */
1303 rt = _mm_mul_pd(r00,vftabscale);
1304 vfitab = _mm_cvttpd_epi32(rt);
1305 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1306 vfitab = _mm_slli_epi32(vfitab,3);
1308 /* CUBIC SPLINE TABLE DISPERSION */
1309 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1310 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1311 GMX_MM_TRANSPOSE2_PD(Y,F);
1312 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1313 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1314 GMX_MM_TRANSPOSE2_PD(G,H);
1315 Heps = _mm_mul_pd(vfeps,H);
1316 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1317 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1318 fvdw6 = _mm_mul_pd(c6_00,FF);
1320 /* CUBIC SPLINE TABLE REPULSION */
1321 vfitab = _mm_add_epi32(vfitab,ifour);
1322 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1323 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1324 GMX_MM_TRANSPOSE2_PD(Y,F);
1325 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1326 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1327 GMX_MM_TRANSPOSE2_PD(G,H);
1328 Heps = _mm_mul_pd(vfeps,H);
1329 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1330 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1331 fvdw12 = _mm_mul_pd(c12_00,FF);
1332 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1336 /* Calculate temporary vectorial force */
1337 tx = _mm_mul_pd(fscal,dx00);
1338 ty = _mm_mul_pd(fscal,dy00);
1339 tz = _mm_mul_pd(fscal,dz00);
1341 /* Update vectorial force */
1342 fix0 = _mm_add_pd(fix0,tx);
1343 fiy0 = _mm_add_pd(fiy0,ty);
1344 fiz0 = _mm_add_pd(fiz0,tz);
1346 fjx0 = _mm_add_pd(fjx0,tx);
1347 fjy0 = _mm_add_pd(fjy0,ty);
1348 fjz0 = _mm_add_pd(fjz0,tz);
1350 /**************************
1351 * CALCULATE INTERACTIONS *
1352 **************************/
1354 /* COULOMB ELECTROSTATICS */
1355 velec = _mm_mul_pd(qq11,rinv11);
1356 felec = _mm_mul_pd(velec,rinvsq11);
1360 /* Calculate temporary vectorial force */
1361 tx = _mm_mul_pd(fscal,dx11);
1362 ty = _mm_mul_pd(fscal,dy11);
1363 tz = _mm_mul_pd(fscal,dz11);
1365 /* Update vectorial force */
1366 fix1 = _mm_add_pd(fix1,tx);
1367 fiy1 = _mm_add_pd(fiy1,ty);
1368 fiz1 = _mm_add_pd(fiz1,tz);
1370 fjx1 = _mm_add_pd(fjx1,tx);
1371 fjy1 = _mm_add_pd(fjy1,ty);
1372 fjz1 = _mm_add_pd(fjz1,tz);
1374 /**************************
1375 * CALCULATE INTERACTIONS *
1376 **************************/
1378 /* COULOMB ELECTROSTATICS */
1379 velec = _mm_mul_pd(qq12,rinv12);
1380 felec = _mm_mul_pd(velec,rinvsq12);
1384 /* Calculate temporary vectorial force */
1385 tx = _mm_mul_pd(fscal,dx12);
1386 ty = _mm_mul_pd(fscal,dy12);
1387 tz = _mm_mul_pd(fscal,dz12);
1389 /* Update vectorial force */
1390 fix1 = _mm_add_pd(fix1,tx);
1391 fiy1 = _mm_add_pd(fiy1,ty);
1392 fiz1 = _mm_add_pd(fiz1,tz);
1394 fjx2 = _mm_add_pd(fjx2,tx);
1395 fjy2 = _mm_add_pd(fjy2,ty);
1396 fjz2 = _mm_add_pd(fjz2,tz);
1398 /**************************
1399 * CALCULATE INTERACTIONS *
1400 **************************/
1402 /* COULOMB ELECTROSTATICS */
1403 velec = _mm_mul_pd(qq13,rinv13);
1404 felec = _mm_mul_pd(velec,rinvsq13);
1408 /* Calculate temporary vectorial force */
1409 tx = _mm_mul_pd(fscal,dx13);
1410 ty = _mm_mul_pd(fscal,dy13);
1411 tz = _mm_mul_pd(fscal,dz13);
1413 /* Update vectorial force */
1414 fix1 = _mm_add_pd(fix1,tx);
1415 fiy1 = _mm_add_pd(fiy1,ty);
1416 fiz1 = _mm_add_pd(fiz1,tz);
1418 fjx3 = _mm_add_pd(fjx3,tx);
1419 fjy3 = _mm_add_pd(fjy3,ty);
1420 fjz3 = _mm_add_pd(fjz3,tz);
1422 /**************************
1423 * CALCULATE INTERACTIONS *
1424 **************************/
1426 /* COULOMB ELECTROSTATICS */
1427 velec = _mm_mul_pd(qq21,rinv21);
1428 felec = _mm_mul_pd(velec,rinvsq21);
1432 /* Calculate temporary vectorial force */
1433 tx = _mm_mul_pd(fscal,dx21);
1434 ty = _mm_mul_pd(fscal,dy21);
1435 tz = _mm_mul_pd(fscal,dz21);
1437 /* Update vectorial force */
1438 fix2 = _mm_add_pd(fix2,tx);
1439 fiy2 = _mm_add_pd(fiy2,ty);
1440 fiz2 = _mm_add_pd(fiz2,tz);
1442 fjx1 = _mm_add_pd(fjx1,tx);
1443 fjy1 = _mm_add_pd(fjy1,ty);
1444 fjz1 = _mm_add_pd(fjz1,tz);
1446 /**************************
1447 * CALCULATE INTERACTIONS *
1448 **************************/
1450 /* COULOMB ELECTROSTATICS */
1451 velec = _mm_mul_pd(qq22,rinv22);
1452 felec = _mm_mul_pd(velec,rinvsq22);
1456 /* Calculate temporary vectorial force */
1457 tx = _mm_mul_pd(fscal,dx22);
1458 ty = _mm_mul_pd(fscal,dy22);
1459 tz = _mm_mul_pd(fscal,dz22);
1461 /* Update vectorial force */
1462 fix2 = _mm_add_pd(fix2,tx);
1463 fiy2 = _mm_add_pd(fiy2,ty);
1464 fiz2 = _mm_add_pd(fiz2,tz);
1466 fjx2 = _mm_add_pd(fjx2,tx);
1467 fjy2 = _mm_add_pd(fjy2,ty);
1468 fjz2 = _mm_add_pd(fjz2,tz);
1470 /**************************
1471 * CALCULATE INTERACTIONS *
1472 **************************/
1474 /* COULOMB ELECTROSTATICS */
1475 velec = _mm_mul_pd(qq23,rinv23);
1476 felec = _mm_mul_pd(velec,rinvsq23);
1480 /* Calculate temporary vectorial force */
1481 tx = _mm_mul_pd(fscal,dx23);
1482 ty = _mm_mul_pd(fscal,dy23);
1483 tz = _mm_mul_pd(fscal,dz23);
1485 /* Update vectorial force */
1486 fix2 = _mm_add_pd(fix2,tx);
1487 fiy2 = _mm_add_pd(fiy2,ty);
1488 fiz2 = _mm_add_pd(fiz2,tz);
1490 fjx3 = _mm_add_pd(fjx3,tx);
1491 fjy3 = _mm_add_pd(fjy3,ty);
1492 fjz3 = _mm_add_pd(fjz3,tz);
1494 /**************************
1495 * CALCULATE INTERACTIONS *
1496 **************************/
1498 /* COULOMB ELECTROSTATICS */
1499 velec = _mm_mul_pd(qq31,rinv31);
1500 felec = _mm_mul_pd(velec,rinvsq31);
1504 /* Calculate temporary vectorial force */
1505 tx = _mm_mul_pd(fscal,dx31);
1506 ty = _mm_mul_pd(fscal,dy31);
1507 tz = _mm_mul_pd(fscal,dz31);
1509 /* Update vectorial force */
1510 fix3 = _mm_add_pd(fix3,tx);
1511 fiy3 = _mm_add_pd(fiy3,ty);
1512 fiz3 = _mm_add_pd(fiz3,tz);
1514 fjx1 = _mm_add_pd(fjx1,tx);
1515 fjy1 = _mm_add_pd(fjy1,ty);
1516 fjz1 = _mm_add_pd(fjz1,tz);
1518 /**************************
1519 * CALCULATE INTERACTIONS *
1520 **************************/
1522 /* COULOMB ELECTROSTATICS */
1523 velec = _mm_mul_pd(qq32,rinv32);
1524 felec = _mm_mul_pd(velec,rinvsq32);
1528 /* Calculate temporary vectorial force */
1529 tx = _mm_mul_pd(fscal,dx32);
1530 ty = _mm_mul_pd(fscal,dy32);
1531 tz = _mm_mul_pd(fscal,dz32);
1533 /* Update vectorial force */
1534 fix3 = _mm_add_pd(fix3,tx);
1535 fiy3 = _mm_add_pd(fiy3,ty);
1536 fiz3 = _mm_add_pd(fiz3,tz);
1538 fjx2 = _mm_add_pd(fjx2,tx);
1539 fjy2 = _mm_add_pd(fjy2,ty);
1540 fjz2 = _mm_add_pd(fjz2,tz);
1542 /**************************
1543 * CALCULATE INTERACTIONS *
1544 **************************/
1546 /* COULOMB ELECTROSTATICS */
1547 velec = _mm_mul_pd(qq33,rinv33);
1548 felec = _mm_mul_pd(velec,rinvsq33);
1552 /* Calculate temporary vectorial force */
1553 tx = _mm_mul_pd(fscal,dx33);
1554 ty = _mm_mul_pd(fscal,dy33);
1555 tz = _mm_mul_pd(fscal,dz33);
1557 /* Update vectorial force */
1558 fix3 = _mm_add_pd(fix3,tx);
1559 fiy3 = _mm_add_pd(fiy3,ty);
1560 fiz3 = _mm_add_pd(fiz3,tz);
1562 fjx3 = _mm_add_pd(fjx3,tx);
1563 fjy3 = _mm_add_pd(fjy3,ty);
1564 fjz3 = _mm_add_pd(fjz3,tz);
1566 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1568 /* Inner loop uses 294 flops */
1571 if(jidx<j_index_end)
1575 j_coord_offsetA = DIM*jnrA;
1577 /* load j atom coordinates */
1578 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1579 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1580 &jy2,&jz2,&jx3,&jy3,&jz3);
1582 /* Calculate displacement vector */
1583 dx00 = _mm_sub_pd(ix0,jx0);
1584 dy00 = _mm_sub_pd(iy0,jy0);
1585 dz00 = _mm_sub_pd(iz0,jz0);
1586 dx11 = _mm_sub_pd(ix1,jx1);
1587 dy11 = _mm_sub_pd(iy1,jy1);
1588 dz11 = _mm_sub_pd(iz1,jz1);
1589 dx12 = _mm_sub_pd(ix1,jx2);
1590 dy12 = _mm_sub_pd(iy1,jy2);
1591 dz12 = _mm_sub_pd(iz1,jz2);
1592 dx13 = _mm_sub_pd(ix1,jx3);
1593 dy13 = _mm_sub_pd(iy1,jy3);
1594 dz13 = _mm_sub_pd(iz1,jz3);
1595 dx21 = _mm_sub_pd(ix2,jx1);
1596 dy21 = _mm_sub_pd(iy2,jy1);
1597 dz21 = _mm_sub_pd(iz2,jz1);
1598 dx22 = _mm_sub_pd(ix2,jx2);
1599 dy22 = _mm_sub_pd(iy2,jy2);
1600 dz22 = _mm_sub_pd(iz2,jz2);
1601 dx23 = _mm_sub_pd(ix2,jx3);
1602 dy23 = _mm_sub_pd(iy2,jy3);
1603 dz23 = _mm_sub_pd(iz2,jz3);
1604 dx31 = _mm_sub_pd(ix3,jx1);
1605 dy31 = _mm_sub_pd(iy3,jy1);
1606 dz31 = _mm_sub_pd(iz3,jz1);
1607 dx32 = _mm_sub_pd(ix3,jx2);
1608 dy32 = _mm_sub_pd(iy3,jy2);
1609 dz32 = _mm_sub_pd(iz3,jz2);
1610 dx33 = _mm_sub_pd(ix3,jx3);
1611 dy33 = _mm_sub_pd(iy3,jy3);
1612 dz33 = _mm_sub_pd(iz3,jz3);
1614 /* Calculate squared distance and things based on it */
1615 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1616 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1617 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1618 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1619 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1620 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1621 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1622 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1623 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1624 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1626 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1627 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1628 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1629 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1630 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1631 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1632 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1633 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1634 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1635 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1637 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1638 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1639 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1640 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1641 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1642 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1643 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1644 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1645 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1647 fjx0 = _mm_setzero_pd();
1648 fjy0 = _mm_setzero_pd();
1649 fjz0 = _mm_setzero_pd();
1650 fjx1 = _mm_setzero_pd();
1651 fjy1 = _mm_setzero_pd();
1652 fjz1 = _mm_setzero_pd();
1653 fjx2 = _mm_setzero_pd();
1654 fjy2 = _mm_setzero_pd();
1655 fjz2 = _mm_setzero_pd();
1656 fjx3 = _mm_setzero_pd();
1657 fjy3 = _mm_setzero_pd();
1658 fjz3 = _mm_setzero_pd();
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r00 = _mm_mul_pd(rsq00,rinv00);
1666 /* Calculate table index by multiplying r with table scale and truncate to integer */
1667 rt = _mm_mul_pd(r00,vftabscale);
1668 vfitab = _mm_cvttpd_epi32(rt);
1669 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1670 vfitab = _mm_slli_epi32(vfitab,3);
1672 /* CUBIC SPLINE TABLE DISPERSION */
1673 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1674 F = _mm_setzero_pd();
1675 GMX_MM_TRANSPOSE2_PD(Y,F);
1676 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1677 H = _mm_setzero_pd();
1678 GMX_MM_TRANSPOSE2_PD(G,H);
1679 Heps = _mm_mul_pd(vfeps,H);
1680 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1681 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1682 fvdw6 = _mm_mul_pd(c6_00,FF);
1684 /* CUBIC SPLINE TABLE REPULSION */
1685 vfitab = _mm_add_epi32(vfitab,ifour);
1686 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1687 F = _mm_setzero_pd();
1688 GMX_MM_TRANSPOSE2_PD(Y,F);
1689 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1690 H = _mm_setzero_pd();
1691 GMX_MM_TRANSPOSE2_PD(G,H);
1692 Heps = _mm_mul_pd(vfeps,H);
1693 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1694 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1695 fvdw12 = _mm_mul_pd(c12_00,FF);
1696 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1700 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1702 /* Calculate temporary vectorial force */
1703 tx = _mm_mul_pd(fscal,dx00);
1704 ty = _mm_mul_pd(fscal,dy00);
1705 tz = _mm_mul_pd(fscal,dz00);
1707 /* Update vectorial force */
1708 fix0 = _mm_add_pd(fix0,tx);
1709 fiy0 = _mm_add_pd(fiy0,ty);
1710 fiz0 = _mm_add_pd(fiz0,tz);
1712 fjx0 = _mm_add_pd(fjx0,tx);
1713 fjy0 = _mm_add_pd(fjy0,ty);
1714 fjz0 = _mm_add_pd(fjz0,tz);
1716 /**************************
1717 * CALCULATE INTERACTIONS *
1718 **************************/
1720 /* COULOMB ELECTROSTATICS */
1721 velec = _mm_mul_pd(qq11,rinv11);
1722 felec = _mm_mul_pd(velec,rinvsq11);
1726 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1728 /* Calculate temporary vectorial force */
1729 tx = _mm_mul_pd(fscal,dx11);
1730 ty = _mm_mul_pd(fscal,dy11);
1731 tz = _mm_mul_pd(fscal,dz11);
1733 /* Update vectorial force */
1734 fix1 = _mm_add_pd(fix1,tx);
1735 fiy1 = _mm_add_pd(fiy1,ty);
1736 fiz1 = _mm_add_pd(fiz1,tz);
1738 fjx1 = _mm_add_pd(fjx1,tx);
1739 fjy1 = _mm_add_pd(fjy1,ty);
1740 fjz1 = _mm_add_pd(fjz1,tz);
1742 /**************************
1743 * CALCULATE INTERACTIONS *
1744 **************************/
1746 /* COULOMB ELECTROSTATICS */
1747 velec = _mm_mul_pd(qq12,rinv12);
1748 felec = _mm_mul_pd(velec,rinvsq12);
1752 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1754 /* Calculate temporary vectorial force */
1755 tx = _mm_mul_pd(fscal,dx12);
1756 ty = _mm_mul_pd(fscal,dy12);
1757 tz = _mm_mul_pd(fscal,dz12);
1759 /* Update vectorial force */
1760 fix1 = _mm_add_pd(fix1,tx);
1761 fiy1 = _mm_add_pd(fiy1,ty);
1762 fiz1 = _mm_add_pd(fiz1,tz);
1764 fjx2 = _mm_add_pd(fjx2,tx);
1765 fjy2 = _mm_add_pd(fjy2,ty);
1766 fjz2 = _mm_add_pd(fjz2,tz);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 /* COULOMB ELECTROSTATICS */
1773 velec = _mm_mul_pd(qq13,rinv13);
1774 felec = _mm_mul_pd(velec,rinvsq13);
1778 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1780 /* Calculate temporary vectorial force */
1781 tx = _mm_mul_pd(fscal,dx13);
1782 ty = _mm_mul_pd(fscal,dy13);
1783 tz = _mm_mul_pd(fscal,dz13);
1785 /* Update vectorial force */
1786 fix1 = _mm_add_pd(fix1,tx);
1787 fiy1 = _mm_add_pd(fiy1,ty);
1788 fiz1 = _mm_add_pd(fiz1,tz);
1790 fjx3 = _mm_add_pd(fjx3,tx);
1791 fjy3 = _mm_add_pd(fjy3,ty);
1792 fjz3 = _mm_add_pd(fjz3,tz);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 /* COULOMB ELECTROSTATICS */
1799 velec = _mm_mul_pd(qq21,rinv21);
1800 felec = _mm_mul_pd(velec,rinvsq21);
1804 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1806 /* Calculate temporary vectorial force */
1807 tx = _mm_mul_pd(fscal,dx21);
1808 ty = _mm_mul_pd(fscal,dy21);
1809 tz = _mm_mul_pd(fscal,dz21);
1811 /* Update vectorial force */
1812 fix2 = _mm_add_pd(fix2,tx);
1813 fiy2 = _mm_add_pd(fiy2,ty);
1814 fiz2 = _mm_add_pd(fiz2,tz);
1816 fjx1 = _mm_add_pd(fjx1,tx);
1817 fjy1 = _mm_add_pd(fjy1,ty);
1818 fjz1 = _mm_add_pd(fjz1,tz);
1820 /**************************
1821 * CALCULATE INTERACTIONS *
1822 **************************/
1824 /* COULOMB ELECTROSTATICS */
1825 velec = _mm_mul_pd(qq22,rinv22);
1826 felec = _mm_mul_pd(velec,rinvsq22);
1830 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1832 /* Calculate temporary vectorial force */
1833 tx = _mm_mul_pd(fscal,dx22);
1834 ty = _mm_mul_pd(fscal,dy22);
1835 tz = _mm_mul_pd(fscal,dz22);
1837 /* Update vectorial force */
1838 fix2 = _mm_add_pd(fix2,tx);
1839 fiy2 = _mm_add_pd(fiy2,ty);
1840 fiz2 = _mm_add_pd(fiz2,tz);
1842 fjx2 = _mm_add_pd(fjx2,tx);
1843 fjy2 = _mm_add_pd(fjy2,ty);
1844 fjz2 = _mm_add_pd(fjz2,tz);
1846 /**************************
1847 * CALCULATE INTERACTIONS *
1848 **************************/
1850 /* COULOMB ELECTROSTATICS */
1851 velec = _mm_mul_pd(qq23,rinv23);
1852 felec = _mm_mul_pd(velec,rinvsq23);
1856 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1858 /* Calculate temporary vectorial force */
1859 tx = _mm_mul_pd(fscal,dx23);
1860 ty = _mm_mul_pd(fscal,dy23);
1861 tz = _mm_mul_pd(fscal,dz23);
1863 /* Update vectorial force */
1864 fix2 = _mm_add_pd(fix2,tx);
1865 fiy2 = _mm_add_pd(fiy2,ty);
1866 fiz2 = _mm_add_pd(fiz2,tz);
1868 fjx3 = _mm_add_pd(fjx3,tx);
1869 fjy3 = _mm_add_pd(fjy3,ty);
1870 fjz3 = _mm_add_pd(fjz3,tz);
1872 /**************************
1873 * CALCULATE INTERACTIONS *
1874 **************************/
1876 /* COULOMB ELECTROSTATICS */
1877 velec = _mm_mul_pd(qq31,rinv31);
1878 felec = _mm_mul_pd(velec,rinvsq31);
1882 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1884 /* Calculate temporary vectorial force */
1885 tx = _mm_mul_pd(fscal,dx31);
1886 ty = _mm_mul_pd(fscal,dy31);
1887 tz = _mm_mul_pd(fscal,dz31);
1889 /* Update vectorial force */
1890 fix3 = _mm_add_pd(fix3,tx);
1891 fiy3 = _mm_add_pd(fiy3,ty);
1892 fiz3 = _mm_add_pd(fiz3,tz);
1894 fjx1 = _mm_add_pd(fjx1,tx);
1895 fjy1 = _mm_add_pd(fjy1,ty);
1896 fjz1 = _mm_add_pd(fjz1,tz);
1898 /**************************
1899 * CALCULATE INTERACTIONS *
1900 **************************/
1902 /* COULOMB ELECTROSTATICS */
1903 velec = _mm_mul_pd(qq32,rinv32);
1904 felec = _mm_mul_pd(velec,rinvsq32);
1908 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1910 /* Calculate temporary vectorial force */
1911 tx = _mm_mul_pd(fscal,dx32);
1912 ty = _mm_mul_pd(fscal,dy32);
1913 tz = _mm_mul_pd(fscal,dz32);
1915 /* Update vectorial force */
1916 fix3 = _mm_add_pd(fix3,tx);
1917 fiy3 = _mm_add_pd(fiy3,ty);
1918 fiz3 = _mm_add_pd(fiz3,tz);
1920 fjx2 = _mm_add_pd(fjx2,tx);
1921 fjy2 = _mm_add_pd(fjy2,ty);
1922 fjz2 = _mm_add_pd(fjz2,tz);
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1928 /* COULOMB ELECTROSTATICS */
1929 velec = _mm_mul_pd(qq33,rinv33);
1930 felec = _mm_mul_pd(velec,rinvsq33);
1934 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1936 /* Calculate temporary vectorial force */
1937 tx = _mm_mul_pd(fscal,dx33);
1938 ty = _mm_mul_pd(fscal,dy33);
1939 tz = _mm_mul_pd(fscal,dz33);
1941 /* Update vectorial force */
1942 fix3 = _mm_add_pd(fix3,tx);
1943 fiy3 = _mm_add_pd(fiy3,ty);
1944 fiz3 = _mm_add_pd(fiz3,tz);
1946 fjx3 = _mm_add_pd(fjx3,tx);
1947 fjy3 = _mm_add_pd(fjy3,ty);
1948 fjz3 = _mm_add_pd(fjz3,tz);
1950 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1952 /* Inner loop uses 294 flops */
1955 /* End of innermost loop */
1957 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1958 f+i_coord_offset,fshift+i_shift_offset);
1960 /* Increment number of inner iterations */
1961 inneriter += j_index_end - j_index_start;
1963 /* Outer loop uses 24 flops */
1966 /* Increment number of outer iterations */
1969 /* Update outer/inner flops */
1971 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*294);