2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_double.h"
48 #include "kernelutil_x86_sse2_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double
52 * Electrostatics interaction: Coulomb
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
104 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128i ifour = _mm_set1_epi32(4);
111 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
113 __m128d dummy_mask,cutoff_mask;
114 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one = _mm_set1_pd(1.0);
116 __m128d two = _mm_set1_pd(2.0);
122 jindex = nlist->jindex;
124 shiftidx = nlist->shift;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_pd(fr->epsfac);
129 charge = mdatoms->chargeA;
130 nvdwtype = fr->ntype;
132 vdwtype = mdatoms->typeA;
134 vftab = kernel_data->table_vdw->data;
135 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
137 /* Setup water-specific parameters */
138 inr = nlist->iinr[0];
139 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
140 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
141 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
142 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
144 jq0 = _mm_set1_pd(charge[inr+0]);
145 jq1 = _mm_set1_pd(charge[inr+1]);
146 jq2 = _mm_set1_pd(charge[inr+2]);
147 vdwjidx0A = 2*vdwtype[inr+0];
148 qq00 = _mm_mul_pd(iq0,jq0);
149 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
150 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
151 qq01 = _mm_mul_pd(iq0,jq1);
152 qq02 = _mm_mul_pd(iq0,jq2);
153 qq10 = _mm_mul_pd(iq1,jq0);
154 qq11 = _mm_mul_pd(iq1,jq1);
155 qq12 = _mm_mul_pd(iq1,jq2);
156 qq20 = _mm_mul_pd(iq2,jq0);
157 qq21 = _mm_mul_pd(iq2,jq1);
158 qq22 = _mm_mul_pd(iq2,jq2);
160 /* Avoid stupid compiler warnings */
168 /* Start outer loop over neighborlists */
169 for(iidx=0; iidx<nri; iidx++)
171 /* Load shift vector for this list */
172 i_shift_offset = DIM*shiftidx[iidx];
174 /* Load limits for loop over neighbors */
175 j_index_start = jindex[iidx];
176 j_index_end = jindex[iidx+1];
178 /* Get outer coordinate index */
180 i_coord_offset = DIM*inr;
182 /* Load i particle coords and add shift vector */
183 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
184 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
186 fix0 = _mm_setzero_pd();
187 fiy0 = _mm_setzero_pd();
188 fiz0 = _mm_setzero_pd();
189 fix1 = _mm_setzero_pd();
190 fiy1 = _mm_setzero_pd();
191 fiz1 = _mm_setzero_pd();
192 fix2 = _mm_setzero_pd();
193 fiy2 = _mm_setzero_pd();
194 fiz2 = _mm_setzero_pd();
196 /* Reset potential sums */
197 velecsum = _mm_setzero_pd();
198 vvdwsum = _mm_setzero_pd();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA = DIM*jnrA;
208 j_coord_offsetB = DIM*jnrB;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
212 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
214 /* Calculate displacement vector */
215 dx00 = _mm_sub_pd(ix0,jx0);
216 dy00 = _mm_sub_pd(iy0,jy0);
217 dz00 = _mm_sub_pd(iz0,jz0);
218 dx01 = _mm_sub_pd(ix0,jx1);
219 dy01 = _mm_sub_pd(iy0,jy1);
220 dz01 = _mm_sub_pd(iz0,jz1);
221 dx02 = _mm_sub_pd(ix0,jx2);
222 dy02 = _mm_sub_pd(iy0,jy2);
223 dz02 = _mm_sub_pd(iz0,jz2);
224 dx10 = _mm_sub_pd(ix1,jx0);
225 dy10 = _mm_sub_pd(iy1,jy0);
226 dz10 = _mm_sub_pd(iz1,jz0);
227 dx11 = _mm_sub_pd(ix1,jx1);
228 dy11 = _mm_sub_pd(iy1,jy1);
229 dz11 = _mm_sub_pd(iz1,jz1);
230 dx12 = _mm_sub_pd(ix1,jx2);
231 dy12 = _mm_sub_pd(iy1,jy2);
232 dz12 = _mm_sub_pd(iz1,jz2);
233 dx20 = _mm_sub_pd(ix2,jx0);
234 dy20 = _mm_sub_pd(iy2,jy0);
235 dz20 = _mm_sub_pd(iz2,jz0);
236 dx21 = _mm_sub_pd(ix2,jx1);
237 dy21 = _mm_sub_pd(iy2,jy1);
238 dz21 = _mm_sub_pd(iz2,jz1);
239 dx22 = _mm_sub_pd(ix2,jx2);
240 dy22 = _mm_sub_pd(iy2,jy2);
241 dz22 = _mm_sub_pd(iz2,jz2);
243 /* Calculate squared distance and things based on it */
244 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
245 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
246 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
247 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
248 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
249 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
250 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
251 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
252 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
254 rinv00 = gmx_mm_invsqrt_pd(rsq00);
255 rinv01 = gmx_mm_invsqrt_pd(rsq01);
256 rinv02 = gmx_mm_invsqrt_pd(rsq02);
257 rinv10 = gmx_mm_invsqrt_pd(rsq10);
258 rinv11 = gmx_mm_invsqrt_pd(rsq11);
259 rinv12 = gmx_mm_invsqrt_pd(rsq12);
260 rinv20 = gmx_mm_invsqrt_pd(rsq20);
261 rinv21 = gmx_mm_invsqrt_pd(rsq21);
262 rinv22 = gmx_mm_invsqrt_pd(rsq22);
264 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
265 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
266 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
267 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
268 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
269 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
270 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
271 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
272 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
274 fjx0 = _mm_setzero_pd();
275 fjy0 = _mm_setzero_pd();
276 fjz0 = _mm_setzero_pd();
277 fjx1 = _mm_setzero_pd();
278 fjy1 = _mm_setzero_pd();
279 fjz1 = _mm_setzero_pd();
280 fjx2 = _mm_setzero_pd();
281 fjy2 = _mm_setzero_pd();
282 fjz2 = _mm_setzero_pd();
284 /**************************
285 * CALCULATE INTERACTIONS *
286 **************************/
288 r00 = _mm_mul_pd(rsq00,rinv00);
290 /* Calculate table index by multiplying r with table scale and truncate to integer */
291 rt = _mm_mul_pd(r00,vftabscale);
292 vfitab = _mm_cvttpd_epi32(rt);
293 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
294 vfitab = _mm_slli_epi32(vfitab,3);
296 /* COULOMB ELECTROSTATICS */
297 velec = _mm_mul_pd(qq00,rinv00);
298 felec = _mm_mul_pd(velec,rinvsq00);
300 /* CUBIC SPLINE TABLE DISPERSION */
301 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
302 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
303 GMX_MM_TRANSPOSE2_PD(Y,F);
304 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
305 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
306 GMX_MM_TRANSPOSE2_PD(G,H);
307 Heps = _mm_mul_pd(vfeps,H);
308 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
309 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
310 vvdw6 = _mm_mul_pd(c6_00,VV);
311 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
312 fvdw6 = _mm_mul_pd(c6_00,FF);
314 /* CUBIC SPLINE TABLE REPULSION */
315 vfitab = _mm_add_epi32(vfitab,ifour);
316 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
317 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
318 GMX_MM_TRANSPOSE2_PD(Y,F);
319 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
320 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
321 GMX_MM_TRANSPOSE2_PD(G,H);
322 Heps = _mm_mul_pd(vfeps,H);
323 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
324 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
325 vvdw12 = _mm_mul_pd(c12_00,VV);
326 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
327 fvdw12 = _mm_mul_pd(c12_00,FF);
328 vvdw = _mm_add_pd(vvdw12,vvdw6);
329 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
331 /* Update potential sum for this i atom from the interaction with this j atom. */
332 velecsum = _mm_add_pd(velecsum,velec);
333 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
335 fscal = _mm_add_pd(felec,fvdw);
337 /* Calculate temporary vectorial force */
338 tx = _mm_mul_pd(fscal,dx00);
339 ty = _mm_mul_pd(fscal,dy00);
340 tz = _mm_mul_pd(fscal,dz00);
342 /* Update vectorial force */
343 fix0 = _mm_add_pd(fix0,tx);
344 fiy0 = _mm_add_pd(fiy0,ty);
345 fiz0 = _mm_add_pd(fiz0,tz);
347 fjx0 = _mm_add_pd(fjx0,tx);
348 fjy0 = _mm_add_pd(fjy0,ty);
349 fjz0 = _mm_add_pd(fjz0,tz);
351 /**************************
352 * CALCULATE INTERACTIONS *
353 **************************/
355 /* COULOMB ELECTROSTATICS */
356 velec = _mm_mul_pd(qq01,rinv01);
357 felec = _mm_mul_pd(velec,rinvsq01);
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 velecsum = _mm_add_pd(velecsum,velec);
364 /* Calculate temporary vectorial force */
365 tx = _mm_mul_pd(fscal,dx01);
366 ty = _mm_mul_pd(fscal,dy01);
367 tz = _mm_mul_pd(fscal,dz01);
369 /* Update vectorial force */
370 fix0 = _mm_add_pd(fix0,tx);
371 fiy0 = _mm_add_pd(fiy0,ty);
372 fiz0 = _mm_add_pd(fiz0,tz);
374 fjx1 = _mm_add_pd(fjx1,tx);
375 fjy1 = _mm_add_pd(fjy1,ty);
376 fjz1 = _mm_add_pd(fjz1,tz);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 /* COULOMB ELECTROSTATICS */
383 velec = _mm_mul_pd(qq02,rinv02);
384 felec = _mm_mul_pd(velec,rinvsq02);
386 /* Update potential sum for this i atom from the interaction with this j atom. */
387 velecsum = _mm_add_pd(velecsum,velec);
391 /* Calculate temporary vectorial force */
392 tx = _mm_mul_pd(fscal,dx02);
393 ty = _mm_mul_pd(fscal,dy02);
394 tz = _mm_mul_pd(fscal,dz02);
396 /* Update vectorial force */
397 fix0 = _mm_add_pd(fix0,tx);
398 fiy0 = _mm_add_pd(fiy0,ty);
399 fiz0 = _mm_add_pd(fiz0,tz);
401 fjx2 = _mm_add_pd(fjx2,tx);
402 fjy2 = _mm_add_pd(fjy2,ty);
403 fjz2 = _mm_add_pd(fjz2,tz);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 /* COULOMB ELECTROSTATICS */
410 velec = _mm_mul_pd(qq10,rinv10);
411 felec = _mm_mul_pd(velec,rinvsq10);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velecsum = _mm_add_pd(velecsum,velec);
418 /* Calculate temporary vectorial force */
419 tx = _mm_mul_pd(fscal,dx10);
420 ty = _mm_mul_pd(fscal,dy10);
421 tz = _mm_mul_pd(fscal,dz10);
423 /* Update vectorial force */
424 fix1 = _mm_add_pd(fix1,tx);
425 fiy1 = _mm_add_pd(fiy1,ty);
426 fiz1 = _mm_add_pd(fiz1,tz);
428 fjx0 = _mm_add_pd(fjx0,tx);
429 fjy0 = _mm_add_pd(fjy0,ty);
430 fjz0 = _mm_add_pd(fjz0,tz);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 /* COULOMB ELECTROSTATICS */
437 velec = _mm_mul_pd(qq11,rinv11);
438 felec = _mm_mul_pd(velec,rinvsq11);
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velecsum = _mm_add_pd(velecsum,velec);
445 /* Calculate temporary vectorial force */
446 tx = _mm_mul_pd(fscal,dx11);
447 ty = _mm_mul_pd(fscal,dy11);
448 tz = _mm_mul_pd(fscal,dz11);
450 /* Update vectorial force */
451 fix1 = _mm_add_pd(fix1,tx);
452 fiy1 = _mm_add_pd(fiy1,ty);
453 fiz1 = _mm_add_pd(fiz1,tz);
455 fjx1 = _mm_add_pd(fjx1,tx);
456 fjy1 = _mm_add_pd(fjy1,ty);
457 fjz1 = _mm_add_pd(fjz1,tz);
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
463 /* COULOMB ELECTROSTATICS */
464 velec = _mm_mul_pd(qq12,rinv12);
465 felec = _mm_mul_pd(velec,rinvsq12);
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velecsum = _mm_add_pd(velecsum,velec);
472 /* Calculate temporary vectorial force */
473 tx = _mm_mul_pd(fscal,dx12);
474 ty = _mm_mul_pd(fscal,dy12);
475 tz = _mm_mul_pd(fscal,dz12);
477 /* Update vectorial force */
478 fix1 = _mm_add_pd(fix1,tx);
479 fiy1 = _mm_add_pd(fiy1,ty);
480 fiz1 = _mm_add_pd(fiz1,tz);
482 fjx2 = _mm_add_pd(fjx2,tx);
483 fjy2 = _mm_add_pd(fjy2,ty);
484 fjz2 = _mm_add_pd(fjz2,tz);
486 /**************************
487 * CALCULATE INTERACTIONS *
488 **************************/
490 /* COULOMB ELECTROSTATICS */
491 velec = _mm_mul_pd(qq20,rinv20);
492 felec = _mm_mul_pd(velec,rinvsq20);
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velecsum = _mm_add_pd(velecsum,velec);
499 /* Calculate temporary vectorial force */
500 tx = _mm_mul_pd(fscal,dx20);
501 ty = _mm_mul_pd(fscal,dy20);
502 tz = _mm_mul_pd(fscal,dz20);
504 /* Update vectorial force */
505 fix2 = _mm_add_pd(fix2,tx);
506 fiy2 = _mm_add_pd(fiy2,ty);
507 fiz2 = _mm_add_pd(fiz2,tz);
509 fjx0 = _mm_add_pd(fjx0,tx);
510 fjy0 = _mm_add_pd(fjy0,ty);
511 fjz0 = _mm_add_pd(fjz0,tz);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 /* COULOMB ELECTROSTATICS */
518 velec = _mm_mul_pd(qq21,rinv21);
519 felec = _mm_mul_pd(velec,rinvsq21);
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velecsum = _mm_add_pd(velecsum,velec);
526 /* Calculate temporary vectorial force */
527 tx = _mm_mul_pd(fscal,dx21);
528 ty = _mm_mul_pd(fscal,dy21);
529 tz = _mm_mul_pd(fscal,dz21);
531 /* Update vectorial force */
532 fix2 = _mm_add_pd(fix2,tx);
533 fiy2 = _mm_add_pd(fiy2,ty);
534 fiz2 = _mm_add_pd(fiz2,tz);
536 fjx1 = _mm_add_pd(fjx1,tx);
537 fjy1 = _mm_add_pd(fjy1,ty);
538 fjz1 = _mm_add_pd(fjz1,tz);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 /* COULOMB ELECTROSTATICS */
545 velec = _mm_mul_pd(qq22,rinv22);
546 felec = _mm_mul_pd(velec,rinvsq22);
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velecsum = _mm_add_pd(velecsum,velec);
553 /* Calculate temporary vectorial force */
554 tx = _mm_mul_pd(fscal,dx22);
555 ty = _mm_mul_pd(fscal,dy22);
556 tz = _mm_mul_pd(fscal,dz22);
558 /* Update vectorial force */
559 fix2 = _mm_add_pd(fix2,tx);
560 fiy2 = _mm_add_pd(fiy2,ty);
561 fiz2 = _mm_add_pd(fiz2,tz);
563 fjx2 = _mm_add_pd(fjx2,tx);
564 fjy2 = _mm_add_pd(fjy2,ty);
565 fjz2 = _mm_add_pd(fjz2,tz);
567 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
569 /* Inner loop uses 287 flops */
576 j_coord_offsetA = DIM*jnrA;
578 /* load j atom coordinates */
579 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
580 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
582 /* Calculate displacement vector */
583 dx00 = _mm_sub_pd(ix0,jx0);
584 dy00 = _mm_sub_pd(iy0,jy0);
585 dz00 = _mm_sub_pd(iz0,jz0);
586 dx01 = _mm_sub_pd(ix0,jx1);
587 dy01 = _mm_sub_pd(iy0,jy1);
588 dz01 = _mm_sub_pd(iz0,jz1);
589 dx02 = _mm_sub_pd(ix0,jx2);
590 dy02 = _mm_sub_pd(iy0,jy2);
591 dz02 = _mm_sub_pd(iz0,jz2);
592 dx10 = _mm_sub_pd(ix1,jx0);
593 dy10 = _mm_sub_pd(iy1,jy0);
594 dz10 = _mm_sub_pd(iz1,jz0);
595 dx11 = _mm_sub_pd(ix1,jx1);
596 dy11 = _mm_sub_pd(iy1,jy1);
597 dz11 = _mm_sub_pd(iz1,jz1);
598 dx12 = _mm_sub_pd(ix1,jx2);
599 dy12 = _mm_sub_pd(iy1,jy2);
600 dz12 = _mm_sub_pd(iz1,jz2);
601 dx20 = _mm_sub_pd(ix2,jx0);
602 dy20 = _mm_sub_pd(iy2,jy0);
603 dz20 = _mm_sub_pd(iz2,jz0);
604 dx21 = _mm_sub_pd(ix2,jx1);
605 dy21 = _mm_sub_pd(iy2,jy1);
606 dz21 = _mm_sub_pd(iz2,jz1);
607 dx22 = _mm_sub_pd(ix2,jx2);
608 dy22 = _mm_sub_pd(iy2,jy2);
609 dz22 = _mm_sub_pd(iz2,jz2);
611 /* Calculate squared distance and things based on it */
612 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
613 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
614 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
615 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
616 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
617 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
618 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
619 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
620 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
622 rinv00 = gmx_mm_invsqrt_pd(rsq00);
623 rinv01 = gmx_mm_invsqrt_pd(rsq01);
624 rinv02 = gmx_mm_invsqrt_pd(rsq02);
625 rinv10 = gmx_mm_invsqrt_pd(rsq10);
626 rinv11 = gmx_mm_invsqrt_pd(rsq11);
627 rinv12 = gmx_mm_invsqrt_pd(rsq12);
628 rinv20 = gmx_mm_invsqrt_pd(rsq20);
629 rinv21 = gmx_mm_invsqrt_pd(rsq21);
630 rinv22 = gmx_mm_invsqrt_pd(rsq22);
632 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
633 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
634 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
635 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
636 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
637 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
638 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
639 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
640 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
642 fjx0 = _mm_setzero_pd();
643 fjy0 = _mm_setzero_pd();
644 fjz0 = _mm_setzero_pd();
645 fjx1 = _mm_setzero_pd();
646 fjy1 = _mm_setzero_pd();
647 fjz1 = _mm_setzero_pd();
648 fjx2 = _mm_setzero_pd();
649 fjy2 = _mm_setzero_pd();
650 fjz2 = _mm_setzero_pd();
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 r00 = _mm_mul_pd(rsq00,rinv00);
658 /* Calculate table index by multiplying r with table scale and truncate to integer */
659 rt = _mm_mul_pd(r00,vftabscale);
660 vfitab = _mm_cvttpd_epi32(rt);
661 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
662 vfitab = _mm_slli_epi32(vfitab,3);
664 /* COULOMB ELECTROSTATICS */
665 velec = _mm_mul_pd(qq00,rinv00);
666 felec = _mm_mul_pd(velec,rinvsq00);
668 /* CUBIC SPLINE TABLE DISPERSION */
669 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
670 F = _mm_setzero_pd();
671 GMX_MM_TRANSPOSE2_PD(Y,F);
672 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
673 H = _mm_setzero_pd();
674 GMX_MM_TRANSPOSE2_PD(G,H);
675 Heps = _mm_mul_pd(vfeps,H);
676 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
677 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
678 vvdw6 = _mm_mul_pd(c6_00,VV);
679 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
680 fvdw6 = _mm_mul_pd(c6_00,FF);
682 /* CUBIC SPLINE TABLE REPULSION */
683 vfitab = _mm_add_epi32(vfitab,ifour);
684 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
685 F = _mm_setzero_pd();
686 GMX_MM_TRANSPOSE2_PD(Y,F);
687 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
688 H = _mm_setzero_pd();
689 GMX_MM_TRANSPOSE2_PD(G,H);
690 Heps = _mm_mul_pd(vfeps,H);
691 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
692 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
693 vvdw12 = _mm_mul_pd(c12_00,VV);
694 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
695 fvdw12 = _mm_mul_pd(c12_00,FF);
696 vvdw = _mm_add_pd(vvdw12,vvdw6);
697 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
699 /* Update potential sum for this i atom from the interaction with this j atom. */
700 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
701 velecsum = _mm_add_pd(velecsum,velec);
702 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
703 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
705 fscal = _mm_add_pd(felec,fvdw);
707 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
709 /* Calculate temporary vectorial force */
710 tx = _mm_mul_pd(fscal,dx00);
711 ty = _mm_mul_pd(fscal,dy00);
712 tz = _mm_mul_pd(fscal,dz00);
714 /* Update vectorial force */
715 fix0 = _mm_add_pd(fix0,tx);
716 fiy0 = _mm_add_pd(fiy0,ty);
717 fiz0 = _mm_add_pd(fiz0,tz);
719 fjx0 = _mm_add_pd(fjx0,tx);
720 fjy0 = _mm_add_pd(fjy0,ty);
721 fjz0 = _mm_add_pd(fjz0,tz);
723 /**************************
724 * CALCULATE INTERACTIONS *
725 **************************/
727 /* COULOMB ELECTROSTATICS */
728 velec = _mm_mul_pd(qq01,rinv01);
729 felec = _mm_mul_pd(velec,rinvsq01);
731 /* Update potential sum for this i atom from the interaction with this j atom. */
732 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
733 velecsum = _mm_add_pd(velecsum,velec);
737 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
739 /* Calculate temporary vectorial force */
740 tx = _mm_mul_pd(fscal,dx01);
741 ty = _mm_mul_pd(fscal,dy01);
742 tz = _mm_mul_pd(fscal,dz01);
744 /* Update vectorial force */
745 fix0 = _mm_add_pd(fix0,tx);
746 fiy0 = _mm_add_pd(fiy0,ty);
747 fiz0 = _mm_add_pd(fiz0,tz);
749 fjx1 = _mm_add_pd(fjx1,tx);
750 fjy1 = _mm_add_pd(fjy1,ty);
751 fjz1 = _mm_add_pd(fjz1,tz);
753 /**************************
754 * CALCULATE INTERACTIONS *
755 **************************/
757 /* COULOMB ELECTROSTATICS */
758 velec = _mm_mul_pd(qq02,rinv02);
759 felec = _mm_mul_pd(velec,rinvsq02);
761 /* Update potential sum for this i atom from the interaction with this j atom. */
762 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
763 velecsum = _mm_add_pd(velecsum,velec);
767 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
769 /* Calculate temporary vectorial force */
770 tx = _mm_mul_pd(fscal,dx02);
771 ty = _mm_mul_pd(fscal,dy02);
772 tz = _mm_mul_pd(fscal,dz02);
774 /* Update vectorial force */
775 fix0 = _mm_add_pd(fix0,tx);
776 fiy0 = _mm_add_pd(fiy0,ty);
777 fiz0 = _mm_add_pd(fiz0,tz);
779 fjx2 = _mm_add_pd(fjx2,tx);
780 fjy2 = _mm_add_pd(fjy2,ty);
781 fjz2 = _mm_add_pd(fjz2,tz);
783 /**************************
784 * CALCULATE INTERACTIONS *
785 **************************/
787 /* COULOMB ELECTROSTATICS */
788 velec = _mm_mul_pd(qq10,rinv10);
789 felec = _mm_mul_pd(velec,rinvsq10);
791 /* Update potential sum for this i atom from the interaction with this j atom. */
792 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
793 velecsum = _mm_add_pd(velecsum,velec);
797 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
799 /* Calculate temporary vectorial force */
800 tx = _mm_mul_pd(fscal,dx10);
801 ty = _mm_mul_pd(fscal,dy10);
802 tz = _mm_mul_pd(fscal,dz10);
804 /* Update vectorial force */
805 fix1 = _mm_add_pd(fix1,tx);
806 fiy1 = _mm_add_pd(fiy1,ty);
807 fiz1 = _mm_add_pd(fiz1,tz);
809 fjx0 = _mm_add_pd(fjx0,tx);
810 fjy0 = _mm_add_pd(fjy0,ty);
811 fjz0 = _mm_add_pd(fjz0,tz);
813 /**************************
814 * CALCULATE INTERACTIONS *
815 **************************/
817 /* COULOMB ELECTROSTATICS */
818 velec = _mm_mul_pd(qq11,rinv11);
819 felec = _mm_mul_pd(velec,rinvsq11);
821 /* Update potential sum for this i atom from the interaction with this j atom. */
822 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
823 velecsum = _mm_add_pd(velecsum,velec);
827 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
829 /* Calculate temporary vectorial force */
830 tx = _mm_mul_pd(fscal,dx11);
831 ty = _mm_mul_pd(fscal,dy11);
832 tz = _mm_mul_pd(fscal,dz11);
834 /* Update vectorial force */
835 fix1 = _mm_add_pd(fix1,tx);
836 fiy1 = _mm_add_pd(fiy1,ty);
837 fiz1 = _mm_add_pd(fiz1,tz);
839 fjx1 = _mm_add_pd(fjx1,tx);
840 fjy1 = _mm_add_pd(fjy1,ty);
841 fjz1 = _mm_add_pd(fjz1,tz);
843 /**************************
844 * CALCULATE INTERACTIONS *
845 **************************/
847 /* COULOMB ELECTROSTATICS */
848 velec = _mm_mul_pd(qq12,rinv12);
849 felec = _mm_mul_pd(velec,rinvsq12);
851 /* Update potential sum for this i atom from the interaction with this j atom. */
852 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
853 velecsum = _mm_add_pd(velecsum,velec);
857 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
859 /* Calculate temporary vectorial force */
860 tx = _mm_mul_pd(fscal,dx12);
861 ty = _mm_mul_pd(fscal,dy12);
862 tz = _mm_mul_pd(fscal,dz12);
864 /* Update vectorial force */
865 fix1 = _mm_add_pd(fix1,tx);
866 fiy1 = _mm_add_pd(fiy1,ty);
867 fiz1 = _mm_add_pd(fiz1,tz);
869 fjx2 = _mm_add_pd(fjx2,tx);
870 fjy2 = _mm_add_pd(fjy2,ty);
871 fjz2 = _mm_add_pd(fjz2,tz);
873 /**************************
874 * CALCULATE INTERACTIONS *
875 **************************/
877 /* COULOMB ELECTROSTATICS */
878 velec = _mm_mul_pd(qq20,rinv20);
879 felec = _mm_mul_pd(velec,rinvsq20);
881 /* Update potential sum for this i atom from the interaction with this j atom. */
882 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
883 velecsum = _mm_add_pd(velecsum,velec);
887 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
889 /* Calculate temporary vectorial force */
890 tx = _mm_mul_pd(fscal,dx20);
891 ty = _mm_mul_pd(fscal,dy20);
892 tz = _mm_mul_pd(fscal,dz20);
894 /* Update vectorial force */
895 fix2 = _mm_add_pd(fix2,tx);
896 fiy2 = _mm_add_pd(fiy2,ty);
897 fiz2 = _mm_add_pd(fiz2,tz);
899 fjx0 = _mm_add_pd(fjx0,tx);
900 fjy0 = _mm_add_pd(fjy0,ty);
901 fjz0 = _mm_add_pd(fjz0,tz);
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 /* COULOMB ELECTROSTATICS */
908 velec = _mm_mul_pd(qq21,rinv21);
909 felec = _mm_mul_pd(velec,rinvsq21);
911 /* Update potential sum for this i atom from the interaction with this j atom. */
912 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
913 velecsum = _mm_add_pd(velecsum,velec);
917 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
919 /* Calculate temporary vectorial force */
920 tx = _mm_mul_pd(fscal,dx21);
921 ty = _mm_mul_pd(fscal,dy21);
922 tz = _mm_mul_pd(fscal,dz21);
924 /* Update vectorial force */
925 fix2 = _mm_add_pd(fix2,tx);
926 fiy2 = _mm_add_pd(fiy2,ty);
927 fiz2 = _mm_add_pd(fiz2,tz);
929 fjx1 = _mm_add_pd(fjx1,tx);
930 fjy1 = _mm_add_pd(fjy1,ty);
931 fjz1 = _mm_add_pd(fjz1,tz);
933 /**************************
934 * CALCULATE INTERACTIONS *
935 **************************/
937 /* COULOMB ELECTROSTATICS */
938 velec = _mm_mul_pd(qq22,rinv22);
939 felec = _mm_mul_pd(velec,rinvsq22);
941 /* Update potential sum for this i atom from the interaction with this j atom. */
942 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
943 velecsum = _mm_add_pd(velecsum,velec);
947 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
949 /* Calculate temporary vectorial force */
950 tx = _mm_mul_pd(fscal,dx22);
951 ty = _mm_mul_pd(fscal,dy22);
952 tz = _mm_mul_pd(fscal,dz22);
954 /* Update vectorial force */
955 fix2 = _mm_add_pd(fix2,tx);
956 fiy2 = _mm_add_pd(fiy2,ty);
957 fiz2 = _mm_add_pd(fiz2,tz);
959 fjx2 = _mm_add_pd(fjx2,tx);
960 fjy2 = _mm_add_pd(fjy2,ty);
961 fjz2 = _mm_add_pd(fjz2,tz);
963 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
965 /* Inner loop uses 287 flops */
968 /* End of innermost loop */
970 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
971 f+i_coord_offset,fshift+i_shift_offset);
974 /* Update potential energies */
975 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
976 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
978 /* Increment number of inner iterations */
979 inneriter += j_index_end - j_index_start;
981 /* Outer loop uses 20 flops */
984 /* Increment number of outer iterations */
987 /* Update outer/inner flops */
989 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*287);
992 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double
993 * Electrostatics interaction: Coulomb
994 * VdW interaction: CubicSplineTable
995 * Geometry: Water3-Water3
996 * Calculate force/pot: Force
999 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double
1000 (t_nblist * gmx_restrict nlist,
1001 rvec * gmx_restrict xx,
1002 rvec * gmx_restrict ff,
1003 t_forcerec * gmx_restrict fr,
1004 t_mdatoms * gmx_restrict mdatoms,
1005 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1006 t_nrnb * gmx_restrict nrnb)
1008 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1009 * just 0 for non-waters.
1010 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1011 * jnr indices corresponding to data put in the four positions in the SIMD register.
1013 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1014 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1016 int j_coord_offsetA,j_coord_offsetB;
1017 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1018 real rcutoff_scalar;
1019 real *shiftvec,*fshift,*x,*f;
1020 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1022 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1024 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1026 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1027 int vdwjidx0A,vdwjidx0B;
1028 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1029 int vdwjidx1A,vdwjidx1B;
1030 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1031 int vdwjidx2A,vdwjidx2B;
1032 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1033 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1034 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1035 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1036 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1037 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1038 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1039 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1040 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1041 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1042 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1045 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1048 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1049 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1051 __m128i ifour = _mm_set1_epi32(4);
1052 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1054 __m128d dummy_mask,cutoff_mask;
1055 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1056 __m128d one = _mm_set1_pd(1.0);
1057 __m128d two = _mm_set1_pd(2.0);
1063 jindex = nlist->jindex;
1065 shiftidx = nlist->shift;
1067 shiftvec = fr->shift_vec[0];
1068 fshift = fr->fshift[0];
1069 facel = _mm_set1_pd(fr->epsfac);
1070 charge = mdatoms->chargeA;
1071 nvdwtype = fr->ntype;
1072 vdwparam = fr->nbfp;
1073 vdwtype = mdatoms->typeA;
1075 vftab = kernel_data->table_vdw->data;
1076 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1078 /* Setup water-specific parameters */
1079 inr = nlist->iinr[0];
1080 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1081 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1082 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1083 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1085 jq0 = _mm_set1_pd(charge[inr+0]);
1086 jq1 = _mm_set1_pd(charge[inr+1]);
1087 jq2 = _mm_set1_pd(charge[inr+2]);
1088 vdwjidx0A = 2*vdwtype[inr+0];
1089 qq00 = _mm_mul_pd(iq0,jq0);
1090 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1091 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1092 qq01 = _mm_mul_pd(iq0,jq1);
1093 qq02 = _mm_mul_pd(iq0,jq2);
1094 qq10 = _mm_mul_pd(iq1,jq0);
1095 qq11 = _mm_mul_pd(iq1,jq1);
1096 qq12 = _mm_mul_pd(iq1,jq2);
1097 qq20 = _mm_mul_pd(iq2,jq0);
1098 qq21 = _mm_mul_pd(iq2,jq1);
1099 qq22 = _mm_mul_pd(iq2,jq2);
1101 /* Avoid stupid compiler warnings */
1103 j_coord_offsetA = 0;
1104 j_coord_offsetB = 0;
1109 /* Start outer loop over neighborlists */
1110 for(iidx=0; iidx<nri; iidx++)
1112 /* Load shift vector for this list */
1113 i_shift_offset = DIM*shiftidx[iidx];
1115 /* Load limits for loop over neighbors */
1116 j_index_start = jindex[iidx];
1117 j_index_end = jindex[iidx+1];
1119 /* Get outer coordinate index */
1121 i_coord_offset = DIM*inr;
1123 /* Load i particle coords and add shift vector */
1124 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1125 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1127 fix0 = _mm_setzero_pd();
1128 fiy0 = _mm_setzero_pd();
1129 fiz0 = _mm_setzero_pd();
1130 fix1 = _mm_setzero_pd();
1131 fiy1 = _mm_setzero_pd();
1132 fiz1 = _mm_setzero_pd();
1133 fix2 = _mm_setzero_pd();
1134 fiy2 = _mm_setzero_pd();
1135 fiz2 = _mm_setzero_pd();
1137 /* Start inner kernel loop */
1138 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1141 /* Get j neighbor index, and coordinate index */
1143 jnrB = jjnr[jidx+1];
1144 j_coord_offsetA = DIM*jnrA;
1145 j_coord_offsetB = DIM*jnrB;
1147 /* load j atom coordinates */
1148 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1149 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1151 /* Calculate displacement vector */
1152 dx00 = _mm_sub_pd(ix0,jx0);
1153 dy00 = _mm_sub_pd(iy0,jy0);
1154 dz00 = _mm_sub_pd(iz0,jz0);
1155 dx01 = _mm_sub_pd(ix0,jx1);
1156 dy01 = _mm_sub_pd(iy0,jy1);
1157 dz01 = _mm_sub_pd(iz0,jz1);
1158 dx02 = _mm_sub_pd(ix0,jx2);
1159 dy02 = _mm_sub_pd(iy0,jy2);
1160 dz02 = _mm_sub_pd(iz0,jz2);
1161 dx10 = _mm_sub_pd(ix1,jx0);
1162 dy10 = _mm_sub_pd(iy1,jy0);
1163 dz10 = _mm_sub_pd(iz1,jz0);
1164 dx11 = _mm_sub_pd(ix1,jx1);
1165 dy11 = _mm_sub_pd(iy1,jy1);
1166 dz11 = _mm_sub_pd(iz1,jz1);
1167 dx12 = _mm_sub_pd(ix1,jx2);
1168 dy12 = _mm_sub_pd(iy1,jy2);
1169 dz12 = _mm_sub_pd(iz1,jz2);
1170 dx20 = _mm_sub_pd(ix2,jx0);
1171 dy20 = _mm_sub_pd(iy2,jy0);
1172 dz20 = _mm_sub_pd(iz2,jz0);
1173 dx21 = _mm_sub_pd(ix2,jx1);
1174 dy21 = _mm_sub_pd(iy2,jy1);
1175 dz21 = _mm_sub_pd(iz2,jz1);
1176 dx22 = _mm_sub_pd(ix2,jx2);
1177 dy22 = _mm_sub_pd(iy2,jy2);
1178 dz22 = _mm_sub_pd(iz2,jz2);
1180 /* Calculate squared distance and things based on it */
1181 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1182 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1183 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1184 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1185 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1186 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1187 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1188 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1189 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1191 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1192 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1193 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1194 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1195 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1196 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1197 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1198 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1199 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1201 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1202 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1203 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1204 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1205 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1206 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1207 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1208 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1209 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1211 fjx0 = _mm_setzero_pd();
1212 fjy0 = _mm_setzero_pd();
1213 fjz0 = _mm_setzero_pd();
1214 fjx1 = _mm_setzero_pd();
1215 fjy1 = _mm_setzero_pd();
1216 fjz1 = _mm_setzero_pd();
1217 fjx2 = _mm_setzero_pd();
1218 fjy2 = _mm_setzero_pd();
1219 fjz2 = _mm_setzero_pd();
1221 /**************************
1222 * CALCULATE INTERACTIONS *
1223 **************************/
1225 r00 = _mm_mul_pd(rsq00,rinv00);
1227 /* Calculate table index by multiplying r with table scale and truncate to integer */
1228 rt = _mm_mul_pd(r00,vftabscale);
1229 vfitab = _mm_cvttpd_epi32(rt);
1230 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1231 vfitab = _mm_slli_epi32(vfitab,3);
1233 /* COULOMB ELECTROSTATICS */
1234 velec = _mm_mul_pd(qq00,rinv00);
1235 felec = _mm_mul_pd(velec,rinvsq00);
1237 /* CUBIC SPLINE TABLE DISPERSION */
1238 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1239 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1240 GMX_MM_TRANSPOSE2_PD(Y,F);
1241 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1242 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1243 GMX_MM_TRANSPOSE2_PD(G,H);
1244 Heps = _mm_mul_pd(vfeps,H);
1245 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1246 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1247 fvdw6 = _mm_mul_pd(c6_00,FF);
1249 /* CUBIC SPLINE TABLE REPULSION */
1250 vfitab = _mm_add_epi32(vfitab,ifour);
1251 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1252 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1253 GMX_MM_TRANSPOSE2_PD(Y,F);
1254 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1255 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1256 GMX_MM_TRANSPOSE2_PD(G,H);
1257 Heps = _mm_mul_pd(vfeps,H);
1258 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1259 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1260 fvdw12 = _mm_mul_pd(c12_00,FF);
1261 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1263 fscal = _mm_add_pd(felec,fvdw);
1265 /* Calculate temporary vectorial force */
1266 tx = _mm_mul_pd(fscal,dx00);
1267 ty = _mm_mul_pd(fscal,dy00);
1268 tz = _mm_mul_pd(fscal,dz00);
1270 /* Update vectorial force */
1271 fix0 = _mm_add_pd(fix0,tx);
1272 fiy0 = _mm_add_pd(fiy0,ty);
1273 fiz0 = _mm_add_pd(fiz0,tz);
1275 fjx0 = _mm_add_pd(fjx0,tx);
1276 fjy0 = _mm_add_pd(fjy0,ty);
1277 fjz0 = _mm_add_pd(fjz0,tz);
1279 /**************************
1280 * CALCULATE INTERACTIONS *
1281 **************************/
1283 /* COULOMB ELECTROSTATICS */
1284 velec = _mm_mul_pd(qq01,rinv01);
1285 felec = _mm_mul_pd(velec,rinvsq01);
1289 /* Calculate temporary vectorial force */
1290 tx = _mm_mul_pd(fscal,dx01);
1291 ty = _mm_mul_pd(fscal,dy01);
1292 tz = _mm_mul_pd(fscal,dz01);
1294 /* Update vectorial force */
1295 fix0 = _mm_add_pd(fix0,tx);
1296 fiy0 = _mm_add_pd(fiy0,ty);
1297 fiz0 = _mm_add_pd(fiz0,tz);
1299 fjx1 = _mm_add_pd(fjx1,tx);
1300 fjy1 = _mm_add_pd(fjy1,ty);
1301 fjz1 = _mm_add_pd(fjz1,tz);
1303 /**************************
1304 * CALCULATE INTERACTIONS *
1305 **************************/
1307 /* COULOMB ELECTROSTATICS */
1308 velec = _mm_mul_pd(qq02,rinv02);
1309 felec = _mm_mul_pd(velec,rinvsq02);
1313 /* Calculate temporary vectorial force */
1314 tx = _mm_mul_pd(fscal,dx02);
1315 ty = _mm_mul_pd(fscal,dy02);
1316 tz = _mm_mul_pd(fscal,dz02);
1318 /* Update vectorial force */
1319 fix0 = _mm_add_pd(fix0,tx);
1320 fiy0 = _mm_add_pd(fiy0,ty);
1321 fiz0 = _mm_add_pd(fiz0,tz);
1323 fjx2 = _mm_add_pd(fjx2,tx);
1324 fjy2 = _mm_add_pd(fjy2,ty);
1325 fjz2 = _mm_add_pd(fjz2,tz);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 /* COULOMB ELECTROSTATICS */
1332 velec = _mm_mul_pd(qq10,rinv10);
1333 felec = _mm_mul_pd(velec,rinvsq10);
1337 /* Calculate temporary vectorial force */
1338 tx = _mm_mul_pd(fscal,dx10);
1339 ty = _mm_mul_pd(fscal,dy10);
1340 tz = _mm_mul_pd(fscal,dz10);
1342 /* Update vectorial force */
1343 fix1 = _mm_add_pd(fix1,tx);
1344 fiy1 = _mm_add_pd(fiy1,ty);
1345 fiz1 = _mm_add_pd(fiz1,tz);
1347 fjx0 = _mm_add_pd(fjx0,tx);
1348 fjy0 = _mm_add_pd(fjy0,ty);
1349 fjz0 = _mm_add_pd(fjz0,tz);
1351 /**************************
1352 * CALCULATE INTERACTIONS *
1353 **************************/
1355 /* COULOMB ELECTROSTATICS */
1356 velec = _mm_mul_pd(qq11,rinv11);
1357 felec = _mm_mul_pd(velec,rinvsq11);
1361 /* Calculate temporary vectorial force */
1362 tx = _mm_mul_pd(fscal,dx11);
1363 ty = _mm_mul_pd(fscal,dy11);
1364 tz = _mm_mul_pd(fscal,dz11);
1366 /* Update vectorial force */
1367 fix1 = _mm_add_pd(fix1,tx);
1368 fiy1 = _mm_add_pd(fiy1,ty);
1369 fiz1 = _mm_add_pd(fiz1,tz);
1371 fjx1 = _mm_add_pd(fjx1,tx);
1372 fjy1 = _mm_add_pd(fjy1,ty);
1373 fjz1 = _mm_add_pd(fjz1,tz);
1375 /**************************
1376 * CALCULATE INTERACTIONS *
1377 **************************/
1379 /* COULOMB ELECTROSTATICS */
1380 velec = _mm_mul_pd(qq12,rinv12);
1381 felec = _mm_mul_pd(velec,rinvsq12);
1385 /* Calculate temporary vectorial force */
1386 tx = _mm_mul_pd(fscal,dx12);
1387 ty = _mm_mul_pd(fscal,dy12);
1388 tz = _mm_mul_pd(fscal,dz12);
1390 /* Update vectorial force */
1391 fix1 = _mm_add_pd(fix1,tx);
1392 fiy1 = _mm_add_pd(fiy1,ty);
1393 fiz1 = _mm_add_pd(fiz1,tz);
1395 fjx2 = _mm_add_pd(fjx2,tx);
1396 fjy2 = _mm_add_pd(fjy2,ty);
1397 fjz2 = _mm_add_pd(fjz2,tz);
1399 /**************************
1400 * CALCULATE INTERACTIONS *
1401 **************************/
1403 /* COULOMB ELECTROSTATICS */
1404 velec = _mm_mul_pd(qq20,rinv20);
1405 felec = _mm_mul_pd(velec,rinvsq20);
1409 /* Calculate temporary vectorial force */
1410 tx = _mm_mul_pd(fscal,dx20);
1411 ty = _mm_mul_pd(fscal,dy20);
1412 tz = _mm_mul_pd(fscal,dz20);
1414 /* Update vectorial force */
1415 fix2 = _mm_add_pd(fix2,tx);
1416 fiy2 = _mm_add_pd(fiy2,ty);
1417 fiz2 = _mm_add_pd(fiz2,tz);
1419 fjx0 = _mm_add_pd(fjx0,tx);
1420 fjy0 = _mm_add_pd(fjy0,ty);
1421 fjz0 = _mm_add_pd(fjz0,tz);
1423 /**************************
1424 * CALCULATE INTERACTIONS *
1425 **************************/
1427 /* COULOMB ELECTROSTATICS */
1428 velec = _mm_mul_pd(qq21,rinv21);
1429 felec = _mm_mul_pd(velec,rinvsq21);
1433 /* Calculate temporary vectorial force */
1434 tx = _mm_mul_pd(fscal,dx21);
1435 ty = _mm_mul_pd(fscal,dy21);
1436 tz = _mm_mul_pd(fscal,dz21);
1438 /* Update vectorial force */
1439 fix2 = _mm_add_pd(fix2,tx);
1440 fiy2 = _mm_add_pd(fiy2,ty);
1441 fiz2 = _mm_add_pd(fiz2,tz);
1443 fjx1 = _mm_add_pd(fjx1,tx);
1444 fjy1 = _mm_add_pd(fjy1,ty);
1445 fjz1 = _mm_add_pd(fjz1,tz);
1447 /**************************
1448 * CALCULATE INTERACTIONS *
1449 **************************/
1451 /* COULOMB ELECTROSTATICS */
1452 velec = _mm_mul_pd(qq22,rinv22);
1453 felec = _mm_mul_pd(velec,rinvsq22);
1457 /* Calculate temporary vectorial force */
1458 tx = _mm_mul_pd(fscal,dx22);
1459 ty = _mm_mul_pd(fscal,dy22);
1460 tz = _mm_mul_pd(fscal,dz22);
1462 /* Update vectorial force */
1463 fix2 = _mm_add_pd(fix2,tx);
1464 fiy2 = _mm_add_pd(fiy2,ty);
1465 fiz2 = _mm_add_pd(fiz2,tz);
1467 fjx2 = _mm_add_pd(fjx2,tx);
1468 fjy2 = _mm_add_pd(fjy2,ty);
1469 fjz2 = _mm_add_pd(fjz2,tz);
1471 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1473 /* Inner loop uses 270 flops */
1476 if(jidx<j_index_end)
1480 j_coord_offsetA = DIM*jnrA;
1482 /* load j atom coordinates */
1483 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1484 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1486 /* Calculate displacement vector */
1487 dx00 = _mm_sub_pd(ix0,jx0);
1488 dy00 = _mm_sub_pd(iy0,jy0);
1489 dz00 = _mm_sub_pd(iz0,jz0);
1490 dx01 = _mm_sub_pd(ix0,jx1);
1491 dy01 = _mm_sub_pd(iy0,jy1);
1492 dz01 = _mm_sub_pd(iz0,jz1);
1493 dx02 = _mm_sub_pd(ix0,jx2);
1494 dy02 = _mm_sub_pd(iy0,jy2);
1495 dz02 = _mm_sub_pd(iz0,jz2);
1496 dx10 = _mm_sub_pd(ix1,jx0);
1497 dy10 = _mm_sub_pd(iy1,jy0);
1498 dz10 = _mm_sub_pd(iz1,jz0);
1499 dx11 = _mm_sub_pd(ix1,jx1);
1500 dy11 = _mm_sub_pd(iy1,jy1);
1501 dz11 = _mm_sub_pd(iz1,jz1);
1502 dx12 = _mm_sub_pd(ix1,jx2);
1503 dy12 = _mm_sub_pd(iy1,jy2);
1504 dz12 = _mm_sub_pd(iz1,jz2);
1505 dx20 = _mm_sub_pd(ix2,jx0);
1506 dy20 = _mm_sub_pd(iy2,jy0);
1507 dz20 = _mm_sub_pd(iz2,jz0);
1508 dx21 = _mm_sub_pd(ix2,jx1);
1509 dy21 = _mm_sub_pd(iy2,jy1);
1510 dz21 = _mm_sub_pd(iz2,jz1);
1511 dx22 = _mm_sub_pd(ix2,jx2);
1512 dy22 = _mm_sub_pd(iy2,jy2);
1513 dz22 = _mm_sub_pd(iz2,jz2);
1515 /* Calculate squared distance and things based on it */
1516 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1517 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1518 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1519 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1520 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1521 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1522 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1523 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1524 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1526 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1527 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1528 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1529 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1530 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1531 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1532 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1533 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1534 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1536 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1537 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1538 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1539 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1540 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1541 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1542 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1543 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1544 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1546 fjx0 = _mm_setzero_pd();
1547 fjy0 = _mm_setzero_pd();
1548 fjz0 = _mm_setzero_pd();
1549 fjx1 = _mm_setzero_pd();
1550 fjy1 = _mm_setzero_pd();
1551 fjz1 = _mm_setzero_pd();
1552 fjx2 = _mm_setzero_pd();
1553 fjy2 = _mm_setzero_pd();
1554 fjz2 = _mm_setzero_pd();
1556 /**************************
1557 * CALCULATE INTERACTIONS *
1558 **************************/
1560 r00 = _mm_mul_pd(rsq00,rinv00);
1562 /* Calculate table index by multiplying r with table scale and truncate to integer */
1563 rt = _mm_mul_pd(r00,vftabscale);
1564 vfitab = _mm_cvttpd_epi32(rt);
1565 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1566 vfitab = _mm_slli_epi32(vfitab,3);
1568 /* COULOMB ELECTROSTATICS */
1569 velec = _mm_mul_pd(qq00,rinv00);
1570 felec = _mm_mul_pd(velec,rinvsq00);
1572 /* CUBIC SPLINE TABLE DISPERSION */
1573 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1574 F = _mm_setzero_pd();
1575 GMX_MM_TRANSPOSE2_PD(Y,F);
1576 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1577 H = _mm_setzero_pd();
1578 GMX_MM_TRANSPOSE2_PD(G,H);
1579 Heps = _mm_mul_pd(vfeps,H);
1580 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1581 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1582 fvdw6 = _mm_mul_pd(c6_00,FF);
1584 /* CUBIC SPLINE TABLE REPULSION */
1585 vfitab = _mm_add_epi32(vfitab,ifour);
1586 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1587 F = _mm_setzero_pd();
1588 GMX_MM_TRANSPOSE2_PD(Y,F);
1589 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1590 H = _mm_setzero_pd();
1591 GMX_MM_TRANSPOSE2_PD(G,H);
1592 Heps = _mm_mul_pd(vfeps,H);
1593 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1594 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1595 fvdw12 = _mm_mul_pd(c12_00,FF);
1596 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1598 fscal = _mm_add_pd(felec,fvdw);
1600 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1602 /* Calculate temporary vectorial force */
1603 tx = _mm_mul_pd(fscal,dx00);
1604 ty = _mm_mul_pd(fscal,dy00);
1605 tz = _mm_mul_pd(fscal,dz00);
1607 /* Update vectorial force */
1608 fix0 = _mm_add_pd(fix0,tx);
1609 fiy0 = _mm_add_pd(fiy0,ty);
1610 fiz0 = _mm_add_pd(fiz0,tz);
1612 fjx0 = _mm_add_pd(fjx0,tx);
1613 fjy0 = _mm_add_pd(fjy0,ty);
1614 fjz0 = _mm_add_pd(fjz0,tz);
1616 /**************************
1617 * CALCULATE INTERACTIONS *
1618 **************************/
1620 /* COULOMB ELECTROSTATICS */
1621 velec = _mm_mul_pd(qq01,rinv01);
1622 felec = _mm_mul_pd(velec,rinvsq01);
1626 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1628 /* Calculate temporary vectorial force */
1629 tx = _mm_mul_pd(fscal,dx01);
1630 ty = _mm_mul_pd(fscal,dy01);
1631 tz = _mm_mul_pd(fscal,dz01);
1633 /* Update vectorial force */
1634 fix0 = _mm_add_pd(fix0,tx);
1635 fiy0 = _mm_add_pd(fiy0,ty);
1636 fiz0 = _mm_add_pd(fiz0,tz);
1638 fjx1 = _mm_add_pd(fjx1,tx);
1639 fjy1 = _mm_add_pd(fjy1,ty);
1640 fjz1 = _mm_add_pd(fjz1,tz);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 /* COULOMB ELECTROSTATICS */
1647 velec = _mm_mul_pd(qq02,rinv02);
1648 felec = _mm_mul_pd(velec,rinvsq02);
1652 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1654 /* Calculate temporary vectorial force */
1655 tx = _mm_mul_pd(fscal,dx02);
1656 ty = _mm_mul_pd(fscal,dy02);
1657 tz = _mm_mul_pd(fscal,dz02);
1659 /* Update vectorial force */
1660 fix0 = _mm_add_pd(fix0,tx);
1661 fiy0 = _mm_add_pd(fiy0,ty);
1662 fiz0 = _mm_add_pd(fiz0,tz);
1664 fjx2 = _mm_add_pd(fjx2,tx);
1665 fjy2 = _mm_add_pd(fjy2,ty);
1666 fjz2 = _mm_add_pd(fjz2,tz);
1668 /**************************
1669 * CALCULATE INTERACTIONS *
1670 **************************/
1672 /* COULOMB ELECTROSTATICS */
1673 velec = _mm_mul_pd(qq10,rinv10);
1674 felec = _mm_mul_pd(velec,rinvsq10);
1678 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1680 /* Calculate temporary vectorial force */
1681 tx = _mm_mul_pd(fscal,dx10);
1682 ty = _mm_mul_pd(fscal,dy10);
1683 tz = _mm_mul_pd(fscal,dz10);
1685 /* Update vectorial force */
1686 fix1 = _mm_add_pd(fix1,tx);
1687 fiy1 = _mm_add_pd(fiy1,ty);
1688 fiz1 = _mm_add_pd(fiz1,tz);
1690 fjx0 = _mm_add_pd(fjx0,tx);
1691 fjy0 = _mm_add_pd(fjy0,ty);
1692 fjz0 = _mm_add_pd(fjz0,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 /* COULOMB ELECTROSTATICS */
1699 velec = _mm_mul_pd(qq11,rinv11);
1700 felec = _mm_mul_pd(velec,rinvsq11);
1704 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1706 /* Calculate temporary vectorial force */
1707 tx = _mm_mul_pd(fscal,dx11);
1708 ty = _mm_mul_pd(fscal,dy11);
1709 tz = _mm_mul_pd(fscal,dz11);
1711 /* Update vectorial force */
1712 fix1 = _mm_add_pd(fix1,tx);
1713 fiy1 = _mm_add_pd(fiy1,ty);
1714 fiz1 = _mm_add_pd(fiz1,tz);
1716 fjx1 = _mm_add_pd(fjx1,tx);
1717 fjy1 = _mm_add_pd(fjy1,ty);
1718 fjz1 = _mm_add_pd(fjz1,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 /* COULOMB ELECTROSTATICS */
1725 velec = _mm_mul_pd(qq12,rinv12);
1726 felec = _mm_mul_pd(velec,rinvsq12);
1730 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1732 /* Calculate temporary vectorial force */
1733 tx = _mm_mul_pd(fscal,dx12);
1734 ty = _mm_mul_pd(fscal,dy12);
1735 tz = _mm_mul_pd(fscal,dz12);
1737 /* Update vectorial force */
1738 fix1 = _mm_add_pd(fix1,tx);
1739 fiy1 = _mm_add_pd(fiy1,ty);
1740 fiz1 = _mm_add_pd(fiz1,tz);
1742 fjx2 = _mm_add_pd(fjx2,tx);
1743 fjy2 = _mm_add_pd(fjy2,ty);
1744 fjz2 = _mm_add_pd(fjz2,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 /* COULOMB ELECTROSTATICS */
1751 velec = _mm_mul_pd(qq20,rinv20);
1752 felec = _mm_mul_pd(velec,rinvsq20);
1756 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1758 /* Calculate temporary vectorial force */
1759 tx = _mm_mul_pd(fscal,dx20);
1760 ty = _mm_mul_pd(fscal,dy20);
1761 tz = _mm_mul_pd(fscal,dz20);
1763 /* Update vectorial force */
1764 fix2 = _mm_add_pd(fix2,tx);
1765 fiy2 = _mm_add_pd(fiy2,ty);
1766 fiz2 = _mm_add_pd(fiz2,tz);
1768 fjx0 = _mm_add_pd(fjx0,tx);
1769 fjy0 = _mm_add_pd(fjy0,ty);
1770 fjz0 = _mm_add_pd(fjz0,tz);
1772 /**************************
1773 * CALCULATE INTERACTIONS *
1774 **************************/
1776 /* COULOMB ELECTROSTATICS */
1777 velec = _mm_mul_pd(qq21,rinv21);
1778 felec = _mm_mul_pd(velec,rinvsq21);
1782 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1784 /* Calculate temporary vectorial force */
1785 tx = _mm_mul_pd(fscal,dx21);
1786 ty = _mm_mul_pd(fscal,dy21);
1787 tz = _mm_mul_pd(fscal,dz21);
1789 /* Update vectorial force */
1790 fix2 = _mm_add_pd(fix2,tx);
1791 fiy2 = _mm_add_pd(fiy2,ty);
1792 fiz2 = _mm_add_pd(fiz2,tz);
1794 fjx1 = _mm_add_pd(fjx1,tx);
1795 fjy1 = _mm_add_pd(fjy1,ty);
1796 fjz1 = _mm_add_pd(fjz1,tz);
1798 /**************************
1799 * CALCULATE INTERACTIONS *
1800 **************************/
1802 /* COULOMB ELECTROSTATICS */
1803 velec = _mm_mul_pd(qq22,rinv22);
1804 felec = _mm_mul_pd(velec,rinvsq22);
1808 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1810 /* Calculate temporary vectorial force */
1811 tx = _mm_mul_pd(fscal,dx22);
1812 ty = _mm_mul_pd(fscal,dy22);
1813 tz = _mm_mul_pd(fscal,dz22);
1815 /* Update vectorial force */
1816 fix2 = _mm_add_pd(fix2,tx);
1817 fiy2 = _mm_add_pd(fiy2,ty);
1818 fiz2 = _mm_add_pd(fiz2,tz);
1820 fjx2 = _mm_add_pd(fjx2,tx);
1821 fjy2 = _mm_add_pd(fjy2,ty);
1822 fjz2 = _mm_add_pd(fjz2,tz);
1824 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1826 /* Inner loop uses 270 flops */
1829 /* End of innermost loop */
1831 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1832 f+i_coord_offset,fshift+i_shift_offset);
1834 /* Increment number of inner iterations */
1835 inneriter += j_index_end - j_index_start;
1837 /* Outer loop uses 18 flops */
1840 /* Increment number of outer iterations */
1843 /* Update outer/inner flops */
1845 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*270);