2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_sse4_1_double.h"
50 #include "kernelutil_x86_sse4_1_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
110 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
112 __m128i ifour = _mm_set1_epi32(4);
113 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115 __m128d dummy_mask,cutoff_mask;
116 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117 __m128d one = _mm_set1_pd(1.0);
118 __m128d two = _mm_set1_pd(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_pd(fr->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
134 vdwtype = mdatoms->typeA;
136 vftab = kernel_data->table_elec->data;
137 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq0 = _mm_set1_pd(charge[inr+0]);
147 jq1 = _mm_set1_pd(charge[inr+1]);
148 jq2 = _mm_set1_pd(charge[inr+2]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 qq00 = _mm_mul_pd(iq0,jq0);
151 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153 qq01 = _mm_mul_pd(iq0,jq1);
154 qq02 = _mm_mul_pd(iq0,jq2);
155 qq10 = _mm_mul_pd(iq1,jq0);
156 qq11 = _mm_mul_pd(iq1,jq1);
157 qq12 = _mm_mul_pd(iq1,jq2);
158 qq20 = _mm_mul_pd(iq2,jq0);
159 qq21 = _mm_mul_pd(iq2,jq1);
160 qq22 = _mm_mul_pd(iq2,jq2);
162 /* Avoid stupid compiler warnings */
170 /* Start outer loop over neighborlists */
171 for(iidx=0; iidx<nri; iidx++)
173 /* Load shift vector for this list */
174 i_shift_offset = DIM*shiftidx[iidx];
176 /* Load limits for loop over neighbors */
177 j_index_start = jindex[iidx];
178 j_index_end = jindex[iidx+1];
180 /* Get outer coordinate index */
182 i_coord_offset = DIM*inr;
184 /* Load i particle coords and add shift vector */
185 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
186 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
188 fix0 = _mm_setzero_pd();
189 fiy0 = _mm_setzero_pd();
190 fiz0 = _mm_setzero_pd();
191 fix1 = _mm_setzero_pd();
192 fiy1 = _mm_setzero_pd();
193 fiz1 = _mm_setzero_pd();
194 fix2 = _mm_setzero_pd();
195 fiy2 = _mm_setzero_pd();
196 fiz2 = _mm_setzero_pd();
198 /* Reset potential sums */
199 velecsum = _mm_setzero_pd();
200 vvdwsum = _mm_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
212 /* load j atom coordinates */
213 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216 /* Calculate displacement vector */
217 dx00 = _mm_sub_pd(ix0,jx0);
218 dy00 = _mm_sub_pd(iy0,jy0);
219 dz00 = _mm_sub_pd(iz0,jz0);
220 dx01 = _mm_sub_pd(ix0,jx1);
221 dy01 = _mm_sub_pd(iy0,jy1);
222 dz01 = _mm_sub_pd(iz0,jz1);
223 dx02 = _mm_sub_pd(ix0,jx2);
224 dy02 = _mm_sub_pd(iy0,jy2);
225 dz02 = _mm_sub_pd(iz0,jz2);
226 dx10 = _mm_sub_pd(ix1,jx0);
227 dy10 = _mm_sub_pd(iy1,jy0);
228 dz10 = _mm_sub_pd(iz1,jz0);
229 dx11 = _mm_sub_pd(ix1,jx1);
230 dy11 = _mm_sub_pd(iy1,jy1);
231 dz11 = _mm_sub_pd(iz1,jz1);
232 dx12 = _mm_sub_pd(ix1,jx2);
233 dy12 = _mm_sub_pd(iy1,jy2);
234 dz12 = _mm_sub_pd(iz1,jz2);
235 dx20 = _mm_sub_pd(ix2,jx0);
236 dy20 = _mm_sub_pd(iy2,jy0);
237 dz20 = _mm_sub_pd(iz2,jz0);
238 dx21 = _mm_sub_pd(ix2,jx1);
239 dy21 = _mm_sub_pd(iy2,jy1);
240 dz21 = _mm_sub_pd(iz2,jz1);
241 dx22 = _mm_sub_pd(ix2,jx2);
242 dy22 = _mm_sub_pd(iy2,jy2);
243 dz22 = _mm_sub_pd(iz2,jz2);
245 /* Calculate squared distance and things based on it */
246 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
247 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
248 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
249 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
250 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
251 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
252 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
253 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
254 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
256 rinv00 = gmx_mm_invsqrt_pd(rsq00);
257 rinv01 = gmx_mm_invsqrt_pd(rsq01);
258 rinv02 = gmx_mm_invsqrt_pd(rsq02);
259 rinv10 = gmx_mm_invsqrt_pd(rsq10);
260 rinv11 = gmx_mm_invsqrt_pd(rsq11);
261 rinv12 = gmx_mm_invsqrt_pd(rsq12);
262 rinv20 = gmx_mm_invsqrt_pd(rsq20);
263 rinv21 = gmx_mm_invsqrt_pd(rsq21);
264 rinv22 = gmx_mm_invsqrt_pd(rsq22);
266 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
268 fjx0 = _mm_setzero_pd();
269 fjy0 = _mm_setzero_pd();
270 fjz0 = _mm_setzero_pd();
271 fjx1 = _mm_setzero_pd();
272 fjy1 = _mm_setzero_pd();
273 fjz1 = _mm_setzero_pd();
274 fjx2 = _mm_setzero_pd();
275 fjy2 = _mm_setzero_pd();
276 fjz2 = _mm_setzero_pd();
278 /**************************
279 * CALCULATE INTERACTIONS *
280 **************************/
282 r00 = _mm_mul_pd(rsq00,rinv00);
284 /* Calculate table index by multiplying r with table scale and truncate to integer */
285 rt = _mm_mul_pd(r00,vftabscale);
286 vfitab = _mm_cvttpd_epi32(rt);
287 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
288 vfitab = _mm_slli_epi32(vfitab,2);
290 /* CUBIC SPLINE TABLE ELECTROSTATICS */
291 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
292 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
293 GMX_MM_TRANSPOSE2_PD(Y,F);
294 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
295 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
296 GMX_MM_TRANSPOSE2_PD(G,H);
297 Heps = _mm_mul_pd(vfeps,H);
298 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
299 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
300 velec = _mm_mul_pd(qq00,VV);
301 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
302 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
304 /* LENNARD-JONES DISPERSION/REPULSION */
306 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
307 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
308 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
309 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
310 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
312 /* Update potential sum for this i atom from the interaction with this j atom. */
313 velecsum = _mm_add_pd(velecsum,velec);
314 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
316 fscal = _mm_add_pd(felec,fvdw);
318 /* Calculate temporary vectorial force */
319 tx = _mm_mul_pd(fscal,dx00);
320 ty = _mm_mul_pd(fscal,dy00);
321 tz = _mm_mul_pd(fscal,dz00);
323 /* Update vectorial force */
324 fix0 = _mm_add_pd(fix0,tx);
325 fiy0 = _mm_add_pd(fiy0,ty);
326 fiz0 = _mm_add_pd(fiz0,tz);
328 fjx0 = _mm_add_pd(fjx0,tx);
329 fjy0 = _mm_add_pd(fjy0,ty);
330 fjz0 = _mm_add_pd(fjz0,tz);
332 /**************************
333 * CALCULATE INTERACTIONS *
334 **************************/
336 r01 = _mm_mul_pd(rsq01,rinv01);
338 /* Calculate table index by multiplying r with table scale and truncate to integer */
339 rt = _mm_mul_pd(r01,vftabscale);
340 vfitab = _mm_cvttpd_epi32(rt);
341 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
342 vfitab = _mm_slli_epi32(vfitab,2);
344 /* CUBIC SPLINE TABLE ELECTROSTATICS */
345 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
346 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
347 GMX_MM_TRANSPOSE2_PD(Y,F);
348 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
349 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
350 GMX_MM_TRANSPOSE2_PD(G,H);
351 Heps = _mm_mul_pd(vfeps,H);
352 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
353 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
354 velec = _mm_mul_pd(qq01,VV);
355 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
356 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
358 /* Update potential sum for this i atom from the interaction with this j atom. */
359 velecsum = _mm_add_pd(velecsum,velec);
363 /* Calculate temporary vectorial force */
364 tx = _mm_mul_pd(fscal,dx01);
365 ty = _mm_mul_pd(fscal,dy01);
366 tz = _mm_mul_pd(fscal,dz01);
368 /* Update vectorial force */
369 fix0 = _mm_add_pd(fix0,tx);
370 fiy0 = _mm_add_pd(fiy0,ty);
371 fiz0 = _mm_add_pd(fiz0,tz);
373 fjx1 = _mm_add_pd(fjx1,tx);
374 fjy1 = _mm_add_pd(fjy1,ty);
375 fjz1 = _mm_add_pd(fjz1,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 r02 = _mm_mul_pd(rsq02,rinv02);
383 /* Calculate table index by multiplying r with table scale and truncate to integer */
384 rt = _mm_mul_pd(r02,vftabscale);
385 vfitab = _mm_cvttpd_epi32(rt);
386 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
387 vfitab = _mm_slli_epi32(vfitab,2);
389 /* CUBIC SPLINE TABLE ELECTROSTATICS */
390 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
391 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
392 GMX_MM_TRANSPOSE2_PD(Y,F);
393 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
394 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
395 GMX_MM_TRANSPOSE2_PD(G,H);
396 Heps = _mm_mul_pd(vfeps,H);
397 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
398 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
399 velec = _mm_mul_pd(qq02,VV);
400 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
401 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum = _mm_add_pd(velecsum,velec);
408 /* Calculate temporary vectorial force */
409 tx = _mm_mul_pd(fscal,dx02);
410 ty = _mm_mul_pd(fscal,dy02);
411 tz = _mm_mul_pd(fscal,dz02);
413 /* Update vectorial force */
414 fix0 = _mm_add_pd(fix0,tx);
415 fiy0 = _mm_add_pd(fiy0,ty);
416 fiz0 = _mm_add_pd(fiz0,tz);
418 fjx2 = _mm_add_pd(fjx2,tx);
419 fjy2 = _mm_add_pd(fjy2,ty);
420 fjz2 = _mm_add_pd(fjz2,tz);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 r10 = _mm_mul_pd(rsq10,rinv10);
428 /* Calculate table index by multiplying r with table scale and truncate to integer */
429 rt = _mm_mul_pd(r10,vftabscale);
430 vfitab = _mm_cvttpd_epi32(rt);
431 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
432 vfitab = _mm_slli_epi32(vfitab,2);
434 /* CUBIC SPLINE TABLE ELECTROSTATICS */
435 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
436 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
437 GMX_MM_TRANSPOSE2_PD(Y,F);
438 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
439 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
440 GMX_MM_TRANSPOSE2_PD(G,H);
441 Heps = _mm_mul_pd(vfeps,H);
442 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
443 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
444 velec = _mm_mul_pd(qq10,VV);
445 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
446 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velecsum = _mm_add_pd(velecsum,velec);
453 /* Calculate temporary vectorial force */
454 tx = _mm_mul_pd(fscal,dx10);
455 ty = _mm_mul_pd(fscal,dy10);
456 tz = _mm_mul_pd(fscal,dz10);
458 /* Update vectorial force */
459 fix1 = _mm_add_pd(fix1,tx);
460 fiy1 = _mm_add_pd(fiy1,ty);
461 fiz1 = _mm_add_pd(fiz1,tz);
463 fjx0 = _mm_add_pd(fjx0,tx);
464 fjy0 = _mm_add_pd(fjy0,ty);
465 fjz0 = _mm_add_pd(fjz0,tz);
467 /**************************
468 * CALCULATE INTERACTIONS *
469 **************************/
471 r11 = _mm_mul_pd(rsq11,rinv11);
473 /* Calculate table index by multiplying r with table scale and truncate to integer */
474 rt = _mm_mul_pd(r11,vftabscale);
475 vfitab = _mm_cvttpd_epi32(rt);
476 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
477 vfitab = _mm_slli_epi32(vfitab,2);
479 /* CUBIC SPLINE TABLE ELECTROSTATICS */
480 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
481 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
482 GMX_MM_TRANSPOSE2_PD(Y,F);
483 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
484 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
485 GMX_MM_TRANSPOSE2_PD(G,H);
486 Heps = _mm_mul_pd(vfeps,H);
487 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
488 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
489 velec = _mm_mul_pd(qq11,VV);
490 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
491 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velecsum = _mm_add_pd(velecsum,velec);
498 /* Calculate temporary vectorial force */
499 tx = _mm_mul_pd(fscal,dx11);
500 ty = _mm_mul_pd(fscal,dy11);
501 tz = _mm_mul_pd(fscal,dz11);
503 /* Update vectorial force */
504 fix1 = _mm_add_pd(fix1,tx);
505 fiy1 = _mm_add_pd(fiy1,ty);
506 fiz1 = _mm_add_pd(fiz1,tz);
508 fjx1 = _mm_add_pd(fjx1,tx);
509 fjy1 = _mm_add_pd(fjy1,ty);
510 fjz1 = _mm_add_pd(fjz1,tz);
512 /**************************
513 * CALCULATE INTERACTIONS *
514 **************************/
516 r12 = _mm_mul_pd(rsq12,rinv12);
518 /* Calculate table index by multiplying r with table scale and truncate to integer */
519 rt = _mm_mul_pd(r12,vftabscale);
520 vfitab = _mm_cvttpd_epi32(rt);
521 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
522 vfitab = _mm_slli_epi32(vfitab,2);
524 /* CUBIC SPLINE TABLE ELECTROSTATICS */
525 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
526 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
527 GMX_MM_TRANSPOSE2_PD(Y,F);
528 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
529 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
530 GMX_MM_TRANSPOSE2_PD(G,H);
531 Heps = _mm_mul_pd(vfeps,H);
532 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
533 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
534 velec = _mm_mul_pd(qq12,VV);
535 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
536 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velecsum = _mm_add_pd(velecsum,velec);
543 /* Calculate temporary vectorial force */
544 tx = _mm_mul_pd(fscal,dx12);
545 ty = _mm_mul_pd(fscal,dy12);
546 tz = _mm_mul_pd(fscal,dz12);
548 /* Update vectorial force */
549 fix1 = _mm_add_pd(fix1,tx);
550 fiy1 = _mm_add_pd(fiy1,ty);
551 fiz1 = _mm_add_pd(fiz1,tz);
553 fjx2 = _mm_add_pd(fjx2,tx);
554 fjy2 = _mm_add_pd(fjy2,ty);
555 fjz2 = _mm_add_pd(fjz2,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 r20 = _mm_mul_pd(rsq20,rinv20);
563 /* Calculate table index by multiplying r with table scale and truncate to integer */
564 rt = _mm_mul_pd(r20,vftabscale);
565 vfitab = _mm_cvttpd_epi32(rt);
566 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
567 vfitab = _mm_slli_epi32(vfitab,2);
569 /* CUBIC SPLINE TABLE ELECTROSTATICS */
570 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
571 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
572 GMX_MM_TRANSPOSE2_PD(Y,F);
573 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
574 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
575 GMX_MM_TRANSPOSE2_PD(G,H);
576 Heps = _mm_mul_pd(vfeps,H);
577 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
578 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
579 velec = _mm_mul_pd(qq20,VV);
580 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
581 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
583 /* Update potential sum for this i atom from the interaction with this j atom. */
584 velecsum = _mm_add_pd(velecsum,velec);
588 /* Calculate temporary vectorial force */
589 tx = _mm_mul_pd(fscal,dx20);
590 ty = _mm_mul_pd(fscal,dy20);
591 tz = _mm_mul_pd(fscal,dz20);
593 /* Update vectorial force */
594 fix2 = _mm_add_pd(fix2,tx);
595 fiy2 = _mm_add_pd(fiy2,ty);
596 fiz2 = _mm_add_pd(fiz2,tz);
598 fjx0 = _mm_add_pd(fjx0,tx);
599 fjy0 = _mm_add_pd(fjy0,ty);
600 fjz0 = _mm_add_pd(fjz0,tz);
602 /**************************
603 * CALCULATE INTERACTIONS *
604 **************************/
606 r21 = _mm_mul_pd(rsq21,rinv21);
608 /* Calculate table index by multiplying r with table scale and truncate to integer */
609 rt = _mm_mul_pd(r21,vftabscale);
610 vfitab = _mm_cvttpd_epi32(rt);
611 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
612 vfitab = _mm_slli_epi32(vfitab,2);
614 /* CUBIC SPLINE TABLE ELECTROSTATICS */
615 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
616 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
617 GMX_MM_TRANSPOSE2_PD(Y,F);
618 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
619 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
620 GMX_MM_TRANSPOSE2_PD(G,H);
621 Heps = _mm_mul_pd(vfeps,H);
622 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
623 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
624 velec = _mm_mul_pd(qq21,VV);
625 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
626 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
628 /* Update potential sum for this i atom from the interaction with this j atom. */
629 velecsum = _mm_add_pd(velecsum,velec);
633 /* Calculate temporary vectorial force */
634 tx = _mm_mul_pd(fscal,dx21);
635 ty = _mm_mul_pd(fscal,dy21);
636 tz = _mm_mul_pd(fscal,dz21);
638 /* Update vectorial force */
639 fix2 = _mm_add_pd(fix2,tx);
640 fiy2 = _mm_add_pd(fiy2,ty);
641 fiz2 = _mm_add_pd(fiz2,tz);
643 fjx1 = _mm_add_pd(fjx1,tx);
644 fjy1 = _mm_add_pd(fjy1,ty);
645 fjz1 = _mm_add_pd(fjz1,tz);
647 /**************************
648 * CALCULATE INTERACTIONS *
649 **************************/
651 r22 = _mm_mul_pd(rsq22,rinv22);
653 /* Calculate table index by multiplying r with table scale and truncate to integer */
654 rt = _mm_mul_pd(r22,vftabscale);
655 vfitab = _mm_cvttpd_epi32(rt);
656 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
657 vfitab = _mm_slli_epi32(vfitab,2);
659 /* CUBIC SPLINE TABLE ELECTROSTATICS */
660 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
661 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
662 GMX_MM_TRANSPOSE2_PD(Y,F);
663 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
664 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
665 GMX_MM_TRANSPOSE2_PD(G,H);
666 Heps = _mm_mul_pd(vfeps,H);
667 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
668 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
669 velec = _mm_mul_pd(qq22,VV);
670 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
671 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velecsum = _mm_add_pd(velecsum,velec);
678 /* Calculate temporary vectorial force */
679 tx = _mm_mul_pd(fscal,dx22);
680 ty = _mm_mul_pd(fscal,dy22);
681 tz = _mm_mul_pd(fscal,dz22);
683 /* Update vectorial force */
684 fix2 = _mm_add_pd(fix2,tx);
685 fiy2 = _mm_add_pd(fiy2,ty);
686 fiz2 = _mm_add_pd(fiz2,tz);
688 fjx2 = _mm_add_pd(fjx2,tx);
689 fjy2 = _mm_add_pd(fjy2,ty);
690 fjz2 = _mm_add_pd(fjz2,tz);
692 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
694 /* Inner loop uses 400 flops */
701 j_coord_offsetA = DIM*jnrA;
703 /* load j atom coordinates */
704 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
705 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
707 /* Calculate displacement vector */
708 dx00 = _mm_sub_pd(ix0,jx0);
709 dy00 = _mm_sub_pd(iy0,jy0);
710 dz00 = _mm_sub_pd(iz0,jz0);
711 dx01 = _mm_sub_pd(ix0,jx1);
712 dy01 = _mm_sub_pd(iy0,jy1);
713 dz01 = _mm_sub_pd(iz0,jz1);
714 dx02 = _mm_sub_pd(ix0,jx2);
715 dy02 = _mm_sub_pd(iy0,jy2);
716 dz02 = _mm_sub_pd(iz0,jz2);
717 dx10 = _mm_sub_pd(ix1,jx0);
718 dy10 = _mm_sub_pd(iy1,jy0);
719 dz10 = _mm_sub_pd(iz1,jz0);
720 dx11 = _mm_sub_pd(ix1,jx1);
721 dy11 = _mm_sub_pd(iy1,jy1);
722 dz11 = _mm_sub_pd(iz1,jz1);
723 dx12 = _mm_sub_pd(ix1,jx2);
724 dy12 = _mm_sub_pd(iy1,jy2);
725 dz12 = _mm_sub_pd(iz1,jz2);
726 dx20 = _mm_sub_pd(ix2,jx0);
727 dy20 = _mm_sub_pd(iy2,jy0);
728 dz20 = _mm_sub_pd(iz2,jz0);
729 dx21 = _mm_sub_pd(ix2,jx1);
730 dy21 = _mm_sub_pd(iy2,jy1);
731 dz21 = _mm_sub_pd(iz2,jz1);
732 dx22 = _mm_sub_pd(ix2,jx2);
733 dy22 = _mm_sub_pd(iy2,jy2);
734 dz22 = _mm_sub_pd(iz2,jz2);
736 /* Calculate squared distance and things based on it */
737 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
738 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
739 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
740 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
741 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
742 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
743 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
744 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
745 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
747 rinv00 = gmx_mm_invsqrt_pd(rsq00);
748 rinv01 = gmx_mm_invsqrt_pd(rsq01);
749 rinv02 = gmx_mm_invsqrt_pd(rsq02);
750 rinv10 = gmx_mm_invsqrt_pd(rsq10);
751 rinv11 = gmx_mm_invsqrt_pd(rsq11);
752 rinv12 = gmx_mm_invsqrt_pd(rsq12);
753 rinv20 = gmx_mm_invsqrt_pd(rsq20);
754 rinv21 = gmx_mm_invsqrt_pd(rsq21);
755 rinv22 = gmx_mm_invsqrt_pd(rsq22);
757 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
759 fjx0 = _mm_setzero_pd();
760 fjy0 = _mm_setzero_pd();
761 fjz0 = _mm_setzero_pd();
762 fjx1 = _mm_setzero_pd();
763 fjy1 = _mm_setzero_pd();
764 fjz1 = _mm_setzero_pd();
765 fjx2 = _mm_setzero_pd();
766 fjy2 = _mm_setzero_pd();
767 fjz2 = _mm_setzero_pd();
769 /**************************
770 * CALCULATE INTERACTIONS *
771 **************************/
773 r00 = _mm_mul_pd(rsq00,rinv00);
775 /* Calculate table index by multiplying r with table scale and truncate to integer */
776 rt = _mm_mul_pd(r00,vftabscale);
777 vfitab = _mm_cvttpd_epi32(rt);
778 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
779 vfitab = _mm_slli_epi32(vfitab,2);
781 /* CUBIC SPLINE TABLE ELECTROSTATICS */
782 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
783 F = _mm_setzero_pd();
784 GMX_MM_TRANSPOSE2_PD(Y,F);
785 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
786 H = _mm_setzero_pd();
787 GMX_MM_TRANSPOSE2_PD(G,H);
788 Heps = _mm_mul_pd(vfeps,H);
789 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
790 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
791 velec = _mm_mul_pd(qq00,VV);
792 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
793 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
795 /* LENNARD-JONES DISPERSION/REPULSION */
797 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
798 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
799 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
800 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
801 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
803 /* Update potential sum for this i atom from the interaction with this j atom. */
804 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
805 velecsum = _mm_add_pd(velecsum,velec);
806 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
807 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
809 fscal = _mm_add_pd(felec,fvdw);
811 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
813 /* Calculate temporary vectorial force */
814 tx = _mm_mul_pd(fscal,dx00);
815 ty = _mm_mul_pd(fscal,dy00);
816 tz = _mm_mul_pd(fscal,dz00);
818 /* Update vectorial force */
819 fix0 = _mm_add_pd(fix0,tx);
820 fiy0 = _mm_add_pd(fiy0,ty);
821 fiz0 = _mm_add_pd(fiz0,tz);
823 fjx0 = _mm_add_pd(fjx0,tx);
824 fjy0 = _mm_add_pd(fjy0,ty);
825 fjz0 = _mm_add_pd(fjz0,tz);
827 /**************************
828 * CALCULATE INTERACTIONS *
829 **************************/
831 r01 = _mm_mul_pd(rsq01,rinv01);
833 /* Calculate table index by multiplying r with table scale and truncate to integer */
834 rt = _mm_mul_pd(r01,vftabscale);
835 vfitab = _mm_cvttpd_epi32(rt);
836 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
837 vfitab = _mm_slli_epi32(vfitab,2);
839 /* CUBIC SPLINE TABLE ELECTROSTATICS */
840 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
841 F = _mm_setzero_pd();
842 GMX_MM_TRANSPOSE2_PD(Y,F);
843 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
844 H = _mm_setzero_pd();
845 GMX_MM_TRANSPOSE2_PD(G,H);
846 Heps = _mm_mul_pd(vfeps,H);
847 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
848 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
849 velec = _mm_mul_pd(qq01,VV);
850 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
851 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
853 /* Update potential sum for this i atom from the interaction with this j atom. */
854 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
855 velecsum = _mm_add_pd(velecsum,velec);
859 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
861 /* Calculate temporary vectorial force */
862 tx = _mm_mul_pd(fscal,dx01);
863 ty = _mm_mul_pd(fscal,dy01);
864 tz = _mm_mul_pd(fscal,dz01);
866 /* Update vectorial force */
867 fix0 = _mm_add_pd(fix0,tx);
868 fiy0 = _mm_add_pd(fiy0,ty);
869 fiz0 = _mm_add_pd(fiz0,tz);
871 fjx1 = _mm_add_pd(fjx1,tx);
872 fjy1 = _mm_add_pd(fjy1,ty);
873 fjz1 = _mm_add_pd(fjz1,tz);
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
879 r02 = _mm_mul_pd(rsq02,rinv02);
881 /* Calculate table index by multiplying r with table scale and truncate to integer */
882 rt = _mm_mul_pd(r02,vftabscale);
883 vfitab = _mm_cvttpd_epi32(rt);
884 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
885 vfitab = _mm_slli_epi32(vfitab,2);
887 /* CUBIC SPLINE TABLE ELECTROSTATICS */
888 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
889 F = _mm_setzero_pd();
890 GMX_MM_TRANSPOSE2_PD(Y,F);
891 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
892 H = _mm_setzero_pd();
893 GMX_MM_TRANSPOSE2_PD(G,H);
894 Heps = _mm_mul_pd(vfeps,H);
895 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
896 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
897 velec = _mm_mul_pd(qq02,VV);
898 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
899 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
901 /* Update potential sum for this i atom from the interaction with this j atom. */
902 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
903 velecsum = _mm_add_pd(velecsum,velec);
907 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
909 /* Calculate temporary vectorial force */
910 tx = _mm_mul_pd(fscal,dx02);
911 ty = _mm_mul_pd(fscal,dy02);
912 tz = _mm_mul_pd(fscal,dz02);
914 /* Update vectorial force */
915 fix0 = _mm_add_pd(fix0,tx);
916 fiy0 = _mm_add_pd(fiy0,ty);
917 fiz0 = _mm_add_pd(fiz0,tz);
919 fjx2 = _mm_add_pd(fjx2,tx);
920 fjy2 = _mm_add_pd(fjy2,ty);
921 fjz2 = _mm_add_pd(fjz2,tz);
923 /**************************
924 * CALCULATE INTERACTIONS *
925 **************************/
927 r10 = _mm_mul_pd(rsq10,rinv10);
929 /* Calculate table index by multiplying r with table scale and truncate to integer */
930 rt = _mm_mul_pd(r10,vftabscale);
931 vfitab = _mm_cvttpd_epi32(rt);
932 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
933 vfitab = _mm_slli_epi32(vfitab,2);
935 /* CUBIC SPLINE TABLE ELECTROSTATICS */
936 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
937 F = _mm_setzero_pd();
938 GMX_MM_TRANSPOSE2_PD(Y,F);
939 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
940 H = _mm_setzero_pd();
941 GMX_MM_TRANSPOSE2_PD(G,H);
942 Heps = _mm_mul_pd(vfeps,H);
943 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
944 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
945 velec = _mm_mul_pd(qq10,VV);
946 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
947 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
951 velecsum = _mm_add_pd(velecsum,velec);
955 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
957 /* Calculate temporary vectorial force */
958 tx = _mm_mul_pd(fscal,dx10);
959 ty = _mm_mul_pd(fscal,dy10);
960 tz = _mm_mul_pd(fscal,dz10);
962 /* Update vectorial force */
963 fix1 = _mm_add_pd(fix1,tx);
964 fiy1 = _mm_add_pd(fiy1,ty);
965 fiz1 = _mm_add_pd(fiz1,tz);
967 fjx0 = _mm_add_pd(fjx0,tx);
968 fjy0 = _mm_add_pd(fjy0,ty);
969 fjz0 = _mm_add_pd(fjz0,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 r11 = _mm_mul_pd(rsq11,rinv11);
977 /* Calculate table index by multiplying r with table scale and truncate to integer */
978 rt = _mm_mul_pd(r11,vftabscale);
979 vfitab = _mm_cvttpd_epi32(rt);
980 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
981 vfitab = _mm_slli_epi32(vfitab,2);
983 /* CUBIC SPLINE TABLE ELECTROSTATICS */
984 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
985 F = _mm_setzero_pd();
986 GMX_MM_TRANSPOSE2_PD(Y,F);
987 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
988 H = _mm_setzero_pd();
989 GMX_MM_TRANSPOSE2_PD(G,H);
990 Heps = _mm_mul_pd(vfeps,H);
991 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
992 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
993 velec = _mm_mul_pd(qq11,VV);
994 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
995 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
997 /* Update potential sum for this i atom from the interaction with this j atom. */
998 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
999 velecsum = _mm_add_pd(velecsum,velec);
1003 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1005 /* Calculate temporary vectorial force */
1006 tx = _mm_mul_pd(fscal,dx11);
1007 ty = _mm_mul_pd(fscal,dy11);
1008 tz = _mm_mul_pd(fscal,dz11);
1010 /* Update vectorial force */
1011 fix1 = _mm_add_pd(fix1,tx);
1012 fiy1 = _mm_add_pd(fiy1,ty);
1013 fiz1 = _mm_add_pd(fiz1,tz);
1015 fjx1 = _mm_add_pd(fjx1,tx);
1016 fjy1 = _mm_add_pd(fjy1,ty);
1017 fjz1 = _mm_add_pd(fjz1,tz);
1019 /**************************
1020 * CALCULATE INTERACTIONS *
1021 **************************/
1023 r12 = _mm_mul_pd(rsq12,rinv12);
1025 /* Calculate table index by multiplying r with table scale and truncate to integer */
1026 rt = _mm_mul_pd(r12,vftabscale);
1027 vfitab = _mm_cvttpd_epi32(rt);
1028 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1029 vfitab = _mm_slli_epi32(vfitab,2);
1031 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1032 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1033 F = _mm_setzero_pd();
1034 GMX_MM_TRANSPOSE2_PD(Y,F);
1035 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1036 H = _mm_setzero_pd();
1037 GMX_MM_TRANSPOSE2_PD(G,H);
1038 Heps = _mm_mul_pd(vfeps,H);
1039 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1040 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1041 velec = _mm_mul_pd(qq12,VV);
1042 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1043 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1045 /* Update potential sum for this i atom from the interaction with this j atom. */
1046 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1047 velecsum = _mm_add_pd(velecsum,velec);
1051 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1053 /* Calculate temporary vectorial force */
1054 tx = _mm_mul_pd(fscal,dx12);
1055 ty = _mm_mul_pd(fscal,dy12);
1056 tz = _mm_mul_pd(fscal,dz12);
1058 /* Update vectorial force */
1059 fix1 = _mm_add_pd(fix1,tx);
1060 fiy1 = _mm_add_pd(fiy1,ty);
1061 fiz1 = _mm_add_pd(fiz1,tz);
1063 fjx2 = _mm_add_pd(fjx2,tx);
1064 fjy2 = _mm_add_pd(fjy2,ty);
1065 fjz2 = _mm_add_pd(fjz2,tz);
1067 /**************************
1068 * CALCULATE INTERACTIONS *
1069 **************************/
1071 r20 = _mm_mul_pd(rsq20,rinv20);
1073 /* Calculate table index by multiplying r with table scale and truncate to integer */
1074 rt = _mm_mul_pd(r20,vftabscale);
1075 vfitab = _mm_cvttpd_epi32(rt);
1076 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1077 vfitab = _mm_slli_epi32(vfitab,2);
1079 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1080 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1081 F = _mm_setzero_pd();
1082 GMX_MM_TRANSPOSE2_PD(Y,F);
1083 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1084 H = _mm_setzero_pd();
1085 GMX_MM_TRANSPOSE2_PD(G,H);
1086 Heps = _mm_mul_pd(vfeps,H);
1087 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1088 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1089 velec = _mm_mul_pd(qq20,VV);
1090 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1091 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1095 velecsum = _mm_add_pd(velecsum,velec);
1099 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1101 /* Calculate temporary vectorial force */
1102 tx = _mm_mul_pd(fscal,dx20);
1103 ty = _mm_mul_pd(fscal,dy20);
1104 tz = _mm_mul_pd(fscal,dz20);
1106 /* Update vectorial force */
1107 fix2 = _mm_add_pd(fix2,tx);
1108 fiy2 = _mm_add_pd(fiy2,ty);
1109 fiz2 = _mm_add_pd(fiz2,tz);
1111 fjx0 = _mm_add_pd(fjx0,tx);
1112 fjy0 = _mm_add_pd(fjy0,ty);
1113 fjz0 = _mm_add_pd(fjz0,tz);
1115 /**************************
1116 * CALCULATE INTERACTIONS *
1117 **************************/
1119 r21 = _mm_mul_pd(rsq21,rinv21);
1121 /* Calculate table index by multiplying r with table scale and truncate to integer */
1122 rt = _mm_mul_pd(r21,vftabscale);
1123 vfitab = _mm_cvttpd_epi32(rt);
1124 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1125 vfitab = _mm_slli_epi32(vfitab,2);
1127 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1128 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1129 F = _mm_setzero_pd();
1130 GMX_MM_TRANSPOSE2_PD(Y,F);
1131 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1132 H = _mm_setzero_pd();
1133 GMX_MM_TRANSPOSE2_PD(G,H);
1134 Heps = _mm_mul_pd(vfeps,H);
1135 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1136 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1137 velec = _mm_mul_pd(qq21,VV);
1138 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1139 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1141 /* Update potential sum for this i atom from the interaction with this j atom. */
1142 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1143 velecsum = _mm_add_pd(velecsum,velec);
1147 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1149 /* Calculate temporary vectorial force */
1150 tx = _mm_mul_pd(fscal,dx21);
1151 ty = _mm_mul_pd(fscal,dy21);
1152 tz = _mm_mul_pd(fscal,dz21);
1154 /* Update vectorial force */
1155 fix2 = _mm_add_pd(fix2,tx);
1156 fiy2 = _mm_add_pd(fiy2,ty);
1157 fiz2 = _mm_add_pd(fiz2,tz);
1159 fjx1 = _mm_add_pd(fjx1,tx);
1160 fjy1 = _mm_add_pd(fjy1,ty);
1161 fjz1 = _mm_add_pd(fjz1,tz);
1163 /**************************
1164 * CALCULATE INTERACTIONS *
1165 **************************/
1167 r22 = _mm_mul_pd(rsq22,rinv22);
1169 /* Calculate table index by multiplying r with table scale and truncate to integer */
1170 rt = _mm_mul_pd(r22,vftabscale);
1171 vfitab = _mm_cvttpd_epi32(rt);
1172 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1173 vfitab = _mm_slli_epi32(vfitab,2);
1175 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1176 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1177 F = _mm_setzero_pd();
1178 GMX_MM_TRANSPOSE2_PD(Y,F);
1179 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1180 H = _mm_setzero_pd();
1181 GMX_MM_TRANSPOSE2_PD(G,H);
1182 Heps = _mm_mul_pd(vfeps,H);
1183 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1184 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1185 velec = _mm_mul_pd(qq22,VV);
1186 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1187 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1191 velecsum = _mm_add_pd(velecsum,velec);
1195 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1197 /* Calculate temporary vectorial force */
1198 tx = _mm_mul_pd(fscal,dx22);
1199 ty = _mm_mul_pd(fscal,dy22);
1200 tz = _mm_mul_pd(fscal,dz22);
1202 /* Update vectorial force */
1203 fix2 = _mm_add_pd(fix2,tx);
1204 fiy2 = _mm_add_pd(fiy2,ty);
1205 fiz2 = _mm_add_pd(fiz2,tz);
1207 fjx2 = _mm_add_pd(fjx2,tx);
1208 fjy2 = _mm_add_pd(fjy2,ty);
1209 fjz2 = _mm_add_pd(fjz2,tz);
1211 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1213 /* Inner loop uses 400 flops */
1216 /* End of innermost loop */
1218 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1219 f+i_coord_offset,fshift+i_shift_offset);
1222 /* Update potential energies */
1223 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1224 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1226 /* Increment number of inner iterations */
1227 inneriter += j_index_end - j_index_start;
1229 /* Outer loop uses 20 flops */
1232 /* Increment number of outer iterations */
1235 /* Update outer/inner flops */
1237 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*400);
1240 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1241 * Electrostatics interaction: CubicSplineTable
1242 * VdW interaction: LennardJones
1243 * Geometry: Water3-Water3
1244 * Calculate force/pot: Force
1247 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1248 (t_nblist * gmx_restrict nlist,
1249 rvec * gmx_restrict xx,
1250 rvec * gmx_restrict ff,
1251 t_forcerec * gmx_restrict fr,
1252 t_mdatoms * gmx_restrict mdatoms,
1253 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1254 t_nrnb * gmx_restrict nrnb)
1256 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1257 * just 0 for non-waters.
1258 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1259 * jnr indices corresponding to data put in the four positions in the SIMD register.
1261 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1262 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1264 int j_coord_offsetA,j_coord_offsetB;
1265 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1266 real rcutoff_scalar;
1267 real *shiftvec,*fshift,*x,*f;
1268 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1270 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1272 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1274 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1275 int vdwjidx0A,vdwjidx0B;
1276 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1277 int vdwjidx1A,vdwjidx1B;
1278 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1279 int vdwjidx2A,vdwjidx2B;
1280 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1281 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1282 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1283 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1284 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1285 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1286 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1287 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1288 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1289 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1290 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1293 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1296 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1297 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1299 __m128i ifour = _mm_set1_epi32(4);
1300 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1302 __m128d dummy_mask,cutoff_mask;
1303 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1304 __m128d one = _mm_set1_pd(1.0);
1305 __m128d two = _mm_set1_pd(2.0);
1311 jindex = nlist->jindex;
1313 shiftidx = nlist->shift;
1315 shiftvec = fr->shift_vec[0];
1316 fshift = fr->fshift[0];
1317 facel = _mm_set1_pd(fr->epsfac);
1318 charge = mdatoms->chargeA;
1319 nvdwtype = fr->ntype;
1320 vdwparam = fr->nbfp;
1321 vdwtype = mdatoms->typeA;
1323 vftab = kernel_data->table_elec->data;
1324 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1326 /* Setup water-specific parameters */
1327 inr = nlist->iinr[0];
1328 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1329 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1330 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1331 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1333 jq0 = _mm_set1_pd(charge[inr+0]);
1334 jq1 = _mm_set1_pd(charge[inr+1]);
1335 jq2 = _mm_set1_pd(charge[inr+2]);
1336 vdwjidx0A = 2*vdwtype[inr+0];
1337 qq00 = _mm_mul_pd(iq0,jq0);
1338 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1339 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1340 qq01 = _mm_mul_pd(iq0,jq1);
1341 qq02 = _mm_mul_pd(iq0,jq2);
1342 qq10 = _mm_mul_pd(iq1,jq0);
1343 qq11 = _mm_mul_pd(iq1,jq1);
1344 qq12 = _mm_mul_pd(iq1,jq2);
1345 qq20 = _mm_mul_pd(iq2,jq0);
1346 qq21 = _mm_mul_pd(iq2,jq1);
1347 qq22 = _mm_mul_pd(iq2,jq2);
1349 /* Avoid stupid compiler warnings */
1351 j_coord_offsetA = 0;
1352 j_coord_offsetB = 0;
1357 /* Start outer loop over neighborlists */
1358 for(iidx=0; iidx<nri; iidx++)
1360 /* Load shift vector for this list */
1361 i_shift_offset = DIM*shiftidx[iidx];
1363 /* Load limits for loop over neighbors */
1364 j_index_start = jindex[iidx];
1365 j_index_end = jindex[iidx+1];
1367 /* Get outer coordinate index */
1369 i_coord_offset = DIM*inr;
1371 /* Load i particle coords and add shift vector */
1372 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1373 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1375 fix0 = _mm_setzero_pd();
1376 fiy0 = _mm_setzero_pd();
1377 fiz0 = _mm_setzero_pd();
1378 fix1 = _mm_setzero_pd();
1379 fiy1 = _mm_setzero_pd();
1380 fiz1 = _mm_setzero_pd();
1381 fix2 = _mm_setzero_pd();
1382 fiy2 = _mm_setzero_pd();
1383 fiz2 = _mm_setzero_pd();
1385 /* Start inner kernel loop */
1386 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1389 /* Get j neighbor index, and coordinate index */
1391 jnrB = jjnr[jidx+1];
1392 j_coord_offsetA = DIM*jnrA;
1393 j_coord_offsetB = DIM*jnrB;
1395 /* load j atom coordinates */
1396 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1397 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1399 /* Calculate displacement vector */
1400 dx00 = _mm_sub_pd(ix0,jx0);
1401 dy00 = _mm_sub_pd(iy0,jy0);
1402 dz00 = _mm_sub_pd(iz0,jz0);
1403 dx01 = _mm_sub_pd(ix0,jx1);
1404 dy01 = _mm_sub_pd(iy0,jy1);
1405 dz01 = _mm_sub_pd(iz0,jz1);
1406 dx02 = _mm_sub_pd(ix0,jx2);
1407 dy02 = _mm_sub_pd(iy0,jy2);
1408 dz02 = _mm_sub_pd(iz0,jz2);
1409 dx10 = _mm_sub_pd(ix1,jx0);
1410 dy10 = _mm_sub_pd(iy1,jy0);
1411 dz10 = _mm_sub_pd(iz1,jz0);
1412 dx11 = _mm_sub_pd(ix1,jx1);
1413 dy11 = _mm_sub_pd(iy1,jy1);
1414 dz11 = _mm_sub_pd(iz1,jz1);
1415 dx12 = _mm_sub_pd(ix1,jx2);
1416 dy12 = _mm_sub_pd(iy1,jy2);
1417 dz12 = _mm_sub_pd(iz1,jz2);
1418 dx20 = _mm_sub_pd(ix2,jx0);
1419 dy20 = _mm_sub_pd(iy2,jy0);
1420 dz20 = _mm_sub_pd(iz2,jz0);
1421 dx21 = _mm_sub_pd(ix2,jx1);
1422 dy21 = _mm_sub_pd(iy2,jy1);
1423 dz21 = _mm_sub_pd(iz2,jz1);
1424 dx22 = _mm_sub_pd(ix2,jx2);
1425 dy22 = _mm_sub_pd(iy2,jy2);
1426 dz22 = _mm_sub_pd(iz2,jz2);
1428 /* Calculate squared distance and things based on it */
1429 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1430 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1431 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1432 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1433 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1434 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1435 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1436 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1437 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1439 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1440 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1441 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1442 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1443 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1444 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1445 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1446 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1447 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1449 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1451 fjx0 = _mm_setzero_pd();
1452 fjy0 = _mm_setzero_pd();
1453 fjz0 = _mm_setzero_pd();
1454 fjx1 = _mm_setzero_pd();
1455 fjy1 = _mm_setzero_pd();
1456 fjz1 = _mm_setzero_pd();
1457 fjx2 = _mm_setzero_pd();
1458 fjy2 = _mm_setzero_pd();
1459 fjz2 = _mm_setzero_pd();
1461 /**************************
1462 * CALCULATE INTERACTIONS *
1463 **************************/
1465 r00 = _mm_mul_pd(rsq00,rinv00);
1467 /* Calculate table index by multiplying r with table scale and truncate to integer */
1468 rt = _mm_mul_pd(r00,vftabscale);
1469 vfitab = _mm_cvttpd_epi32(rt);
1470 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1471 vfitab = _mm_slli_epi32(vfitab,2);
1473 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1474 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1475 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1476 GMX_MM_TRANSPOSE2_PD(Y,F);
1477 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1478 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1479 GMX_MM_TRANSPOSE2_PD(G,H);
1480 Heps = _mm_mul_pd(vfeps,H);
1481 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1482 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1483 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1485 /* LENNARD-JONES DISPERSION/REPULSION */
1487 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1488 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1490 fscal = _mm_add_pd(felec,fvdw);
1492 /* Calculate temporary vectorial force */
1493 tx = _mm_mul_pd(fscal,dx00);
1494 ty = _mm_mul_pd(fscal,dy00);
1495 tz = _mm_mul_pd(fscal,dz00);
1497 /* Update vectorial force */
1498 fix0 = _mm_add_pd(fix0,tx);
1499 fiy0 = _mm_add_pd(fiy0,ty);
1500 fiz0 = _mm_add_pd(fiz0,tz);
1502 fjx0 = _mm_add_pd(fjx0,tx);
1503 fjy0 = _mm_add_pd(fjy0,ty);
1504 fjz0 = _mm_add_pd(fjz0,tz);
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 r01 = _mm_mul_pd(rsq01,rinv01);
1512 /* Calculate table index by multiplying r with table scale and truncate to integer */
1513 rt = _mm_mul_pd(r01,vftabscale);
1514 vfitab = _mm_cvttpd_epi32(rt);
1515 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1516 vfitab = _mm_slli_epi32(vfitab,2);
1518 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1519 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1520 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1521 GMX_MM_TRANSPOSE2_PD(Y,F);
1522 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1523 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1524 GMX_MM_TRANSPOSE2_PD(G,H);
1525 Heps = _mm_mul_pd(vfeps,H);
1526 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1527 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1528 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1532 /* Calculate temporary vectorial force */
1533 tx = _mm_mul_pd(fscal,dx01);
1534 ty = _mm_mul_pd(fscal,dy01);
1535 tz = _mm_mul_pd(fscal,dz01);
1537 /* Update vectorial force */
1538 fix0 = _mm_add_pd(fix0,tx);
1539 fiy0 = _mm_add_pd(fiy0,ty);
1540 fiz0 = _mm_add_pd(fiz0,tz);
1542 fjx1 = _mm_add_pd(fjx1,tx);
1543 fjy1 = _mm_add_pd(fjy1,ty);
1544 fjz1 = _mm_add_pd(fjz1,tz);
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 r02 = _mm_mul_pd(rsq02,rinv02);
1552 /* Calculate table index by multiplying r with table scale and truncate to integer */
1553 rt = _mm_mul_pd(r02,vftabscale);
1554 vfitab = _mm_cvttpd_epi32(rt);
1555 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1556 vfitab = _mm_slli_epi32(vfitab,2);
1558 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1559 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1560 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1561 GMX_MM_TRANSPOSE2_PD(Y,F);
1562 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1563 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1564 GMX_MM_TRANSPOSE2_PD(G,H);
1565 Heps = _mm_mul_pd(vfeps,H);
1566 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1567 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1568 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1572 /* Calculate temporary vectorial force */
1573 tx = _mm_mul_pd(fscal,dx02);
1574 ty = _mm_mul_pd(fscal,dy02);
1575 tz = _mm_mul_pd(fscal,dz02);
1577 /* Update vectorial force */
1578 fix0 = _mm_add_pd(fix0,tx);
1579 fiy0 = _mm_add_pd(fiy0,ty);
1580 fiz0 = _mm_add_pd(fiz0,tz);
1582 fjx2 = _mm_add_pd(fjx2,tx);
1583 fjy2 = _mm_add_pd(fjy2,ty);
1584 fjz2 = _mm_add_pd(fjz2,tz);
1586 /**************************
1587 * CALCULATE INTERACTIONS *
1588 **************************/
1590 r10 = _mm_mul_pd(rsq10,rinv10);
1592 /* Calculate table index by multiplying r with table scale and truncate to integer */
1593 rt = _mm_mul_pd(r10,vftabscale);
1594 vfitab = _mm_cvttpd_epi32(rt);
1595 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1596 vfitab = _mm_slli_epi32(vfitab,2);
1598 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1599 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1600 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1601 GMX_MM_TRANSPOSE2_PD(Y,F);
1602 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1603 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1604 GMX_MM_TRANSPOSE2_PD(G,H);
1605 Heps = _mm_mul_pd(vfeps,H);
1606 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1607 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1608 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1612 /* Calculate temporary vectorial force */
1613 tx = _mm_mul_pd(fscal,dx10);
1614 ty = _mm_mul_pd(fscal,dy10);
1615 tz = _mm_mul_pd(fscal,dz10);
1617 /* Update vectorial force */
1618 fix1 = _mm_add_pd(fix1,tx);
1619 fiy1 = _mm_add_pd(fiy1,ty);
1620 fiz1 = _mm_add_pd(fiz1,tz);
1622 fjx0 = _mm_add_pd(fjx0,tx);
1623 fjy0 = _mm_add_pd(fjy0,ty);
1624 fjz0 = _mm_add_pd(fjz0,tz);
1626 /**************************
1627 * CALCULATE INTERACTIONS *
1628 **************************/
1630 r11 = _mm_mul_pd(rsq11,rinv11);
1632 /* Calculate table index by multiplying r with table scale and truncate to integer */
1633 rt = _mm_mul_pd(r11,vftabscale);
1634 vfitab = _mm_cvttpd_epi32(rt);
1635 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1636 vfitab = _mm_slli_epi32(vfitab,2);
1638 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1640 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1641 GMX_MM_TRANSPOSE2_PD(Y,F);
1642 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1643 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1644 GMX_MM_TRANSPOSE2_PD(G,H);
1645 Heps = _mm_mul_pd(vfeps,H);
1646 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1647 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1648 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1652 /* Calculate temporary vectorial force */
1653 tx = _mm_mul_pd(fscal,dx11);
1654 ty = _mm_mul_pd(fscal,dy11);
1655 tz = _mm_mul_pd(fscal,dz11);
1657 /* Update vectorial force */
1658 fix1 = _mm_add_pd(fix1,tx);
1659 fiy1 = _mm_add_pd(fiy1,ty);
1660 fiz1 = _mm_add_pd(fiz1,tz);
1662 fjx1 = _mm_add_pd(fjx1,tx);
1663 fjy1 = _mm_add_pd(fjy1,ty);
1664 fjz1 = _mm_add_pd(fjz1,tz);
1666 /**************************
1667 * CALCULATE INTERACTIONS *
1668 **************************/
1670 r12 = _mm_mul_pd(rsq12,rinv12);
1672 /* Calculate table index by multiplying r with table scale and truncate to integer */
1673 rt = _mm_mul_pd(r12,vftabscale);
1674 vfitab = _mm_cvttpd_epi32(rt);
1675 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1676 vfitab = _mm_slli_epi32(vfitab,2);
1678 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1679 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1680 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1681 GMX_MM_TRANSPOSE2_PD(Y,F);
1682 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1683 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1684 GMX_MM_TRANSPOSE2_PD(G,H);
1685 Heps = _mm_mul_pd(vfeps,H);
1686 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1687 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1688 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1692 /* Calculate temporary vectorial force */
1693 tx = _mm_mul_pd(fscal,dx12);
1694 ty = _mm_mul_pd(fscal,dy12);
1695 tz = _mm_mul_pd(fscal,dz12);
1697 /* Update vectorial force */
1698 fix1 = _mm_add_pd(fix1,tx);
1699 fiy1 = _mm_add_pd(fiy1,ty);
1700 fiz1 = _mm_add_pd(fiz1,tz);
1702 fjx2 = _mm_add_pd(fjx2,tx);
1703 fjy2 = _mm_add_pd(fjy2,ty);
1704 fjz2 = _mm_add_pd(fjz2,tz);
1706 /**************************
1707 * CALCULATE INTERACTIONS *
1708 **************************/
1710 r20 = _mm_mul_pd(rsq20,rinv20);
1712 /* Calculate table index by multiplying r with table scale and truncate to integer */
1713 rt = _mm_mul_pd(r20,vftabscale);
1714 vfitab = _mm_cvttpd_epi32(rt);
1715 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1716 vfitab = _mm_slli_epi32(vfitab,2);
1718 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1719 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1720 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1721 GMX_MM_TRANSPOSE2_PD(Y,F);
1722 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1723 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1724 GMX_MM_TRANSPOSE2_PD(G,H);
1725 Heps = _mm_mul_pd(vfeps,H);
1726 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1727 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1728 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1732 /* Calculate temporary vectorial force */
1733 tx = _mm_mul_pd(fscal,dx20);
1734 ty = _mm_mul_pd(fscal,dy20);
1735 tz = _mm_mul_pd(fscal,dz20);
1737 /* Update vectorial force */
1738 fix2 = _mm_add_pd(fix2,tx);
1739 fiy2 = _mm_add_pd(fiy2,ty);
1740 fiz2 = _mm_add_pd(fiz2,tz);
1742 fjx0 = _mm_add_pd(fjx0,tx);
1743 fjy0 = _mm_add_pd(fjy0,ty);
1744 fjz0 = _mm_add_pd(fjz0,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 r21 = _mm_mul_pd(rsq21,rinv21);
1752 /* Calculate table index by multiplying r with table scale and truncate to integer */
1753 rt = _mm_mul_pd(r21,vftabscale);
1754 vfitab = _mm_cvttpd_epi32(rt);
1755 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1756 vfitab = _mm_slli_epi32(vfitab,2);
1758 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1759 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1760 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1761 GMX_MM_TRANSPOSE2_PD(Y,F);
1762 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1763 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1764 GMX_MM_TRANSPOSE2_PD(G,H);
1765 Heps = _mm_mul_pd(vfeps,H);
1766 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1767 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1768 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1772 /* Calculate temporary vectorial force */
1773 tx = _mm_mul_pd(fscal,dx21);
1774 ty = _mm_mul_pd(fscal,dy21);
1775 tz = _mm_mul_pd(fscal,dz21);
1777 /* Update vectorial force */
1778 fix2 = _mm_add_pd(fix2,tx);
1779 fiy2 = _mm_add_pd(fiy2,ty);
1780 fiz2 = _mm_add_pd(fiz2,tz);
1782 fjx1 = _mm_add_pd(fjx1,tx);
1783 fjy1 = _mm_add_pd(fjy1,ty);
1784 fjz1 = _mm_add_pd(fjz1,tz);
1786 /**************************
1787 * CALCULATE INTERACTIONS *
1788 **************************/
1790 r22 = _mm_mul_pd(rsq22,rinv22);
1792 /* Calculate table index by multiplying r with table scale and truncate to integer */
1793 rt = _mm_mul_pd(r22,vftabscale);
1794 vfitab = _mm_cvttpd_epi32(rt);
1795 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1796 vfitab = _mm_slli_epi32(vfitab,2);
1798 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1799 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1800 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1801 GMX_MM_TRANSPOSE2_PD(Y,F);
1802 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1803 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1804 GMX_MM_TRANSPOSE2_PD(G,H);
1805 Heps = _mm_mul_pd(vfeps,H);
1806 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1807 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1808 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1812 /* Calculate temporary vectorial force */
1813 tx = _mm_mul_pd(fscal,dx22);
1814 ty = _mm_mul_pd(fscal,dy22);
1815 tz = _mm_mul_pd(fscal,dz22);
1817 /* Update vectorial force */
1818 fix2 = _mm_add_pd(fix2,tx);
1819 fiy2 = _mm_add_pd(fiy2,ty);
1820 fiz2 = _mm_add_pd(fiz2,tz);
1822 fjx2 = _mm_add_pd(fjx2,tx);
1823 fjy2 = _mm_add_pd(fjy2,ty);
1824 fjz2 = _mm_add_pd(fjz2,tz);
1826 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1828 /* Inner loop uses 359 flops */
1831 if(jidx<j_index_end)
1835 j_coord_offsetA = DIM*jnrA;
1837 /* load j atom coordinates */
1838 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1839 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1841 /* Calculate displacement vector */
1842 dx00 = _mm_sub_pd(ix0,jx0);
1843 dy00 = _mm_sub_pd(iy0,jy0);
1844 dz00 = _mm_sub_pd(iz0,jz0);
1845 dx01 = _mm_sub_pd(ix0,jx1);
1846 dy01 = _mm_sub_pd(iy0,jy1);
1847 dz01 = _mm_sub_pd(iz0,jz1);
1848 dx02 = _mm_sub_pd(ix0,jx2);
1849 dy02 = _mm_sub_pd(iy0,jy2);
1850 dz02 = _mm_sub_pd(iz0,jz2);
1851 dx10 = _mm_sub_pd(ix1,jx0);
1852 dy10 = _mm_sub_pd(iy1,jy0);
1853 dz10 = _mm_sub_pd(iz1,jz0);
1854 dx11 = _mm_sub_pd(ix1,jx1);
1855 dy11 = _mm_sub_pd(iy1,jy1);
1856 dz11 = _mm_sub_pd(iz1,jz1);
1857 dx12 = _mm_sub_pd(ix1,jx2);
1858 dy12 = _mm_sub_pd(iy1,jy2);
1859 dz12 = _mm_sub_pd(iz1,jz2);
1860 dx20 = _mm_sub_pd(ix2,jx0);
1861 dy20 = _mm_sub_pd(iy2,jy0);
1862 dz20 = _mm_sub_pd(iz2,jz0);
1863 dx21 = _mm_sub_pd(ix2,jx1);
1864 dy21 = _mm_sub_pd(iy2,jy1);
1865 dz21 = _mm_sub_pd(iz2,jz1);
1866 dx22 = _mm_sub_pd(ix2,jx2);
1867 dy22 = _mm_sub_pd(iy2,jy2);
1868 dz22 = _mm_sub_pd(iz2,jz2);
1870 /* Calculate squared distance and things based on it */
1871 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1872 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1873 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1874 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1875 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1876 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1877 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1878 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1879 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1881 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1882 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1883 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1884 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1885 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1886 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1887 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1888 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1889 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1891 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1893 fjx0 = _mm_setzero_pd();
1894 fjy0 = _mm_setzero_pd();
1895 fjz0 = _mm_setzero_pd();
1896 fjx1 = _mm_setzero_pd();
1897 fjy1 = _mm_setzero_pd();
1898 fjz1 = _mm_setzero_pd();
1899 fjx2 = _mm_setzero_pd();
1900 fjy2 = _mm_setzero_pd();
1901 fjz2 = _mm_setzero_pd();
1903 /**************************
1904 * CALCULATE INTERACTIONS *
1905 **************************/
1907 r00 = _mm_mul_pd(rsq00,rinv00);
1909 /* Calculate table index by multiplying r with table scale and truncate to integer */
1910 rt = _mm_mul_pd(r00,vftabscale);
1911 vfitab = _mm_cvttpd_epi32(rt);
1912 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1913 vfitab = _mm_slli_epi32(vfitab,2);
1915 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1916 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1917 F = _mm_setzero_pd();
1918 GMX_MM_TRANSPOSE2_PD(Y,F);
1919 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1920 H = _mm_setzero_pd();
1921 GMX_MM_TRANSPOSE2_PD(G,H);
1922 Heps = _mm_mul_pd(vfeps,H);
1923 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1924 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1925 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1927 /* LENNARD-JONES DISPERSION/REPULSION */
1929 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1930 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1932 fscal = _mm_add_pd(felec,fvdw);
1934 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1936 /* Calculate temporary vectorial force */
1937 tx = _mm_mul_pd(fscal,dx00);
1938 ty = _mm_mul_pd(fscal,dy00);
1939 tz = _mm_mul_pd(fscal,dz00);
1941 /* Update vectorial force */
1942 fix0 = _mm_add_pd(fix0,tx);
1943 fiy0 = _mm_add_pd(fiy0,ty);
1944 fiz0 = _mm_add_pd(fiz0,tz);
1946 fjx0 = _mm_add_pd(fjx0,tx);
1947 fjy0 = _mm_add_pd(fjy0,ty);
1948 fjz0 = _mm_add_pd(fjz0,tz);
1950 /**************************
1951 * CALCULATE INTERACTIONS *
1952 **************************/
1954 r01 = _mm_mul_pd(rsq01,rinv01);
1956 /* Calculate table index by multiplying r with table scale and truncate to integer */
1957 rt = _mm_mul_pd(r01,vftabscale);
1958 vfitab = _mm_cvttpd_epi32(rt);
1959 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1960 vfitab = _mm_slli_epi32(vfitab,2);
1962 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1964 F = _mm_setzero_pd();
1965 GMX_MM_TRANSPOSE2_PD(Y,F);
1966 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1967 H = _mm_setzero_pd();
1968 GMX_MM_TRANSPOSE2_PD(G,H);
1969 Heps = _mm_mul_pd(vfeps,H);
1970 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1971 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1972 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1976 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1978 /* Calculate temporary vectorial force */
1979 tx = _mm_mul_pd(fscal,dx01);
1980 ty = _mm_mul_pd(fscal,dy01);
1981 tz = _mm_mul_pd(fscal,dz01);
1983 /* Update vectorial force */
1984 fix0 = _mm_add_pd(fix0,tx);
1985 fiy0 = _mm_add_pd(fiy0,ty);
1986 fiz0 = _mm_add_pd(fiz0,tz);
1988 fjx1 = _mm_add_pd(fjx1,tx);
1989 fjy1 = _mm_add_pd(fjy1,ty);
1990 fjz1 = _mm_add_pd(fjz1,tz);
1992 /**************************
1993 * CALCULATE INTERACTIONS *
1994 **************************/
1996 r02 = _mm_mul_pd(rsq02,rinv02);
1998 /* Calculate table index by multiplying r with table scale and truncate to integer */
1999 rt = _mm_mul_pd(r02,vftabscale);
2000 vfitab = _mm_cvttpd_epi32(rt);
2001 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2002 vfitab = _mm_slli_epi32(vfitab,2);
2004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2005 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2006 F = _mm_setzero_pd();
2007 GMX_MM_TRANSPOSE2_PD(Y,F);
2008 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2009 H = _mm_setzero_pd();
2010 GMX_MM_TRANSPOSE2_PD(G,H);
2011 Heps = _mm_mul_pd(vfeps,H);
2012 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2013 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2014 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2018 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2020 /* Calculate temporary vectorial force */
2021 tx = _mm_mul_pd(fscal,dx02);
2022 ty = _mm_mul_pd(fscal,dy02);
2023 tz = _mm_mul_pd(fscal,dz02);
2025 /* Update vectorial force */
2026 fix0 = _mm_add_pd(fix0,tx);
2027 fiy0 = _mm_add_pd(fiy0,ty);
2028 fiz0 = _mm_add_pd(fiz0,tz);
2030 fjx2 = _mm_add_pd(fjx2,tx);
2031 fjy2 = _mm_add_pd(fjy2,ty);
2032 fjz2 = _mm_add_pd(fjz2,tz);
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 r10 = _mm_mul_pd(rsq10,rinv10);
2040 /* Calculate table index by multiplying r with table scale and truncate to integer */
2041 rt = _mm_mul_pd(r10,vftabscale);
2042 vfitab = _mm_cvttpd_epi32(rt);
2043 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2044 vfitab = _mm_slli_epi32(vfitab,2);
2046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2048 F = _mm_setzero_pd();
2049 GMX_MM_TRANSPOSE2_PD(Y,F);
2050 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2051 H = _mm_setzero_pd();
2052 GMX_MM_TRANSPOSE2_PD(G,H);
2053 Heps = _mm_mul_pd(vfeps,H);
2054 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2055 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2056 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2060 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2062 /* Calculate temporary vectorial force */
2063 tx = _mm_mul_pd(fscal,dx10);
2064 ty = _mm_mul_pd(fscal,dy10);
2065 tz = _mm_mul_pd(fscal,dz10);
2067 /* Update vectorial force */
2068 fix1 = _mm_add_pd(fix1,tx);
2069 fiy1 = _mm_add_pd(fiy1,ty);
2070 fiz1 = _mm_add_pd(fiz1,tz);
2072 fjx0 = _mm_add_pd(fjx0,tx);
2073 fjy0 = _mm_add_pd(fjy0,ty);
2074 fjz0 = _mm_add_pd(fjz0,tz);
2076 /**************************
2077 * CALCULATE INTERACTIONS *
2078 **************************/
2080 r11 = _mm_mul_pd(rsq11,rinv11);
2082 /* Calculate table index by multiplying r with table scale and truncate to integer */
2083 rt = _mm_mul_pd(r11,vftabscale);
2084 vfitab = _mm_cvttpd_epi32(rt);
2085 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2086 vfitab = _mm_slli_epi32(vfitab,2);
2088 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2089 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2090 F = _mm_setzero_pd();
2091 GMX_MM_TRANSPOSE2_PD(Y,F);
2092 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2093 H = _mm_setzero_pd();
2094 GMX_MM_TRANSPOSE2_PD(G,H);
2095 Heps = _mm_mul_pd(vfeps,H);
2096 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2097 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2098 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2102 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2104 /* Calculate temporary vectorial force */
2105 tx = _mm_mul_pd(fscal,dx11);
2106 ty = _mm_mul_pd(fscal,dy11);
2107 tz = _mm_mul_pd(fscal,dz11);
2109 /* Update vectorial force */
2110 fix1 = _mm_add_pd(fix1,tx);
2111 fiy1 = _mm_add_pd(fiy1,ty);
2112 fiz1 = _mm_add_pd(fiz1,tz);
2114 fjx1 = _mm_add_pd(fjx1,tx);
2115 fjy1 = _mm_add_pd(fjy1,ty);
2116 fjz1 = _mm_add_pd(fjz1,tz);
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 r12 = _mm_mul_pd(rsq12,rinv12);
2124 /* Calculate table index by multiplying r with table scale and truncate to integer */
2125 rt = _mm_mul_pd(r12,vftabscale);
2126 vfitab = _mm_cvttpd_epi32(rt);
2127 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2128 vfitab = _mm_slli_epi32(vfitab,2);
2130 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2131 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2132 F = _mm_setzero_pd();
2133 GMX_MM_TRANSPOSE2_PD(Y,F);
2134 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2135 H = _mm_setzero_pd();
2136 GMX_MM_TRANSPOSE2_PD(G,H);
2137 Heps = _mm_mul_pd(vfeps,H);
2138 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2139 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2140 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2144 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2146 /* Calculate temporary vectorial force */
2147 tx = _mm_mul_pd(fscal,dx12);
2148 ty = _mm_mul_pd(fscal,dy12);
2149 tz = _mm_mul_pd(fscal,dz12);
2151 /* Update vectorial force */
2152 fix1 = _mm_add_pd(fix1,tx);
2153 fiy1 = _mm_add_pd(fiy1,ty);
2154 fiz1 = _mm_add_pd(fiz1,tz);
2156 fjx2 = _mm_add_pd(fjx2,tx);
2157 fjy2 = _mm_add_pd(fjy2,ty);
2158 fjz2 = _mm_add_pd(fjz2,tz);
2160 /**************************
2161 * CALCULATE INTERACTIONS *
2162 **************************/
2164 r20 = _mm_mul_pd(rsq20,rinv20);
2166 /* Calculate table index by multiplying r with table scale and truncate to integer */
2167 rt = _mm_mul_pd(r20,vftabscale);
2168 vfitab = _mm_cvttpd_epi32(rt);
2169 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2170 vfitab = _mm_slli_epi32(vfitab,2);
2172 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2173 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2174 F = _mm_setzero_pd();
2175 GMX_MM_TRANSPOSE2_PD(Y,F);
2176 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2177 H = _mm_setzero_pd();
2178 GMX_MM_TRANSPOSE2_PD(G,H);
2179 Heps = _mm_mul_pd(vfeps,H);
2180 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2181 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2182 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2186 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2188 /* Calculate temporary vectorial force */
2189 tx = _mm_mul_pd(fscal,dx20);
2190 ty = _mm_mul_pd(fscal,dy20);
2191 tz = _mm_mul_pd(fscal,dz20);
2193 /* Update vectorial force */
2194 fix2 = _mm_add_pd(fix2,tx);
2195 fiy2 = _mm_add_pd(fiy2,ty);
2196 fiz2 = _mm_add_pd(fiz2,tz);
2198 fjx0 = _mm_add_pd(fjx0,tx);
2199 fjy0 = _mm_add_pd(fjy0,ty);
2200 fjz0 = _mm_add_pd(fjz0,tz);
2202 /**************************
2203 * CALCULATE INTERACTIONS *
2204 **************************/
2206 r21 = _mm_mul_pd(rsq21,rinv21);
2208 /* Calculate table index by multiplying r with table scale and truncate to integer */
2209 rt = _mm_mul_pd(r21,vftabscale);
2210 vfitab = _mm_cvttpd_epi32(rt);
2211 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2212 vfitab = _mm_slli_epi32(vfitab,2);
2214 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2215 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2216 F = _mm_setzero_pd();
2217 GMX_MM_TRANSPOSE2_PD(Y,F);
2218 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2219 H = _mm_setzero_pd();
2220 GMX_MM_TRANSPOSE2_PD(G,H);
2221 Heps = _mm_mul_pd(vfeps,H);
2222 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2223 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2224 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2228 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2230 /* Calculate temporary vectorial force */
2231 tx = _mm_mul_pd(fscal,dx21);
2232 ty = _mm_mul_pd(fscal,dy21);
2233 tz = _mm_mul_pd(fscal,dz21);
2235 /* Update vectorial force */
2236 fix2 = _mm_add_pd(fix2,tx);
2237 fiy2 = _mm_add_pd(fiy2,ty);
2238 fiz2 = _mm_add_pd(fiz2,tz);
2240 fjx1 = _mm_add_pd(fjx1,tx);
2241 fjy1 = _mm_add_pd(fjy1,ty);
2242 fjz1 = _mm_add_pd(fjz1,tz);
2244 /**************************
2245 * CALCULATE INTERACTIONS *
2246 **************************/
2248 r22 = _mm_mul_pd(rsq22,rinv22);
2250 /* Calculate table index by multiplying r with table scale and truncate to integer */
2251 rt = _mm_mul_pd(r22,vftabscale);
2252 vfitab = _mm_cvttpd_epi32(rt);
2253 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2254 vfitab = _mm_slli_epi32(vfitab,2);
2256 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2257 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2258 F = _mm_setzero_pd();
2259 GMX_MM_TRANSPOSE2_PD(Y,F);
2260 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2261 H = _mm_setzero_pd();
2262 GMX_MM_TRANSPOSE2_PD(G,H);
2263 Heps = _mm_mul_pd(vfeps,H);
2264 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2265 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2266 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2270 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2272 /* Calculate temporary vectorial force */
2273 tx = _mm_mul_pd(fscal,dx22);
2274 ty = _mm_mul_pd(fscal,dy22);
2275 tz = _mm_mul_pd(fscal,dz22);
2277 /* Update vectorial force */
2278 fix2 = _mm_add_pd(fix2,tx);
2279 fiy2 = _mm_add_pd(fiy2,ty);
2280 fiz2 = _mm_add_pd(fiz2,tz);
2282 fjx2 = _mm_add_pd(fjx2,tx);
2283 fjy2 = _mm_add_pd(fjy2,ty);
2284 fjz2 = _mm_add_pd(fjz2,tz);
2286 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2288 /* Inner loop uses 359 flops */
2291 /* End of innermost loop */
2293 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2294 f+i_coord_offset,fshift+i_shift_offset);
2296 /* Increment number of inner iterations */
2297 inneriter += j_index_end - j_index_start;
2299 /* Outer loop uses 18 flops */
2302 /* Increment number of outer iterations */
2305 /* Update outer/inner flops */
2307 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);