2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
104 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128i ifour = _mm_set1_epi32(4);
111 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
113 __m128d dummy_mask,cutoff_mask;
114 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one = _mm_set1_pd(1.0);
116 __m128d two = _mm_set1_pd(2.0);
122 jindex = nlist->jindex;
124 shiftidx = nlist->shift;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_pd(fr->epsfac);
129 charge = mdatoms->chargeA;
130 nvdwtype = fr->ntype;
132 vdwtype = mdatoms->typeA;
134 vftab = kernel_data->table_elec_vdw->data;
135 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
137 /* Setup water-specific parameters */
138 inr = nlist->iinr[0];
139 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
140 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
141 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
142 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
144 jq0 = _mm_set1_pd(charge[inr+0]);
145 jq1 = _mm_set1_pd(charge[inr+1]);
146 jq2 = _mm_set1_pd(charge[inr+2]);
147 vdwjidx0A = 2*vdwtype[inr+0];
148 qq00 = _mm_mul_pd(iq0,jq0);
149 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
150 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
151 qq01 = _mm_mul_pd(iq0,jq1);
152 qq02 = _mm_mul_pd(iq0,jq2);
153 qq10 = _mm_mul_pd(iq1,jq0);
154 qq11 = _mm_mul_pd(iq1,jq1);
155 qq12 = _mm_mul_pd(iq1,jq2);
156 qq20 = _mm_mul_pd(iq2,jq0);
157 qq21 = _mm_mul_pd(iq2,jq1);
158 qq22 = _mm_mul_pd(iq2,jq2);
160 /* Avoid stupid compiler warnings */
168 /* Start outer loop over neighborlists */
169 for(iidx=0; iidx<nri; iidx++)
171 /* Load shift vector for this list */
172 i_shift_offset = DIM*shiftidx[iidx];
174 /* Load limits for loop over neighbors */
175 j_index_start = jindex[iidx];
176 j_index_end = jindex[iidx+1];
178 /* Get outer coordinate index */
180 i_coord_offset = DIM*inr;
182 /* Load i particle coords and add shift vector */
183 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
184 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
186 fix0 = _mm_setzero_pd();
187 fiy0 = _mm_setzero_pd();
188 fiz0 = _mm_setzero_pd();
189 fix1 = _mm_setzero_pd();
190 fiy1 = _mm_setzero_pd();
191 fiz1 = _mm_setzero_pd();
192 fix2 = _mm_setzero_pd();
193 fiy2 = _mm_setzero_pd();
194 fiz2 = _mm_setzero_pd();
196 /* Reset potential sums */
197 velecsum = _mm_setzero_pd();
198 vvdwsum = _mm_setzero_pd();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA = DIM*jnrA;
208 j_coord_offsetB = DIM*jnrB;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
212 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
214 /* Calculate displacement vector */
215 dx00 = _mm_sub_pd(ix0,jx0);
216 dy00 = _mm_sub_pd(iy0,jy0);
217 dz00 = _mm_sub_pd(iz0,jz0);
218 dx01 = _mm_sub_pd(ix0,jx1);
219 dy01 = _mm_sub_pd(iy0,jy1);
220 dz01 = _mm_sub_pd(iz0,jz1);
221 dx02 = _mm_sub_pd(ix0,jx2);
222 dy02 = _mm_sub_pd(iy0,jy2);
223 dz02 = _mm_sub_pd(iz0,jz2);
224 dx10 = _mm_sub_pd(ix1,jx0);
225 dy10 = _mm_sub_pd(iy1,jy0);
226 dz10 = _mm_sub_pd(iz1,jz0);
227 dx11 = _mm_sub_pd(ix1,jx1);
228 dy11 = _mm_sub_pd(iy1,jy1);
229 dz11 = _mm_sub_pd(iz1,jz1);
230 dx12 = _mm_sub_pd(ix1,jx2);
231 dy12 = _mm_sub_pd(iy1,jy2);
232 dz12 = _mm_sub_pd(iz1,jz2);
233 dx20 = _mm_sub_pd(ix2,jx0);
234 dy20 = _mm_sub_pd(iy2,jy0);
235 dz20 = _mm_sub_pd(iz2,jz0);
236 dx21 = _mm_sub_pd(ix2,jx1);
237 dy21 = _mm_sub_pd(iy2,jy1);
238 dz21 = _mm_sub_pd(iz2,jz1);
239 dx22 = _mm_sub_pd(ix2,jx2);
240 dy22 = _mm_sub_pd(iy2,jy2);
241 dz22 = _mm_sub_pd(iz2,jz2);
243 /* Calculate squared distance and things based on it */
244 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
245 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
246 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
247 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
248 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
249 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
250 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
251 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
252 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
254 rinv00 = gmx_mm_invsqrt_pd(rsq00);
255 rinv01 = gmx_mm_invsqrt_pd(rsq01);
256 rinv02 = gmx_mm_invsqrt_pd(rsq02);
257 rinv10 = gmx_mm_invsqrt_pd(rsq10);
258 rinv11 = gmx_mm_invsqrt_pd(rsq11);
259 rinv12 = gmx_mm_invsqrt_pd(rsq12);
260 rinv20 = gmx_mm_invsqrt_pd(rsq20);
261 rinv21 = gmx_mm_invsqrt_pd(rsq21);
262 rinv22 = gmx_mm_invsqrt_pd(rsq22);
264 fjx0 = _mm_setzero_pd();
265 fjy0 = _mm_setzero_pd();
266 fjz0 = _mm_setzero_pd();
267 fjx1 = _mm_setzero_pd();
268 fjy1 = _mm_setzero_pd();
269 fjz1 = _mm_setzero_pd();
270 fjx2 = _mm_setzero_pd();
271 fjy2 = _mm_setzero_pd();
272 fjz2 = _mm_setzero_pd();
274 /**************************
275 * CALCULATE INTERACTIONS *
276 **************************/
278 r00 = _mm_mul_pd(rsq00,rinv00);
280 /* Calculate table index by multiplying r with table scale and truncate to integer */
281 rt = _mm_mul_pd(r00,vftabscale);
282 vfitab = _mm_cvttpd_epi32(rt);
284 vfeps = _mm_frcz_pd(rt);
286 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
288 twovfeps = _mm_add_pd(vfeps,vfeps);
289 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
291 /* CUBIC SPLINE TABLE ELECTROSTATICS */
292 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294 GMX_MM_TRANSPOSE2_PD(Y,F);
295 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
296 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
297 GMX_MM_TRANSPOSE2_PD(G,H);
298 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
299 VV = _mm_macc_pd(vfeps,Fp,Y);
300 velec = _mm_mul_pd(qq00,VV);
301 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
302 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
304 /* CUBIC SPLINE TABLE DISPERSION */
305 vfitab = _mm_add_epi32(vfitab,ifour);
306 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308 GMX_MM_TRANSPOSE2_PD(Y,F);
309 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
310 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
311 GMX_MM_TRANSPOSE2_PD(G,H);
312 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
313 VV = _mm_macc_pd(vfeps,Fp,Y);
314 vvdw6 = _mm_mul_pd(c6_00,VV);
315 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
316 fvdw6 = _mm_mul_pd(c6_00,FF);
318 /* CUBIC SPLINE TABLE REPULSION */
319 vfitab = _mm_add_epi32(vfitab,ifour);
320 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
321 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
322 GMX_MM_TRANSPOSE2_PD(Y,F);
323 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
324 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
325 GMX_MM_TRANSPOSE2_PD(G,H);
326 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
327 VV = _mm_macc_pd(vfeps,Fp,Y);
328 vvdw12 = _mm_mul_pd(c12_00,VV);
329 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
330 fvdw12 = _mm_mul_pd(c12_00,FF);
331 vvdw = _mm_add_pd(vvdw12,vvdw6);
332 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
334 /* Update potential sum for this i atom from the interaction with this j atom. */
335 velecsum = _mm_add_pd(velecsum,velec);
336 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
338 fscal = _mm_add_pd(felec,fvdw);
340 /* Update vectorial force */
341 fix0 = _mm_macc_pd(dx00,fscal,fix0);
342 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
343 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
345 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
346 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
347 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 r01 = _mm_mul_pd(rsq01,rinv01);
355 /* Calculate table index by multiplying r with table scale and truncate to integer */
356 rt = _mm_mul_pd(r01,vftabscale);
357 vfitab = _mm_cvttpd_epi32(rt);
359 vfeps = _mm_frcz_pd(rt);
361 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
363 twovfeps = _mm_add_pd(vfeps,vfeps);
364 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
366 /* CUBIC SPLINE TABLE ELECTROSTATICS */
367 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
368 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
369 GMX_MM_TRANSPOSE2_PD(Y,F);
370 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
371 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
372 GMX_MM_TRANSPOSE2_PD(G,H);
373 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
374 VV = _mm_macc_pd(vfeps,Fp,Y);
375 velec = _mm_mul_pd(qq01,VV);
376 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
377 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velecsum = _mm_add_pd(velecsum,velec);
384 /* Update vectorial force */
385 fix0 = _mm_macc_pd(dx01,fscal,fix0);
386 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
387 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
389 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
390 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
391 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 r02 = _mm_mul_pd(rsq02,rinv02);
399 /* Calculate table index by multiplying r with table scale and truncate to integer */
400 rt = _mm_mul_pd(r02,vftabscale);
401 vfitab = _mm_cvttpd_epi32(rt);
403 vfeps = _mm_frcz_pd(rt);
405 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
407 twovfeps = _mm_add_pd(vfeps,vfeps);
408 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
410 /* CUBIC SPLINE TABLE ELECTROSTATICS */
411 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
412 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
413 GMX_MM_TRANSPOSE2_PD(Y,F);
414 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
415 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
416 GMX_MM_TRANSPOSE2_PD(G,H);
417 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
418 VV = _mm_macc_pd(vfeps,Fp,Y);
419 velec = _mm_mul_pd(qq02,VV);
420 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
421 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velecsum = _mm_add_pd(velecsum,velec);
428 /* Update vectorial force */
429 fix0 = _mm_macc_pd(dx02,fscal,fix0);
430 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
431 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
433 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
434 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
435 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 r10 = _mm_mul_pd(rsq10,rinv10);
443 /* Calculate table index by multiplying r with table scale and truncate to integer */
444 rt = _mm_mul_pd(r10,vftabscale);
445 vfitab = _mm_cvttpd_epi32(rt);
447 vfeps = _mm_frcz_pd(rt);
449 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
451 twovfeps = _mm_add_pd(vfeps,vfeps);
452 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
454 /* CUBIC SPLINE TABLE ELECTROSTATICS */
455 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
456 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
457 GMX_MM_TRANSPOSE2_PD(Y,F);
458 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
459 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
460 GMX_MM_TRANSPOSE2_PD(G,H);
461 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
462 VV = _mm_macc_pd(vfeps,Fp,Y);
463 velec = _mm_mul_pd(qq10,VV);
464 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
465 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velecsum = _mm_add_pd(velecsum,velec);
472 /* Update vectorial force */
473 fix1 = _mm_macc_pd(dx10,fscal,fix1);
474 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
475 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
477 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
478 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
479 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 r11 = _mm_mul_pd(rsq11,rinv11);
487 /* Calculate table index by multiplying r with table scale and truncate to integer */
488 rt = _mm_mul_pd(r11,vftabscale);
489 vfitab = _mm_cvttpd_epi32(rt);
491 vfeps = _mm_frcz_pd(rt);
493 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
495 twovfeps = _mm_add_pd(vfeps,vfeps);
496 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
498 /* CUBIC SPLINE TABLE ELECTROSTATICS */
499 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
500 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
501 GMX_MM_TRANSPOSE2_PD(Y,F);
502 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
503 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
504 GMX_MM_TRANSPOSE2_PD(G,H);
505 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
506 VV = _mm_macc_pd(vfeps,Fp,Y);
507 velec = _mm_mul_pd(qq11,VV);
508 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
509 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velecsum = _mm_add_pd(velecsum,velec);
516 /* Update vectorial force */
517 fix1 = _mm_macc_pd(dx11,fscal,fix1);
518 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
519 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
521 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
522 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
523 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
525 /**************************
526 * CALCULATE INTERACTIONS *
527 **************************/
529 r12 = _mm_mul_pd(rsq12,rinv12);
531 /* Calculate table index by multiplying r with table scale and truncate to integer */
532 rt = _mm_mul_pd(r12,vftabscale);
533 vfitab = _mm_cvttpd_epi32(rt);
535 vfeps = _mm_frcz_pd(rt);
537 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
539 twovfeps = _mm_add_pd(vfeps,vfeps);
540 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
542 /* CUBIC SPLINE TABLE ELECTROSTATICS */
543 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
544 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
545 GMX_MM_TRANSPOSE2_PD(Y,F);
546 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
547 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
548 GMX_MM_TRANSPOSE2_PD(G,H);
549 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
550 VV = _mm_macc_pd(vfeps,Fp,Y);
551 velec = _mm_mul_pd(qq12,VV);
552 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
553 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velecsum = _mm_add_pd(velecsum,velec);
560 /* Update vectorial force */
561 fix1 = _mm_macc_pd(dx12,fscal,fix1);
562 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
563 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
565 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
566 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
567 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
569 /**************************
570 * CALCULATE INTERACTIONS *
571 **************************/
573 r20 = _mm_mul_pd(rsq20,rinv20);
575 /* Calculate table index by multiplying r with table scale and truncate to integer */
576 rt = _mm_mul_pd(r20,vftabscale);
577 vfitab = _mm_cvttpd_epi32(rt);
579 vfeps = _mm_frcz_pd(rt);
581 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
583 twovfeps = _mm_add_pd(vfeps,vfeps);
584 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
586 /* CUBIC SPLINE TABLE ELECTROSTATICS */
587 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
588 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
589 GMX_MM_TRANSPOSE2_PD(Y,F);
590 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
591 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
592 GMX_MM_TRANSPOSE2_PD(G,H);
593 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
594 VV = _mm_macc_pd(vfeps,Fp,Y);
595 velec = _mm_mul_pd(qq20,VV);
596 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
597 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
599 /* Update potential sum for this i atom from the interaction with this j atom. */
600 velecsum = _mm_add_pd(velecsum,velec);
604 /* Update vectorial force */
605 fix2 = _mm_macc_pd(dx20,fscal,fix2);
606 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
607 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
609 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
610 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
611 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
613 /**************************
614 * CALCULATE INTERACTIONS *
615 **************************/
617 r21 = _mm_mul_pd(rsq21,rinv21);
619 /* Calculate table index by multiplying r with table scale and truncate to integer */
620 rt = _mm_mul_pd(r21,vftabscale);
621 vfitab = _mm_cvttpd_epi32(rt);
623 vfeps = _mm_frcz_pd(rt);
625 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
627 twovfeps = _mm_add_pd(vfeps,vfeps);
628 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
630 /* CUBIC SPLINE TABLE ELECTROSTATICS */
631 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
632 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
633 GMX_MM_TRANSPOSE2_PD(Y,F);
634 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
635 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
636 GMX_MM_TRANSPOSE2_PD(G,H);
637 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
638 VV = _mm_macc_pd(vfeps,Fp,Y);
639 velec = _mm_mul_pd(qq21,VV);
640 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
641 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
643 /* Update potential sum for this i atom from the interaction with this j atom. */
644 velecsum = _mm_add_pd(velecsum,velec);
648 /* Update vectorial force */
649 fix2 = _mm_macc_pd(dx21,fscal,fix2);
650 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
651 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
653 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
654 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
655 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
657 /**************************
658 * CALCULATE INTERACTIONS *
659 **************************/
661 r22 = _mm_mul_pd(rsq22,rinv22);
663 /* Calculate table index by multiplying r with table scale and truncate to integer */
664 rt = _mm_mul_pd(r22,vftabscale);
665 vfitab = _mm_cvttpd_epi32(rt);
667 vfeps = _mm_frcz_pd(rt);
669 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
671 twovfeps = _mm_add_pd(vfeps,vfeps);
672 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
674 /* CUBIC SPLINE TABLE ELECTROSTATICS */
675 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
676 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
677 GMX_MM_TRANSPOSE2_PD(Y,F);
678 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
679 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
680 GMX_MM_TRANSPOSE2_PD(G,H);
681 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
682 VV = _mm_macc_pd(vfeps,Fp,Y);
683 velec = _mm_mul_pd(qq22,VV);
684 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
685 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velecsum = _mm_add_pd(velecsum,velec);
692 /* Update vectorial force */
693 fix2 = _mm_macc_pd(dx22,fscal,fix2);
694 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
695 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
697 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
698 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
699 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
701 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
703 /* Inner loop uses 444 flops */
710 j_coord_offsetA = DIM*jnrA;
712 /* load j atom coordinates */
713 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
714 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
716 /* Calculate displacement vector */
717 dx00 = _mm_sub_pd(ix0,jx0);
718 dy00 = _mm_sub_pd(iy0,jy0);
719 dz00 = _mm_sub_pd(iz0,jz0);
720 dx01 = _mm_sub_pd(ix0,jx1);
721 dy01 = _mm_sub_pd(iy0,jy1);
722 dz01 = _mm_sub_pd(iz0,jz1);
723 dx02 = _mm_sub_pd(ix0,jx2);
724 dy02 = _mm_sub_pd(iy0,jy2);
725 dz02 = _mm_sub_pd(iz0,jz2);
726 dx10 = _mm_sub_pd(ix1,jx0);
727 dy10 = _mm_sub_pd(iy1,jy0);
728 dz10 = _mm_sub_pd(iz1,jz0);
729 dx11 = _mm_sub_pd(ix1,jx1);
730 dy11 = _mm_sub_pd(iy1,jy1);
731 dz11 = _mm_sub_pd(iz1,jz1);
732 dx12 = _mm_sub_pd(ix1,jx2);
733 dy12 = _mm_sub_pd(iy1,jy2);
734 dz12 = _mm_sub_pd(iz1,jz2);
735 dx20 = _mm_sub_pd(ix2,jx0);
736 dy20 = _mm_sub_pd(iy2,jy0);
737 dz20 = _mm_sub_pd(iz2,jz0);
738 dx21 = _mm_sub_pd(ix2,jx1);
739 dy21 = _mm_sub_pd(iy2,jy1);
740 dz21 = _mm_sub_pd(iz2,jz1);
741 dx22 = _mm_sub_pd(ix2,jx2);
742 dy22 = _mm_sub_pd(iy2,jy2);
743 dz22 = _mm_sub_pd(iz2,jz2);
745 /* Calculate squared distance and things based on it */
746 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
747 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
748 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
749 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
750 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
751 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
752 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
753 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
754 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
756 rinv00 = gmx_mm_invsqrt_pd(rsq00);
757 rinv01 = gmx_mm_invsqrt_pd(rsq01);
758 rinv02 = gmx_mm_invsqrt_pd(rsq02);
759 rinv10 = gmx_mm_invsqrt_pd(rsq10);
760 rinv11 = gmx_mm_invsqrt_pd(rsq11);
761 rinv12 = gmx_mm_invsqrt_pd(rsq12);
762 rinv20 = gmx_mm_invsqrt_pd(rsq20);
763 rinv21 = gmx_mm_invsqrt_pd(rsq21);
764 rinv22 = gmx_mm_invsqrt_pd(rsq22);
766 fjx0 = _mm_setzero_pd();
767 fjy0 = _mm_setzero_pd();
768 fjz0 = _mm_setzero_pd();
769 fjx1 = _mm_setzero_pd();
770 fjy1 = _mm_setzero_pd();
771 fjz1 = _mm_setzero_pd();
772 fjx2 = _mm_setzero_pd();
773 fjy2 = _mm_setzero_pd();
774 fjz2 = _mm_setzero_pd();
776 /**************************
777 * CALCULATE INTERACTIONS *
778 **************************/
780 r00 = _mm_mul_pd(rsq00,rinv00);
782 /* Calculate table index by multiplying r with table scale and truncate to integer */
783 rt = _mm_mul_pd(r00,vftabscale);
784 vfitab = _mm_cvttpd_epi32(rt);
786 vfeps = _mm_frcz_pd(rt);
788 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
790 twovfeps = _mm_add_pd(vfeps,vfeps);
791 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
793 /* CUBIC SPLINE TABLE ELECTROSTATICS */
794 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
795 F = _mm_setzero_pd();
796 GMX_MM_TRANSPOSE2_PD(Y,F);
797 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
798 H = _mm_setzero_pd();
799 GMX_MM_TRANSPOSE2_PD(G,H);
800 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
801 VV = _mm_macc_pd(vfeps,Fp,Y);
802 velec = _mm_mul_pd(qq00,VV);
803 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
804 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
806 /* CUBIC SPLINE TABLE DISPERSION */
807 vfitab = _mm_add_epi32(vfitab,ifour);
808 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
809 F = _mm_setzero_pd();
810 GMX_MM_TRANSPOSE2_PD(Y,F);
811 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
812 H = _mm_setzero_pd();
813 GMX_MM_TRANSPOSE2_PD(G,H);
814 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
815 VV = _mm_macc_pd(vfeps,Fp,Y);
816 vvdw6 = _mm_mul_pd(c6_00,VV);
817 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
818 fvdw6 = _mm_mul_pd(c6_00,FF);
820 /* CUBIC SPLINE TABLE REPULSION */
821 vfitab = _mm_add_epi32(vfitab,ifour);
822 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
823 F = _mm_setzero_pd();
824 GMX_MM_TRANSPOSE2_PD(Y,F);
825 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
826 H = _mm_setzero_pd();
827 GMX_MM_TRANSPOSE2_PD(G,H);
828 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
829 VV = _mm_macc_pd(vfeps,Fp,Y);
830 vvdw12 = _mm_mul_pd(c12_00,VV);
831 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
832 fvdw12 = _mm_mul_pd(c12_00,FF);
833 vvdw = _mm_add_pd(vvdw12,vvdw6);
834 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
836 /* Update potential sum for this i atom from the interaction with this j atom. */
837 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
838 velecsum = _mm_add_pd(velecsum,velec);
839 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
840 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
842 fscal = _mm_add_pd(felec,fvdw);
844 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
846 /* Update vectorial force */
847 fix0 = _mm_macc_pd(dx00,fscal,fix0);
848 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
849 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
851 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
852 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
853 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
855 /**************************
856 * CALCULATE INTERACTIONS *
857 **************************/
859 r01 = _mm_mul_pd(rsq01,rinv01);
861 /* Calculate table index by multiplying r with table scale and truncate to integer */
862 rt = _mm_mul_pd(r01,vftabscale);
863 vfitab = _mm_cvttpd_epi32(rt);
865 vfeps = _mm_frcz_pd(rt);
867 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
869 twovfeps = _mm_add_pd(vfeps,vfeps);
870 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
872 /* CUBIC SPLINE TABLE ELECTROSTATICS */
873 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
874 F = _mm_setzero_pd();
875 GMX_MM_TRANSPOSE2_PD(Y,F);
876 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
877 H = _mm_setzero_pd();
878 GMX_MM_TRANSPOSE2_PD(G,H);
879 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
880 VV = _mm_macc_pd(vfeps,Fp,Y);
881 velec = _mm_mul_pd(qq01,VV);
882 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
883 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
885 /* Update potential sum for this i atom from the interaction with this j atom. */
886 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
887 velecsum = _mm_add_pd(velecsum,velec);
891 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
893 /* Update vectorial force */
894 fix0 = _mm_macc_pd(dx01,fscal,fix0);
895 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
896 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
898 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
899 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
900 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
902 /**************************
903 * CALCULATE INTERACTIONS *
904 **************************/
906 r02 = _mm_mul_pd(rsq02,rinv02);
908 /* Calculate table index by multiplying r with table scale and truncate to integer */
909 rt = _mm_mul_pd(r02,vftabscale);
910 vfitab = _mm_cvttpd_epi32(rt);
912 vfeps = _mm_frcz_pd(rt);
914 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
916 twovfeps = _mm_add_pd(vfeps,vfeps);
917 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
919 /* CUBIC SPLINE TABLE ELECTROSTATICS */
920 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
921 F = _mm_setzero_pd();
922 GMX_MM_TRANSPOSE2_PD(Y,F);
923 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
924 H = _mm_setzero_pd();
925 GMX_MM_TRANSPOSE2_PD(G,H);
926 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
927 VV = _mm_macc_pd(vfeps,Fp,Y);
928 velec = _mm_mul_pd(qq02,VV);
929 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
930 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
932 /* Update potential sum for this i atom from the interaction with this j atom. */
933 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
934 velecsum = _mm_add_pd(velecsum,velec);
938 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
940 /* Update vectorial force */
941 fix0 = _mm_macc_pd(dx02,fscal,fix0);
942 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
943 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
945 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
946 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
947 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
949 /**************************
950 * CALCULATE INTERACTIONS *
951 **************************/
953 r10 = _mm_mul_pd(rsq10,rinv10);
955 /* Calculate table index by multiplying r with table scale and truncate to integer */
956 rt = _mm_mul_pd(r10,vftabscale);
957 vfitab = _mm_cvttpd_epi32(rt);
959 vfeps = _mm_frcz_pd(rt);
961 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
963 twovfeps = _mm_add_pd(vfeps,vfeps);
964 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
966 /* CUBIC SPLINE TABLE ELECTROSTATICS */
967 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
968 F = _mm_setzero_pd();
969 GMX_MM_TRANSPOSE2_PD(Y,F);
970 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
971 H = _mm_setzero_pd();
972 GMX_MM_TRANSPOSE2_PD(G,H);
973 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
974 VV = _mm_macc_pd(vfeps,Fp,Y);
975 velec = _mm_mul_pd(qq10,VV);
976 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
977 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
981 velecsum = _mm_add_pd(velecsum,velec);
985 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
987 /* Update vectorial force */
988 fix1 = _mm_macc_pd(dx10,fscal,fix1);
989 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
990 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
992 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
993 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
994 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
996 /**************************
997 * CALCULATE INTERACTIONS *
998 **************************/
1000 r11 = _mm_mul_pd(rsq11,rinv11);
1002 /* Calculate table index by multiplying r with table scale and truncate to integer */
1003 rt = _mm_mul_pd(r11,vftabscale);
1004 vfitab = _mm_cvttpd_epi32(rt);
1006 vfeps = _mm_frcz_pd(rt);
1008 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1010 twovfeps = _mm_add_pd(vfeps,vfeps);
1011 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1013 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1014 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1015 F = _mm_setzero_pd();
1016 GMX_MM_TRANSPOSE2_PD(Y,F);
1017 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1018 H = _mm_setzero_pd();
1019 GMX_MM_TRANSPOSE2_PD(G,H);
1020 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1021 VV = _mm_macc_pd(vfeps,Fp,Y);
1022 velec = _mm_mul_pd(qq11,VV);
1023 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1024 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1028 velecsum = _mm_add_pd(velecsum,velec);
1032 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1034 /* Update vectorial force */
1035 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1036 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1037 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1039 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1040 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1041 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1043 /**************************
1044 * CALCULATE INTERACTIONS *
1045 **************************/
1047 r12 = _mm_mul_pd(rsq12,rinv12);
1049 /* Calculate table index by multiplying r with table scale and truncate to integer */
1050 rt = _mm_mul_pd(r12,vftabscale);
1051 vfitab = _mm_cvttpd_epi32(rt);
1053 vfeps = _mm_frcz_pd(rt);
1055 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1057 twovfeps = _mm_add_pd(vfeps,vfeps);
1058 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1060 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1061 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1062 F = _mm_setzero_pd();
1063 GMX_MM_TRANSPOSE2_PD(Y,F);
1064 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1065 H = _mm_setzero_pd();
1066 GMX_MM_TRANSPOSE2_PD(G,H);
1067 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1068 VV = _mm_macc_pd(vfeps,Fp,Y);
1069 velec = _mm_mul_pd(qq12,VV);
1070 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1071 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1075 velecsum = _mm_add_pd(velecsum,velec);
1079 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1081 /* Update vectorial force */
1082 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1083 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1084 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1086 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1087 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1088 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1090 /**************************
1091 * CALCULATE INTERACTIONS *
1092 **************************/
1094 r20 = _mm_mul_pd(rsq20,rinv20);
1096 /* Calculate table index by multiplying r with table scale and truncate to integer */
1097 rt = _mm_mul_pd(r20,vftabscale);
1098 vfitab = _mm_cvttpd_epi32(rt);
1100 vfeps = _mm_frcz_pd(rt);
1102 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1104 twovfeps = _mm_add_pd(vfeps,vfeps);
1105 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1107 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1109 F = _mm_setzero_pd();
1110 GMX_MM_TRANSPOSE2_PD(Y,F);
1111 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1112 H = _mm_setzero_pd();
1113 GMX_MM_TRANSPOSE2_PD(G,H);
1114 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1115 VV = _mm_macc_pd(vfeps,Fp,Y);
1116 velec = _mm_mul_pd(qq20,VV);
1117 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1118 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1120 /* Update potential sum for this i atom from the interaction with this j atom. */
1121 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1122 velecsum = _mm_add_pd(velecsum,velec);
1126 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1128 /* Update vectorial force */
1129 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1130 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1131 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1133 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1134 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1135 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1137 /**************************
1138 * CALCULATE INTERACTIONS *
1139 **************************/
1141 r21 = _mm_mul_pd(rsq21,rinv21);
1143 /* Calculate table index by multiplying r with table scale and truncate to integer */
1144 rt = _mm_mul_pd(r21,vftabscale);
1145 vfitab = _mm_cvttpd_epi32(rt);
1147 vfeps = _mm_frcz_pd(rt);
1149 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1151 twovfeps = _mm_add_pd(vfeps,vfeps);
1152 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1154 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1155 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1156 F = _mm_setzero_pd();
1157 GMX_MM_TRANSPOSE2_PD(Y,F);
1158 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1159 H = _mm_setzero_pd();
1160 GMX_MM_TRANSPOSE2_PD(G,H);
1161 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1162 VV = _mm_macc_pd(vfeps,Fp,Y);
1163 velec = _mm_mul_pd(qq21,VV);
1164 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1165 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1167 /* Update potential sum for this i atom from the interaction with this j atom. */
1168 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1169 velecsum = _mm_add_pd(velecsum,velec);
1173 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1175 /* Update vectorial force */
1176 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1177 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1178 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1180 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1181 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1182 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1184 /**************************
1185 * CALCULATE INTERACTIONS *
1186 **************************/
1188 r22 = _mm_mul_pd(rsq22,rinv22);
1190 /* Calculate table index by multiplying r with table scale and truncate to integer */
1191 rt = _mm_mul_pd(r22,vftabscale);
1192 vfitab = _mm_cvttpd_epi32(rt);
1194 vfeps = _mm_frcz_pd(rt);
1196 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1198 twovfeps = _mm_add_pd(vfeps,vfeps);
1199 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1201 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1202 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1203 F = _mm_setzero_pd();
1204 GMX_MM_TRANSPOSE2_PD(Y,F);
1205 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1206 H = _mm_setzero_pd();
1207 GMX_MM_TRANSPOSE2_PD(G,H);
1208 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1209 VV = _mm_macc_pd(vfeps,Fp,Y);
1210 velec = _mm_mul_pd(qq22,VV);
1211 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1212 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1214 /* Update potential sum for this i atom from the interaction with this j atom. */
1215 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1216 velecsum = _mm_add_pd(velecsum,velec);
1220 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1222 /* Update vectorial force */
1223 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1224 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1225 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1227 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1228 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1229 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1231 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1233 /* Inner loop uses 444 flops */
1236 /* End of innermost loop */
1238 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1239 f+i_coord_offset,fshift+i_shift_offset);
1242 /* Update potential energies */
1243 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1244 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1246 /* Increment number of inner iterations */
1247 inneriter += j_index_end - j_index_start;
1249 /* Outer loop uses 20 flops */
1252 /* Increment number of outer iterations */
1255 /* Update outer/inner flops */
1257 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*444);
1260 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1261 * Electrostatics interaction: CubicSplineTable
1262 * VdW interaction: CubicSplineTable
1263 * Geometry: Water3-Water3
1264 * Calculate force/pot: Force
1267 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1268 (t_nblist * gmx_restrict nlist,
1269 rvec * gmx_restrict xx,
1270 rvec * gmx_restrict ff,
1271 t_forcerec * gmx_restrict fr,
1272 t_mdatoms * gmx_restrict mdatoms,
1273 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1274 t_nrnb * gmx_restrict nrnb)
1276 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1277 * just 0 for non-waters.
1278 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1279 * jnr indices corresponding to data put in the four positions in the SIMD register.
1281 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1282 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1284 int j_coord_offsetA,j_coord_offsetB;
1285 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1286 real rcutoff_scalar;
1287 real *shiftvec,*fshift,*x,*f;
1288 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1290 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1292 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1294 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1295 int vdwjidx0A,vdwjidx0B;
1296 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1297 int vdwjidx1A,vdwjidx1B;
1298 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1299 int vdwjidx2A,vdwjidx2B;
1300 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1301 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1302 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1303 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1304 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1305 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1306 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1307 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1308 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1309 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1310 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1313 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1316 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1317 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1319 __m128i ifour = _mm_set1_epi32(4);
1320 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1322 __m128d dummy_mask,cutoff_mask;
1323 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1324 __m128d one = _mm_set1_pd(1.0);
1325 __m128d two = _mm_set1_pd(2.0);
1331 jindex = nlist->jindex;
1333 shiftidx = nlist->shift;
1335 shiftvec = fr->shift_vec[0];
1336 fshift = fr->fshift[0];
1337 facel = _mm_set1_pd(fr->epsfac);
1338 charge = mdatoms->chargeA;
1339 nvdwtype = fr->ntype;
1340 vdwparam = fr->nbfp;
1341 vdwtype = mdatoms->typeA;
1343 vftab = kernel_data->table_elec_vdw->data;
1344 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1346 /* Setup water-specific parameters */
1347 inr = nlist->iinr[0];
1348 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1349 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1350 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1351 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1353 jq0 = _mm_set1_pd(charge[inr+0]);
1354 jq1 = _mm_set1_pd(charge[inr+1]);
1355 jq2 = _mm_set1_pd(charge[inr+2]);
1356 vdwjidx0A = 2*vdwtype[inr+0];
1357 qq00 = _mm_mul_pd(iq0,jq0);
1358 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1359 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1360 qq01 = _mm_mul_pd(iq0,jq1);
1361 qq02 = _mm_mul_pd(iq0,jq2);
1362 qq10 = _mm_mul_pd(iq1,jq0);
1363 qq11 = _mm_mul_pd(iq1,jq1);
1364 qq12 = _mm_mul_pd(iq1,jq2);
1365 qq20 = _mm_mul_pd(iq2,jq0);
1366 qq21 = _mm_mul_pd(iq2,jq1);
1367 qq22 = _mm_mul_pd(iq2,jq2);
1369 /* Avoid stupid compiler warnings */
1371 j_coord_offsetA = 0;
1372 j_coord_offsetB = 0;
1377 /* Start outer loop over neighborlists */
1378 for(iidx=0; iidx<nri; iidx++)
1380 /* Load shift vector for this list */
1381 i_shift_offset = DIM*shiftidx[iidx];
1383 /* Load limits for loop over neighbors */
1384 j_index_start = jindex[iidx];
1385 j_index_end = jindex[iidx+1];
1387 /* Get outer coordinate index */
1389 i_coord_offset = DIM*inr;
1391 /* Load i particle coords and add shift vector */
1392 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1393 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1395 fix0 = _mm_setzero_pd();
1396 fiy0 = _mm_setzero_pd();
1397 fiz0 = _mm_setzero_pd();
1398 fix1 = _mm_setzero_pd();
1399 fiy1 = _mm_setzero_pd();
1400 fiz1 = _mm_setzero_pd();
1401 fix2 = _mm_setzero_pd();
1402 fiy2 = _mm_setzero_pd();
1403 fiz2 = _mm_setzero_pd();
1405 /* Start inner kernel loop */
1406 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1409 /* Get j neighbor index, and coordinate index */
1411 jnrB = jjnr[jidx+1];
1412 j_coord_offsetA = DIM*jnrA;
1413 j_coord_offsetB = DIM*jnrB;
1415 /* load j atom coordinates */
1416 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1417 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1419 /* Calculate displacement vector */
1420 dx00 = _mm_sub_pd(ix0,jx0);
1421 dy00 = _mm_sub_pd(iy0,jy0);
1422 dz00 = _mm_sub_pd(iz0,jz0);
1423 dx01 = _mm_sub_pd(ix0,jx1);
1424 dy01 = _mm_sub_pd(iy0,jy1);
1425 dz01 = _mm_sub_pd(iz0,jz1);
1426 dx02 = _mm_sub_pd(ix0,jx2);
1427 dy02 = _mm_sub_pd(iy0,jy2);
1428 dz02 = _mm_sub_pd(iz0,jz2);
1429 dx10 = _mm_sub_pd(ix1,jx0);
1430 dy10 = _mm_sub_pd(iy1,jy0);
1431 dz10 = _mm_sub_pd(iz1,jz0);
1432 dx11 = _mm_sub_pd(ix1,jx1);
1433 dy11 = _mm_sub_pd(iy1,jy1);
1434 dz11 = _mm_sub_pd(iz1,jz1);
1435 dx12 = _mm_sub_pd(ix1,jx2);
1436 dy12 = _mm_sub_pd(iy1,jy2);
1437 dz12 = _mm_sub_pd(iz1,jz2);
1438 dx20 = _mm_sub_pd(ix2,jx0);
1439 dy20 = _mm_sub_pd(iy2,jy0);
1440 dz20 = _mm_sub_pd(iz2,jz0);
1441 dx21 = _mm_sub_pd(ix2,jx1);
1442 dy21 = _mm_sub_pd(iy2,jy1);
1443 dz21 = _mm_sub_pd(iz2,jz1);
1444 dx22 = _mm_sub_pd(ix2,jx2);
1445 dy22 = _mm_sub_pd(iy2,jy2);
1446 dz22 = _mm_sub_pd(iz2,jz2);
1448 /* Calculate squared distance and things based on it */
1449 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1450 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1451 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1452 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1453 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1454 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1455 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1456 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1457 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1459 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1460 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1461 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1462 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1463 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1464 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1465 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1466 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1467 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1469 fjx0 = _mm_setzero_pd();
1470 fjy0 = _mm_setzero_pd();
1471 fjz0 = _mm_setzero_pd();
1472 fjx1 = _mm_setzero_pd();
1473 fjy1 = _mm_setzero_pd();
1474 fjz1 = _mm_setzero_pd();
1475 fjx2 = _mm_setzero_pd();
1476 fjy2 = _mm_setzero_pd();
1477 fjz2 = _mm_setzero_pd();
1479 /**************************
1480 * CALCULATE INTERACTIONS *
1481 **************************/
1483 r00 = _mm_mul_pd(rsq00,rinv00);
1485 /* Calculate table index by multiplying r with table scale and truncate to integer */
1486 rt = _mm_mul_pd(r00,vftabscale);
1487 vfitab = _mm_cvttpd_epi32(rt);
1489 vfeps = _mm_frcz_pd(rt);
1491 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1493 twovfeps = _mm_add_pd(vfeps,vfeps);
1494 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1496 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1497 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1498 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1499 GMX_MM_TRANSPOSE2_PD(Y,F);
1500 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1501 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1502 GMX_MM_TRANSPOSE2_PD(G,H);
1503 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1504 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1505 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1507 /* CUBIC SPLINE TABLE DISPERSION */
1508 vfitab = _mm_add_epi32(vfitab,ifour);
1509 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1510 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1511 GMX_MM_TRANSPOSE2_PD(Y,F);
1512 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1513 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1514 GMX_MM_TRANSPOSE2_PD(G,H);
1515 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1516 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1517 fvdw6 = _mm_mul_pd(c6_00,FF);
1519 /* CUBIC SPLINE TABLE REPULSION */
1520 vfitab = _mm_add_epi32(vfitab,ifour);
1521 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1522 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1523 GMX_MM_TRANSPOSE2_PD(Y,F);
1524 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1525 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1526 GMX_MM_TRANSPOSE2_PD(G,H);
1527 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1528 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1529 fvdw12 = _mm_mul_pd(c12_00,FF);
1530 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1532 fscal = _mm_add_pd(felec,fvdw);
1534 /* Update vectorial force */
1535 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1536 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1537 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1539 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1540 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1541 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1543 /**************************
1544 * CALCULATE INTERACTIONS *
1545 **************************/
1547 r01 = _mm_mul_pd(rsq01,rinv01);
1549 /* Calculate table index by multiplying r with table scale and truncate to integer */
1550 rt = _mm_mul_pd(r01,vftabscale);
1551 vfitab = _mm_cvttpd_epi32(rt);
1553 vfeps = _mm_frcz_pd(rt);
1555 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1557 twovfeps = _mm_add_pd(vfeps,vfeps);
1558 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1560 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1562 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1563 GMX_MM_TRANSPOSE2_PD(Y,F);
1564 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1565 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1566 GMX_MM_TRANSPOSE2_PD(G,H);
1567 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1568 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1569 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1573 /* Update vectorial force */
1574 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1575 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1576 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1578 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1579 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1580 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1582 /**************************
1583 * CALCULATE INTERACTIONS *
1584 **************************/
1586 r02 = _mm_mul_pd(rsq02,rinv02);
1588 /* Calculate table index by multiplying r with table scale and truncate to integer */
1589 rt = _mm_mul_pd(r02,vftabscale);
1590 vfitab = _mm_cvttpd_epi32(rt);
1592 vfeps = _mm_frcz_pd(rt);
1594 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1596 twovfeps = _mm_add_pd(vfeps,vfeps);
1597 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1599 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1600 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1601 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1602 GMX_MM_TRANSPOSE2_PD(Y,F);
1603 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1604 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1605 GMX_MM_TRANSPOSE2_PD(G,H);
1606 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1607 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1608 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1612 /* Update vectorial force */
1613 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1614 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1615 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1617 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1618 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1619 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 r10 = _mm_mul_pd(rsq10,rinv10);
1627 /* Calculate table index by multiplying r with table scale and truncate to integer */
1628 rt = _mm_mul_pd(r10,vftabscale);
1629 vfitab = _mm_cvttpd_epi32(rt);
1631 vfeps = _mm_frcz_pd(rt);
1633 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1635 twovfeps = _mm_add_pd(vfeps,vfeps);
1636 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1638 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1640 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1641 GMX_MM_TRANSPOSE2_PD(Y,F);
1642 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1643 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1644 GMX_MM_TRANSPOSE2_PD(G,H);
1645 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1646 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1647 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1651 /* Update vectorial force */
1652 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1653 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1654 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1656 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1657 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1658 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r11 = _mm_mul_pd(rsq11,rinv11);
1666 /* Calculate table index by multiplying r with table scale and truncate to integer */
1667 rt = _mm_mul_pd(r11,vftabscale);
1668 vfitab = _mm_cvttpd_epi32(rt);
1670 vfeps = _mm_frcz_pd(rt);
1672 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1674 twovfeps = _mm_add_pd(vfeps,vfeps);
1675 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1677 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1679 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1680 GMX_MM_TRANSPOSE2_PD(Y,F);
1681 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1682 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1683 GMX_MM_TRANSPOSE2_PD(G,H);
1684 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1685 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1686 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1690 /* Update vectorial force */
1691 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1692 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1693 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1695 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1696 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1697 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1703 r12 = _mm_mul_pd(rsq12,rinv12);
1705 /* Calculate table index by multiplying r with table scale and truncate to integer */
1706 rt = _mm_mul_pd(r12,vftabscale);
1707 vfitab = _mm_cvttpd_epi32(rt);
1709 vfeps = _mm_frcz_pd(rt);
1711 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1713 twovfeps = _mm_add_pd(vfeps,vfeps);
1714 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1716 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1717 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1718 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1719 GMX_MM_TRANSPOSE2_PD(Y,F);
1720 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1721 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1722 GMX_MM_TRANSPOSE2_PD(G,H);
1723 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1724 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1725 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1729 /* Update vectorial force */
1730 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1731 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1732 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1734 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1735 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1736 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 r20 = _mm_mul_pd(rsq20,rinv20);
1744 /* Calculate table index by multiplying r with table scale and truncate to integer */
1745 rt = _mm_mul_pd(r20,vftabscale);
1746 vfitab = _mm_cvttpd_epi32(rt);
1748 vfeps = _mm_frcz_pd(rt);
1750 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1752 twovfeps = _mm_add_pd(vfeps,vfeps);
1753 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1755 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1756 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1757 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1758 GMX_MM_TRANSPOSE2_PD(Y,F);
1759 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1760 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1761 GMX_MM_TRANSPOSE2_PD(G,H);
1762 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1763 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1764 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1768 /* Update vectorial force */
1769 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1770 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1771 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1773 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1774 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1775 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1777 /**************************
1778 * CALCULATE INTERACTIONS *
1779 **************************/
1781 r21 = _mm_mul_pd(rsq21,rinv21);
1783 /* Calculate table index by multiplying r with table scale and truncate to integer */
1784 rt = _mm_mul_pd(r21,vftabscale);
1785 vfitab = _mm_cvttpd_epi32(rt);
1787 vfeps = _mm_frcz_pd(rt);
1789 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1791 twovfeps = _mm_add_pd(vfeps,vfeps);
1792 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1794 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1795 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1796 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1797 GMX_MM_TRANSPOSE2_PD(Y,F);
1798 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1799 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1800 GMX_MM_TRANSPOSE2_PD(G,H);
1801 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1802 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1803 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1807 /* Update vectorial force */
1808 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1809 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1810 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1812 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1813 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1814 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1816 /**************************
1817 * CALCULATE INTERACTIONS *
1818 **************************/
1820 r22 = _mm_mul_pd(rsq22,rinv22);
1822 /* Calculate table index by multiplying r with table scale and truncate to integer */
1823 rt = _mm_mul_pd(r22,vftabscale);
1824 vfitab = _mm_cvttpd_epi32(rt);
1826 vfeps = _mm_frcz_pd(rt);
1828 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1830 twovfeps = _mm_add_pd(vfeps,vfeps);
1831 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1833 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1834 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1835 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1836 GMX_MM_TRANSPOSE2_PD(Y,F);
1837 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1838 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1839 GMX_MM_TRANSPOSE2_PD(G,H);
1840 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1841 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1842 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1846 /* Update vectorial force */
1847 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1848 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1849 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1851 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1852 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1853 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1855 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1857 /* Inner loop uses 400 flops */
1860 if(jidx<j_index_end)
1864 j_coord_offsetA = DIM*jnrA;
1866 /* load j atom coordinates */
1867 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1868 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1870 /* Calculate displacement vector */
1871 dx00 = _mm_sub_pd(ix0,jx0);
1872 dy00 = _mm_sub_pd(iy0,jy0);
1873 dz00 = _mm_sub_pd(iz0,jz0);
1874 dx01 = _mm_sub_pd(ix0,jx1);
1875 dy01 = _mm_sub_pd(iy0,jy1);
1876 dz01 = _mm_sub_pd(iz0,jz1);
1877 dx02 = _mm_sub_pd(ix0,jx2);
1878 dy02 = _mm_sub_pd(iy0,jy2);
1879 dz02 = _mm_sub_pd(iz0,jz2);
1880 dx10 = _mm_sub_pd(ix1,jx0);
1881 dy10 = _mm_sub_pd(iy1,jy0);
1882 dz10 = _mm_sub_pd(iz1,jz0);
1883 dx11 = _mm_sub_pd(ix1,jx1);
1884 dy11 = _mm_sub_pd(iy1,jy1);
1885 dz11 = _mm_sub_pd(iz1,jz1);
1886 dx12 = _mm_sub_pd(ix1,jx2);
1887 dy12 = _mm_sub_pd(iy1,jy2);
1888 dz12 = _mm_sub_pd(iz1,jz2);
1889 dx20 = _mm_sub_pd(ix2,jx0);
1890 dy20 = _mm_sub_pd(iy2,jy0);
1891 dz20 = _mm_sub_pd(iz2,jz0);
1892 dx21 = _mm_sub_pd(ix2,jx1);
1893 dy21 = _mm_sub_pd(iy2,jy1);
1894 dz21 = _mm_sub_pd(iz2,jz1);
1895 dx22 = _mm_sub_pd(ix2,jx2);
1896 dy22 = _mm_sub_pd(iy2,jy2);
1897 dz22 = _mm_sub_pd(iz2,jz2);
1899 /* Calculate squared distance and things based on it */
1900 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1901 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1902 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1903 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1904 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1905 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1906 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1907 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1908 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1910 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1911 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1912 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1913 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1914 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1915 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1916 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1917 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1918 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1920 fjx0 = _mm_setzero_pd();
1921 fjy0 = _mm_setzero_pd();
1922 fjz0 = _mm_setzero_pd();
1923 fjx1 = _mm_setzero_pd();
1924 fjy1 = _mm_setzero_pd();
1925 fjz1 = _mm_setzero_pd();
1926 fjx2 = _mm_setzero_pd();
1927 fjy2 = _mm_setzero_pd();
1928 fjz2 = _mm_setzero_pd();
1930 /**************************
1931 * CALCULATE INTERACTIONS *
1932 **************************/
1934 r00 = _mm_mul_pd(rsq00,rinv00);
1936 /* Calculate table index by multiplying r with table scale and truncate to integer */
1937 rt = _mm_mul_pd(r00,vftabscale);
1938 vfitab = _mm_cvttpd_epi32(rt);
1940 vfeps = _mm_frcz_pd(rt);
1942 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1944 twovfeps = _mm_add_pd(vfeps,vfeps);
1945 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1947 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1948 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1949 F = _mm_setzero_pd();
1950 GMX_MM_TRANSPOSE2_PD(Y,F);
1951 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1952 H = _mm_setzero_pd();
1953 GMX_MM_TRANSPOSE2_PD(G,H);
1954 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1955 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1956 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1958 /* CUBIC SPLINE TABLE DISPERSION */
1959 vfitab = _mm_add_epi32(vfitab,ifour);
1960 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1961 F = _mm_setzero_pd();
1962 GMX_MM_TRANSPOSE2_PD(Y,F);
1963 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1964 H = _mm_setzero_pd();
1965 GMX_MM_TRANSPOSE2_PD(G,H);
1966 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1967 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1968 fvdw6 = _mm_mul_pd(c6_00,FF);
1970 /* CUBIC SPLINE TABLE REPULSION */
1971 vfitab = _mm_add_epi32(vfitab,ifour);
1972 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1973 F = _mm_setzero_pd();
1974 GMX_MM_TRANSPOSE2_PD(Y,F);
1975 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1976 H = _mm_setzero_pd();
1977 GMX_MM_TRANSPOSE2_PD(G,H);
1978 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1979 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1980 fvdw12 = _mm_mul_pd(c12_00,FF);
1981 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1983 fscal = _mm_add_pd(felec,fvdw);
1985 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1987 /* Update vectorial force */
1988 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1989 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1990 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1992 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1993 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1994 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1996 /**************************
1997 * CALCULATE INTERACTIONS *
1998 **************************/
2000 r01 = _mm_mul_pd(rsq01,rinv01);
2002 /* Calculate table index by multiplying r with table scale and truncate to integer */
2003 rt = _mm_mul_pd(r01,vftabscale);
2004 vfitab = _mm_cvttpd_epi32(rt);
2006 vfeps = _mm_frcz_pd(rt);
2008 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2010 twovfeps = _mm_add_pd(vfeps,vfeps);
2011 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2013 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2014 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2015 F = _mm_setzero_pd();
2016 GMX_MM_TRANSPOSE2_PD(Y,F);
2017 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2018 H = _mm_setzero_pd();
2019 GMX_MM_TRANSPOSE2_PD(G,H);
2020 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2021 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2022 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2026 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2028 /* Update vectorial force */
2029 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2030 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2031 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2033 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2034 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2035 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 r02 = _mm_mul_pd(rsq02,rinv02);
2043 /* Calculate table index by multiplying r with table scale and truncate to integer */
2044 rt = _mm_mul_pd(r02,vftabscale);
2045 vfitab = _mm_cvttpd_epi32(rt);
2047 vfeps = _mm_frcz_pd(rt);
2049 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2051 twovfeps = _mm_add_pd(vfeps,vfeps);
2052 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2054 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2055 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2056 F = _mm_setzero_pd();
2057 GMX_MM_TRANSPOSE2_PD(Y,F);
2058 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2059 H = _mm_setzero_pd();
2060 GMX_MM_TRANSPOSE2_PD(G,H);
2061 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2062 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2063 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2067 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2069 /* Update vectorial force */
2070 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2071 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2072 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2074 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2075 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2076 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2078 /**************************
2079 * CALCULATE INTERACTIONS *
2080 **************************/
2082 r10 = _mm_mul_pd(rsq10,rinv10);
2084 /* Calculate table index by multiplying r with table scale and truncate to integer */
2085 rt = _mm_mul_pd(r10,vftabscale);
2086 vfitab = _mm_cvttpd_epi32(rt);
2088 vfeps = _mm_frcz_pd(rt);
2090 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2092 twovfeps = _mm_add_pd(vfeps,vfeps);
2093 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2095 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2096 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2097 F = _mm_setzero_pd();
2098 GMX_MM_TRANSPOSE2_PD(Y,F);
2099 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2100 H = _mm_setzero_pd();
2101 GMX_MM_TRANSPOSE2_PD(G,H);
2102 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2103 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2104 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2108 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2110 /* Update vectorial force */
2111 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2112 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2113 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2115 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2116 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2117 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2119 /**************************
2120 * CALCULATE INTERACTIONS *
2121 **************************/
2123 r11 = _mm_mul_pd(rsq11,rinv11);
2125 /* Calculate table index by multiplying r with table scale and truncate to integer */
2126 rt = _mm_mul_pd(r11,vftabscale);
2127 vfitab = _mm_cvttpd_epi32(rt);
2129 vfeps = _mm_frcz_pd(rt);
2131 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2133 twovfeps = _mm_add_pd(vfeps,vfeps);
2134 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2136 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2137 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2138 F = _mm_setzero_pd();
2139 GMX_MM_TRANSPOSE2_PD(Y,F);
2140 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2141 H = _mm_setzero_pd();
2142 GMX_MM_TRANSPOSE2_PD(G,H);
2143 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2144 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2145 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2149 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2151 /* Update vectorial force */
2152 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2153 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2154 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2156 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2157 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2158 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2160 /**************************
2161 * CALCULATE INTERACTIONS *
2162 **************************/
2164 r12 = _mm_mul_pd(rsq12,rinv12);
2166 /* Calculate table index by multiplying r with table scale and truncate to integer */
2167 rt = _mm_mul_pd(r12,vftabscale);
2168 vfitab = _mm_cvttpd_epi32(rt);
2170 vfeps = _mm_frcz_pd(rt);
2172 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2174 twovfeps = _mm_add_pd(vfeps,vfeps);
2175 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2177 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2178 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2179 F = _mm_setzero_pd();
2180 GMX_MM_TRANSPOSE2_PD(Y,F);
2181 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2182 H = _mm_setzero_pd();
2183 GMX_MM_TRANSPOSE2_PD(G,H);
2184 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2185 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2186 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2190 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2192 /* Update vectorial force */
2193 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2194 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2195 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2197 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2198 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2199 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2205 r20 = _mm_mul_pd(rsq20,rinv20);
2207 /* Calculate table index by multiplying r with table scale and truncate to integer */
2208 rt = _mm_mul_pd(r20,vftabscale);
2209 vfitab = _mm_cvttpd_epi32(rt);
2211 vfeps = _mm_frcz_pd(rt);
2213 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2215 twovfeps = _mm_add_pd(vfeps,vfeps);
2216 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2218 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2219 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2220 F = _mm_setzero_pd();
2221 GMX_MM_TRANSPOSE2_PD(Y,F);
2222 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2223 H = _mm_setzero_pd();
2224 GMX_MM_TRANSPOSE2_PD(G,H);
2225 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2226 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2227 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2231 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2233 /* Update vectorial force */
2234 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2235 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2236 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2238 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2239 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2240 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2242 /**************************
2243 * CALCULATE INTERACTIONS *
2244 **************************/
2246 r21 = _mm_mul_pd(rsq21,rinv21);
2248 /* Calculate table index by multiplying r with table scale and truncate to integer */
2249 rt = _mm_mul_pd(r21,vftabscale);
2250 vfitab = _mm_cvttpd_epi32(rt);
2252 vfeps = _mm_frcz_pd(rt);
2254 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2256 twovfeps = _mm_add_pd(vfeps,vfeps);
2257 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2259 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2260 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2261 F = _mm_setzero_pd();
2262 GMX_MM_TRANSPOSE2_PD(Y,F);
2263 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2264 H = _mm_setzero_pd();
2265 GMX_MM_TRANSPOSE2_PD(G,H);
2266 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2267 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2268 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2272 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2274 /* Update vectorial force */
2275 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2276 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2277 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2279 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2280 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2281 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2283 /**************************
2284 * CALCULATE INTERACTIONS *
2285 **************************/
2287 r22 = _mm_mul_pd(rsq22,rinv22);
2289 /* Calculate table index by multiplying r with table scale and truncate to integer */
2290 rt = _mm_mul_pd(r22,vftabscale);
2291 vfitab = _mm_cvttpd_epi32(rt);
2293 vfeps = _mm_frcz_pd(rt);
2295 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2297 twovfeps = _mm_add_pd(vfeps,vfeps);
2298 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2300 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2301 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2302 F = _mm_setzero_pd();
2303 GMX_MM_TRANSPOSE2_PD(Y,F);
2304 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2305 H = _mm_setzero_pd();
2306 GMX_MM_TRANSPOSE2_PD(G,H);
2307 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2308 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2309 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2313 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2315 /* Update vectorial force */
2316 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2317 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2318 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2320 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2321 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2322 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2324 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2326 /* Inner loop uses 400 flops */
2329 /* End of innermost loop */
2331 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2332 f+i_coord_offset,fshift+i_shift_offset);
2334 /* Increment number of inner iterations */
2335 inneriter += j_index_end - j_index_start;
2337 /* Outer loop uses 18 flops */
2340 /* Increment number of outer iterations */
2343 /* Update outer/inner flops */
2345 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);