2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
82 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
84 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
85 int vdwjidx1A,vdwjidx1B;
86 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
87 int vdwjidx2A,vdwjidx2B;
88 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
89 int vdwjidx3A,vdwjidx3B;
90 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
91 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
92 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
93 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
94 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
95 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
96 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
97 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
98 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
99 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
100 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
103 __m128i ifour = _mm_set1_epi32(4);
104 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
106 __m128d dummy_mask,cutoff_mask;
107 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
108 __m128d one = _mm_set1_pd(1.0);
109 __m128d two = _mm_set1_pd(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm_set1_pd(fr->ic->epsfac);
122 charge = mdatoms->chargeA;
124 vftab = kernel_data->table_elec->data;
125 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
130 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
131 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
133 jq1 = _mm_set1_pd(charge[inr+1]);
134 jq2 = _mm_set1_pd(charge[inr+2]);
135 jq3 = _mm_set1_pd(charge[inr+3]);
136 qq11 = _mm_mul_pd(iq1,jq1);
137 qq12 = _mm_mul_pd(iq1,jq2);
138 qq13 = _mm_mul_pd(iq1,jq3);
139 qq21 = _mm_mul_pd(iq2,jq1);
140 qq22 = _mm_mul_pd(iq2,jq2);
141 qq23 = _mm_mul_pd(iq2,jq3);
142 qq31 = _mm_mul_pd(iq3,jq1);
143 qq32 = _mm_mul_pd(iq3,jq2);
144 qq33 = _mm_mul_pd(iq3,jq3);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
170 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
172 fix1 = _mm_setzero_pd();
173 fiy1 = _mm_setzero_pd();
174 fiz1 = _mm_setzero_pd();
175 fix2 = _mm_setzero_pd();
176 fiy2 = _mm_setzero_pd();
177 fiz2 = _mm_setzero_pd();
178 fix3 = _mm_setzero_pd();
179 fiy3 = _mm_setzero_pd();
180 fiz3 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
189 /* Get j neighbor index, and coordinate index */
192 j_coord_offsetA = DIM*jnrA;
193 j_coord_offsetB = DIM*jnrB;
195 /* load j atom coordinates */
196 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
197 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
199 /* Calculate displacement vector */
200 dx11 = _mm_sub_pd(ix1,jx1);
201 dy11 = _mm_sub_pd(iy1,jy1);
202 dz11 = _mm_sub_pd(iz1,jz1);
203 dx12 = _mm_sub_pd(ix1,jx2);
204 dy12 = _mm_sub_pd(iy1,jy2);
205 dz12 = _mm_sub_pd(iz1,jz2);
206 dx13 = _mm_sub_pd(ix1,jx3);
207 dy13 = _mm_sub_pd(iy1,jy3);
208 dz13 = _mm_sub_pd(iz1,jz3);
209 dx21 = _mm_sub_pd(ix2,jx1);
210 dy21 = _mm_sub_pd(iy2,jy1);
211 dz21 = _mm_sub_pd(iz2,jz1);
212 dx22 = _mm_sub_pd(ix2,jx2);
213 dy22 = _mm_sub_pd(iy2,jy2);
214 dz22 = _mm_sub_pd(iz2,jz2);
215 dx23 = _mm_sub_pd(ix2,jx3);
216 dy23 = _mm_sub_pd(iy2,jy3);
217 dz23 = _mm_sub_pd(iz2,jz3);
218 dx31 = _mm_sub_pd(ix3,jx1);
219 dy31 = _mm_sub_pd(iy3,jy1);
220 dz31 = _mm_sub_pd(iz3,jz1);
221 dx32 = _mm_sub_pd(ix3,jx2);
222 dy32 = _mm_sub_pd(iy3,jy2);
223 dz32 = _mm_sub_pd(iz3,jz2);
224 dx33 = _mm_sub_pd(ix3,jx3);
225 dy33 = _mm_sub_pd(iy3,jy3);
226 dz33 = _mm_sub_pd(iz3,jz3);
228 /* Calculate squared distance and things based on it */
229 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
230 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
231 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
232 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
233 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
234 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
235 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
236 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
237 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
239 rinv11 = avx128fma_invsqrt_d(rsq11);
240 rinv12 = avx128fma_invsqrt_d(rsq12);
241 rinv13 = avx128fma_invsqrt_d(rsq13);
242 rinv21 = avx128fma_invsqrt_d(rsq21);
243 rinv22 = avx128fma_invsqrt_d(rsq22);
244 rinv23 = avx128fma_invsqrt_d(rsq23);
245 rinv31 = avx128fma_invsqrt_d(rsq31);
246 rinv32 = avx128fma_invsqrt_d(rsq32);
247 rinv33 = avx128fma_invsqrt_d(rsq33);
249 fjx1 = _mm_setzero_pd();
250 fjy1 = _mm_setzero_pd();
251 fjz1 = _mm_setzero_pd();
252 fjx2 = _mm_setzero_pd();
253 fjy2 = _mm_setzero_pd();
254 fjz2 = _mm_setzero_pd();
255 fjx3 = _mm_setzero_pd();
256 fjy3 = _mm_setzero_pd();
257 fjz3 = _mm_setzero_pd();
259 /**************************
260 * CALCULATE INTERACTIONS *
261 **************************/
263 r11 = _mm_mul_pd(rsq11,rinv11);
265 /* Calculate table index by multiplying r with table scale and truncate to integer */
266 rt = _mm_mul_pd(r11,vftabscale);
267 vfitab = _mm_cvttpd_epi32(rt);
269 vfeps = _mm_frcz_pd(rt);
271 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
273 twovfeps = _mm_add_pd(vfeps,vfeps);
274 vfitab = _mm_slli_epi32(vfitab,2);
276 /* CUBIC SPLINE TABLE ELECTROSTATICS */
277 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
278 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
279 GMX_MM_TRANSPOSE2_PD(Y,F);
280 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
281 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
282 GMX_MM_TRANSPOSE2_PD(G,H);
283 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
284 VV = _mm_macc_pd(vfeps,Fp,Y);
285 velec = _mm_mul_pd(qq11,VV);
286 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
287 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
289 /* Update potential sum for this i atom from the interaction with this j atom. */
290 velecsum = _mm_add_pd(velecsum,velec);
294 /* Update vectorial force */
295 fix1 = _mm_macc_pd(dx11,fscal,fix1);
296 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
297 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
299 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
300 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
301 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 r12 = _mm_mul_pd(rsq12,rinv12);
309 /* Calculate table index by multiplying r with table scale and truncate to integer */
310 rt = _mm_mul_pd(r12,vftabscale);
311 vfitab = _mm_cvttpd_epi32(rt);
313 vfeps = _mm_frcz_pd(rt);
315 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
317 twovfeps = _mm_add_pd(vfeps,vfeps);
318 vfitab = _mm_slli_epi32(vfitab,2);
320 /* CUBIC SPLINE TABLE ELECTROSTATICS */
321 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
322 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
323 GMX_MM_TRANSPOSE2_PD(Y,F);
324 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
325 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
326 GMX_MM_TRANSPOSE2_PD(G,H);
327 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
328 VV = _mm_macc_pd(vfeps,Fp,Y);
329 velec = _mm_mul_pd(qq12,VV);
330 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
331 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm_add_pd(velecsum,velec);
338 /* Update vectorial force */
339 fix1 = _mm_macc_pd(dx12,fscal,fix1);
340 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
341 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
343 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
344 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
345 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
347 /**************************
348 * CALCULATE INTERACTIONS *
349 **************************/
351 r13 = _mm_mul_pd(rsq13,rinv13);
353 /* Calculate table index by multiplying r with table scale and truncate to integer */
354 rt = _mm_mul_pd(r13,vftabscale);
355 vfitab = _mm_cvttpd_epi32(rt);
357 vfeps = _mm_frcz_pd(rt);
359 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
361 twovfeps = _mm_add_pd(vfeps,vfeps);
362 vfitab = _mm_slli_epi32(vfitab,2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
366 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
367 GMX_MM_TRANSPOSE2_PD(Y,F);
368 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
369 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
370 GMX_MM_TRANSPOSE2_PD(G,H);
371 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
372 VV = _mm_macc_pd(vfeps,Fp,Y);
373 velec = _mm_mul_pd(qq13,VV);
374 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
375 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velecsum = _mm_add_pd(velecsum,velec);
382 /* Update vectorial force */
383 fix1 = _mm_macc_pd(dx13,fscal,fix1);
384 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
385 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
387 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
388 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
389 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 r21 = _mm_mul_pd(rsq21,rinv21);
397 /* Calculate table index by multiplying r with table scale and truncate to integer */
398 rt = _mm_mul_pd(r21,vftabscale);
399 vfitab = _mm_cvttpd_epi32(rt);
401 vfeps = _mm_frcz_pd(rt);
403 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
405 twovfeps = _mm_add_pd(vfeps,vfeps);
406 vfitab = _mm_slli_epi32(vfitab,2);
408 /* CUBIC SPLINE TABLE ELECTROSTATICS */
409 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
410 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
411 GMX_MM_TRANSPOSE2_PD(Y,F);
412 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
413 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
414 GMX_MM_TRANSPOSE2_PD(G,H);
415 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
416 VV = _mm_macc_pd(vfeps,Fp,Y);
417 velec = _mm_mul_pd(qq21,VV);
418 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
419 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
421 /* Update potential sum for this i atom from the interaction with this j atom. */
422 velecsum = _mm_add_pd(velecsum,velec);
426 /* Update vectorial force */
427 fix2 = _mm_macc_pd(dx21,fscal,fix2);
428 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
429 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
431 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
432 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
433 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 r22 = _mm_mul_pd(rsq22,rinv22);
441 /* Calculate table index by multiplying r with table scale and truncate to integer */
442 rt = _mm_mul_pd(r22,vftabscale);
443 vfitab = _mm_cvttpd_epi32(rt);
445 vfeps = _mm_frcz_pd(rt);
447 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
449 twovfeps = _mm_add_pd(vfeps,vfeps);
450 vfitab = _mm_slli_epi32(vfitab,2);
452 /* CUBIC SPLINE TABLE ELECTROSTATICS */
453 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
454 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
455 GMX_MM_TRANSPOSE2_PD(Y,F);
456 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
457 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
458 GMX_MM_TRANSPOSE2_PD(G,H);
459 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
460 VV = _mm_macc_pd(vfeps,Fp,Y);
461 velec = _mm_mul_pd(qq22,VV);
462 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
463 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velecsum = _mm_add_pd(velecsum,velec);
470 /* Update vectorial force */
471 fix2 = _mm_macc_pd(dx22,fscal,fix2);
472 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
473 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
475 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
476 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
477 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
479 /**************************
480 * CALCULATE INTERACTIONS *
481 **************************/
483 r23 = _mm_mul_pd(rsq23,rinv23);
485 /* Calculate table index by multiplying r with table scale and truncate to integer */
486 rt = _mm_mul_pd(r23,vftabscale);
487 vfitab = _mm_cvttpd_epi32(rt);
489 vfeps = _mm_frcz_pd(rt);
491 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
493 twovfeps = _mm_add_pd(vfeps,vfeps);
494 vfitab = _mm_slli_epi32(vfitab,2);
496 /* CUBIC SPLINE TABLE ELECTROSTATICS */
497 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
498 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
499 GMX_MM_TRANSPOSE2_PD(Y,F);
500 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
501 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
502 GMX_MM_TRANSPOSE2_PD(G,H);
503 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
504 VV = _mm_macc_pd(vfeps,Fp,Y);
505 velec = _mm_mul_pd(qq23,VV);
506 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
507 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velecsum = _mm_add_pd(velecsum,velec);
514 /* Update vectorial force */
515 fix2 = _mm_macc_pd(dx23,fscal,fix2);
516 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
517 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
519 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
520 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
521 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
523 /**************************
524 * CALCULATE INTERACTIONS *
525 **************************/
527 r31 = _mm_mul_pd(rsq31,rinv31);
529 /* Calculate table index by multiplying r with table scale and truncate to integer */
530 rt = _mm_mul_pd(r31,vftabscale);
531 vfitab = _mm_cvttpd_epi32(rt);
533 vfeps = _mm_frcz_pd(rt);
535 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
537 twovfeps = _mm_add_pd(vfeps,vfeps);
538 vfitab = _mm_slli_epi32(vfitab,2);
540 /* CUBIC SPLINE TABLE ELECTROSTATICS */
541 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
542 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
543 GMX_MM_TRANSPOSE2_PD(Y,F);
544 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
545 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
546 GMX_MM_TRANSPOSE2_PD(G,H);
547 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
548 VV = _mm_macc_pd(vfeps,Fp,Y);
549 velec = _mm_mul_pd(qq31,VV);
550 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
551 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
553 /* Update potential sum for this i atom from the interaction with this j atom. */
554 velecsum = _mm_add_pd(velecsum,velec);
558 /* Update vectorial force */
559 fix3 = _mm_macc_pd(dx31,fscal,fix3);
560 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
561 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
563 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
564 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
565 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
567 /**************************
568 * CALCULATE INTERACTIONS *
569 **************************/
571 r32 = _mm_mul_pd(rsq32,rinv32);
573 /* Calculate table index by multiplying r with table scale and truncate to integer */
574 rt = _mm_mul_pd(r32,vftabscale);
575 vfitab = _mm_cvttpd_epi32(rt);
577 vfeps = _mm_frcz_pd(rt);
579 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
581 twovfeps = _mm_add_pd(vfeps,vfeps);
582 vfitab = _mm_slli_epi32(vfitab,2);
584 /* CUBIC SPLINE TABLE ELECTROSTATICS */
585 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
586 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
587 GMX_MM_TRANSPOSE2_PD(Y,F);
588 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
589 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
590 GMX_MM_TRANSPOSE2_PD(G,H);
591 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
592 VV = _mm_macc_pd(vfeps,Fp,Y);
593 velec = _mm_mul_pd(qq32,VV);
594 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
595 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
597 /* Update potential sum for this i atom from the interaction with this j atom. */
598 velecsum = _mm_add_pd(velecsum,velec);
602 /* Update vectorial force */
603 fix3 = _mm_macc_pd(dx32,fscal,fix3);
604 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
605 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
607 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
608 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
609 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
611 /**************************
612 * CALCULATE INTERACTIONS *
613 **************************/
615 r33 = _mm_mul_pd(rsq33,rinv33);
617 /* Calculate table index by multiplying r with table scale and truncate to integer */
618 rt = _mm_mul_pd(r33,vftabscale);
619 vfitab = _mm_cvttpd_epi32(rt);
621 vfeps = _mm_frcz_pd(rt);
623 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
625 twovfeps = _mm_add_pd(vfeps,vfeps);
626 vfitab = _mm_slli_epi32(vfitab,2);
628 /* CUBIC SPLINE TABLE ELECTROSTATICS */
629 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
630 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
631 GMX_MM_TRANSPOSE2_PD(Y,F);
632 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
633 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
634 GMX_MM_TRANSPOSE2_PD(G,H);
635 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
636 VV = _mm_macc_pd(vfeps,Fp,Y);
637 velec = _mm_mul_pd(qq33,VV);
638 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
639 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
641 /* Update potential sum for this i atom from the interaction with this j atom. */
642 velecsum = _mm_add_pd(velecsum,velec);
646 /* Update vectorial force */
647 fix3 = _mm_macc_pd(dx33,fscal,fix3);
648 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
649 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
651 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
652 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
653 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
655 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
657 /* Inner loop uses 414 flops */
664 j_coord_offsetA = DIM*jnrA;
666 /* load j atom coordinates */
667 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
668 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
670 /* Calculate displacement vector */
671 dx11 = _mm_sub_pd(ix1,jx1);
672 dy11 = _mm_sub_pd(iy1,jy1);
673 dz11 = _mm_sub_pd(iz1,jz1);
674 dx12 = _mm_sub_pd(ix1,jx2);
675 dy12 = _mm_sub_pd(iy1,jy2);
676 dz12 = _mm_sub_pd(iz1,jz2);
677 dx13 = _mm_sub_pd(ix1,jx3);
678 dy13 = _mm_sub_pd(iy1,jy3);
679 dz13 = _mm_sub_pd(iz1,jz3);
680 dx21 = _mm_sub_pd(ix2,jx1);
681 dy21 = _mm_sub_pd(iy2,jy1);
682 dz21 = _mm_sub_pd(iz2,jz1);
683 dx22 = _mm_sub_pd(ix2,jx2);
684 dy22 = _mm_sub_pd(iy2,jy2);
685 dz22 = _mm_sub_pd(iz2,jz2);
686 dx23 = _mm_sub_pd(ix2,jx3);
687 dy23 = _mm_sub_pd(iy2,jy3);
688 dz23 = _mm_sub_pd(iz2,jz3);
689 dx31 = _mm_sub_pd(ix3,jx1);
690 dy31 = _mm_sub_pd(iy3,jy1);
691 dz31 = _mm_sub_pd(iz3,jz1);
692 dx32 = _mm_sub_pd(ix3,jx2);
693 dy32 = _mm_sub_pd(iy3,jy2);
694 dz32 = _mm_sub_pd(iz3,jz2);
695 dx33 = _mm_sub_pd(ix3,jx3);
696 dy33 = _mm_sub_pd(iy3,jy3);
697 dz33 = _mm_sub_pd(iz3,jz3);
699 /* Calculate squared distance and things based on it */
700 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
701 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
702 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
703 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
704 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
705 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
706 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
707 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
708 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
710 rinv11 = avx128fma_invsqrt_d(rsq11);
711 rinv12 = avx128fma_invsqrt_d(rsq12);
712 rinv13 = avx128fma_invsqrt_d(rsq13);
713 rinv21 = avx128fma_invsqrt_d(rsq21);
714 rinv22 = avx128fma_invsqrt_d(rsq22);
715 rinv23 = avx128fma_invsqrt_d(rsq23);
716 rinv31 = avx128fma_invsqrt_d(rsq31);
717 rinv32 = avx128fma_invsqrt_d(rsq32);
718 rinv33 = avx128fma_invsqrt_d(rsq33);
720 fjx1 = _mm_setzero_pd();
721 fjy1 = _mm_setzero_pd();
722 fjz1 = _mm_setzero_pd();
723 fjx2 = _mm_setzero_pd();
724 fjy2 = _mm_setzero_pd();
725 fjz2 = _mm_setzero_pd();
726 fjx3 = _mm_setzero_pd();
727 fjy3 = _mm_setzero_pd();
728 fjz3 = _mm_setzero_pd();
730 /**************************
731 * CALCULATE INTERACTIONS *
732 **************************/
734 r11 = _mm_mul_pd(rsq11,rinv11);
736 /* Calculate table index by multiplying r with table scale and truncate to integer */
737 rt = _mm_mul_pd(r11,vftabscale);
738 vfitab = _mm_cvttpd_epi32(rt);
740 vfeps = _mm_frcz_pd(rt);
742 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
744 twovfeps = _mm_add_pd(vfeps,vfeps);
745 vfitab = _mm_slli_epi32(vfitab,2);
747 /* CUBIC SPLINE TABLE ELECTROSTATICS */
748 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
749 F = _mm_setzero_pd();
750 GMX_MM_TRANSPOSE2_PD(Y,F);
751 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
752 H = _mm_setzero_pd();
753 GMX_MM_TRANSPOSE2_PD(G,H);
754 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
755 VV = _mm_macc_pd(vfeps,Fp,Y);
756 velec = _mm_mul_pd(qq11,VV);
757 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
758 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
760 /* Update potential sum for this i atom from the interaction with this j atom. */
761 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
762 velecsum = _mm_add_pd(velecsum,velec);
766 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
768 /* Update vectorial force */
769 fix1 = _mm_macc_pd(dx11,fscal,fix1);
770 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
771 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
773 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
774 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
775 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
777 /**************************
778 * CALCULATE INTERACTIONS *
779 **************************/
781 r12 = _mm_mul_pd(rsq12,rinv12);
783 /* Calculate table index by multiplying r with table scale and truncate to integer */
784 rt = _mm_mul_pd(r12,vftabscale);
785 vfitab = _mm_cvttpd_epi32(rt);
787 vfeps = _mm_frcz_pd(rt);
789 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
791 twovfeps = _mm_add_pd(vfeps,vfeps);
792 vfitab = _mm_slli_epi32(vfitab,2);
794 /* CUBIC SPLINE TABLE ELECTROSTATICS */
795 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
796 F = _mm_setzero_pd();
797 GMX_MM_TRANSPOSE2_PD(Y,F);
798 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
799 H = _mm_setzero_pd();
800 GMX_MM_TRANSPOSE2_PD(G,H);
801 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
802 VV = _mm_macc_pd(vfeps,Fp,Y);
803 velec = _mm_mul_pd(qq12,VV);
804 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
805 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
809 velecsum = _mm_add_pd(velecsum,velec);
813 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
815 /* Update vectorial force */
816 fix1 = _mm_macc_pd(dx12,fscal,fix1);
817 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
818 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
820 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
821 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
822 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 r13 = _mm_mul_pd(rsq13,rinv13);
830 /* Calculate table index by multiplying r with table scale and truncate to integer */
831 rt = _mm_mul_pd(r13,vftabscale);
832 vfitab = _mm_cvttpd_epi32(rt);
834 vfeps = _mm_frcz_pd(rt);
836 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
838 twovfeps = _mm_add_pd(vfeps,vfeps);
839 vfitab = _mm_slli_epi32(vfitab,2);
841 /* CUBIC SPLINE TABLE ELECTROSTATICS */
842 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
843 F = _mm_setzero_pd();
844 GMX_MM_TRANSPOSE2_PD(Y,F);
845 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
846 H = _mm_setzero_pd();
847 GMX_MM_TRANSPOSE2_PD(G,H);
848 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
849 VV = _mm_macc_pd(vfeps,Fp,Y);
850 velec = _mm_mul_pd(qq13,VV);
851 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
852 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
854 /* Update potential sum for this i atom from the interaction with this j atom. */
855 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
856 velecsum = _mm_add_pd(velecsum,velec);
860 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
862 /* Update vectorial force */
863 fix1 = _mm_macc_pd(dx13,fscal,fix1);
864 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
865 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
867 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
868 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
869 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
871 /**************************
872 * CALCULATE INTERACTIONS *
873 **************************/
875 r21 = _mm_mul_pd(rsq21,rinv21);
877 /* Calculate table index by multiplying r with table scale and truncate to integer */
878 rt = _mm_mul_pd(r21,vftabscale);
879 vfitab = _mm_cvttpd_epi32(rt);
881 vfeps = _mm_frcz_pd(rt);
883 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
885 twovfeps = _mm_add_pd(vfeps,vfeps);
886 vfitab = _mm_slli_epi32(vfitab,2);
888 /* CUBIC SPLINE TABLE ELECTROSTATICS */
889 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
890 F = _mm_setzero_pd();
891 GMX_MM_TRANSPOSE2_PD(Y,F);
892 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
893 H = _mm_setzero_pd();
894 GMX_MM_TRANSPOSE2_PD(G,H);
895 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
896 VV = _mm_macc_pd(vfeps,Fp,Y);
897 velec = _mm_mul_pd(qq21,VV);
898 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
899 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
901 /* Update potential sum for this i atom from the interaction with this j atom. */
902 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
903 velecsum = _mm_add_pd(velecsum,velec);
907 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
909 /* Update vectorial force */
910 fix2 = _mm_macc_pd(dx21,fscal,fix2);
911 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
912 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
914 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
915 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
916 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 r22 = _mm_mul_pd(rsq22,rinv22);
924 /* Calculate table index by multiplying r with table scale and truncate to integer */
925 rt = _mm_mul_pd(r22,vftabscale);
926 vfitab = _mm_cvttpd_epi32(rt);
928 vfeps = _mm_frcz_pd(rt);
930 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
932 twovfeps = _mm_add_pd(vfeps,vfeps);
933 vfitab = _mm_slli_epi32(vfitab,2);
935 /* CUBIC SPLINE TABLE ELECTROSTATICS */
936 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
937 F = _mm_setzero_pd();
938 GMX_MM_TRANSPOSE2_PD(Y,F);
939 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
940 H = _mm_setzero_pd();
941 GMX_MM_TRANSPOSE2_PD(G,H);
942 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
943 VV = _mm_macc_pd(vfeps,Fp,Y);
944 velec = _mm_mul_pd(qq22,VV);
945 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
946 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
950 velecsum = _mm_add_pd(velecsum,velec);
954 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
956 /* Update vectorial force */
957 fix2 = _mm_macc_pd(dx22,fscal,fix2);
958 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
959 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
961 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
962 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
963 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
965 /**************************
966 * CALCULATE INTERACTIONS *
967 **************************/
969 r23 = _mm_mul_pd(rsq23,rinv23);
971 /* Calculate table index by multiplying r with table scale and truncate to integer */
972 rt = _mm_mul_pd(r23,vftabscale);
973 vfitab = _mm_cvttpd_epi32(rt);
975 vfeps = _mm_frcz_pd(rt);
977 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
979 twovfeps = _mm_add_pd(vfeps,vfeps);
980 vfitab = _mm_slli_epi32(vfitab,2);
982 /* CUBIC SPLINE TABLE ELECTROSTATICS */
983 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
984 F = _mm_setzero_pd();
985 GMX_MM_TRANSPOSE2_PD(Y,F);
986 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
987 H = _mm_setzero_pd();
988 GMX_MM_TRANSPOSE2_PD(G,H);
989 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
990 VV = _mm_macc_pd(vfeps,Fp,Y);
991 velec = _mm_mul_pd(qq23,VV);
992 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
993 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
995 /* Update potential sum for this i atom from the interaction with this j atom. */
996 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
997 velecsum = _mm_add_pd(velecsum,velec);
1001 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1003 /* Update vectorial force */
1004 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1005 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1006 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1008 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1009 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1010 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1012 /**************************
1013 * CALCULATE INTERACTIONS *
1014 **************************/
1016 r31 = _mm_mul_pd(rsq31,rinv31);
1018 /* Calculate table index by multiplying r with table scale and truncate to integer */
1019 rt = _mm_mul_pd(r31,vftabscale);
1020 vfitab = _mm_cvttpd_epi32(rt);
1022 vfeps = _mm_frcz_pd(rt);
1024 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1026 twovfeps = _mm_add_pd(vfeps,vfeps);
1027 vfitab = _mm_slli_epi32(vfitab,2);
1029 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1030 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1031 F = _mm_setzero_pd();
1032 GMX_MM_TRANSPOSE2_PD(Y,F);
1033 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1034 H = _mm_setzero_pd();
1035 GMX_MM_TRANSPOSE2_PD(G,H);
1036 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1037 VV = _mm_macc_pd(vfeps,Fp,Y);
1038 velec = _mm_mul_pd(qq31,VV);
1039 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1040 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1042 /* Update potential sum for this i atom from the interaction with this j atom. */
1043 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1044 velecsum = _mm_add_pd(velecsum,velec);
1048 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1050 /* Update vectorial force */
1051 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1052 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1053 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1055 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1056 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1057 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1059 /**************************
1060 * CALCULATE INTERACTIONS *
1061 **************************/
1063 r32 = _mm_mul_pd(rsq32,rinv32);
1065 /* Calculate table index by multiplying r with table scale and truncate to integer */
1066 rt = _mm_mul_pd(r32,vftabscale);
1067 vfitab = _mm_cvttpd_epi32(rt);
1069 vfeps = _mm_frcz_pd(rt);
1071 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1073 twovfeps = _mm_add_pd(vfeps,vfeps);
1074 vfitab = _mm_slli_epi32(vfitab,2);
1076 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1077 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1078 F = _mm_setzero_pd();
1079 GMX_MM_TRANSPOSE2_PD(Y,F);
1080 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1081 H = _mm_setzero_pd();
1082 GMX_MM_TRANSPOSE2_PD(G,H);
1083 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1084 VV = _mm_macc_pd(vfeps,Fp,Y);
1085 velec = _mm_mul_pd(qq32,VV);
1086 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1087 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1089 /* Update potential sum for this i atom from the interaction with this j atom. */
1090 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1091 velecsum = _mm_add_pd(velecsum,velec);
1095 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1097 /* Update vectorial force */
1098 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1099 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1100 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1102 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1103 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1104 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1106 /**************************
1107 * CALCULATE INTERACTIONS *
1108 **************************/
1110 r33 = _mm_mul_pd(rsq33,rinv33);
1112 /* Calculate table index by multiplying r with table scale and truncate to integer */
1113 rt = _mm_mul_pd(r33,vftabscale);
1114 vfitab = _mm_cvttpd_epi32(rt);
1116 vfeps = _mm_frcz_pd(rt);
1118 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1120 twovfeps = _mm_add_pd(vfeps,vfeps);
1121 vfitab = _mm_slli_epi32(vfitab,2);
1123 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1124 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1125 F = _mm_setzero_pd();
1126 GMX_MM_TRANSPOSE2_PD(Y,F);
1127 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1128 H = _mm_setzero_pd();
1129 GMX_MM_TRANSPOSE2_PD(G,H);
1130 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1131 VV = _mm_macc_pd(vfeps,Fp,Y);
1132 velec = _mm_mul_pd(qq33,VV);
1133 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1134 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1138 velecsum = _mm_add_pd(velecsum,velec);
1142 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1144 /* Update vectorial force */
1145 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1146 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1147 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1149 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1150 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1151 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1153 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1155 /* Inner loop uses 414 flops */
1158 /* End of innermost loop */
1160 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1161 f+i_coord_offset+DIM,fshift+i_shift_offset);
1164 /* Update potential energies */
1165 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1167 /* Increment number of inner iterations */
1168 inneriter += j_index_end - j_index_start;
1170 /* Outer loop uses 19 flops */
1173 /* Increment number of outer iterations */
1176 /* Update outer/inner flops */
1178 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*414);
1181 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_double
1182 * Electrostatics interaction: CubicSplineTable
1183 * VdW interaction: None
1184 * Geometry: Water4-Water4
1185 * Calculate force/pot: Force
1188 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_double
1189 (t_nblist * gmx_restrict nlist,
1190 rvec * gmx_restrict xx,
1191 rvec * gmx_restrict ff,
1192 struct t_forcerec * gmx_restrict fr,
1193 t_mdatoms * gmx_restrict mdatoms,
1194 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1195 t_nrnb * gmx_restrict nrnb)
1197 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1198 * just 0 for non-waters.
1199 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1200 * jnr indices corresponding to data put in the four positions in the SIMD register.
1202 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1203 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1205 int j_coord_offsetA,j_coord_offsetB;
1206 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1207 real rcutoff_scalar;
1208 real *shiftvec,*fshift,*x,*f;
1209 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1211 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1213 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1215 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1216 int vdwjidx1A,vdwjidx1B;
1217 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1218 int vdwjidx2A,vdwjidx2B;
1219 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1220 int vdwjidx3A,vdwjidx3B;
1221 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1222 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1223 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1224 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1225 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1226 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1227 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1228 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1229 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1230 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1231 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1234 __m128i ifour = _mm_set1_epi32(4);
1235 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1237 __m128d dummy_mask,cutoff_mask;
1238 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1239 __m128d one = _mm_set1_pd(1.0);
1240 __m128d two = _mm_set1_pd(2.0);
1246 jindex = nlist->jindex;
1248 shiftidx = nlist->shift;
1250 shiftvec = fr->shift_vec[0];
1251 fshift = fr->fshift[0];
1252 facel = _mm_set1_pd(fr->ic->epsfac);
1253 charge = mdatoms->chargeA;
1255 vftab = kernel_data->table_elec->data;
1256 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1258 /* Setup water-specific parameters */
1259 inr = nlist->iinr[0];
1260 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1261 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1262 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1264 jq1 = _mm_set1_pd(charge[inr+1]);
1265 jq2 = _mm_set1_pd(charge[inr+2]);
1266 jq3 = _mm_set1_pd(charge[inr+3]);
1267 qq11 = _mm_mul_pd(iq1,jq1);
1268 qq12 = _mm_mul_pd(iq1,jq2);
1269 qq13 = _mm_mul_pd(iq1,jq3);
1270 qq21 = _mm_mul_pd(iq2,jq1);
1271 qq22 = _mm_mul_pd(iq2,jq2);
1272 qq23 = _mm_mul_pd(iq2,jq3);
1273 qq31 = _mm_mul_pd(iq3,jq1);
1274 qq32 = _mm_mul_pd(iq3,jq2);
1275 qq33 = _mm_mul_pd(iq3,jq3);
1277 /* Avoid stupid compiler warnings */
1279 j_coord_offsetA = 0;
1280 j_coord_offsetB = 0;
1285 /* Start outer loop over neighborlists */
1286 for(iidx=0; iidx<nri; iidx++)
1288 /* Load shift vector for this list */
1289 i_shift_offset = DIM*shiftidx[iidx];
1291 /* Load limits for loop over neighbors */
1292 j_index_start = jindex[iidx];
1293 j_index_end = jindex[iidx+1];
1295 /* Get outer coordinate index */
1297 i_coord_offset = DIM*inr;
1299 /* Load i particle coords and add shift vector */
1300 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1301 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1303 fix1 = _mm_setzero_pd();
1304 fiy1 = _mm_setzero_pd();
1305 fiz1 = _mm_setzero_pd();
1306 fix2 = _mm_setzero_pd();
1307 fiy2 = _mm_setzero_pd();
1308 fiz2 = _mm_setzero_pd();
1309 fix3 = _mm_setzero_pd();
1310 fiy3 = _mm_setzero_pd();
1311 fiz3 = _mm_setzero_pd();
1313 /* Start inner kernel loop */
1314 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1317 /* Get j neighbor index, and coordinate index */
1319 jnrB = jjnr[jidx+1];
1320 j_coord_offsetA = DIM*jnrA;
1321 j_coord_offsetB = DIM*jnrB;
1323 /* load j atom coordinates */
1324 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1325 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1327 /* Calculate displacement vector */
1328 dx11 = _mm_sub_pd(ix1,jx1);
1329 dy11 = _mm_sub_pd(iy1,jy1);
1330 dz11 = _mm_sub_pd(iz1,jz1);
1331 dx12 = _mm_sub_pd(ix1,jx2);
1332 dy12 = _mm_sub_pd(iy1,jy2);
1333 dz12 = _mm_sub_pd(iz1,jz2);
1334 dx13 = _mm_sub_pd(ix1,jx3);
1335 dy13 = _mm_sub_pd(iy1,jy3);
1336 dz13 = _mm_sub_pd(iz1,jz3);
1337 dx21 = _mm_sub_pd(ix2,jx1);
1338 dy21 = _mm_sub_pd(iy2,jy1);
1339 dz21 = _mm_sub_pd(iz2,jz1);
1340 dx22 = _mm_sub_pd(ix2,jx2);
1341 dy22 = _mm_sub_pd(iy2,jy2);
1342 dz22 = _mm_sub_pd(iz2,jz2);
1343 dx23 = _mm_sub_pd(ix2,jx3);
1344 dy23 = _mm_sub_pd(iy2,jy3);
1345 dz23 = _mm_sub_pd(iz2,jz3);
1346 dx31 = _mm_sub_pd(ix3,jx1);
1347 dy31 = _mm_sub_pd(iy3,jy1);
1348 dz31 = _mm_sub_pd(iz3,jz1);
1349 dx32 = _mm_sub_pd(ix3,jx2);
1350 dy32 = _mm_sub_pd(iy3,jy2);
1351 dz32 = _mm_sub_pd(iz3,jz2);
1352 dx33 = _mm_sub_pd(ix3,jx3);
1353 dy33 = _mm_sub_pd(iy3,jy3);
1354 dz33 = _mm_sub_pd(iz3,jz3);
1356 /* Calculate squared distance and things based on it */
1357 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1358 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1359 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1360 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1361 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1362 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1363 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1364 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1365 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1367 rinv11 = avx128fma_invsqrt_d(rsq11);
1368 rinv12 = avx128fma_invsqrt_d(rsq12);
1369 rinv13 = avx128fma_invsqrt_d(rsq13);
1370 rinv21 = avx128fma_invsqrt_d(rsq21);
1371 rinv22 = avx128fma_invsqrt_d(rsq22);
1372 rinv23 = avx128fma_invsqrt_d(rsq23);
1373 rinv31 = avx128fma_invsqrt_d(rsq31);
1374 rinv32 = avx128fma_invsqrt_d(rsq32);
1375 rinv33 = avx128fma_invsqrt_d(rsq33);
1377 fjx1 = _mm_setzero_pd();
1378 fjy1 = _mm_setzero_pd();
1379 fjz1 = _mm_setzero_pd();
1380 fjx2 = _mm_setzero_pd();
1381 fjy2 = _mm_setzero_pd();
1382 fjz2 = _mm_setzero_pd();
1383 fjx3 = _mm_setzero_pd();
1384 fjy3 = _mm_setzero_pd();
1385 fjz3 = _mm_setzero_pd();
1387 /**************************
1388 * CALCULATE INTERACTIONS *
1389 **************************/
1391 r11 = _mm_mul_pd(rsq11,rinv11);
1393 /* Calculate table index by multiplying r with table scale and truncate to integer */
1394 rt = _mm_mul_pd(r11,vftabscale);
1395 vfitab = _mm_cvttpd_epi32(rt);
1397 vfeps = _mm_frcz_pd(rt);
1399 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1401 twovfeps = _mm_add_pd(vfeps,vfeps);
1402 vfitab = _mm_slli_epi32(vfitab,2);
1404 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1405 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1406 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1407 GMX_MM_TRANSPOSE2_PD(Y,F);
1408 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1409 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1410 GMX_MM_TRANSPOSE2_PD(G,H);
1411 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1412 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1413 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1417 /* Update vectorial force */
1418 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1419 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1420 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1422 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1423 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1424 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1426 /**************************
1427 * CALCULATE INTERACTIONS *
1428 **************************/
1430 r12 = _mm_mul_pd(rsq12,rinv12);
1432 /* Calculate table index by multiplying r with table scale and truncate to integer */
1433 rt = _mm_mul_pd(r12,vftabscale);
1434 vfitab = _mm_cvttpd_epi32(rt);
1436 vfeps = _mm_frcz_pd(rt);
1438 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1440 twovfeps = _mm_add_pd(vfeps,vfeps);
1441 vfitab = _mm_slli_epi32(vfitab,2);
1443 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1444 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1445 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1446 GMX_MM_TRANSPOSE2_PD(Y,F);
1447 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1448 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1449 GMX_MM_TRANSPOSE2_PD(G,H);
1450 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1451 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1452 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1456 /* Update vectorial force */
1457 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1458 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1459 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1461 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1462 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1463 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1465 /**************************
1466 * CALCULATE INTERACTIONS *
1467 **************************/
1469 r13 = _mm_mul_pd(rsq13,rinv13);
1471 /* Calculate table index by multiplying r with table scale and truncate to integer */
1472 rt = _mm_mul_pd(r13,vftabscale);
1473 vfitab = _mm_cvttpd_epi32(rt);
1475 vfeps = _mm_frcz_pd(rt);
1477 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1479 twovfeps = _mm_add_pd(vfeps,vfeps);
1480 vfitab = _mm_slli_epi32(vfitab,2);
1482 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1483 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1484 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1485 GMX_MM_TRANSPOSE2_PD(Y,F);
1486 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1487 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1488 GMX_MM_TRANSPOSE2_PD(G,H);
1489 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1490 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1491 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1495 /* Update vectorial force */
1496 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1497 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1498 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1500 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1501 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1502 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 r21 = _mm_mul_pd(rsq21,rinv21);
1510 /* Calculate table index by multiplying r with table scale and truncate to integer */
1511 rt = _mm_mul_pd(r21,vftabscale);
1512 vfitab = _mm_cvttpd_epi32(rt);
1514 vfeps = _mm_frcz_pd(rt);
1516 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1518 twovfeps = _mm_add_pd(vfeps,vfeps);
1519 vfitab = _mm_slli_epi32(vfitab,2);
1521 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1522 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1523 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1524 GMX_MM_TRANSPOSE2_PD(Y,F);
1525 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1526 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1527 GMX_MM_TRANSPOSE2_PD(G,H);
1528 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1529 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1530 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1534 /* Update vectorial force */
1535 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1536 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1537 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1539 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1540 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1541 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1543 /**************************
1544 * CALCULATE INTERACTIONS *
1545 **************************/
1547 r22 = _mm_mul_pd(rsq22,rinv22);
1549 /* Calculate table index by multiplying r with table scale and truncate to integer */
1550 rt = _mm_mul_pd(r22,vftabscale);
1551 vfitab = _mm_cvttpd_epi32(rt);
1553 vfeps = _mm_frcz_pd(rt);
1555 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1557 twovfeps = _mm_add_pd(vfeps,vfeps);
1558 vfitab = _mm_slli_epi32(vfitab,2);
1560 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1562 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1563 GMX_MM_TRANSPOSE2_PD(Y,F);
1564 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1565 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1566 GMX_MM_TRANSPOSE2_PD(G,H);
1567 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1568 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1569 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1573 /* Update vectorial force */
1574 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1575 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1576 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1578 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1579 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1580 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1582 /**************************
1583 * CALCULATE INTERACTIONS *
1584 **************************/
1586 r23 = _mm_mul_pd(rsq23,rinv23);
1588 /* Calculate table index by multiplying r with table scale and truncate to integer */
1589 rt = _mm_mul_pd(r23,vftabscale);
1590 vfitab = _mm_cvttpd_epi32(rt);
1592 vfeps = _mm_frcz_pd(rt);
1594 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1596 twovfeps = _mm_add_pd(vfeps,vfeps);
1597 vfitab = _mm_slli_epi32(vfitab,2);
1599 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1600 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1601 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1602 GMX_MM_TRANSPOSE2_PD(Y,F);
1603 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1604 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1605 GMX_MM_TRANSPOSE2_PD(G,H);
1606 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1607 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1608 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1612 /* Update vectorial force */
1613 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1614 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1615 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1617 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1618 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1619 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 r31 = _mm_mul_pd(rsq31,rinv31);
1627 /* Calculate table index by multiplying r with table scale and truncate to integer */
1628 rt = _mm_mul_pd(r31,vftabscale);
1629 vfitab = _mm_cvttpd_epi32(rt);
1631 vfeps = _mm_frcz_pd(rt);
1633 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1635 twovfeps = _mm_add_pd(vfeps,vfeps);
1636 vfitab = _mm_slli_epi32(vfitab,2);
1638 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1640 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1641 GMX_MM_TRANSPOSE2_PD(Y,F);
1642 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1643 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1644 GMX_MM_TRANSPOSE2_PD(G,H);
1645 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1646 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1647 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1651 /* Update vectorial force */
1652 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1653 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1654 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1656 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1657 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1658 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r32 = _mm_mul_pd(rsq32,rinv32);
1666 /* Calculate table index by multiplying r with table scale and truncate to integer */
1667 rt = _mm_mul_pd(r32,vftabscale);
1668 vfitab = _mm_cvttpd_epi32(rt);
1670 vfeps = _mm_frcz_pd(rt);
1672 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1674 twovfeps = _mm_add_pd(vfeps,vfeps);
1675 vfitab = _mm_slli_epi32(vfitab,2);
1677 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1679 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1680 GMX_MM_TRANSPOSE2_PD(Y,F);
1681 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1682 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1683 GMX_MM_TRANSPOSE2_PD(G,H);
1684 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1685 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1686 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1690 /* Update vectorial force */
1691 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1692 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1693 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1695 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1696 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1697 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1703 r33 = _mm_mul_pd(rsq33,rinv33);
1705 /* Calculate table index by multiplying r with table scale and truncate to integer */
1706 rt = _mm_mul_pd(r33,vftabscale);
1707 vfitab = _mm_cvttpd_epi32(rt);
1709 vfeps = _mm_frcz_pd(rt);
1711 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1713 twovfeps = _mm_add_pd(vfeps,vfeps);
1714 vfitab = _mm_slli_epi32(vfitab,2);
1716 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1717 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1718 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1719 GMX_MM_TRANSPOSE2_PD(Y,F);
1720 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1721 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1722 GMX_MM_TRANSPOSE2_PD(G,H);
1723 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1724 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1725 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1729 /* Update vectorial force */
1730 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1731 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1732 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1734 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1735 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1736 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1738 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1740 /* Inner loop uses 378 flops */
1743 if(jidx<j_index_end)
1747 j_coord_offsetA = DIM*jnrA;
1749 /* load j atom coordinates */
1750 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
1751 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1753 /* Calculate displacement vector */
1754 dx11 = _mm_sub_pd(ix1,jx1);
1755 dy11 = _mm_sub_pd(iy1,jy1);
1756 dz11 = _mm_sub_pd(iz1,jz1);
1757 dx12 = _mm_sub_pd(ix1,jx2);
1758 dy12 = _mm_sub_pd(iy1,jy2);
1759 dz12 = _mm_sub_pd(iz1,jz2);
1760 dx13 = _mm_sub_pd(ix1,jx3);
1761 dy13 = _mm_sub_pd(iy1,jy3);
1762 dz13 = _mm_sub_pd(iz1,jz3);
1763 dx21 = _mm_sub_pd(ix2,jx1);
1764 dy21 = _mm_sub_pd(iy2,jy1);
1765 dz21 = _mm_sub_pd(iz2,jz1);
1766 dx22 = _mm_sub_pd(ix2,jx2);
1767 dy22 = _mm_sub_pd(iy2,jy2);
1768 dz22 = _mm_sub_pd(iz2,jz2);
1769 dx23 = _mm_sub_pd(ix2,jx3);
1770 dy23 = _mm_sub_pd(iy2,jy3);
1771 dz23 = _mm_sub_pd(iz2,jz3);
1772 dx31 = _mm_sub_pd(ix3,jx1);
1773 dy31 = _mm_sub_pd(iy3,jy1);
1774 dz31 = _mm_sub_pd(iz3,jz1);
1775 dx32 = _mm_sub_pd(ix3,jx2);
1776 dy32 = _mm_sub_pd(iy3,jy2);
1777 dz32 = _mm_sub_pd(iz3,jz2);
1778 dx33 = _mm_sub_pd(ix3,jx3);
1779 dy33 = _mm_sub_pd(iy3,jy3);
1780 dz33 = _mm_sub_pd(iz3,jz3);
1782 /* Calculate squared distance and things based on it */
1783 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1784 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1785 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1786 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1787 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1788 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1789 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1790 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1791 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1793 rinv11 = avx128fma_invsqrt_d(rsq11);
1794 rinv12 = avx128fma_invsqrt_d(rsq12);
1795 rinv13 = avx128fma_invsqrt_d(rsq13);
1796 rinv21 = avx128fma_invsqrt_d(rsq21);
1797 rinv22 = avx128fma_invsqrt_d(rsq22);
1798 rinv23 = avx128fma_invsqrt_d(rsq23);
1799 rinv31 = avx128fma_invsqrt_d(rsq31);
1800 rinv32 = avx128fma_invsqrt_d(rsq32);
1801 rinv33 = avx128fma_invsqrt_d(rsq33);
1803 fjx1 = _mm_setzero_pd();
1804 fjy1 = _mm_setzero_pd();
1805 fjz1 = _mm_setzero_pd();
1806 fjx2 = _mm_setzero_pd();
1807 fjy2 = _mm_setzero_pd();
1808 fjz2 = _mm_setzero_pd();
1809 fjx3 = _mm_setzero_pd();
1810 fjy3 = _mm_setzero_pd();
1811 fjz3 = _mm_setzero_pd();
1813 /**************************
1814 * CALCULATE INTERACTIONS *
1815 **************************/
1817 r11 = _mm_mul_pd(rsq11,rinv11);
1819 /* Calculate table index by multiplying r with table scale and truncate to integer */
1820 rt = _mm_mul_pd(r11,vftabscale);
1821 vfitab = _mm_cvttpd_epi32(rt);
1823 vfeps = _mm_frcz_pd(rt);
1825 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1827 twovfeps = _mm_add_pd(vfeps,vfeps);
1828 vfitab = _mm_slli_epi32(vfitab,2);
1830 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1831 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1832 F = _mm_setzero_pd();
1833 GMX_MM_TRANSPOSE2_PD(Y,F);
1834 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1835 H = _mm_setzero_pd();
1836 GMX_MM_TRANSPOSE2_PD(G,H);
1837 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1838 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1839 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1843 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1845 /* Update vectorial force */
1846 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1847 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1848 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1850 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1851 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1852 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 r12 = _mm_mul_pd(rsq12,rinv12);
1860 /* Calculate table index by multiplying r with table scale and truncate to integer */
1861 rt = _mm_mul_pd(r12,vftabscale);
1862 vfitab = _mm_cvttpd_epi32(rt);
1864 vfeps = _mm_frcz_pd(rt);
1866 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1868 twovfeps = _mm_add_pd(vfeps,vfeps);
1869 vfitab = _mm_slli_epi32(vfitab,2);
1871 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1872 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1873 F = _mm_setzero_pd();
1874 GMX_MM_TRANSPOSE2_PD(Y,F);
1875 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1876 H = _mm_setzero_pd();
1877 GMX_MM_TRANSPOSE2_PD(G,H);
1878 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1879 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1880 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1884 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1886 /* Update vectorial force */
1887 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1888 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1889 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1891 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1892 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1893 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1895 /**************************
1896 * CALCULATE INTERACTIONS *
1897 **************************/
1899 r13 = _mm_mul_pd(rsq13,rinv13);
1901 /* Calculate table index by multiplying r with table scale and truncate to integer */
1902 rt = _mm_mul_pd(r13,vftabscale);
1903 vfitab = _mm_cvttpd_epi32(rt);
1905 vfeps = _mm_frcz_pd(rt);
1907 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1909 twovfeps = _mm_add_pd(vfeps,vfeps);
1910 vfitab = _mm_slli_epi32(vfitab,2);
1912 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1913 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1914 F = _mm_setzero_pd();
1915 GMX_MM_TRANSPOSE2_PD(Y,F);
1916 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1917 H = _mm_setzero_pd();
1918 GMX_MM_TRANSPOSE2_PD(G,H);
1919 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1920 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1921 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1925 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1927 /* Update vectorial force */
1928 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1929 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1930 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1932 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1933 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1934 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1936 /**************************
1937 * CALCULATE INTERACTIONS *
1938 **************************/
1940 r21 = _mm_mul_pd(rsq21,rinv21);
1942 /* Calculate table index by multiplying r with table scale and truncate to integer */
1943 rt = _mm_mul_pd(r21,vftabscale);
1944 vfitab = _mm_cvttpd_epi32(rt);
1946 vfeps = _mm_frcz_pd(rt);
1948 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1950 twovfeps = _mm_add_pd(vfeps,vfeps);
1951 vfitab = _mm_slli_epi32(vfitab,2);
1953 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1954 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1955 F = _mm_setzero_pd();
1956 GMX_MM_TRANSPOSE2_PD(Y,F);
1957 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1958 H = _mm_setzero_pd();
1959 GMX_MM_TRANSPOSE2_PD(G,H);
1960 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1961 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1962 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1966 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1968 /* Update vectorial force */
1969 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1970 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1971 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1973 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1974 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1975 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1977 /**************************
1978 * CALCULATE INTERACTIONS *
1979 **************************/
1981 r22 = _mm_mul_pd(rsq22,rinv22);
1983 /* Calculate table index by multiplying r with table scale and truncate to integer */
1984 rt = _mm_mul_pd(r22,vftabscale);
1985 vfitab = _mm_cvttpd_epi32(rt);
1987 vfeps = _mm_frcz_pd(rt);
1989 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1991 twovfeps = _mm_add_pd(vfeps,vfeps);
1992 vfitab = _mm_slli_epi32(vfitab,2);
1994 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1995 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1996 F = _mm_setzero_pd();
1997 GMX_MM_TRANSPOSE2_PD(Y,F);
1998 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1999 H = _mm_setzero_pd();
2000 GMX_MM_TRANSPOSE2_PD(G,H);
2001 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2002 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2003 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2007 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2009 /* Update vectorial force */
2010 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2011 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2012 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2014 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2015 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2016 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2018 /**************************
2019 * CALCULATE INTERACTIONS *
2020 **************************/
2022 r23 = _mm_mul_pd(rsq23,rinv23);
2024 /* Calculate table index by multiplying r with table scale and truncate to integer */
2025 rt = _mm_mul_pd(r23,vftabscale);
2026 vfitab = _mm_cvttpd_epi32(rt);
2028 vfeps = _mm_frcz_pd(rt);
2030 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2032 twovfeps = _mm_add_pd(vfeps,vfeps);
2033 vfitab = _mm_slli_epi32(vfitab,2);
2035 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2036 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2037 F = _mm_setzero_pd();
2038 GMX_MM_TRANSPOSE2_PD(Y,F);
2039 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2040 H = _mm_setzero_pd();
2041 GMX_MM_TRANSPOSE2_PD(G,H);
2042 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2043 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2044 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2048 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2050 /* Update vectorial force */
2051 fix2 = _mm_macc_pd(dx23,fscal,fix2);
2052 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2053 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2055 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2056 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2057 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2059 /**************************
2060 * CALCULATE INTERACTIONS *
2061 **************************/
2063 r31 = _mm_mul_pd(rsq31,rinv31);
2065 /* Calculate table index by multiplying r with table scale and truncate to integer */
2066 rt = _mm_mul_pd(r31,vftabscale);
2067 vfitab = _mm_cvttpd_epi32(rt);
2069 vfeps = _mm_frcz_pd(rt);
2071 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2073 twovfeps = _mm_add_pd(vfeps,vfeps);
2074 vfitab = _mm_slli_epi32(vfitab,2);
2076 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2077 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2078 F = _mm_setzero_pd();
2079 GMX_MM_TRANSPOSE2_PD(Y,F);
2080 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2081 H = _mm_setzero_pd();
2082 GMX_MM_TRANSPOSE2_PD(G,H);
2083 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2084 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2085 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2089 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2091 /* Update vectorial force */
2092 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2093 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2094 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2096 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2097 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2098 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2100 /**************************
2101 * CALCULATE INTERACTIONS *
2102 **************************/
2104 r32 = _mm_mul_pd(rsq32,rinv32);
2106 /* Calculate table index by multiplying r with table scale and truncate to integer */
2107 rt = _mm_mul_pd(r32,vftabscale);
2108 vfitab = _mm_cvttpd_epi32(rt);
2110 vfeps = _mm_frcz_pd(rt);
2112 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2114 twovfeps = _mm_add_pd(vfeps,vfeps);
2115 vfitab = _mm_slli_epi32(vfitab,2);
2117 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2118 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2119 F = _mm_setzero_pd();
2120 GMX_MM_TRANSPOSE2_PD(Y,F);
2121 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2122 H = _mm_setzero_pd();
2123 GMX_MM_TRANSPOSE2_PD(G,H);
2124 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2125 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2126 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2130 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2132 /* Update vectorial force */
2133 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2134 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2135 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2137 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2138 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2139 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2141 /**************************
2142 * CALCULATE INTERACTIONS *
2143 **************************/
2145 r33 = _mm_mul_pd(rsq33,rinv33);
2147 /* Calculate table index by multiplying r with table scale and truncate to integer */
2148 rt = _mm_mul_pd(r33,vftabscale);
2149 vfitab = _mm_cvttpd_epi32(rt);
2151 vfeps = _mm_frcz_pd(rt);
2153 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2155 twovfeps = _mm_add_pd(vfeps,vfeps);
2156 vfitab = _mm_slli_epi32(vfitab,2);
2158 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2159 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2160 F = _mm_setzero_pd();
2161 GMX_MM_TRANSPOSE2_PD(Y,F);
2162 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2163 H = _mm_setzero_pd();
2164 GMX_MM_TRANSPOSE2_PD(G,H);
2165 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2166 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2167 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2171 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2173 /* Update vectorial force */
2174 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2175 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2176 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2178 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2179 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2180 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2182 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2184 /* Inner loop uses 378 flops */
2187 /* End of innermost loop */
2189 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2190 f+i_coord_offset+DIM,fshift+i_shift_offset);
2192 /* Increment number of inner iterations */
2193 inneriter += j_index_end - j_index_start;
2195 /* Outer loop uses 18 flops */
2198 /* Increment number of outer iterations */
2201 /* Update outer/inner flops */
2203 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);