2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_avx_128_fma_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85 int vdwjidx0A,vdwjidx0B;
86 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
88 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
89 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
90 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
97 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
99 __m128i ifour = _mm_set1_epi32(4);
100 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
102 __m128d dummy_mask,cutoff_mask;
103 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
104 __m128d one = _mm_set1_pd(1.0);
105 __m128d two = _mm_set1_pd(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_pd(fr->ic->epsfac);
118 charge = mdatoms->chargeA;
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 vftab = kernel_data->table_elec->data;
124 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
129 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
130 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 /* Avoid stupid compiler warnings */
141 /* Start outer loop over neighborlists */
142 for(iidx=0; iidx<nri; iidx++)
144 /* Load shift vector for this list */
145 i_shift_offset = DIM*shiftidx[iidx];
147 /* Load limits for loop over neighbors */
148 j_index_start = jindex[iidx];
149 j_index_end = jindex[iidx+1];
151 /* Get outer coordinate index */
153 i_coord_offset = DIM*inr;
155 /* Load i particle coords and add shift vector */
156 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
157 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
159 fix0 = _mm_setzero_pd();
160 fiy0 = _mm_setzero_pd();
161 fiz0 = _mm_setzero_pd();
162 fix1 = _mm_setzero_pd();
163 fiy1 = _mm_setzero_pd();
164 fiz1 = _mm_setzero_pd();
165 fix2 = _mm_setzero_pd();
166 fiy2 = _mm_setzero_pd();
167 fiz2 = _mm_setzero_pd();
169 /* Reset potential sums */
170 velecsum = _mm_setzero_pd();
171 vvdwsum = _mm_setzero_pd();
173 /* Start inner kernel loop */
174 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
177 /* Get j neighbor index, and coordinate index */
180 j_coord_offsetA = DIM*jnrA;
181 j_coord_offsetB = DIM*jnrB;
183 /* load j atom coordinates */
184 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
187 /* Calculate displacement vector */
188 dx00 = _mm_sub_pd(ix0,jx0);
189 dy00 = _mm_sub_pd(iy0,jy0);
190 dz00 = _mm_sub_pd(iz0,jz0);
191 dx10 = _mm_sub_pd(ix1,jx0);
192 dy10 = _mm_sub_pd(iy1,jy0);
193 dz10 = _mm_sub_pd(iz1,jz0);
194 dx20 = _mm_sub_pd(ix2,jx0);
195 dy20 = _mm_sub_pd(iy2,jy0);
196 dz20 = _mm_sub_pd(iz2,jz0);
198 /* Calculate squared distance and things based on it */
199 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
200 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
201 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
203 rinv00 = avx128fma_invsqrt_d(rsq00);
204 rinv10 = avx128fma_invsqrt_d(rsq10);
205 rinv20 = avx128fma_invsqrt_d(rsq20);
207 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
209 /* Load parameters for j particles */
210 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
211 vdwjidx0A = 2*vdwtype[jnrA+0];
212 vdwjidx0B = 2*vdwtype[jnrB+0];
214 fjx0 = _mm_setzero_pd();
215 fjy0 = _mm_setzero_pd();
216 fjz0 = _mm_setzero_pd();
218 /**************************
219 * CALCULATE INTERACTIONS *
220 **************************/
222 r00 = _mm_mul_pd(rsq00,rinv00);
224 /* Compute parameters for interactions between i and j atoms */
225 qq00 = _mm_mul_pd(iq0,jq0);
226 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
227 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
229 /* Calculate table index by multiplying r with table scale and truncate to integer */
230 rt = _mm_mul_pd(r00,vftabscale);
231 vfitab = _mm_cvttpd_epi32(rt);
233 vfeps = _mm_frcz_pd(rt);
235 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
237 twovfeps = _mm_add_pd(vfeps,vfeps);
238 vfitab = _mm_slli_epi32(vfitab,2);
240 /* CUBIC SPLINE TABLE ELECTROSTATICS */
241 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
242 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
243 GMX_MM_TRANSPOSE2_PD(Y,F);
244 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
245 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
246 GMX_MM_TRANSPOSE2_PD(G,H);
247 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
248 VV = _mm_macc_pd(vfeps,Fp,Y);
249 velec = _mm_mul_pd(qq00,VV);
250 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
251 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
253 /* LENNARD-JONES DISPERSION/REPULSION */
255 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
256 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
257 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
258 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
259 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
261 /* Update potential sum for this i atom from the interaction with this j atom. */
262 velecsum = _mm_add_pd(velecsum,velec);
263 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
265 fscal = _mm_add_pd(felec,fvdw);
267 /* Update vectorial force */
268 fix0 = _mm_macc_pd(dx00,fscal,fix0);
269 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
270 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
272 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
273 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
274 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
276 /**************************
277 * CALCULATE INTERACTIONS *
278 **************************/
280 r10 = _mm_mul_pd(rsq10,rinv10);
282 /* Compute parameters for interactions between i and j atoms */
283 qq10 = _mm_mul_pd(iq1,jq0);
285 /* Calculate table index by multiplying r with table scale and truncate to integer */
286 rt = _mm_mul_pd(r10,vftabscale);
287 vfitab = _mm_cvttpd_epi32(rt);
289 vfeps = _mm_frcz_pd(rt);
291 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
293 twovfeps = _mm_add_pd(vfeps,vfeps);
294 vfitab = _mm_slli_epi32(vfitab,2);
296 /* CUBIC SPLINE TABLE ELECTROSTATICS */
297 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
298 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
299 GMX_MM_TRANSPOSE2_PD(Y,F);
300 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
301 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
302 GMX_MM_TRANSPOSE2_PD(G,H);
303 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
304 VV = _mm_macc_pd(vfeps,Fp,Y);
305 velec = _mm_mul_pd(qq10,VV);
306 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
307 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
309 /* Update potential sum for this i atom from the interaction with this j atom. */
310 velecsum = _mm_add_pd(velecsum,velec);
314 /* Update vectorial force */
315 fix1 = _mm_macc_pd(dx10,fscal,fix1);
316 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
317 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
319 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
320 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
321 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
323 /**************************
324 * CALCULATE INTERACTIONS *
325 **************************/
327 r20 = _mm_mul_pd(rsq20,rinv20);
329 /* Compute parameters for interactions between i and j atoms */
330 qq20 = _mm_mul_pd(iq2,jq0);
332 /* Calculate table index by multiplying r with table scale and truncate to integer */
333 rt = _mm_mul_pd(r20,vftabscale);
334 vfitab = _mm_cvttpd_epi32(rt);
336 vfeps = _mm_frcz_pd(rt);
338 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
340 twovfeps = _mm_add_pd(vfeps,vfeps);
341 vfitab = _mm_slli_epi32(vfitab,2);
343 /* CUBIC SPLINE TABLE ELECTROSTATICS */
344 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
345 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
346 GMX_MM_TRANSPOSE2_PD(Y,F);
347 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
348 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
349 GMX_MM_TRANSPOSE2_PD(G,H);
350 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
351 VV = _mm_macc_pd(vfeps,Fp,Y);
352 velec = _mm_mul_pd(qq20,VV);
353 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
354 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 velecsum = _mm_add_pd(velecsum,velec);
361 /* Update vectorial force */
362 fix2 = _mm_macc_pd(dx20,fscal,fix2);
363 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
364 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
366 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
367 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
368 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
370 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
372 /* Inner loop uses 154 flops */
379 j_coord_offsetA = DIM*jnrA;
381 /* load j atom coordinates */
382 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
385 /* Calculate displacement vector */
386 dx00 = _mm_sub_pd(ix0,jx0);
387 dy00 = _mm_sub_pd(iy0,jy0);
388 dz00 = _mm_sub_pd(iz0,jz0);
389 dx10 = _mm_sub_pd(ix1,jx0);
390 dy10 = _mm_sub_pd(iy1,jy0);
391 dz10 = _mm_sub_pd(iz1,jz0);
392 dx20 = _mm_sub_pd(ix2,jx0);
393 dy20 = _mm_sub_pd(iy2,jy0);
394 dz20 = _mm_sub_pd(iz2,jz0);
396 /* Calculate squared distance and things based on it */
397 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
398 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
399 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
401 rinv00 = avx128fma_invsqrt_d(rsq00);
402 rinv10 = avx128fma_invsqrt_d(rsq10);
403 rinv20 = avx128fma_invsqrt_d(rsq20);
405 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
407 /* Load parameters for j particles */
408 jq0 = _mm_load_sd(charge+jnrA+0);
409 vdwjidx0A = 2*vdwtype[jnrA+0];
411 fjx0 = _mm_setzero_pd();
412 fjy0 = _mm_setzero_pd();
413 fjz0 = _mm_setzero_pd();
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 r00 = _mm_mul_pd(rsq00,rinv00);
421 /* Compute parameters for interactions between i and j atoms */
422 qq00 = _mm_mul_pd(iq0,jq0);
423 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
425 /* Calculate table index by multiplying r with table scale and truncate to integer */
426 rt = _mm_mul_pd(r00,vftabscale);
427 vfitab = _mm_cvttpd_epi32(rt);
429 vfeps = _mm_frcz_pd(rt);
431 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
433 twovfeps = _mm_add_pd(vfeps,vfeps);
434 vfitab = _mm_slli_epi32(vfitab,2);
436 /* CUBIC SPLINE TABLE ELECTROSTATICS */
437 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
438 F = _mm_setzero_pd();
439 GMX_MM_TRANSPOSE2_PD(Y,F);
440 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
441 H = _mm_setzero_pd();
442 GMX_MM_TRANSPOSE2_PD(G,H);
443 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
444 VV = _mm_macc_pd(vfeps,Fp,Y);
445 velec = _mm_mul_pd(qq00,VV);
446 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
447 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
449 /* LENNARD-JONES DISPERSION/REPULSION */
451 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
452 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
453 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
454 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
455 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
459 velecsum = _mm_add_pd(velecsum,velec);
460 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
461 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
463 fscal = _mm_add_pd(felec,fvdw);
465 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
467 /* Update vectorial force */
468 fix0 = _mm_macc_pd(dx00,fscal,fix0);
469 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
470 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
472 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
473 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
474 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 r10 = _mm_mul_pd(rsq10,rinv10);
482 /* Compute parameters for interactions between i and j atoms */
483 qq10 = _mm_mul_pd(iq1,jq0);
485 /* Calculate table index by multiplying r with table scale and truncate to integer */
486 rt = _mm_mul_pd(r10,vftabscale);
487 vfitab = _mm_cvttpd_epi32(rt);
489 vfeps = _mm_frcz_pd(rt);
491 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
493 twovfeps = _mm_add_pd(vfeps,vfeps);
494 vfitab = _mm_slli_epi32(vfitab,2);
496 /* CUBIC SPLINE TABLE ELECTROSTATICS */
497 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
498 F = _mm_setzero_pd();
499 GMX_MM_TRANSPOSE2_PD(Y,F);
500 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
501 H = _mm_setzero_pd();
502 GMX_MM_TRANSPOSE2_PD(G,H);
503 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
504 VV = _mm_macc_pd(vfeps,Fp,Y);
505 velec = _mm_mul_pd(qq10,VV);
506 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
507 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
511 velecsum = _mm_add_pd(velecsum,velec);
515 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
517 /* Update vectorial force */
518 fix1 = _mm_macc_pd(dx10,fscal,fix1);
519 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
520 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
522 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
523 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
524 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 r20 = _mm_mul_pd(rsq20,rinv20);
532 /* Compute parameters for interactions between i and j atoms */
533 qq20 = _mm_mul_pd(iq2,jq0);
535 /* Calculate table index by multiplying r with table scale and truncate to integer */
536 rt = _mm_mul_pd(r20,vftabscale);
537 vfitab = _mm_cvttpd_epi32(rt);
539 vfeps = _mm_frcz_pd(rt);
541 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
543 twovfeps = _mm_add_pd(vfeps,vfeps);
544 vfitab = _mm_slli_epi32(vfitab,2);
546 /* CUBIC SPLINE TABLE ELECTROSTATICS */
547 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
548 F = _mm_setzero_pd();
549 GMX_MM_TRANSPOSE2_PD(Y,F);
550 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
551 H = _mm_setzero_pd();
552 GMX_MM_TRANSPOSE2_PD(G,H);
553 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
554 VV = _mm_macc_pd(vfeps,Fp,Y);
555 velec = _mm_mul_pd(qq20,VV);
556 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
557 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
561 velecsum = _mm_add_pd(velecsum,velec);
565 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
567 /* Update vectorial force */
568 fix2 = _mm_macc_pd(dx20,fscal,fix2);
569 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
570 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
572 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
573 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
574 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
576 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
578 /* Inner loop uses 154 flops */
581 /* End of innermost loop */
583 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
584 f+i_coord_offset,fshift+i_shift_offset);
587 /* Update potential energies */
588 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
589 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
591 /* Increment number of inner iterations */
592 inneriter += j_index_end - j_index_start;
594 /* Outer loop uses 20 flops */
597 /* Increment number of outer iterations */
600 /* Update outer/inner flops */
602 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
605 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_double
606 * Electrostatics interaction: CubicSplineTable
607 * VdW interaction: LennardJones
608 * Geometry: Water3-Particle
609 * Calculate force/pot: Force
612 nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_avx_128_fma_double
613 (t_nblist * gmx_restrict nlist,
614 rvec * gmx_restrict xx,
615 rvec * gmx_restrict ff,
616 struct t_forcerec * gmx_restrict fr,
617 t_mdatoms * gmx_restrict mdatoms,
618 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
619 t_nrnb * gmx_restrict nrnb)
621 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
622 * just 0 for non-waters.
623 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
624 * jnr indices corresponding to data put in the four positions in the SIMD register.
626 int i_shift_offset,i_coord_offset,outeriter,inneriter;
627 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
629 int j_coord_offsetA,j_coord_offsetB;
630 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
632 real *shiftvec,*fshift,*x,*f;
633 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
635 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
637 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
639 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
640 int vdwjidx0A,vdwjidx0B;
641 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
642 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
643 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
644 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
645 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
648 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
651 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
652 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
654 __m128i ifour = _mm_set1_epi32(4);
655 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
657 __m128d dummy_mask,cutoff_mask;
658 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
659 __m128d one = _mm_set1_pd(1.0);
660 __m128d two = _mm_set1_pd(2.0);
666 jindex = nlist->jindex;
668 shiftidx = nlist->shift;
670 shiftvec = fr->shift_vec[0];
671 fshift = fr->fshift[0];
672 facel = _mm_set1_pd(fr->ic->epsfac);
673 charge = mdatoms->chargeA;
674 nvdwtype = fr->ntype;
676 vdwtype = mdatoms->typeA;
678 vftab = kernel_data->table_elec->data;
679 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
681 /* Setup water-specific parameters */
682 inr = nlist->iinr[0];
683 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
684 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
685 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
686 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
688 /* Avoid stupid compiler warnings */
696 /* Start outer loop over neighborlists */
697 for(iidx=0; iidx<nri; iidx++)
699 /* Load shift vector for this list */
700 i_shift_offset = DIM*shiftidx[iidx];
702 /* Load limits for loop over neighbors */
703 j_index_start = jindex[iidx];
704 j_index_end = jindex[iidx+1];
706 /* Get outer coordinate index */
708 i_coord_offset = DIM*inr;
710 /* Load i particle coords and add shift vector */
711 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
712 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
714 fix0 = _mm_setzero_pd();
715 fiy0 = _mm_setzero_pd();
716 fiz0 = _mm_setzero_pd();
717 fix1 = _mm_setzero_pd();
718 fiy1 = _mm_setzero_pd();
719 fiz1 = _mm_setzero_pd();
720 fix2 = _mm_setzero_pd();
721 fiy2 = _mm_setzero_pd();
722 fiz2 = _mm_setzero_pd();
724 /* Start inner kernel loop */
725 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
728 /* Get j neighbor index, and coordinate index */
731 j_coord_offsetA = DIM*jnrA;
732 j_coord_offsetB = DIM*jnrB;
734 /* load j atom coordinates */
735 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
738 /* Calculate displacement vector */
739 dx00 = _mm_sub_pd(ix0,jx0);
740 dy00 = _mm_sub_pd(iy0,jy0);
741 dz00 = _mm_sub_pd(iz0,jz0);
742 dx10 = _mm_sub_pd(ix1,jx0);
743 dy10 = _mm_sub_pd(iy1,jy0);
744 dz10 = _mm_sub_pd(iz1,jz0);
745 dx20 = _mm_sub_pd(ix2,jx0);
746 dy20 = _mm_sub_pd(iy2,jy0);
747 dz20 = _mm_sub_pd(iz2,jz0);
749 /* Calculate squared distance and things based on it */
750 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
751 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
752 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
754 rinv00 = avx128fma_invsqrt_d(rsq00);
755 rinv10 = avx128fma_invsqrt_d(rsq10);
756 rinv20 = avx128fma_invsqrt_d(rsq20);
758 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
760 /* Load parameters for j particles */
761 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
762 vdwjidx0A = 2*vdwtype[jnrA+0];
763 vdwjidx0B = 2*vdwtype[jnrB+0];
765 fjx0 = _mm_setzero_pd();
766 fjy0 = _mm_setzero_pd();
767 fjz0 = _mm_setzero_pd();
769 /**************************
770 * CALCULATE INTERACTIONS *
771 **************************/
773 r00 = _mm_mul_pd(rsq00,rinv00);
775 /* Compute parameters for interactions between i and j atoms */
776 qq00 = _mm_mul_pd(iq0,jq0);
777 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
778 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
780 /* Calculate table index by multiplying r with table scale and truncate to integer */
781 rt = _mm_mul_pd(r00,vftabscale);
782 vfitab = _mm_cvttpd_epi32(rt);
784 vfeps = _mm_frcz_pd(rt);
786 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
788 twovfeps = _mm_add_pd(vfeps,vfeps);
789 vfitab = _mm_slli_epi32(vfitab,2);
791 /* CUBIC SPLINE TABLE ELECTROSTATICS */
792 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
793 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
794 GMX_MM_TRANSPOSE2_PD(Y,F);
795 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
796 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
797 GMX_MM_TRANSPOSE2_PD(G,H);
798 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
799 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
800 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
802 /* LENNARD-JONES DISPERSION/REPULSION */
804 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
805 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
807 fscal = _mm_add_pd(felec,fvdw);
809 /* Update vectorial force */
810 fix0 = _mm_macc_pd(dx00,fscal,fix0);
811 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
812 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
814 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
815 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
816 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
818 /**************************
819 * CALCULATE INTERACTIONS *
820 **************************/
822 r10 = _mm_mul_pd(rsq10,rinv10);
824 /* Compute parameters for interactions between i and j atoms */
825 qq10 = _mm_mul_pd(iq1,jq0);
827 /* Calculate table index by multiplying r with table scale and truncate to integer */
828 rt = _mm_mul_pd(r10,vftabscale);
829 vfitab = _mm_cvttpd_epi32(rt);
831 vfeps = _mm_frcz_pd(rt);
833 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
835 twovfeps = _mm_add_pd(vfeps,vfeps);
836 vfitab = _mm_slli_epi32(vfitab,2);
838 /* CUBIC SPLINE TABLE ELECTROSTATICS */
839 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
840 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
841 GMX_MM_TRANSPOSE2_PD(Y,F);
842 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
843 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
844 GMX_MM_TRANSPOSE2_PD(G,H);
845 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
846 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
847 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
851 /* Update vectorial force */
852 fix1 = _mm_macc_pd(dx10,fscal,fix1);
853 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
854 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
856 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
857 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
858 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
860 /**************************
861 * CALCULATE INTERACTIONS *
862 **************************/
864 r20 = _mm_mul_pd(rsq20,rinv20);
866 /* Compute parameters for interactions between i and j atoms */
867 qq20 = _mm_mul_pd(iq2,jq0);
869 /* Calculate table index by multiplying r with table scale and truncate to integer */
870 rt = _mm_mul_pd(r20,vftabscale);
871 vfitab = _mm_cvttpd_epi32(rt);
873 vfeps = _mm_frcz_pd(rt);
875 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
877 twovfeps = _mm_add_pd(vfeps,vfeps);
878 vfitab = _mm_slli_epi32(vfitab,2);
880 /* CUBIC SPLINE TABLE ELECTROSTATICS */
881 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
882 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
883 GMX_MM_TRANSPOSE2_PD(Y,F);
884 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
885 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
886 GMX_MM_TRANSPOSE2_PD(G,H);
887 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
888 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
889 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
893 /* Update vectorial force */
894 fix2 = _mm_macc_pd(dx20,fscal,fix2);
895 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
896 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
898 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
899 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
900 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
902 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
904 /* Inner loop uses 137 flops */
911 j_coord_offsetA = DIM*jnrA;
913 /* load j atom coordinates */
914 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
917 /* Calculate displacement vector */
918 dx00 = _mm_sub_pd(ix0,jx0);
919 dy00 = _mm_sub_pd(iy0,jy0);
920 dz00 = _mm_sub_pd(iz0,jz0);
921 dx10 = _mm_sub_pd(ix1,jx0);
922 dy10 = _mm_sub_pd(iy1,jy0);
923 dz10 = _mm_sub_pd(iz1,jz0);
924 dx20 = _mm_sub_pd(ix2,jx0);
925 dy20 = _mm_sub_pd(iy2,jy0);
926 dz20 = _mm_sub_pd(iz2,jz0);
928 /* Calculate squared distance and things based on it */
929 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
930 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
931 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
933 rinv00 = avx128fma_invsqrt_d(rsq00);
934 rinv10 = avx128fma_invsqrt_d(rsq10);
935 rinv20 = avx128fma_invsqrt_d(rsq20);
937 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
939 /* Load parameters for j particles */
940 jq0 = _mm_load_sd(charge+jnrA+0);
941 vdwjidx0A = 2*vdwtype[jnrA+0];
943 fjx0 = _mm_setzero_pd();
944 fjy0 = _mm_setzero_pd();
945 fjz0 = _mm_setzero_pd();
947 /**************************
948 * CALCULATE INTERACTIONS *
949 **************************/
951 r00 = _mm_mul_pd(rsq00,rinv00);
953 /* Compute parameters for interactions between i and j atoms */
954 qq00 = _mm_mul_pd(iq0,jq0);
955 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
957 /* Calculate table index by multiplying r with table scale and truncate to integer */
958 rt = _mm_mul_pd(r00,vftabscale);
959 vfitab = _mm_cvttpd_epi32(rt);
961 vfeps = _mm_frcz_pd(rt);
963 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
965 twovfeps = _mm_add_pd(vfeps,vfeps);
966 vfitab = _mm_slli_epi32(vfitab,2);
968 /* CUBIC SPLINE TABLE ELECTROSTATICS */
969 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
970 F = _mm_setzero_pd();
971 GMX_MM_TRANSPOSE2_PD(Y,F);
972 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
973 H = _mm_setzero_pd();
974 GMX_MM_TRANSPOSE2_PD(G,H);
975 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
976 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
977 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
979 /* LENNARD-JONES DISPERSION/REPULSION */
981 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
982 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
984 fscal = _mm_add_pd(felec,fvdw);
986 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
988 /* Update vectorial force */
989 fix0 = _mm_macc_pd(dx00,fscal,fix0);
990 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
991 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
993 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
994 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
995 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
997 /**************************
998 * CALCULATE INTERACTIONS *
999 **************************/
1001 r10 = _mm_mul_pd(rsq10,rinv10);
1003 /* Compute parameters for interactions between i and j atoms */
1004 qq10 = _mm_mul_pd(iq1,jq0);
1006 /* Calculate table index by multiplying r with table scale and truncate to integer */
1007 rt = _mm_mul_pd(r10,vftabscale);
1008 vfitab = _mm_cvttpd_epi32(rt);
1010 vfeps = _mm_frcz_pd(rt);
1012 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1014 twovfeps = _mm_add_pd(vfeps,vfeps);
1015 vfitab = _mm_slli_epi32(vfitab,2);
1017 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1018 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1019 F = _mm_setzero_pd();
1020 GMX_MM_TRANSPOSE2_PD(Y,F);
1021 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1022 H = _mm_setzero_pd();
1023 GMX_MM_TRANSPOSE2_PD(G,H);
1024 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1025 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1026 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1030 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1032 /* Update vectorial force */
1033 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1034 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1035 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1037 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1038 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1039 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1041 /**************************
1042 * CALCULATE INTERACTIONS *
1043 **************************/
1045 r20 = _mm_mul_pd(rsq20,rinv20);
1047 /* Compute parameters for interactions between i and j atoms */
1048 qq20 = _mm_mul_pd(iq2,jq0);
1050 /* Calculate table index by multiplying r with table scale and truncate to integer */
1051 rt = _mm_mul_pd(r20,vftabscale);
1052 vfitab = _mm_cvttpd_epi32(rt);
1054 vfeps = _mm_frcz_pd(rt);
1056 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1058 twovfeps = _mm_add_pd(vfeps,vfeps);
1059 vfitab = _mm_slli_epi32(vfitab,2);
1061 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1062 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1063 F = _mm_setzero_pd();
1064 GMX_MM_TRANSPOSE2_PD(Y,F);
1065 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1066 H = _mm_setzero_pd();
1067 GMX_MM_TRANSPOSE2_PD(G,H);
1068 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1069 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1070 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1074 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1076 /* Update vectorial force */
1077 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1078 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1079 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1081 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1082 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1083 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1085 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1087 /* Inner loop uses 137 flops */
1090 /* End of innermost loop */
1092 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1093 f+i_coord_offset,fshift+i_shift_offset);
1095 /* Increment number of inner iterations */
1096 inneriter += j_index_end - j_index_start;
1098 /* Outer loop uses 18 flops */
1101 /* Increment number of outer iterations */
1104 /* Update outer/inner flops */
1106 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*137);