2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
91 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
92 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
93 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
100 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
102 __m128i ifour = _mm_set1_epi32(4);
103 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
106 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
109 __m128 dummy_mask,cutoff_mask;
110 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111 __m128 one = _mm_set1_ps(1.0);
112 __m128 two = _mm_set1_ps(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm_set1_ps(fr->ic->epsfac);
125 charge = mdatoms->chargeA;
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 vftab = kernel_data->table_vdw->data;
131 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
133 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
134 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
135 beta2 = _mm_mul_ps(beta,beta);
136 beta3 = _mm_mul_ps(beta,beta2);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
144 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
145 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 /* Avoid stupid compiler warnings */
149 jnrA = jnrB = jnrC = jnrD = 0;
158 for(iidx=0;iidx<4*DIM;iidx++)
163 /* Start outer loop over neighborlists */
164 for(iidx=0; iidx<nri; iidx++)
166 /* Load shift vector for this list */
167 i_shift_offset = DIM*shiftidx[iidx];
169 /* Load limits for loop over neighbors */
170 j_index_start = jindex[iidx];
171 j_index_end = jindex[iidx+1];
173 /* Get outer coordinate index */
175 i_coord_offset = DIM*inr;
177 /* Load i particle coords and add shift vector */
178 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
179 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
181 fix0 = _mm_setzero_ps();
182 fiy0 = _mm_setzero_ps();
183 fiz0 = _mm_setzero_ps();
184 fix1 = _mm_setzero_ps();
185 fiy1 = _mm_setzero_ps();
186 fiz1 = _mm_setzero_ps();
187 fix2 = _mm_setzero_ps();
188 fiy2 = _mm_setzero_ps();
189 fiz2 = _mm_setzero_ps();
191 /* Reset potential sums */
192 velecsum = _mm_setzero_ps();
193 vvdwsum = _mm_setzero_ps();
195 /* Start inner kernel loop */
196 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
199 /* Get j neighbor index, and coordinate index */
204 j_coord_offsetA = DIM*jnrA;
205 j_coord_offsetB = DIM*jnrB;
206 j_coord_offsetC = DIM*jnrC;
207 j_coord_offsetD = DIM*jnrD;
209 /* load j atom coordinates */
210 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
211 x+j_coord_offsetC,x+j_coord_offsetD,
214 /* Calculate displacement vector */
215 dx00 = _mm_sub_ps(ix0,jx0);
216 dy00 = _mm_sub_ps(iy0,jy0);
217 dz00 = _mm_sub_ps(iz0,jz0);
218 dx10 = _mm_sub_ps(ix1,jx0);
219 dy10 = _mm_sub_ps(iy1,jy0);
220 dz10 = _mm_sub_ps(iz1,jz0);
221 dx20 = _mm_sub_ps(ix2,jx0);
222 dy20 = _mm_sub_ps(iy2,jy0);
223 dz20 = _mm_sub_ps(iz2,jz0);
225 /* Calculate squared distance and things based on it */
226 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
227 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
228 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
230 rinv00 = avx128fma_invsqrt_f(rsq00);
231 rinv10 = avx128fma_invsqrt_f(rsq10);
232 rinv20 = avx128fma_invsqrt_f(rsq20);
234 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
235 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
236 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
238 /* Load parameters for j particles */
239 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
240 charge+jnrC+0,charge+jnrD+0);
241 vdwjidx0A = 2*vdwtype[jnrA+0];
242 vdwjidx0B = 2*vdwtype[jnrB+0];
243 vdwjidx0C = 2*vdwtype[jnrC+0];
244 vdwjidx0D = 2*vdwtype[jnrD+0];
246 fjx0 = _mm_setzero_ps();
247 fjy0 = _mm_setzero_ps();
248 fjz0 = _mm_setzero_ps();
250 /**************************
251 * CALCULATE INTERACTIONS *
252 **************************/
254 r00 = _mm_mul_ps(rsq00,rinv00);
256 /* Compute parameters for interactions between i and j atoms */
257 qq00 = _mm_mul_ps(iq0,jq0);
258 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
259 vdwparam+vdwioffset0+vdwjidx0B,
260 vdwparam+vdwioffset0+vdwjidx0C,
261 vdwparam+vdwioffset0+vdwjidx0D,
264 /* Calculate table index by multiplying r with table scale and truncate to integer */
265 rt = _mm_mul_ps(r00,vftabscale);
266 vfitab = _mm_cvttps_epi32(rt);
268 vfeps = _mm_frcz_ps(rt);
270 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
272 twovfeps = _mm_add_ps(vfeps,vfeps);
273 vfitab = _mm_slli_epi32(vfitab,3);
275 /* EWALD ELECTROSTATICS */
277 /* Analytical PME correction */
278 zeta2 = _mm_mul_ps(beta2,rsq00);
279 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
280 pmecorrF = avx128fma_pmecorrF_f(zeta2);
281 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
282 felec = _mm_mul_ps(qq00,felec);
283 pmecorrV = avx128fma_pmecorrV_f(zeta2);
284 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
285 velec = _mm_mul_ps(qq00,velec);
287 /* CUBIC SPLINE TABLE DISPERSION */
288 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
289 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
290 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
291 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
292 _MM_TRANSPOSE4_PS(Y,F,G,H);
293 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
294 VV = _mm_macc_ps(vfeps,Fp,Y);
295 vvdw6 = _mm_mul_ps(c6_00,VV);
296 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
297 fvdw6 = _mm_mul_ps(c6_00,FF);
299 /* CUBIC SPLINE TABLE REPULSION */
300 vfitab = _mm_add_epi32(vfitab,ifour);
301 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
302 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
303 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
304 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
305 _MM_TRANSPOSE4_PS(Y,F,G,H);
306 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
307 VV = _mm_macc_ps(vfeps,Fp,Y);
308 vvdw12 = _mm_mul_ps(c12_00,VV);
309 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
310 fvdw12 = _mm_mul_ps(c12_00,FF);
311 vvdw = _mm_add_ps(vvdw12,vvdw6);
312 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
314 /* Update potential sum for this i atom from the interaction with this j atom. */
315 velecsum = _mm_add_ps(velecsum,velec);
316 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
318 fscal = _mm_add_ps(felec,fvdw);
320 /* Update vectorial force */
321 fix0 = _mm_macc_ps(dx00,fscal,fix0);
322 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
323 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
325 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
326 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
327 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
329 /**************************
330 * CALCULATE INTERACTIONS *
331 **************************/
333 r10 = _mm_mul_ps(rsq10,rinv10);
335 /* Compute parameters for interactions between i and j atoms */
336 qq10 = _mm_mul_ps(iq1,jq0);
338 /* EWALD ELECTROSTATICS */
340 /* Analytical PME correction */
341 zeta2 = _mm_mul_ps(beta2,rsq10);
342 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
343 pmecorrF = avx128fma_pmecorrF_f(zeta2);
344 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
345 felec = _mm_mul_ps(qq10,felec);
346 pmecorrV = avx128fma_pmecorrV_f(zeta2);
347 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
348 velec = _mm_mul_ps(qq10,velec);
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 velecsum = _mm_add_ps(velecsum,velec);
355 /* Update vectorial force */
356 fix1 = _mm_macc_ps(dx10,fscal,fix1);
357 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
358 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
360 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
361 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
362 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 r20 = _mm_mul_ps(rsq20,rinv20);
370 /* Compute parameters for interactions between i and j atoms */
371 qq20 = _mm_mul_ps(iq2,jq0);
373 /* EWALD ELECTROSTATICS */
375 /* Analytical PME correction */
376 zeta2 = _mm_mul_ps(beta2,rsq20);
377 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
378 pmecorrF = avx128fma_pmecorrF_f(zeta2);
379 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
380 felec = _mm_mul_ps(qq20,felec);
381 pmecorrV = avx128fma_pmecorrV_f(zeta2);
382 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
383 velec = _mm_mul_ps(qq20,velec);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velecsum = _mm_add_ps(velecsum,velec);
390 /* Update vectorial force */
391 fix2 = _mm_macc_ps(dx20,fscal,fix2);
392 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
393 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
395 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
396 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
397 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
399 fjptrA = f+j_coord_offsetA;
400 fjptrB = f+j_coord_offsetB;
401 fjptrC = f+j_coord_offsetC;
402 fjptrD = f+j_coord_offsetD;
404 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
406 /* Inner loop uses 121 flops */
412 /* Get j neighbor index, and coordinate index */
413 jnrlistA = jjnr[jidx];
414 jnrlistB = jjnr[jidx+1];
415 jnrlistC = jjnr[jidx+2];
416 jnrlistD = jjnr[jidx+3];
417 /* Sign of each element will be negative for non-real atoms.
418 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
419 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
421 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
422 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
423 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
424 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
425 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
426 j_coord_offsetA = DIM*jnrA;
427 j_coord_offsetB = DIM*jnrB;
428 j_coord_offsetC = DIM*jnrC;
429 j_coord_offsetD = DIM*jnrD;
431 /* load j atom coordinates */
432 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
433 x+j_coord_offsetC,x+j_coord_offsetD,
436 /* Calculate displacement vector */
437 dx00 = _mm_sub_ps(ix0,jx0);
438 dy00 = _mm_sub_ps(iy0,jy0);
439 dz00 = _mm_sub_ps(iz0,jz0);
440 dx10 = _mm_sub_ps(ix1,jx0);
441 dy10 = _mm_sub_ps(iy1,jy0);
442 dz10 = _mm_sub_ps(iz1,jz0);
443 dx20 = _mm_sub_ps(ix2,jx0);
444 dy20 = _mm_sub_ps(iy2,jy0);
445 dz20 = _mm_sub_ps(iz2,jz0);
447 /* Calculate squared distance and things based on it */
448 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
449 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
450 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
452 rinv00 = avx128fma_invsqrt_f(rsq00);
453 rinv10 = avx128fma_invsqrt_f(rsq10);
454 rinv20 = avx128fma_invsqrt_f(rsq20);
456 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
457 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
458 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
460 /* Load parameters for j particles */
461 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
462 charge+jnrC+0,charge+jnrD+0);
463 vdwjidx0A = 2*vdwtype[jnrA+0];
464 vdwjidx0B = 2*vdwtype[jnrB+0];
465 vdwjidx0C = 2*vdwtype[jnrC+0];
466 vdwjidx0D = 2*vdwtype[jnrD+0];
468 fjx0 = _mm_setzero_ps();
469 fjy0 = _mm_setzero_ps();
470 fjz0 = _mm_setzero_ps();
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
476 r00 = _mm_mul_ps(rsq00,rinv00);
477 r00 = _mm_andnot_ps(dummy_mask,r00);
479 /* Compute parameters for interactions between i and j atoms */
480 qq00 = _mm_mul_ps(iq0,jq0);
481 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
482 vdwparam+vdwioffset0+vdwjidx0B,
483 vdwparam+vdwioffset0+vdwjidx0C,
484 vdwparam+vdwioffset0+vdwjidx0D,
487 /* Calculate table index by multiplying r with table scale and truncate to integer */
488 rt = _mm_mul_ps(r00,vftabscale);
489 vfitab = _mm_cvttps_epi32(rt);
491 vfeps = _mm_frcz_ps(rt);
493 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
495 twovfeps = _mm_add_ps(vfeps,vfeps);
496 vfitab = _mm_slli_epi32(vfitab,3);
498 /* EWALD ELECTROSTATICS */
500 /* Analytical PME correction */
501 zeta2 = _mm_mul_ps(beta2,rsq00);
502 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
503 pmecorrF = avx128fma_pmecorrF_f(zeta2);
504 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
505 felec = _mm_mul_ps(qq00,felec);
506 pmecorrV = avx128fma_pmecorrV_f(zeta2);
507 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
508 velec = _mm_mul_ps(qq00,velec);
510 /* CUBIC SPLINE TABLE DISPERSION */
511 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
512 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
513 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
514 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
515 _MM_TRANSPOSE4_PS(Y,F,G,H);
516 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
517 VV = _mm_macc_ps(vfeps,Fp,Y);
518 vvdw6 = _mm_mul_ps(c6_00,VV);
519 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
520 fvdw6 = _mm_mul_ps(c6_00,FF);
522 /* CUBIC SPLINE TABLE REPULSION */
523 vfitab = _mm_add_epi32(vfitab,ifour);
524 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
525 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
526 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
527 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
528 _MM_TRANSPOSE4_PS(Y,F,G,H);
529 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
530 VV = _mm_macc_ps(vfeps,Fp,Y);
531 vvdw12 = _mm_mul_ps(c12_00,VV);
532 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
533 fvdw12 = _mm_mul_ps(c12_00,FF);
534 vvdw = _mm_add_ps(vvdw12,vvdw6);
535 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velec = _mm_andnot_ps(dummy_mask,velec);
539 velecsum = _mm_add_ps(velecsum,velec);
540 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
541 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
543 fscal = _mm_add_ps(felec,fvdw);
545 fscal = _mm_andnot_ps(dummy_mask,fscal);
547 /* Update vectorial force */
548 fix0 = _mm_macc_ps(dx00,fscal,fix0);
549 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
550 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
552 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
553 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
554 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 r10 = _mm_mul_ps(rsq10,rinv10);
561 r10 = _mm_andnot_ps(dummy_mask,r10);
563 /* Compute parameters for interactions between i and j atoms */
564 qq10 = _mm_mul_ps(iq1,jq0);
566 /* EWALD ELECTROSTATICS */
568 /* Analytical PME correction */
569 zeta2 = _mm_mul_ps(beta2,rsq10);
570 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
571 pmecorrF = avx128fma_pmecorrF_f(zeta2);
572 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
573 felec = _mm_mul_ps(qq10,felec);
574 pmecorrV = avx128fma_pmecorrV_f(zeta2);
575 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
576 velec = _mm_mul_ps(qq10,velec);
578 /* Update potential sum for this i atom from the interaction with this j atom. */
579 velec = _mm_andnot_ps(dummy_mask,velec);
580 velecsum = _mm_add_ps(velecsum,velec);
584 fscal = _mm_andnot_ps(dummy_mask,fscal);
586 /* Update vectorial force */
587 fix1 = _mm_macc_ps(dx10,fscal,fix1);
588 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
589 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
591 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
592 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
593 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
595 /**************************
596 * CALCULATE INTERACTIONS *
597 **************************/
599 r20 = _mm_mul_ps(rsq20,rinv20);
600 r20 = _mm_andnot_ps(dummy_mask,r20);
602 /* Compute parameters for interactions between i and j atoms */
603 qq20 = _mm_mul_ps(iq2,jq0);
605 /* EWALD ELECTROSTATICS */
607 /* Analytical PME correction */
608 zeta2 = _mm_mul_ps(beta2,rsq20);
609 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
610 pmecorrF = avx128fma_pmecorrF_f(zeta2);
611 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
612 felec = _mm_mul_ps(qq20,felec);
613 pmecorrV = avx128fma_pmecorrV_f(zeta2);
614 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
615 velec = _mm_mul_ps(qq20,velec);
617 /* Update potential sum for this i atom from the interaction with this j atom. */
618 velec = _mm_andnot_ps(dummy_mask,velec);
619 velecsum = _mm_add_ps(velecsum,velec);
623 fscal = _mm_andnot_ps(dummy_mask,fscal);
625 /* Update vectorial force */
626 fix2 = _mm_macc_ps(dx20,fscal,fix2);
627 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
628 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
630 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
631 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
632 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
634 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
635 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
636 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
637 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
639 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
641 /* Inner loop uses 124 flops */
644 /* End of innermost loop */
646 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
647 f+i_coord_offset,fshift+i_shift_offset);
650 /* Update potential energies */
651 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
652 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
654 /* Increment number of inner iterations */
655 inneriter += j_index_end - j_index_start;
657 /* Outer loop uses 20 flops */
660 /* Increment number of outer iterations */
663 /* Update outer/inner flops */
665 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*124);
668 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
669 * Electrostatics interaction: Ewald
670 * VdW interaction: CubicSplineTable
671 * Geometry: Water3-Particle
672 * Calculate force/pot: Force
675 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_128_fma_single
676 (t_nblist * gmx_restrict nlist,
677 rvec * gmx_restrict xx,
678 rvec * gmx_restrict ff,
679 struct t_forcerec * gmx_restrict fr,
680 t_mdatoms * gmx_restrict mdatoms,
681 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
682 t_nrnb * gmx_restrict nrnb)
684 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
685 * just 0 for non-waters.
686 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
687 * jnr indices corresponding to data put in the four positions in the SIMD register.
689 int i_shift_offset,i_coord_offset,outeriter,inneriter;
690 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
691 int jnrA,jnrB,jnrC,jnrD;
692 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
693 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
694 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
696 real *shiftvec,*fshift,*x,*f;
697 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
699 __m128 fscal,rcutoff,rcutoff2,jidxall;
701 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
703 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
705 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
706 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
707 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
708 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
709 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
710 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
711 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
714 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
717 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
718 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
720 __m128i ifour = _mm_set1_epi32(4);
721 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
724 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
725 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
727 __m128 dummy_mask,cutoff_mask;
728 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
729 __m128 one = _mm_set1_ps(1.0);
730 __m128 two = _mm_set1_ps(2.0);
736 jindex = nlist->jindex;
738 shiftidx = nlist->shift;
740 shiftvec = fr->shift_vec[0];
741 fshift = fr->fshift[0];
742 facel = _mm_set1_ps(fr->ic->epsfac);
743 charge = mdatoms->chargeA;
744 nvdwtype = fr->ntype;
746 vdwtype = mdatoms->typeA;
748 vftab = kernel_data->table_vdw->data;
749 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
751 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
752 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
753 beta2 = _mm_mul_ps(beta,beta);
754 beta3 = _mm_mul_ps(beta,beta2);
755 ewtab = fr->ic->tabq_coul_F;
756 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
757 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
759 /* Setup water-specific parameters */
760 inr = nlist->iinr[0];
761 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
762 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
763 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
764 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
766 /* Avoid stupid compiler warnings */
767 jnrA = jnrB = jnrC = jnrD = 0;
776 for(iidx=0;iidx<4*DIM;iidx++)
781 /* Start outer loop over neighborlists */
782 for(iidx=0; iidx<nri; iidx++)
784 /* Load shift vector for this list */
785 i_shift_offset = DIM*shiftidx[iidx];
787 /* Load limits for loop over neighbors */
788 j_index_start = jindex[iidx];
789 j_index_end = jindex[iidx+1];
791 /* Get outer coordinate index */
793 i_coord_offset = DIM*inr;
795 /* Load i particle coords and add shift vector */
796 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
797 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
799 fix0 = _mm_setzero_ps();
800 fiy0 = _mm_setzero_ps();
801 fiz0 = _mm_setzero_ps();
802 fix1 = _mm_setzero_ps();
803 fiy1 = _mm_setzero_ps();
804 fiz1 = _mm_setzero_ps();
805 fix2 = _mm_setzero_ps();
806 fiy2 = _mm_setzero_ps();
807 fiz2 = _mm_setzero_ps();
809 /* Start inner kernel loop */
810 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
813 /* Get j neighbor index, and coordinate index */
818 j_coord_offsetA = DIM*jnrA;
819 j_coord_offsetB = DIM*jnrB;
820 j_coord_offsetC = DIM*jnrC;
821 j_coord_offsetD = DIM*jnrD;
823 /* load j atom coordinates */
824 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
825 x+j_coord_offsetC,x+j_coord_offsetD,
828 /* Calculate displacement vector */
829 dx00 = _mm_sub_ps(ix0,jx0);
830 dy00 = _mm_sub_ps(iy0,jy0);
831 dz00 = _mm_sub_ps(iz0,jz0);
832 dx10 = _mm_sub_ps(ix1,jx0);
833 dy10 = _mm_sub_ps(iy1,jy0);
834 dz10 = _mm_sub_ps(iz1,jz0);
835 dx20 = _mm_sub_ps(ix2,jx0);
836 dy20 = _mm_sub_ps(iy2,jy0);
837 dz20 = _mm_sub_ps(iz2,jz0);
839 /* Calculate squared distance and things based on it */
840 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
841 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
842 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
844 rinv00 = avx128fma_invsqrt_f(rsq00);
845 rinv10 = avx128fma_invsqrt_f(rsq10);
846 rinv20 = avx128fma_invsqrt_f(rsq20);
848 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
849 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
850 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
852 /* Load parameters for j particles */
853 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
854 charge+jnrC+0,charge+jnrD+0);
855 vdwjidx0A = 2*vdwtype[jnrA+0];
856 vdwjidx0B = 2*vdwtype[jnrB+0];
857 vdwjidx0C = 2*vdwtype[jnrC+0];
858 vdwjidx0D = 2*vdwtype[jnrD+0];
860 fjx0 = _mm_setzero_ps();
861 fjy0 = _mm_setzero_ps();
862 fjz0 = _mm_setzero_ps();
864 /**************************
865 * CALCULATE INTERACTIONS *
866 **************************/
868 r00 = _mm_mul_ps(rsq00,rinv00);
870 /* Compute parameters for interactions between i and j atoms */
871 qq00 = _mm_mul_ps(iq0,jq0);
872 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
873 vdwparam+vdwioffset0+vdwjidx0B,
874 vdwparam+vdwioffset0+vdwjidx0C,
875 vdwparam+vdwioffset0+vdwjidx0D,
878 /* Calculate table index by multiplying r with table scale and truncate to integer */
879 rt = _mm_mul_ps(r00,vftabscale);
880 vfitab = _mm_cvttps_epi32(rt);
882 vfeps = _mm_frcz_ps(rt);
884 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
886 twovfeps = _mm_add_ps(vfeps,vfeps);
887 vfitab = _mm_slli_epi32(vfitab,3);
889 /* EWALD ELECTROSTATICS */
891 /* Analytical PME correction */
892 zeta2 = _mm_mul_ps(beta2,rsq00);
893 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
894 pmecorrF = avx128fma_pmecorrF_f(zeta2);
895 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
896 felec = _mm_mul_ps(qq00,felec);
898 /* CUBIC SPLINE TABLE DISPERSION */
899 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
900 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
901 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
902 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
903 _MM_TRANSPOSE4_PS(Y,F,G,H);
904 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
905 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
906 fvdw6 = _mm_mul_ps(c6_00,FF);
908 /* CUBIC SPLINE TABLE REPULSION */
909 vfitab = _mm_add_epi32(vfitab,ifour);
910 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
911 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
912 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
913 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
914 _MM_TRANSPOSE4_PS(Y,F,G,H);
915 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
916 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
917 fvdw12 = _mm_mul_ps(c12_00,FF);
918 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
920 fscal = _mm_add_ps(felec,fvdw);
922 /* Update vectorial force */
923 fix0 = _mm_macc_ps(dx00,fscal,fix0);
924 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
925 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
927 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
928 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
929 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
931 /**************************
932 * CALCULATE INTERACTIONS *
933 **************************/
935 r10 = _mm_mul_ps(rsq10,rinv10);
937 /* Compute parameters for interactions between i and j atoms */
938 qq10 = _mm_mul_ps(iq1,jq0);
940 /* EWALD ELECTROSTATICS */
942 /* Analytical PME correction */
943 zeta2 = _mm_mul_ps(beta2,rsq10);
944 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
945 pmecorrF = avx128fma_pmecorrF_f(zeta2);
946 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
947 felec = _mm_mul_ps(qq10,felec);
951 /* Update vectorial force */
952 fix1 = _mm_macc_ps(dx10,fscal,fix1);
953 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
954 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
956 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
957 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
958 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
960 /**************************
961 * CALCULATE INTERACTIONS *
962 **************************/
964 r20 = _mm_mul_ps(rsq20,rinv20);
966 /* Compute parameters for interactions between i and j atoms */
967 qq20 = _mm_mul_ps(iq2,jq0);
969 /* EWALD ELECTROSTATICS */
971 /* Analytical PME correction */
972 zeta2 = _mm_mul_ps(beta2,rsq20);
973 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
974 pmecorrF = avx128fma_pmecorrF_f(zeta2);
975 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
976 felec = _mm_mul_ps(qq20,felec);
980 /* Update vectorial force */
981 fix2 = _mm_macc_ps(dx20,fscal,fix2);
982 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
983 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
985 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
986 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
987 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
989 fjptrA = f+j_coord_offsetA;
990 fjptrB = f+j_coord_offsetB;
991 fjptrC = f+j_coord_offsetC;
992 fjptrD = f+j_coord_offsetD;
994 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
996 /* Inner loop uses 110 flops */
1002 /* Get j neighbor index, and coordinate index */
1003 jnrlistA = jjnr[jidx];
1004 jnrlistB = jjnr[jidx+1];
1005 jnrlistC = jjnr[jidx+2];
1006 jnrlistD = jjnr[jidx+3];
1007 /* Sign of each element will be negative for non-real atoms.
1008 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1009 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1011 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1012 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1013 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1014 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1015 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1016 j_coord_offsetA = DIM*jnrA;
1017 j_coord_offsetB = DIM*jnrB;
1018 j_coord_offsetC = DIM*jnrC;
1019 j_coord_offsetD = DIM*jnrD;
1021 /* load j atom coordinates */
1022 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1023 x+j_coord_offsetC,x+j_coord_offsetD,
1026 /* Calculate displacement vector */
1027 dx00 = _mm_sub_ps(ix0,jx0);
1028 dy00 = _mm_sub_ps(iy0,jy0);
1029 dz00 = _mm_sub_ps(iz0,jz0);
1030 dx10 = _mm_sub_ps(ix1,jx0);
1031 dy10 = _mm_sub_ps(iy1,jy0);
1032 dz10 = _mm_sub_ps(iz1,jz0);
1033 dx20 = _mm_sub_ps(ix2,jx0);
1034 dy20 = _mm_sub_ps(iy2,jy0);
1035 dz20 = _mm_sub_ps(iz2,jz0);
1037 /* Calculate squared distance and things based on it */
1038 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1039 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1040 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1042 rinv00 = avx128fma_invsqrt_f(rsq00);
1043 rinv10 = avx128fma_invsqrt_f(rsq10);
1044 rinv20 = avx128fma_invsqrt_f(rsq20);
1046 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1047 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1048 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1050 /* Load parameters for j particles */
1051 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1052 charge+jnrC+0,charge+jnrD+0);
1053 vdwjidx0A = 2*vdwtype[jnrA+0];
1054 vdwjidx0B = 2*vdwtype[jnrB+0];
1055 vdwjidx0C = 2*vdwtype[jnrC+0];
1056 vdwjidx0D = 2*vdwtype[jnrD+0];
1058 fjx0 = _mm_setzero_ps();
1059 fjy0 = _mm_setzero_ps();
1060 fjz0 = _mm_setzero_ps();
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r00 = _mm_mul_ps(rsq00,rinv00);
1067 r00 = _mm_andnot_ps(dummy_mask,r00);
1069 /* Compute parameters for interactions between i and j atoms */
1070 qq00 = _mm_mul_ps(iq0,jq0);
1071 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1072 vdwparam+vdwioffset0+vdwjidx0B,
1073 vdwparam+vdwioffset0+vdwjidx0C,
1074 vdwparam+vdwioffset0+vdwjidx0D,
1077 /* Calculate table index by multiplying r with table scale and truncate to integer */
1078 rt = _mm_mul_ps(r00,vftabscale);
1079 vfitab = _mm_cvttps_epi32(rt);
1081 vfeps = _mm_frcz_ps(rt);
1083 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1085 twovfeps = _mm_add_ps(vfeps,vfeps);
1086 vfitab = _mm_slli_epi32(vfitab,3);
1088 /* EWALD ELECTROSTATICS */
1090 /* Analytical PME correction */
1091 zeta2 = _mm_mul_ps(beta2,rsq00);
1092 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1093 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1094 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1095 felec = _mm_mul_ps(qq00,felec);
1097 /* CUBIC SPLINE TABLE DISPERSION */
1098 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1099 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1100 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1101 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1102 _MM_TRANSPOSE4_PS(Y,F,G,H);
1103 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1104 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1105 fvdw6 = _mm_mul_ps(c6_00,FF);
1107 /* CUBIC SPLINE TABLE REPULSION */
1108 vfitab = _mm_add_epi32(vfitab,ifour);
1109 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1110 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1111 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1112 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1113 _MM_TRANSPOSE4_PS(Y,F,G,H);
1114 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1115 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1116 fvdw12 = _mm_mul_ps(c12_00,FF);
1117 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1119 fscal = _mm_add_ps(felec,fvdw);
1121 fscal = _mm_andnot_ps(dummy_mask,fscal);
1123 /* Update vectorial force */
1124 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1125 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1126 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1128 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1129 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1130 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1132 /**************************
1133 * CALCULATE INTERACTIONS *
1134 **************************/
1136 r10 = _mm_mul_ps(rsq10,rinv10);
1137 r10 = _mm_andnot_ps(dummy_mask,r10);
1139 /* Compute parameters for interactions between i and j atoms */
1140 qq10 = _mm_mul_ps(iq1,jq0);
1142 /* EWALD ELECTROSTATICS */
1144 /* Analytical PME correction */
1145 zeta2 = _mm_mul_ps(beta2,rsq10);
1146 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1147 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1148 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1149 felec = _mm_mul_ps(qq10,felec);
1153 fscal = _mm_andnot_ps(dummy_mask,fscal);
1155 /* Update vectorial force */
1156 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1157 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1158 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1160 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1161 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1162 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1164 /**************************
1165 * CALCULATE INTERACTIONS *
1166 **************************/
1168 r20 = _mm_mul_ps(rsq20,rinv20);
1169 r20 = _mm_andnot_ps(dummy_mask,r20);
1171 /* Compute parameters for interactions between i and j atoms */
1172 qq20 = _mm_mul_ps(iq2,jq0);
1174 /* EWALD ELECTROSTATICS */
1176 /* Analytical PME correction */
1177 zeta2 = _mm_mul_ps(beta2,rsq20);
1178 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1179 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1180 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1181 felec = _mm_mul_ps(qq20,felec);
1185 fscal = _mm_andnot_ps(dummy_mask,fscal);
1187 /* Update vectorial force */
1188 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1189 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1190 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1192 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1193 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1194 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1196 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1197 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1198 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1199 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1201 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1203 /* Inner loop uses 113 flops */
1206 /* End of innermost loop */
1208 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1209 f+i_coord_offset,fshift+i_shift_offset);
1211 /* Increment number of inner iterations */
1212 inneriter += j_index_end - j_index_start;
1214 /* Outer loop uses 18 flops */
1217 /* Increment number of outer iterations */
1220 /* Update outer/inner flops */
1222 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*113);