2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
91 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
92 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
93 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
100 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
102 __m128i ifour = _mm_set1_epi32(4);
103 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
105 __m128 dummy_mask,cutoff_mask;
106 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
107 __m128 one = _mm_set1_ps(1.0);
108 __m128 two = _mm_set1_ps(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm_set1_ps(fr->ic->epsfac);
121 charge = mdatoms->chargeA;
122 nvdwtype = fr->ntype;
124 vdwtype = mdatoms->typeA;
126 vftab = kernel_data->table_elec_vdw->data;
127 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
132 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
133 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
134 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
136 /* Avoid stupid compiler warnings */
137 jnrA = jnrB = jnrC = jnrD = 0;
146 for(iidx=0;iidx<4*DIM;iidx++)
151 /* Start outer loop over neighborlists */
152 for(iidx=0; iidx<nri; iidx++)
154 /* Load shift vector for this list */
155 i_shift_offset = DIM*shiftidx[iidx];
157 /* Load limits for loop over neighbors */
158 j_index_start = jindex[iidx];
159 j_index_end = jindex[iidx+1];
161 /* Get outer coordinate index */
163 i_coord_offset = DIM*inr;
165 /* Load i particle coords and add shift vector */
166 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
167 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
169 fix0 = _mm_setzero_ps();
170 fiy0 = _mm_setzero_ps();
171 fiz0 = _mm_setzero_ps();
172 fix1 = _mm_setzero_ps();
173 fiy1 = _mm_setzero_ps();
174 fiz1 = _mm_setzero_ps();
175 fix2 = _mm_setzero_ps();
176 fiy2 = _mm_setzero_ps();
177 fiz2 = _mm_setzero_ps();
179 /* Reset potential sums */
180 velecsum = _mm_setzero_ps();
181 vvdwsum = _mm_setzero_ps();
183 /* Start inner kernel loop */
184 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
187 /* Get j neighbor index, and coordinate index */
192 j_coord_offsetA = DIM*jnrA;
193 j_coord_offsetB = DIM*jnrB;
194 j_coord_offsetC = DIM*jnrC;
195 j_coord_offsetD = DIM*jnrD;
197 /* load j atom coordinates */
198 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
199 x+j_coord_offsetC,x+j_coord_offsetD,
202 /* Calculate displacement vector */
203 dx00 = _mm_sub_ps(ix0,jx0);
204 dy00 = _mm_sub_ps(iy0,jy0);
205 dz00 = _mm_sub_ps(iz0,jz0);
206 dx10 = _mm_sub_ps(ix1,jx0);
207 dy10 = _mm_sub_ps(iy1,jy0);
208 dz10 = _mm_sub_ps(iz1,jz0);
209 dx20 = _mm_sub_ps(ix2,jx0);
210 dy20 = _mm_sub_ps(iy2,jy0);
211 dz20 = _mm_sub_ps(iz2,jz0);
213 /* Calculate squared distance and things based on it */
214 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
215 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
216 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
218 rinv00 = avx128fma_invsqrt_f(rsq00);
219 rinv10 = avx128fma_invsqrt_f(rsq10);
220 rinv20 = avx128fma_invsqrt_f(rsq20);
222 /* Load parameters for j particles */
223 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
224 charge+jnrC+0,charge+jnrD+0);
225 vdwjidx0A = 2*vdwtype[jnrA+0];
226 vdwjidx0B = 2*vdwtype[jnrB+0];
227 vdwjidx0C = 2*vdwtype[jnrC+0];
228 vdwjidx0D = 2*vdwtype[jnrD+0];
230 fjx0 = _mm_setzero_ps();
231 fjy0 = _mm_setzero_ps();
232 fjz0 = _mm_setzero_ps();
234 /**************************
235 * CALCULATE INTERACTIONS *
236 **************************/
238 r00 = _mm_mul_ps(rsq00,rinv00);
240 /* Compute parameters for interactions between i and j atoms */
241 qq00 = _mm_mul_ps(iq0,jq0);
242 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
243 vdwparam+vdwioffset0+vdwjidx0B,
244 vdwparam+vdwioffset0+vdwjidx0C,
245 vdwparam+vdwioffset0+vdwjidx0D,
248 /* Calculate table index by multiplying r with table scale and truncate to integer */
249 rt = _mm_mul_ps(r00,vftabscale);
250 vfitab = _mm_cvttps_epi32(rt);
252 vfeps = _mm_frcz_ps(rt);
254 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
256 twovfeps = _mm_add_ps(vfeps,vfeps);
257 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
259 /* CUBIC SPLINE TABLE ELECTROSTATICS */
260 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
261 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
262 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
263 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
264 _MM_TRANSPOSE4_PS(Y,F,G,H);
265 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
266 VV = _mm_macc_ps(vfeps,Fp,Y);
267 velec = _mm_mul_ps(qq00,VV);
268 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
269 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
271 /* CUBIC SPLINE TABLE DISPERSION */
272 vfitab = _mm_add_epi32(vfitab,ifour);
273 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
274 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
275 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
276 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
277 _MM_TRANSPOSE4_PS(Y,F,G,H);
278 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
279 VV = _mm_macc_ps(vfeps,Fp,Y);
280 vvdw6 = _mm_mul_ps(c6_00,VV);
281 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
282 fvdw6 = _mm_mul_ps(c6_00,FF);
284 /* CUBIC SPLINE TABLE REPULSION */
285 vfitab = _mm_add_epi32(vfitab,ifour);
286 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
287 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
288 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
289 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
290 _MM_TRANSPOSE4_PS(Y,F,G,H);
291 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
292 VV = _mm_macc_ps(vfeps,Fp,Y);
293 vvdw12 = _mm_mul_ps(c12_00,VV);
294 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
295 fvdw12 = _mm_mul_ps(c12_00,FF);
296 vvdw = _mm_add_ps(vvdw12,vvdw6);
297 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
299 /* Update potential sum for this i atom from the interaction with this j atom. */
300 velecsum = _mm_add_ps(velecsum,velec);
301 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
303 fscal = _mm_add_ps(felec,fvdw);
305 /* Update vectorial force */
306 fix0 = _mm_macc_ps(dx00,fscal,fix0);
307 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
308 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
310 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
311 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
312 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 r10 = _mm_mul_ps(rsq10,rinv10);
320 /* Compute parameters for interactions between i and j atoms */
321 qq10 = _mm_mul_ps(iq1,jq0);
323 /* Calculate table index by multiplying r with table scale and truncate to integer */
324 rt = _mm_mul_ps(r10,vftabscale);
325 vfitab = _mm_cvttps_epi32(rt);
327 vfeps = _mm_frcz_ps(rt);
329 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
331 twovfeps = _mm_add_ps(vfeps,vfeps);
332 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
334 /* CUBIC SPLINE TABLE ELECTROSTATICS */
335 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
336 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
337 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
338 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
339 _MM_TRANSPOSE4_PS(Y,F,G,H);
340 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
341 VV = _mm_macc_ps(vfeps,Fp,Y);
342 velec = _mm_mul_ps(qq10,VV);
343 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
344 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velecsum = _mm_add_ps(velecsum,velec);
351 /* Update vectorial force */
352 fix1 = _mm_macc_ps(dx10,fscal,fix1);
353 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
354 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
356 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
357 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
358 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
360 /**************************
361 * CALCULATE INTERACTIONS *
362 **************************/
364 r20 = _mm_mul_ps(rsq20,rinv20);
366 /* Compute parameters for interactions between i and j atoms */
367 qq20 = _mm_mul_ps(iq2,jq0);
369 /* Calculate table index by multiplying r with table scale and truncate to integer */
370 rt = _mm_mul_ps(r20,vftabscale);
371 vfitab = _mm_cvttps_epi32(rt);
373 vfeps = _mm_frcz_ps(rt);
375 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
377 twovfeps = _mm_add_ps(vfeps,vfeps);
378 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
380 /* CUBIC SPLINE TABLE ELECTROSTATICS */
381 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
382 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
383 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
384 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
385 _MM_TRANSPOSE4_PS(Y,F,G,H);
386 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
387 VV = _mm_macc_ps(vfeps,Fp,Y);
388 velec = _mm_mul_ps(qq20,VV);
389 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
390 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
392 /* Update potential sum for this i atom from the interaction with this j atom. */
393 velecsum = _mm_add_ps(velecsum,velec);
397 /* Update vectorial force */
398 fix2 = _mm_macc_ps(dx20,fscal,fix2);
399 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
400 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
402 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
403 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
404 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
406 fjptrA = f+j_coord_offsetA;
407 fjptrB = f+j_coord_offsetB;
408 fjptrC = f+j_coord_offsetC;
409 fjptrD = f+j_coord_offsetD;
411 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
413 /* Inner loop uses 168 flops */
419 /* Get j neighbor index, and coordinate index */
420 jnrlistA = jjnr[jidx];
421 jnrlistB = jjnr[jidx+1];
422 jnrlistC = jjnr[jidx+2];
423 jnrlistD = jjnr[jidx+3];
424 /* Sign of each element will be negative for non-real atoms.
425 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
426 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
428 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
429 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
430 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
431 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
432 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
433 j_coord_offsetA = DIM*jnrA;
434 j_coord_offsetB = DIM*jnrB;
435 j_coord_offsetC = DIM*jnrC;
436 j_coord_offsetD = DIM*jnrD;
438 /* load j atom coordinates */
439 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
440 x+j_coord_offsetC,x+j_coord_offsetD,
443 /* Calculate displacement vector */
444 dx00 = _mm_sub_ps(ix0,jx0);
445 dy00 = _mm_sub_ps(iy0,jy0);
446 dz00 = _mm_sub_ps(iz0,jz0);
447 dx10 = _mm_sub_ps(ix1,jx0);
448 dy10 = _mm_sub_ps(iy1,jy0);
449 dz10 = _mm_sub_ps(iz1,jz0);
450 dx20 = _mm_sub_ps(ix2,jx0);
451 dy20 = _mm_sub_ps(iy2,jy0);
452 dz20 = _mm_sub_ps(iz2,jz0);
454 /* Calculate squared distance and things based on it */
455 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
456 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
457 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
459 rinv00 = avx128fma_invsqrt_f(rsq00);
460 rinv10 = avx128fma_invsqrt_f(rsq10);
461 rinv20 = avx128fma_invsqrt_f(rsq20);
463 /* Load parameters for j particles */
464 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
465 charge+jnrC+0,charge+jnrD+0);
466 vdwjidx0A = 2*vdwtype[jnrA+0];
467 vdwjidx0B = 2*vdwtype[jnrB+0];
468 vdwjidx0C = 2*vdwtype[jnrC+0];
469 vdwjidx0D = 2*vdwtype[jnrD+0];
471 fjx0 = _mm_setzero_ps();
472 fjy0 = _mm_setzero_ps();
473 fjz0 = _mm_setzero_ps();
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 r00 = _mm_mul_ps(rsq00,rinv00);
480 r00 = _mm_andnot_ps(dummy_mask,r00);
482 /* Compute parameters for interactions between i and j atoms */
483 qq00 = _mm_mul_ps(iq0,jq0);
484 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
485 vdwparam+vdwioffset0+vdwjidx0B,
486 vdwparam+vdwioffset0+vdwjidx0C,
487 vdwparam+vdwioffset0+vdwjidx0D,
490 /* Calculate table index by multiplying r with table scale and truncate to integer */
491 rt = _mm_mul_ps(r00,vftabscale);
492 vfitab = _mm_cvttps_epi32(rt);
494 vfeps = _mm_frcz_ps(rt);
496 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
498 twovfeps = _mm_add_ps(vfeps,vfeps);
499 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
501 /* CUBIC SPLINE TABLE ELECTROSTATICS */
502 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
503 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
504 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
505 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
506 _MM_TRANSPOSE4_PS(Y,F,G,H);
507 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
508 VV = _mm_macc_ps(vfeps,Fp,Y);
509 velec = _mm_mul_ps(qq00,VV);
510 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
511 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
513 /* CUBIC SPLINE TABLE DISPERSION */
514 vfitab = _mm_add_epi32(vfitab,ifour);
515 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
516 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
517 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
518 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
519 _MM_TRANSPOSE4_PS(Y,F,G,H);
520 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
521 VV = _mm_macc_ps(vfeps,Fp,Y);
522 vvdw6 = _mm_mul_ps(c6_00,VV);
523 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
524 fvdw6 = _mm_mul_ps(c6_00,FF);
526 /* CUBIC SPLINE TABLE REPULSION */
527 vfitab = _mm_add_epi32(vfitab,ifour);
528 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
529 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
530 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
531 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
532 _MM_TRANSPOSE4_PS(Y,F,G,H);
533 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
534 VV = _mm_macc_ps(vfeps,Fp,Y);
535 vvdw12 = _mm_mul_ps(c12_00,VV);
536 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
537 fvdw12 = _mm_mul_ps(c12_00,FF);
538 vvdw = _mm_add_ps(vvdw12,vvdw6);
539 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velec = _mm_andnot_ps(dummy_mask,velec);
543 velecsum = _mm_add_ps(velecsum,velec);
544 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
545 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
547 fscal = _mm_add_ps(felec,fvdw);
549 fscal = _mm_andnot_ps(dummy_mask,fscal);
551 /* Update vectorial force */
552 fix0 = _mm_macc_ps(dx00,fscal,fix0);
553 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
554 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
556 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
557 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
558 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
560 /**************************
561 * CALCULATE INTERACTIONS *
562 **************************/
564 r10 = _mm_mul_ps(rsq10,rinv10);
565 r10 = _mm_andnot_ps(dummy_mask,r10);
567 /* Compute parameters for interactions between i and j atoms */
568 qq10 = _mm_mul_ps(iq1,jq0);
570 /* Calculate table index by multiplying r with table scale and truncate to integer */
571 rt = _mm_mul_ps(r10,vftabscale);
572 vfitab = _mm_cvttps_epi32(rt);
574 vfeps = _mm_frcz_ps(rt);
576 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
578 twovfeps = _mm_add_ps(vfeps,vfeps);
579 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
581 /* CUBIC SPLINE TABLE ELECTROSTATICS */
582 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
583 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
584 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
585 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
586 _MM_TRANSPOSE4_PS(Y,F,G,H);
587 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
588 VV = _mm_macc_ps(vfeps,Fp,Y);
589 velec = _mm_mul_ps(qq10,VV);
590 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
591 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
593 /* Update potential sum for this i atom from the interaction with this j atom. */
594 velec = _mm_andnot_ps(dummy_mask,velec);
595 velecsum = _mm_add_ps(velecsum,velec);
599 fscal = _mm_andnot_ps(dummy_mask,fscal);
601 /* Update vectorial force */
602 fix1 = _mm_macc_ps(dx10,fscal,fix1);
603 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
604 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
606 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
607 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
608 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 r20 = _mm_mul_ps(rsq20,rinv20);
615 r20 = _mm_andnot_ps(dummy_mask,r20);
617 /* Compute parameters for interactions between i and j atoms */
618 qq20 = _mm_mul_ps(iq2,jq0);
620 /* Calculate table index by multiplying r with table scale and truncate to integer */
621 rt = _mm_mul_ps(r20,vftabscale);
622 vfitab = _mm_cvttps_epi32(rt);
624 vfeps = _mm_frcz_ps(rt);
626 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
628 twovfeps = _mm_add_ps(vfeps,vfeps);
629 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
631 /* CUBIC SPLINE TABLE ELECTROSTATICS */
632 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
633 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
634 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
635 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
636 _MM_TRANSPOSE4_PS(Y,F,G,H);
637 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
638 VV = _mm_macc_ps(vfeps,Fp,Y);
639 velec = _mm_mul_ps(qq20,VV);
640 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
641 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
643 /* Update potential sum for this i atom from the interaction with this j atom. */
644 velec = _mm_andnot_ps(dummy_mask,velec);
645 velecsum = _mm_add_ps(velecsum,velec);
649 fscal = _mm_andnot_ps(dummy_mask,fscal);
651 /* Update vectorial force */
652 fix2 = _mm_macc_ps(dx20,fscal,fix2);
653 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
654 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
656 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
657 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
658 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
660 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
661 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
662 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
663 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
665 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
667 /* Inner loop uses 171 flops */
670 /* End of innermost loop */
672 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
673 f+i_coord_offset,fshift+i_shift_offset);
676 /* Update potential energies */
677 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
678 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
680 /* Increment number of inner iterations */
681 inneriter += j_index_end - j_index_start;
683 /* Outer loop uses 20 flops */
686 /* Increment number of outer iterations */
689 /* Update outer/inner flops */
691 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*171);
694 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
695 * Electrostatics interaction: CubicSplineTable
696 * VdW interaction: CubicSplineTable
697 * Geometry: Water3-Particle
698 * Calculate force/pot: Force
701 nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_128_fma_single
702 (t_nblist * gmx_restrict nlist,
703 rvec * gmx_restrict xx,
704 rvec * gmx_restrict ff,
705 struct t_forcerec * gmx_restrict fr,
706 t_mdatoms * gmx_restrict mdatoms,
707 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
708 t_nrnb * gmx_restrict nrnb)
710 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
711 * just 0 for non-waters.
712 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
713 * jnr indices corresponding to data put in the four positions in the SIMD register.
715 int i_shift_offset,i_coord_offset,outeriter,inneriter;
716 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
717 int jnrA,jnrB,jnrC,jnrD;
718 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
719 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
720 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
722 real *shiftvec,*fshift,*x,*f;
723 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
725 __m128 fscal,rcutoff,rcutoff2,jidxall;
727 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
729 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
731 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
732 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
733 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
734 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
735 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
736 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
737 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
740 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
743 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
744 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
746 __m128i ifour = _mm_set1_epi32(4);
747 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
749 __m128 dummy_mask,cutoff_mask;
750 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
751 __m128 one = _mm_set1_ps(1.0);
752 __m128 two = _mm_set1_ps(2.0);
758 jindex = nlist->jindex;
760 shiftidx = nlist->shift;
762 shiftvec = fr->shift_vec[0];
763 fshift = fr->fshift[0];
764 facel = _mm_set1_ps(fr->ic->epsfac);
765 charge = mdatoms->chargeA;
766 nvdwtype = fr->ntype;
768 vdwtype = mdatoms->typeA;
770 vftab = kernel_data->table_elec_vdw->data;
771 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
773 /* Setup water-specific parameters */
774 inr = nlist->iinr[0];
775 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
776 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
777 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
778 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
780 /* Avoid stupid compiler warnings */
781 jnrA = jnrB = jnrC = jnrD = 0;
790 for(iidx=0;iidx<4*DIM;iidx++)
795 /* Start outer loop over neighborlists */
796 for(iidx=0; iidx<nri; iidx++)
798 /* Load shift vector for this list */
799 i_shift_offset = DIM*shiftidx[iidx];
801 /* Load limits for loop over neighbors */
802 j_index_start = jindex[iidx];
803 j_index_end = jindex[iidx+1];
805 /* Get outer coordinate index */
807 i_coord_offset = DIM*inr;
809 /* Load i particle coords and add shift vector */
810 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
811 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
813 fix0 = _mm_setzero_ps();
814 fiy0 = _mm_setzero_ps();
815 fiz0 = _mm_setzero_ps();
816 fix1 = _mm_setzero_ps();
817 fiy1 = _mm_setzero_ps();
818 fiz1 = _mm_setzero_ps();
819 fix2 = _mm_setzero_ps();
820 fiy2 = _mm_setzero_ps();
821 fiz2 = _mm_setzero_ps();
823 /* Start inner kernel loop */
824 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
827 /* Get j neighbor index, and coordinate index */
832 j_coord_offsetA = DIM*jnrA;
833 j_coord_offsetB = DIM*jnrB;
834 j_coord_offsetC = DIM*jnrC;
835 j_coord_offsetD = DIM*jnrD;
837 /* load j atom coordinates */
838 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
839 x+j_coord_offsetC,x+j_coord_offsetD,
842 /* Calculate displacement vector */
843 dx00 = _mm_sub_ps(ix0,jx0);
844 dy00 = _mm_sub_ps(iy0,jy0);
845 dz00 = _mm_sub_ps(iz0,jz0);
846 dx10 = _mm_sub_ps(ix1,jx0);
847 dy10 = _mm_sub_ps(iy1,jy0);
848 dz10 = _mm_sub_ps(iz1,jz0);
849 dx20 = _mm_sub_ps(ix2,jx0);
850 dy20 = _mm_sub_ps(iy2,jy0);
851 dz20 = _mm_sub_ps(iz2,jz0);
853 /* Calculate squared distance and things based on it */
854 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
855 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
856 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
858 rinv00 = avx128fma_invsqrt_f(rsq00);
859 rinv10 = avx128fma_invsqrt_f(rsq10);
860 rinv20 = avx128fma_invsqrt_f(rsq20);
862 /* Load parameters for j particles */
863 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
864 charge+jnrC+0,charge+jnrD+0);
865 vdwjidx0A = 2*vdwtype[jnrA+0];
866 vdwjidx0B = 2*vdwtype[jnrB+0];
867 vdwjidx0C = 2*vdwtype[jnrC+0];
868 vdwjidx0D = 2*vdwtype[jnrD+0];
870 fjx0 = _mm_setzero_ps();
871 fjy0 = _mm_setzero_ps();
872 fjz0 = _mm_setzero_ps();
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 r00 = _mm_mul_ps(rsq00,rinv00);
880 /* Compute parameters for interactions between i and j atoms */
881 qq00 = _mm_mul_ps(iq0,jq0);
882 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
883 vdwparam+vdwioffset0+vdwjidx0B,
884 vdwparam+vdwioffset0+vdwjidx0C,
885 vdwparam+vdwioffset0+vdwjidx0D,
888 /* Calculate table index by multiplying r with table scale and truncate to integer */
889 rt = _mm_mul_ps(r00,vftabscale);
890 vfitab = _mm_cvttps_epi32(rt);
892 vfeps = _mm_frcz_ps(rt);
894 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
896 twovfeps = _mm_add_ps(vfeps,vfeps);
897 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
899 /* CUBIC SPLINE TABLE ELECTROSTATICS */
900 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
901 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
902 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
903 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
904 _MM_TRANSPOSE4_PS(Y,F,G,H);
905 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
906 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
907 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
909 /* CUBIC SPLINE TABLE DISPERSION */
910 vfitab = _mm_add_epi32(vfitab,ifour);
911 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
912 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
913 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
914 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
915 _MM_TRANSPOSE4_PS(Y,F,G,H);
916 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
917 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
918 fvdw6 = _mm_mul_ps(c6_00,FF);
920 /* CUBIC SPLINE TABLE REPULSION */
921 vfitab = _mm_add_epi32(vfitab,ifour);
922 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
923 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
924 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
925 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
926 _MM_TRANSPOSE4_PS(Y,F,G,H);
927 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
928 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
929 fvdw12 = _mm_mul_ps(c12_00,FF);
930 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
932 fscal = _mm_add_ps(felec,fvdw);
934 /* Update vectorial force */
935 fix0 = _mm_macc_ps(dx00,fscal,fix0);
936 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
937 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
939 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
940 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
941 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 r10 = _mm_mul_ps(rsq10,rinv10);
949 /* Compute parameters for interactions between i and j atoms */
950 qq10 = _mm_mul_ps(iq1,jq0);
952 /* Calculate table index by multiplying r with table scale and truncate to integer */
953 rt = _mm_mul_ps(r10,vftabscale);
954 vfitab = _mm_cvttps_epi32(rt);
956 vfeps = _mm_frcz_ps(rt);
958 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
960 twovfeps = _mm_add_ps(vfeps,vfeps);
961 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
963 /* CUBIC SPLINE TABLE ELECTROSTATICS */
964 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
965 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
966 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
967 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
968 _MM_TRANSPOSE4_PS(Y,F,G,H);
969 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
970 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
971 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
975 /* Update vectorial force */
976 fix1 = _mm_macc_ps(dx10,fscal,fix1);
977 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
978 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
980 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
981 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
982 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
984 /**************************
985 * CALCULATE INTERACTIONS *
986 **************************/
988 r20 = _mm_mul_ps(rsq20,rinv20);
990 /* Compute parameters for interactions between i and j atoms */
991 qq20 = _mm_mul_ps(iq2,jq0);
993 /* Calculate table index by multiplying r with table scale and truncate to integer */
994 rt = _mm_mul_ps(r20,vftabscale);
995 vfitab = _mm_cvttps_epi32(rt);
997 vfeps = _mm_frcz_ps(rt);
999 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1001 twovfeps = _mm_add_ps(vfeps,vfeps);
1002 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1006 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1007 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1008 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1009 _MM_TRANSPOSE4_PS(Y,F,G,H);
1010 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1011 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1012 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1016 /* Update vectorial force */
1017 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1018 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1019 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1021 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1022 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1023 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1025 fjptrA = f+j_coord_offsetA;
1026 fjptrB = f+j_coord_offsetB;
1027 fjptrC = f+j_coord_offsetC;
1028 fjptrD = f+j_coord_offsetD;
1030 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1032 /* Inner loop uses 148 flops */
1035 if(jidx<j_index_end)
1038 /* Get j neighbor index, and coordinate index */
1039 jnrlistA = jjnr[jidx];
1040 jnrlistB = jjnr[jidx+1];
1041 jnrlistC = jjnr[jidx+2];
1042 jnrlistD = jjnr[jidx+3];
1043 /* Sign of each element will be negative for non-real atoms.
1044 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1045 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1047 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1048 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1049 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1050 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1051 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1052 j_coord_offsetA = DIM*jnrA;
1053 j_coord_offsetB = DIM*jnrB;
1054 j_coord_offsetC = DIM*jnrC;
1055 j_coord_offsetD = DIM*jnrD;
1057 /* load j atom coordinates */
1058 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1059 x+j_coord_offsetC,x+j_coord_offsetD,
1062 /* Calculate displacement vector */
1063 dx00 = _mm_sub_ps(ix0,jx0);
1064 dy00 = _mm_sub_ps(iy0,jy0);
1065 dz00 = _mm_sub_ps(iz0,jz0);
1066 dx10 = _mm_sub_ps(ix1,jx0);
1067 dy10 = _mm_sub_ps(iy1,jy0);
1068 dz10 = _mm_sub_ps(iz1,jz0);
1069 dx20 = _mm_sub_ps(ix2,jx0);
1070 dy20 = _mm_sub_ps(iy2,jy0);
1071 dz20 = _mm_sub_ps(iz2,jz0);
1073 /* Calculate squared distance and things based on it */
1074 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1075 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1076 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1078 rinv00 = avx128fma_invsqrt_f(rsq00);
1079 rinv10 = avx128fma_invsqrt_f(rsq10);
1080 rinv20 = avx128fma_invsqrt_f(rsq20);
1082 /* Load parameters for j particles */
1083 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1084 charge+jnrC+0,charge+jnrD+0);
1085 vdwjidx0A = 2*vdwtype[jnrA+0];
1086 vdwjidx0B = 2*vdwtype[jnrB+0];
1087 vdwjidx0C = 2*vdwtype[jnrC+0];
1088 vdwjidx0D = 2*vdwtype[jnrD+0];
1090 fjx0 = _mm_setzero_ps();
1091 fjy0 = _mm_setzero_ps();
1092 fjz0 = _mm_setzero_ps();
1094 /**************************
1095 * CALCULATE INTERACTIONS *
1096 **************************/
1098 r00 = _mm_mul_ps(rsq00,rinv00);
1099 r00 = _mm_andnot_ps(dummy_mask,r00);
1101 /* Compute parameters for interactions between i and j atoms */
1102 qq00 = _mm_mul_ps(iq0,jq0);
1103 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1104 vdwparam+vdwioffset0+vdwjidx0B,
1105 vdwparam+vdwioffset0+vdwjidx0C,
1106 vdwparam+vdwioffset0+vdwjidx0D,
1109 /* Calculate table index by multiplying r with table scale and truncate to integer */
1110 rt = _mm_mul_ps(r00,vftabscale);
1111 vfitab = _mm_cvttps_epi32(rt);
1113 vfeps = _mm_frcz_ps(rt);
1115 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1117 twovfeps = _mm_add_ps(vfeps,vfeps);
1118 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1120 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1121 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1122 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1123 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1124 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1125 _MM_TRANSPOSE4_PS(Y,F,G,H);
1126 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1127 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1128 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1130 /* CUBIC SPLINE TABLE DISPERSION */
1131 vfitab = _mm_add_epi32(vfitab,ifour);
1132 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1133 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1134 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1135 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1136 _MM_TRANSPOSE4_PS(Y,F,G,H);
1137 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1138 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1139 fvdw6 = _mm_mul_ps(c6_00,FF);
1141 /* CUBIC SPLINE TABLE REPULSION */
1142 vfitab = _mm_add_epi32(vfitab,ifour);
1143 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1144 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1145 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1146 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1147 _MM_TRANSPOSE4_PS(Y,F,G,H);
1148 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1149 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1150 fvdw12 = _mm_mul_ps(c12_00,FF);
1151 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1153 fscal = _mm_add_ps(felec,fvdw);
1155 fscal = _mm_andnot_ps(dummy_mask,fscal);
1157 /* Update vectorial force */
1158 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1159 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1160 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1162 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1163 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1164 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1166 /**************************
1167 * CALCULATE INTERACTIONS *
1168 **************************/
1170 r10 = _mm_mul_ps(rsq10,rinv10);
1171 r10 = _mm_andnot_ps(dummy_mask,r10);
1173 /* Compute parameters for interactions between i and j atoms */
1174 qq10 = _mm_mul_ps(iq1,jq0);
1176 /* Calculate table index by multiplying r with table scale and truncate to integer */
1177 rt = _mm_mul_ps(r10,vftabscale);
1178 vfitab = _mm_cvttps_epi32(rt);
1180 vfeps = _mm_frcz_ps(rt);
1182 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1184 twovfeps = _mm_add_ps(vfeps,vfeps);
1185 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1187 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1188 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1189 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1190 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1191 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1192 _MM_TRANSPOSE4_PS(Y,F,G,H);
1193 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1194 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1195 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1199 fscal = _mm_andnot_ps(dummy_mask,fscal);
1201 /* Update vectorial force */
1202 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1203 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1204 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1206 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1207 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1208 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1210 /**************************
1211 * CALCULATE INTERACTIONS *
1212 **************************/
1214 r20 = _mm_mul_ps(rsq20,rinv20);
1215 r20 = _mm_andnot_ps(dummy_mask,r20);
1217 /* Compute parameters for interactions between i and j atoms */
1218 qq20 = _mm_mul_ps(iq2,jq0);
1220 /* Calculate table index by multiplying r with table scale and truncate to integer */
1221 rt = _mm_mul_ps(r20,vftabscale);
1222 vfitab = _mm_cvttps_epi32(rt);
1224 vfeps = _mm_frcz_ps(rt);
1226 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1228 twovfeps = _mm_add_ps(vfeps,vfeps);
1229 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1231 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1232 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1233 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1234 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1235 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1236 _MM_TRANSPOSE4_PS(Y,F,G,H);
1237 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1238 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1239 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1243 fscal = _mm_andnot_ps(dummy_mask,fscal);
1245 /* Update vectorial force */
1246 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1247 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1248 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1250 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1251 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1252 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1254 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1255 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1256 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1257 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1259 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1261 /* Inner loop uses 151 flops */
1264 /* End of innermost loop */
1266 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1267 f+i_coord_offset,fshift+i_shift_offset);
1269 /* Increment number of inner iterations */
1270 inneriter += j_index_end - j_index_start;
1272 /* Outer loop uses 18 flops */
1275 /* Increment number of outer iterations */
1278 /* Update outer/inner flops */
1280 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*151);