2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
78 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
79 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
84 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
86 __m128 dummy_mask,cutoff_mask;
87 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
88 __m128 one = _mm_set1_ps(1.0);
89 __m128 two = _mm_set1_ps(2.0);
95 jindex = nlist->jindex;
97 shiftidx = nlist->shift;
99 shiftvec = fr->shift_vec[0];
100 fshift = fr->fshift[0];
101 facel = _mm_set1_ps(fr->epsfac);
102 charge = mdatoms->chargeA;
104 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
105 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
106 beta2 = _mm_mul_ps(beta,beta);
107 beta3 = _mm_mul_ps(beta,beta2);
108 ewtab = fr->ic->tabq_coul_FDV0;
109 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
110 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
112 /* Setup water-specific parameters */
113 inr = nlist->iinr[0];
114 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
115 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
116 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
118 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
119 rcutoff_scalar = fr->rcoulomb;
120 rcutoff = _mm_set1_ps(rcutoff_scalar);
121 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
123 /* Avoid stupid compiler warnings */
124 jnrA = jnrB = jnrC = jnrD = 0;
133 for(iidx=0;iidx<4*DIM;iidx++)
138 /* Start outer loop over neighborlists */
139 for(iidx=0; iidx<nri; iidx++)
141 /* Load shift vector for this list */
142 i_shift_offset = DIM*shiftidx[iidx];
144 /* Load limits for loop over neighbors */
145 j_index_start = jindex[iidx];
146 j_index_end = jindex[iidx+1];
148 /* Get outer coordinate index */
150 i_coord_offset = DIM*inr;
152 /* Load i particle coords and add shift vector */
153 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
154 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
156 fix0 = _mm_setzero_ps();
157 fiy0 = _mm_setzero_ps();
158 fiz0 = _mm_setzero_ps();
159 fix1 = _mm_setzero_ps();
160 fiy1 = _mm_setzero_ps();
161 fiz1 = _mm_setzero_ps();
162 fix2 = _mm_setzero_ps();
163 fiy2 = _mm_setzero_ps();
164 fiz2 = _mm_setzero_ps();
166 /* Reset potential sums */
167 velecsum = _mm_setzero_ps();
169 /* Start inner kernel loop */
170 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
173 /* Get j neighbor index, and coordinate index */
178 j_coord_offsetA = DIM*jnrA;
179 j_coord_offsetB = DIM*jnrB;
180 j_coord_offsetC = DIM*jnrC;
181 j_coord_offsetD = DIM*jnrD;
183 /* load j atom coordinates */
184 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
185 x+j_coord_offsetC,x+j_coord_offsetD,
188 /* Calculate displacement vector */
189 dx00 = _mm_sub_ps(ix0,jx0);
190 dy00 = _mm_sub_ps(iy0,jy0);
191 dz00 = _mm_sub_ps(iz0,jz0);
192 dx10 = _mm_sub_ps(ix1,jx0);
193 dy10 = _mm_sub_ps(iy1,jy0);
194 dz10 = _mm_sub_ps(iz1,jz0);
195 dx20 = _mm_sub_ps(ix2,jx0);
196 dy20 = _mm_sub_ps(iy2,jy0);
197 dz20 = _mm_sub_ps(iz2,jz0);
199 /* Calculate squared distance and things based on it */
200 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
201 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
202 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
204 rinv00 = gmx_mm_invsqrt_ps(rsq00);
205 rinv10 = gmx_mm_invsqrt_ps(rsq10);
206 rinv20 = gmx_mm_invsqrt_ps(rsq20);
208 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
209 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
210 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
212 /* Load parameters for j particles */
213 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
214 charge+jnrC+0,charge+jnrD+0);
216 fjx0 = _mm_setzero_ps();
217 fjy0 = _mm_setzero_ps();
218 fjz0 = _mm_setzero_ps();
220 /**************************
221 * CALCULATE INTERACTIONS *
222 **************************/
224 if (gmx_mm_any_lt(rsq00,rcutoff2))
227 r00 = _mm_mul_ps(rsq00,rinv00);
229 /* Compute parameters for interactions between i and j atoms */
230 qq00 = _mm_mul_ps(iq0,jq0);
232 /* EWALD ELECTROSTATICS */
234 /* Analytical PME correction */
235 zeta2 = _mm_mul_ps(beta2,rsq00);
236 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
237 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
238 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
239 felec = _mm_mul_ps(qq00,felec);
240 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
241 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
242 velec = _mm_mul_ps(qq00,velec);
244 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
246 /* Update potential sum for this i atom from the interaction with this j atom. */
247 velec = _mm_and_ps(velec,cutoff_mask);
248 velecsum = _mm_add_ps(velecsum,velec);
252 fscal = _mm_and_ps(fscal,cutoff_mask);
254 /* Update vectorial force */
255 fix0 = _mm_macc_ps(dx00,fscal,fix0);
256 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
257 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
259 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
260 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
261 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
265 /**************************
266 * CALCULATE INTERACTIONS *
267 **************************/
269 if (gmx_mm_any_lt(rsq10,rcutoff2))
272 r10 = _mm_mul_ps(rsq10,rinv10);
274 /* Compute parameters for interactions between i and j atoms */
275 qq10 = _mm_mul_ps(iq1,jq0);
277 /* EWALD ELECTROSTATICS */
279 /* Analytical PME correction */
280 zeta2 = _mm_mul_ps(beta2,rsq10);
281 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
282 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
283 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
284 felec = _mm_mul_ps(qq10,felec);
285 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
286 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
287 velec = _mm_mul_ps(qq10,velec);
289 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
291 /* Update potential sum for this i atom from the interaction with this j atom. */
292 velec = _mm_and_ps(velec,cutoff_mask);
293 velecsum = _mm_add_ps(velecsum,velec);
297 fscal = _mm_and_ps(fscal,cutoff_mask);
299 /* Update vectorial force */
300 fix1 = _mm_macc_ps(dx10,fscal,fix1);
301 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
302 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
304 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
305 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
306 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 if (gmx_mm_any_lt(rsq20,rcutoff2))
317 r20 = _mm_mul_ps(rsq20,rinv20);
319 /* Compute parameters for interactions between i and j atoms */
320 qq20 = _mm_mul_ps(iq2,jq0);
322 /* EWALD ELECTROSTATICS */
324 /* Analytical PME correction */
325 zeta2 = _mm_mul_ps(beta2,rsq20);
326 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
327 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
328 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
329 felec = _mm_mul_ps(qq20,felec);
330 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
331 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
332 velec = _mm_mul_ps(qq20,velec);
334 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
336 /* Update potential sum for this i atom from the interaction with this j atom. */
337 velec = _mm_and_ps(velec,cutoff_mask);
338 velecsum = _mm_add_ps(velecsum,velec);
342 fscal = _mm_and_ps(fscal,cutoff_mask);
344 /* Update vectorial force */
345 fix2 = _mm_macc_ps(dx20,fscal,fix2);
346 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
347 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
349 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
350 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
351 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
355 fjptrA = f+j_coord_offsetA;
356 fjptrB = f+j_coord_offsetB;
357 fjptrC = f+j_coord_offsetC;
358 fjptrD = f+j_coord_offsetD;
360 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
362 /* Inner loop uses 99 flops */
368 /* Get j neighbor index, and coordinate index */
369 jnrlistA = jjnr[jidx];
370 jnrlistB = jjnr[jidx+1];
371 jnrlistC = jjnr[jidx+2];
372 jnrlistD = jjnr[jidx+3];
373 /* Sign of each element will be negative for non-real atoms.
374 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
375 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
377 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
378 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
379 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
380 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
381 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
382 j_coord_offsetA = DIM*jnrA;
383 j_coord_offsetB = DIM*jnrB;
384 j_coord_offsetC = DIM*jnrC;
385 j_coord_offsetD = DIM*jnrD;
387 /* load j atom coordinates */
388 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
389 x+j_coord_offsetC,x+j_coord_offsetD,
392 /* Calculate displacement vector */
393 dx00 = _mm_sub_ps(ix0,jx0);
394 dy00 = _mm_sub_ps(iy0,jy0);
395 dz00 = _mm_sub_ps(iz0,jz0);
396 dx10 = _mm_sub_ps(ix1,jx0);
397 dy10 = _mm_sub_ps(iy1,jy0);
398 dz10 = _mm_sub_ps(iz1,jz0);
399 dx20 = _mm_sub_ps(ix2,jx0);
400 dy20 = _mm_sub_ps(iy2,jy0);
401 dz20 = _mm_sub_ps(iz2,jz0);
403 /* Calculate squared distance and things based on it */
404 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
405 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
406 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
408 rinv00 = gmx_mm_invsqrt_ps(rsq00);
409 rinv10 = gmx_mm_invsqrt_ps(rsq10);
410 rinv20 = gmx_mm_invsqrt_ps(rsq20);
412 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
413 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
414 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
416 /* Load parameters for j particles */
417 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
418 charge+jnrC+0,charge+jnrD+0);
420 fjx0 = _mm_setzero_ps();
421 fjy0 = _mm_setzero_ps();
422 fjz0 = _mm_setzero_ps();
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 if (gmx_mm_any_lt(rsq00,rcutoff2))
431 r00 = _mm_mul_ps(rsq00,rinv00);
432 r00 = _mm_andnot_ps(dummy_mask,r00);
434 /* Compute parameters for interactions between i and j atoms */
435 qq00 = _mm_mul_ps(iq0,jq0);
437 /* EWALD ELECTROSTATICS */
439 /* Analytical PME correction */
440 zeta2 = _mm_mul_ps(beta2,rsq00);
441 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
442 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
443 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
444 felec = _mm_mul_ps(qq00,felec);
445 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
446 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
447 velec = _mm_mul_ps(qq00,velec);
449 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
451 /* Update potential sum for this i atom from the interaction with this j atom. */
452 velec = _mm_and_ps(velec,cutoff_mask);
453 velec = _mm_andnot_ps(dummy_mask,velec);
454 velecsum = _mm_add_ps(velecsum,velec);
458 fscal = _mm_and_ps(fscal,cutoff_mask);
460 fscal = _mm_andnot_ps(dummy_mask,fscal);
462 /* Update vectorial force */
463 fix0 = _mm_macc_ps(dx00,fscal,fix0);
464 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
465 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
467 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
468 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
469 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 if (gmx_mm_any_lt(rsq10,rcutoff2))
480 r10 = _mm_mul_ps(rsq10,rinv10);
481 r10 = _mm_andnot_ps(dummy_mask,r10);
483 /* Compute parameters for interactions between i and j atoms */
484 qq10 = _mm_mul_ps(iq1,jq0);
486 /* EWALD ELECTROSTATICS */
488 /* Analytical PME correction */
489 zeta2 = _mm_mul_ps(beta2,rsq10);
490 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
491 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
492 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
493 felec = _mm_mul_ps(qq10,felec);
494 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
495 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
496 velec = _mm_mul_ps(qq10,velec);
498 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velec = _mm_and_ps(velec,cutoff_mask);
502 velec = _mm_andnot_ps(dummy_mask,velec);
503 velecsum = _mm_add_ps(velecsum,velec);
507 fscal = _mm_and_ps(fscal,cutoff_mask);
509 fscal = _mm_andnot_ps(dummy_mask,fscal);
511 /* Update vectorial force */
512 fix1 = _mm_macc_ps(dx10,fscal,fix1);
513 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
514 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
516 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
517 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
518 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
522 /**************************
523 * CALCULATE INTERACTIONS *
524 **************************/
526 if (gmx_mm_any_lt(rsq20,rcutoff2))
529 r20 = _mm_mul_ps(rsq20,rinv20);
530 r20 = _mm_andnot_ps(dummy_mask,r20);
532 /* Compute parameters for interactions between i and j atoms */
533 qq20 = _mm_mul_ps(iq2,jq0);
535 /* EWALD ELECTROSTATICS */
537 /* Analytical PME correction */
538 zeta2 = _mm_mul_ps(beta2,rsq20);
539 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
540 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
541 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
542 felec = _mm_mul_ps(qq20,felec);
543 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
544 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
545 velec = _mm_mul_ps(qq20,velec);
547 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
549 /* Update potential sum for this i atom from the interaction with this j atom. */
550 velec = _mm_and_ps(velec,cutoff_mask);
551 velec = _mm_andnot_ps(dummy_mask,velec);
552 velecsum = _mm_add_ps(velecsum,velec);
556 fscal = _mm_and_ps(fscal,cutoff_mask);
558 fscal = _mm_andnot_ps(dummy_mask,fscal);
560 /* Update vectorial force */
561 fix2 = _mm_macc_ps(dx20,fscal,fix2);
562 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
563 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
565 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
566 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
567 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
571 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
572 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
573 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
574 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
576 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
578 /* Inner loop uses 102 flops */
581 /* End of innermost loop */
583 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
584 f+i_coord_offset,fshift+i_shift_offset);
587 /* Update potential energies */
588 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
590 /* Increment number of inner iterations */
591 inneriter += j_index_end - j_index_start;
593 /* Outer loop uses 19 flops */
596 /* Increment number of outer iterations */
599 /* Update outer/inner flops */
601 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*102);
604 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
605 * Electrostatics interaction: Ewald
606 * VdW interaction: None
607 * Geometry: Water3-Particle
608 * Calculate force/pot: Force
611 nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_avx_128_fma_single
612 (t_nblist * gmx_restrict nlist,
613 rvec * gmx_restrict xx,
614 rvec * gmx_restrict ff,
615 t_forcerec * gmx_restrict fr,
616 t_mdatoms * gmx_restrict mdatoms,
617 nb_kernel_data_t * gmx_restrict kernel_data,
618 t_nrnb * gmx_restrict nrnb)
620 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
621 * just 0 for non-waters.
622 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
623 * jnr indices corresponding to data put in the four positions in the SIMD register.
625 int i_shift_offset,i_coord_offset,outeriter,inneriter;
626 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
627 int jnrA,jnrB,jnrC,jnrD;
628 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
629 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
630 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
632 real *shiftvec,*fshift,*x,*f;
633 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
635 __m128 fscal,rcutoff,rcutoff2,jidxall;
637 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
639 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
641 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
642 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
643 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
644 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
645 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
646 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
647 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
650 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
651 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
653 __m128 dummy_mask,cutoff_mask;
654 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
655 __m128 one = _mm_set1_ps(1.0);
656 __m128 two = _mm_set1_ps(2.0);
662 jindex = nlist->jindex;
664 shiftidx = nlist->shift;
666 shiftvec = fr->shift_vec[0];
667 fshift = fr->fshift[0];
668 facel = _mm_set1_ps(fr->epsfac);
669 charge = mdatoms->chargeA;
671 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
672 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
673 beta2 = _mm_mul_ps(beta,beta);
674 beta3 = _mm_mul_ps(beta,beta2);
675 ewtab = fr->ic->tabq_coul_F;
676 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
677 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
679 /* Setup water-specific parameters */
680 inr = nlist->iinr[0];
681 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
682 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
683 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
685 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
686 rcutoff_scalar = fr->rcoulomb;
687 rcutoff = _mm_set1_ps(rcutoff_scalar);
688 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
690 /* Avoid stupid compiler warnings */
691 jnrA = jnrB = jnrC = jnrD = 0;
700 for(iidx=0;iidx<4*DIM;iidx++)
705 /* Start outer loop over neighborlists */
706 for(iidx=0; iidx<nri; iidx++)
708 /* Load shift vector for this list */
709 i_shift_offset = DIM*shiftidx[iidx];
711 /* Load limits for loop over neighbors */
712 j_index_start = jindex[iidx];
713 j_index_end = jindex[iidx+1];
715 /* Get outer coordinate index */
717 i_coord_offset = DIM*inr;
719 /* Load i particle coords and add shift vector */
720 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
721 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
723 fix0 = _mm_setzero_ps();
724 fiy0 = _mm_setzero_ps();
725 fiz0 = _mm_setzero_ps();
726 fix1 = _mm_setzero_ps();
727 fiy1 = _mm_setzero_ps();
728 fiz1 = _mm_setzero_ps();
729 fix2 = _mm_setzero_ps();
730 fiy2 = _mm_setzero_ps();
731 fiz2 = _mm_setzero_ps();
733 /* Start inner kernel loop */
734 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
737 /* Get j neighbor index, and coordinate index */
742 j_coord_offsetA = DIM*jnrA;
743 j_coord_offsetB = DIM*jnrB;
744 j_coord_offsetC = DIM*jnrC;
745 j_coord_offsetD = DIM*jnrD;
747 /* load j atom coordinates */
748 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
749 x+j_coord_offsetC,x+j_coord_offsetD,
752 /* Calculate displacement vector */
753 dx00 = _mm_sub_ps(ix0,jx0);
754 dy00 = _mm_sub_ps(iy0,jy0);
755 dz00 = _mm_sub_ps(iz0,jz0);
756 dx10 = _mm_sub_ps(ix1,jx0);
757 dy10 = _mm_sub_ps(iy1,jy0);
758 dz10 = _mm_sub_ps(iz1,jz0);
759 dx20 = _mm_sub_ps(ix2,jx0);
760 dy20 = _mm_sub_ps(iy2,jy0);
761 dz20 = _mm_sub_ps(iz2,jz0);
763 /* Calculate squared distance and things based on it */
764 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
765 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
766 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
768 rinv00 = gmx_mm_invsqrt_ps(rsq00);
769 rinv10 = gmx_mm_invsqrt_ps(rsq10);
770 rinv20 = gmx_mm_invsqrt_ps(rsq20);
772 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
773 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
774 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
776 /* Load parameters for j particles */
777 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
778 charge+jnrC+0,charge+jnrD+0);
780 fjx0 = _mm_setzero_ps();
781 fjy0 = _mm_setzero_ps();
782 fjz0 = _mm_setzero_ps();
784 /**************************
785 * CALCULATE INTERACTIONS *
786 **************************/
788 if (gmx_mm_any_lt(rsq00,rcutoff2))
791 r00 = _mm_mul_ps(rsq00,rinv00);
793 /* Compute parameters for interactions between i and j atoms */
794 qq00 = _mm_mul_ps(iq0,jq0);
796 /* EWALD ELECTROSTATICS */
798 /* Analytical PME correction */
799 zeta2 = _mm_mul_ps(beta2,rsq00);
800 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
801 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
802 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
803 felec = _mm_mul_ps(qq00,felec);
805 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
809 fscal = _mm_and_ps(fscal,cutoff_mask);
811 /* Update vectorial force */
812 fix0 = _mm_macc_ps(dx00,fscal,fix0);
813 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
814 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
816 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
817 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
818 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
822 /**************************
823 * CALCULATE INTERACTIONS *
824 **************************/
826 if (gmx_mm_any_lt(rsq10,rcutoff2))
829 r10 = _mm_mul_ps(rsq10,rinv10);
831 /* Compute parameters for interactions between i and j atoms */
832 qq10 = _mm_mul_ps(iq1,jq0);
834 /* EWALD ELECTROSTATICS */
836 /* Analytical PME correction */
837 zeta2 = _mm_mul_ps(beta2,rsq10);
838 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
839 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
840 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
841 felec = _mm_mul_ps(qq10,felec);
843 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
847 fscal = _mm_and_ps(fscal,cutoff_mask);
849 /* Update vectorial force */
850 fix1 = _mm_macc_ps(dx10,fscal,fix1);
851 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
852 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
854 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
855 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
856 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
860 /**************************
861 * CALCULATE INTERACTIONS *
862 **************************/
864 if (gmx_mm_any_lt(rsq20,rcutoff2))
867 r20 = _mm_mul_ps(rsq20,rinv20);
869 /* Compute parameters for interactions between i and j atoms */
870 qq20 = _mm_mul_ps(iq2,jq0);
872 /* EWALD ELECTROSTATICS */
874 /* Analytical PME correction */
875 zeta2 = _mm_mul_ps(beta2,rsq20);
876 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
877 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
878 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
879 felec = _mm_mul_ps(qq20,felec);
881 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
885 fscal = _mm_and_ps(fscal,cutoff_mask);
887 /* Update vectorial force */
888 fix2 = _mm_macc_ps(dx20,fscal,fix2);
889 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
890 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
892 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
893 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
894 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
898 fjptrA = f+j_coord_offsetA;
899 fjptrB = f+j_coord_offsetB;
900 fjptrC = f+j_coord_offsetC;
901 fjptrD = f+j_coord_offsetD;
903 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
905 /* Inner loop uses 93 flops */
911 /* Get j neighbor index, and coordinate index */
912 jnrlistA = jjnr[jidx];
913 jnrlistB = jjnr[jidx+1];
914 jnrlistC = jjnr[jidx+2];
915 jnrlistD = jjnr[jidx+3];
916 /* Sign of each element will be negative for non-real atoms.
917 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
918 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
920 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
921 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
922 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
923 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
924 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
925 j_coord_offsetA = DIM*jnrA;
926 j_coord_offsetB = DIM*jnrB;
927 j_coord_offsetC = DIM*jnrC;
928 j_coord_offsetD = DIM*jnrD;
930 /* load j atom coordinates */
931 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
932 x+j_coord_offsetC,x+j_coord_offsetD,
935 /* Calculate displacement vector */
936 dx00 = _mm_sub_ps(ix0,jx0);
937 dy00 = _mm_sub_ps(iy0,jy0);
938 dz00 = _mm_sub_ps(iz0,jz0);
939 dx10 = _mm_sub_ps(ix1,jx0);
940 dy10 = _mm_sub_ps(iy1,jy0);
941 dz10 = _mm_sub_ps(iz1,jz0);
942 dx20 = _mm_sub_ps(ix2,jx0);
943 dy20 = _mm_sub_ps(iy2,jy0);
944 dz20 = _mm_sub_ps(iz2,jz0);
946 /* Calculate squared distance and things based on it */
947 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
948 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
949 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
951 rinv00 = gmx_mm_invsqrt_ps(rsq00);
952 rinv10 = gmx_mm_invsqrt_ps(rsq10);
953 rinv20 = gmx_mm_invsqrt_ps(rsq20);
955 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
956 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
957 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
959 /* Load parameters for j particles */
960 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
961 charge+jnrC+0,charge+jnrD+0);
963 fjx0 = _mm_setzero_ps();
964 fjy0 = _mm_setzero_ps();
965 fjz0 = _mm_setzero_ps();
967 /**************************
968 * CALCULATE INTERACTIONS *
969 **************************/
971 if (gmx_mm_any_lt(rsq00,rcutoff2))
974 r00 = _mm_mul_ps(rsq00,rinv00);
975 r00 = _mm_andnot_ps(dummy_mask,r00);
977 /* Compute parameters for interactions between i and j atoms */
978 qq00 = _mm_mul_ps(iq0,jq0);
980 /* EWALD ELECTROSTATICS */
982 /* Analytical PME correction */
983 zeta2 = _mm_mul_ps(beta2,rsq00);
984 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
985 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
986 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
987 felec = _mm_mul_ps(qq00,felec);
989 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
993 fscal = _mm_and_ps(fscal,cutoff_mask);
995 fscal = _mm_andnot_ps(dummy_mask,fscal);
997 /* Update vectorial force */
998 fix0 = _mm_macc_ps(dx00,fscal,fix0);
999 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1000 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1002 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1003 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1004 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1008 /**************************
1009 * CALCULATE INTERACTIONS *
1010 **************************/
1012 if (gmx_mm_any_lt(rsq10,rcutoff2))
1015 r10 = _mm_mul_ps(rsq10,rinv10);
1016 r10 = _mm_andnot_ps(dummy_mask,r10);
1018 /* Compute parameters for interactions between i and j atoms */
1019 qq10 = _mm_mul_ps(iq1,jq0);
1021 /* EWALD ELECTROSTATICS */
1023 /* Analytical PME correction */
1024 zeta2 = _mm_mul_ps(beta2,rsq10);
1025 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1026 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1027 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1028 felec = _mm_mul_ps(qq10,felec);
1030 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1034 fscal = _mm_and_ps(fscal,cutoff_mask);
1036 fscal = _mm_andnot_ps(dummy_mask,fscal);
1038 /* Update vectorial force */
1039 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1040 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1041 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1043 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1044 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1045 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1049 /**************************
1050 * CALCULATE INTERACTIONS *
1051 **************************/
1053 if (gmx_mm_any_lt(rsq20,rcutoff2))
1056 r20 = _mm_mul_ps(rsq20,rinv20);
1057 r20 = _mm_andnot_ps(dummy_mask,r20);
1059 /* Compute parameters for interactions between i and j atoms */
1060 qq20 = _mm_mul_ps(iq2,jq0);
1062 /* EWALD ELECTROSTATICS */
1064 /* Analytical PME correction */
1065 zeta2 = _mm_mul_ps(beta2,rsq20);
1066 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1067 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1068 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1069 felec = _mm_mul_ps(qq20,felec);
1071 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1075 fscal = _mm_and_ps(fscal,cutoff_mask);
1077 fscal = _mm_andnot_ps(dummy_mask,fscal);
1079 /* Update vectorial force */
1080 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1081 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1082 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1084 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1085 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1086 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1090 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1091 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1092 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1093 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1095 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1097 /* Inner loop uses 96 flops */
1100 /* End of innermost loop */
1102 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1103 f+i_coord_offset,fshift+i_shift_offset);
1105 /* Increment number of inner iterations */
1106 inneriter += j_index_end - j_index_start;
1108 /* Outer loop uses 18 flops */
1111 /* Increment number of outer iterations */
1114 /* Update outer/inner flops */
1116 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*96);