2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
109 __m128 dummy_mask,cutoff_mask;
110 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111 __m128 one = _mm_set1_ps(1.0);
112 __m128 two = _mm_set1_ps(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm_set1_ps(fr->ic->epsfac);
125 charge = mdatoms->chargeA;
127 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
128 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
129 beta2 = _mm_mul_ps(beta,beta);
130 beta3 = _mm_mul_ps(beta,beta2);
131 ewtab = fr->ic->tabq_coul_FDV0;
132 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
133 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
138 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
141 jq0 = _mm_set1_ps(charge[inr+0]);
142 jq1 = _mm_set1_ps(charge[inr+1]);
143 jq2 = _mm_set1_ps(charge[inr+2]);
144 qq00 = _mm_mul_ps(iq0,jq0);
145 qq01 = _mm_mul_ps(iq0,jq1);
146 qq02 = _mm_mul_ps(iq0,jq2);
147 qq10 = _mm_mul_ps(iq1,jq0);
148 qq11 = _mm_mul_ps(iq1,jq1);
149 qq12 = _mm_mul_ps(iq1,jq2);
150 qq20 = _mm_mul_ps(iq2,jq0);
151 qq21 = _mm_mul_ps(iq2,jq1);
152 qq22 = _mm_mul_ps(iq2,jq2);
154 /* Avoid stupid compiler warnings */
155 jnrA = jnrB = jnrC = jnrD = 0;
164 for(iidx=0;iidx<4*DIM;iidx++)
169 /* Start outer loop over neighborlists */
170 for(iidx=0; iidx<nri; iidx++)
172 /* Load shift vector for this list */
173 i_shift_offset = DIM*shiftidx[iidx];
175 /* Load limits for loop over neighbors */
176 j_index_start = jindex[iidx];
177 j_index_end = jindex[iidx+1];
179 /* Get outer coordinate index */
181 i_coord_offset = DIM*inr;
183 /* Load i particle coords and add shift vector */
184 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
185 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
187 fix0 = _mm_setzero_ps();
188 fiy0 = _mm_setzero_ps();
189 fiz0 = _mm_setzero_ps();
190 fix1 = _mm_setzero_ps();
191 fiy1 = _mm_setzero_ps();
192 fiz1 = _mm_setzero_ps();
193 fix2 = _mm_setzero_ps();
194 fiy2 = _mm_setzero_ps();
195 fiz2 = _mm_setzero_ps();
197 /* Reset potential sums */
198 velecsum = _mm_setzero_ps();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
204 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
211 j_coord_offsetC = DIM*jnrC;
212 j_coord_offsetD = DIM*jnrD;
214 /* load j atom coordinates */
215 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
216 x+j_coord_offsetC,x+j_coord_offsetD,
217 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
219 /* Calculate displacement vector */
220 dx00 = _mm_sub_ps(ix0,jx0);
221 dy00 = _mm_sub_ps(iy0,jy0);
222 dz00 = _mm_sub_ps(iz0,jz0);
223 dx01 = _mm_sub_ps(ix0,jx1);
224 dy01 = _mm_sub_ps(iy0,jy1);
225 dz01 = _mm_sub_ps(iz0,jz1);
226 dx02 = _mm_sub_ps(ix0,jx2);
227 dy02 = _mm_sub_ps(iy0,jy2);
228 dz02 = _mm_sub_ps(iz0,jz2);
229 dx10 = _mm_sub_ps(ix1,jx0);
230 dy10 = _mm_sub_ps(iy1,jy0);
231 dz10 = _mm_sub_ps(iz1,jz0);
232 dx11 = _mm_sub_ps(ix1,jx1);
233 dy11 = _mm_sub_ps(iy1,jy1);
234 dz11 = _mm_sub_ps(iz1,jz1);
235 dx12 = _mm_sub_ps(ix1,jx2);
236 dy12 = _mm_sub_ps(iy1,jy2);
237 dz12 = _mm_sub_ps(iz1,jz2);
238 dx20 = _mm_sub_ps(ix2,jx0);
239 dy20 = _mm_sub_ps(iy2,jy0);
240 dz20 = _mm_sub_ps(iz2,jz0);
241 dx21 = _mm_sub_ps(ix2,jx1);
242 dy21 = _mm_sub_ps(iy2,jy1);
243 dz21 = _mm_sub_ps(iz2,jz1);
244 dx22 = _mm_sub_ps(ix2,jx2);
245 dy22 = _mm_sub_ps(iy2,jy2);
246 dz22 = _mm_sub_ps(iz2,jz2);
248 /* Calculate squared distance and things based on it */
249 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
250 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
251 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
252 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
253 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
254 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
255 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
256 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
257 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
259 rinv00 = avx128fma_invsqrt_f(rsq00);
260 rinv01 = avx128fma_invsqrt_f(rsq01);
261 rinv02 = avx128fma_invsqrt_f(rsq02);
262 rinv10 = avx128fma_invsqrt_f(rsq10);
263 rinv11 = avx128fma_invsqrt_f(rsq11);
264 rinv12 = avx128fma_invsqrt_f(rsq12);
265 rinv20 = avx128fma_invsqrt_f(rsq20);
266 rinv21 = avx128fma_invsqrt_f(rsq21);
267 rinv22 = avx128fma_invsqrt_f(rsq22);
269 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
270 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
271 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
272 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
273 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
274 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
275 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
276 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
277 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
279 fjx0 = _mm_setzero_ps();
280 fjy0 = _mm_setzero_ps();
281 fjz0 = _mm_setzero_ps();
282 fjx1 = _mm_setzero_ps();
283 fjy1 = _mm_setzero_ps();
284 fjz1 = _mm_setzero_ps();
285 fjx2 = _mm_setzero_ps();
286 fjy2 = _mm_setzero_ps();
287 fjz2 = _mm_setzero_ps();
289 /**************************
290 * CALCULATE INTERACTIONS *
291 **************************/
293 r00 = _mm_mul_ps(rsq00,rinv00);
295 /* EWALD ELECTROSTATICS */
297 /* Analytical PME correction */
298 zeta2 = _mm_mul_ps(beta2,rsq00);
299 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
300 pmecorrF = avx128fma_pmecorrF_f(zeta2);
301 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
302 felec = _mm_mul_ps(qq00,felec);
303 pmecorrV = avx128fma_pmecorrV_f(zeta2);
304 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
305 velec = _mm_mul_ps(qq00,velec);
307 /* Update potential sum for this i atom from the interaction with this j atom. */
308 velecsum = _mm_add_ps(velecsum,velec);
312 /* Update vectorial force */
313 fix0 = _mm_macc_ps(dx00,fscal,fix0);
314 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
315 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
317 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
318 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
319 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 r01 = _mm_mul_ps(rsq01,rinv01);
327 /* EWALD ELECTROSTATICS */
329 /* Analytical PME correction */
330 zeta2 = _mm_mul_ps(beta2,rsq01);
331 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
332 pmecorrF = avx128fma_pmecorrF_f(zeta2);
333 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
334 felec = _mm_mul_ps(qq01,felec);
335 pmecorrV = avx128fma_pmecorrV_f(zeta2);
336 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
337 velec = _mm_mul_ps(qq01,velec);
339 /* Update potential sum for this i atom from the interaction with this j atom. */
340 velecsum = _mm_add_ps(velecsum,velec);
344 /* Update vectorial force */
345 fix0 = _mm_macc_ps(dx01,fscal,fix0);
346 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
347 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
349 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
350 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
351 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 r02 = _mm_mul_ps(rsq02,rinv02);
359 /* EWALD ELECTROSTATICS */
361 /* Analytical PME correction */
362 zeta2 = _mm_mul_ps(beta2,rsq02);
363 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
364 pmecorrF = avx128fma_pmecorrF_f(zeta2);
365 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
366 felec = _mm_mul_ps(qq02,felec);
367 pmecorrV = avx128fma_pmecorrV_f(zeta2);
368 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
369 velec = _mm_mul_ps(qq02,velec);
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 velecsum = _mm_add_ps(velecsum,velec);
376 /* Update vectorial force */
377 fix0 = _mm_macc_ps(dx02,fscal,fix0);
378 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
379 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
381 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
382 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
383 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 r10 = _mm_mul_ps(rsq10,rinv10);
391 /* EWALD ELECTROSTATICS */
393 /* Analytical PME correction */
394 zeta2 = _mm_mul_ps(beta2,rsq10);
395 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
396 pmecorrF = avx128fma_pmecorrF_f(zeta2);
397 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
398 felec = _mm_mul_ps(qq10,felec);
399 pmecorrV = avx128fma_pmecorrV_f(zeta2);
400 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
401 velec = _mm_mul_ps(qq10,velec);
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum = _mm_add_ps(velecsum,velec);
408 /* Update vectorial force */
409 fix1 = _mm_macc_ps(dx10,fscal,fix1);
410 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
411 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
413 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
414 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
415 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 r11 = _mm_mul_ps(rsq11,rinv11);
423 /* EWALD ELECTROSTATICS */
425 /* Analytical PME correction */
426 zeta2 = _mm_mul_ps(beta2,rsq11);
427 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
428 pmecorrF = avx128fma_pmecorrF_f(zeta2);
429 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
430 felec = _mm_mul_ps(qq11,felec);
431 pmecorrV = avx128fma_pmecorrV_f(zeta2);
432 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
433 velec = _mm_mul_ps(qq11,velec);
435 /* Update potential sum for this i atom from the interaction with this j atom. */
436 velecsum = _mm_add_ps(velecsum,velec);
440 /* Update vectorial force */
441 fix1 = _mm_macc_ps(dx11,fscal,fix1);
442 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
443 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
445 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
446 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
447 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 r12 = _mm_mul_ps(rsq12,rinv12);
455 /* EWALD ELECTROSTATICS */
457 /* Analytical PME correction */
458 zeta2 = _mm_mul_ps(beta2,rsq12);
459 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
460 pmecorrF = avx128fma_pmecorrF_f(zeta2);
461 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
462 felec = _mm_mul_ps(qq12,felec);
463 pmecorrV = avx128fma_pmecorrV_f(zeta2);
464 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
465 velec = _mm_mul_ps(qq12,velec);
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velecsum = _mm_add_ps(velecsum,velec);
472 /* Update vectorial force */
473 fix1 = _mm_macc_ps(dx12,fscal,fix1);
474 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
475 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
477 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
478 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
479 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 r20 = _mm_mul_ps(rsq20,rinv20);
487 /* EWALD ELECTROSTATICS */
489 /* Analytical PME correction */
490 zeta2 = _mm_mul_ps(beta2,rsq20);
491 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
492 pmecorrF = avx128fma_pmecorrF_f(zeta2);
493 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
494 felec = _mm_mul_ps(qq20,felec);
495 pmecorrV = avx128fma_pmecorrV_f(zeta2);
496 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
497 velec = _mm_mul_ps(qq20,velec);
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velecsum = _mm_add_ps(velecsum,velec);
504 /* Update vectorial force */
505 fix2 = _mm_macc_ps(dx20,fscal,fix2);
506 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
507 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
509 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
510 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
511 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 r21 = _mm_mul_ps(rsq21,rinv21);
519 /* EWALD ELECTROSTATICS */
521 /* Analytical PME correction */
522 zeta2 = _mm_mul_ps(beta2,rsq21);
523 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
524 pmecorrF = avx128fma_pmecorrF_f(zeta2);
525 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
526 felec = _mm_mul_ps(qq21,felec);
527 pmecorrV = avx128fma_pmecorrV_f(zeta2);
528 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
529 velec = _mm_mul_ps(qq21,velec);
531 /* Update potential sum for this i atom from the interaction with this j atom. */
532 velecsum = _mm_add_ps(velecsum,velec);
536 /* Update vectorial force */
537 fix2 = _mm_macc_ps(dx21,fscal,fix2);
538 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
539 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
541 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
542 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
543 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
549 r22 = _mm_mul_ps(rsq22,rinv22);
551 /* EWALD ELECTROSTATICS */
553 /* Analytical PME correction */
554 zeta2 = _mm_mul_ps(beta2,rsq22);
555 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
556 pmecorrF = avx128fma_pmecorrF_f(zeta2);
557 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
558 felec = _mm_mul_ps(qq22,felec);
559 pmecorrV = avx128fma_pmecorrV_f(zeta2);
560 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
561 velec = _mm_mul_ps(qq22,velec);
563 /* Update potential sum for this i atom from the interaction with this j atom. */
564 velecsum = _mm_add_ps(velecsum,velec);
568 /* Update vectorial force */
569 fix2 = _mm_macc_ps(dx22,fscal,fix2);
570 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
571 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
573 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
574 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
575 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
577 fjptrA = f+j_coord_offsetA;
578 fjptrB = f+j_coord_offsetB;
579 fjptrC = f+j_coord_offsetC;
580 fjptrD = f+j_coord_offsetD;
582 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
583 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
585 /* Inner loop uses 261 flops */
591 /* Get j neighbor index, and coordinate index */
592 jnrlistA = jjnr[jidx];
593 jnrlistB = jjnr[jidx+1];
594 jnrlistC = jjnr[jidx+2];
595 jnrlistD = jjnr[jidx+3];
596 /* Sign of each element will be negative for non-real atoms.
597 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
598 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
600 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
601 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
602 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
603 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
604 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
605 j_coord_offsetA = DIM*jnrA;
606 j_coord_offsetB = DIM*jnrB;
607 j_coord_offsetC = DIM*jnrC;
608 j_coord_offsetD = DIM*jnrD;
610 /* load j atom coordinates */
611 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
612 x+j_coord_offsetC,x+j_coord_offsetD,
613 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
615 /* Calculate displacement vector */
616 dx00 = _mm_sub_ps(ix0,jx0);
617 dy00 = _mm_sub_ps(iy0,jy0);
618 dz00 = _mm_sub_ps(iz0,jz0);
619 dx01 = _mm_sub_ps(ix0,jx1);
620 dy01 = _mm_sub_ps(iy0,jy1);
621 dz01 = _mm_sub_ps(iz0,jz1);
622 dx02 = _mm_sub_ps(ix0,jx2);
623 dy02 = _mm_sub_ps(iy0,jy2);
624 dz02 = _mm_sub_ps(iz0,jz2);
625 dx10 = _mm_sub_ps(ix1,jx0);
626 dy10 = _mm_sub_ps(iy1,jy0);
627 dz10 = _mm_sub_ps(iz1,jz0);
628 dx11 = _mm_sub_ps(ix1,jx1);
629 dy11 = _mm_sub_ps(iy1,jy1);
630 dz11 = _mm_sub_ps(iz1,jz1);
631 dx12 = _mm_sub_ps(ix1,jx2);
632 dy12 = _mm_sub_ps(iy1,jy2);
633 dz12 = _mm_sub_ps(iz1,jz2);
634 dx20 = _mm_sub_ps(ix2,jx0);
635 dy20 = _mm_sub_ps(iy2,jy0);
636 dz20 = _mm_sub_ps(iz2,jz0);
637 dx21 = _mm_sub_ps(ix2,jx1);
638 dy21 = _mm_sub_ps(iy2,jy1);
639 dz21 = _mm_sub_ps(iz2,jz1);
640 dx22 = _mm_sub_ps(ix2,jx2);
641 dy22 = _mm_sub_ps(iy2,jy2);
642 dz22 = _mm_sub_ps(iz2,jz2);
644 /* Calculate squared distance and things based on it */
645 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
646 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
647 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
648 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
649 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
650 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
651 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
652 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
653 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
655 rinv00 = avx128fma_invsqrt_f(rsq00);
656 rinv01 = avx128fma_invsqrt_f(rsq01);
657 rinv02 = avx128fma_invsqrt_f(rsq02);
658 rinv10 = avx128fma_invsqrt_f(rsq10);
659 rinv11 = avx128fma_invsqrt_f(rsq11);
660 rinv12 = avx128fma_invsqrt_f(rsq12);
661 rinv20 = avx128fma_invsqrt_f(rsq20);
662 rinv21 = avx128fma_invsqrt_f(rsq21);
663 rinv22 = avx128fma_invsqrt_f(rsq22);
665 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
666 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
667 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
668 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
669 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
670 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
671 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
672 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
673 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
675 fjx0 = _mm_setzero_ps();
676 fjy0 = _mm_setzero_ps();
677 fjz0 = _mm_setzero_ps();
678 fjx1 = _mm_setzero_ps();
679 fjy1 = _mm_setzero_ps();
680 fjz1 = _mm_setzero_ps();
681 fjx2 = _mm_setzero_ps();
682 fjy2 = _mm_setzero_ps();
683 fjz2 = _mm_setzero_ps();
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 r00 = _mm_mul_ps(rsq00,rinv00);
690 r00 = _mm_andnot_ps(dummy_mask,r00);
692 /* EWALD ELECTROSTATICS */
694 /* Analytical PME correction */
695 zeta2 = _mm_mul_ps(beta2,rsq00);
696 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
697 pmecorrF = avx128fma_pmecorrF_f(zeta2);
698 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
699 felec = _mm_mul_ps(qq00,felec);
700 pmecorrV = avx128fma_pmecorrV_f(zeta2);
701 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
702 velec = _mm_mul_ps(qq00,velec);
704 /* Update potential sum for this i atom from the interaction with this j atom. */
705 velec = _mm_andnot_ps(dummy_mask,velec);
706 velecsum = _mm_add_ps(velecsum,velec);
710 fscal = _mm_andnot_ps(dummy_mask,fscal);
712 /* Update vectorial force */
713 fix0 = _mm_macc_ps(dx00,fscal,fix0);
714 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
715 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
717 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
718 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
719 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
721 /**************************
722 * CALCULATE INTERACTIONS *
723 **************************/
725 r01 = _mm_mul_ps(rsq01,rinv01);
726 r01 = _mm_andnot_ps(dummy_mask,r01);
728 /* EWALD ELECTROSTATICS */
730 /* Analytical PME correction */
731 zeta2 = _mm_mul_ps(beta2,rsq01);
732 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
733 pmecorrF = avx128fma_pmecorrF_f(zeta2);
734 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
735 felec = _mm_mul_ps(qq01,felec);
736 pmecorrV = avx128fma_pmecorrV_f(zeta2);
737 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
738 velec = _mm_mul_ps(qq01,velec);
740 /* Update potential sum for this i atom from the interaction with this j atom. */
741 velec = _mm_andnot_ps(dummy_mask,velec);
742 velecsum = _mm_add_ps(velecsum,velec);
746 fscal = _mm_andnot_ps(dummy_mask,fscal);
748 /* Update vectorial force */
749 fix0 = _mm_macc_ps(dx01,fscal,fix0);
750 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
751 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
753 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
754 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
755 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
757 /**************************
758 * CALCULATE INTERACTIONS *
759 **************************/
761 r02 = _mm_mul_ps(rsq02,rinv02);
762 r02 = _mm_andnot_ps(dummy_mask,r02);
764 /* EWALD ELECTROSTATICS */
766 /* Analytical PME correction */
767 zeta2 = _mm_mul_ps(beta2,rsq02);
768 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
769 pmecorrF = avx128fma_pmecorrF_f(zeta2);
770 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
771 felec = _mm_mul_ps(qq02,felec);
772 pmecorrV = avx128fma_pmecorrV_f(zeta2);
773 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
774 velec = _mm_mul_ps(qq02,velec);
776 /* Update potential sum for this i atom from the interaction with this j atom. */
777 velec = _mm_andnot_ps(dummy_mask,velec);
778 velecsum = _mm_add_ps(velecsum,velec);
782 fscal = _mm_andnot_ps(dummy_mask,fscal);
784 /* Update vectorial force */
785 fix0 = _mm_macc_ps(dx02,fscal,fix0);
786 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
787 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
789 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
790 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
791 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 r10 = _mm_mul_ps(rsq10,rinv10);
798 r10 = _mm_andnot_ps(dummy_mask,r10);
800 /* EWALD ELECTROSTATICS */
802 /* Analytical PME correction */
803 zeta2 = _mm_mul_ps(beta2,rsq10);
804 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
805 pmecorrF = avx128fma_pmecorrF_f(zeta2);
806 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
807 felec = _mm_mul_ps(qq10,felec);
808 pmecorrV = avx128fma_pmecorrV_f(zeta2);
809 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
810 velec = _mm_mul_ps(qq10,velec);
812 /* Update potential sum for this i atom from the interaction with this j atom. */
813 velec = _mm_andnot_ps(dummy_mask,velec);
814 velecsum = _mm_add_ps(velecsum,velec);
818 fscal = _mm_andnot_ps(dummy_mask,fscal);
820 /* Update vectorial force */
821 fix1 = _mm_macc_ps(dx10,fscal,fix1);
822 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
823 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
825 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
826 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
827 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 r11 = _mm_mul_ps(rsq11,rinv11);
834 r11 = _mm_andnot_ps(dummy_mask,r11);
836 /* EWALD ELECTROSTATICS */
838 /* Analytical PME correction */
839 zeta2 = _mm_mul_ps(beta2,rsq11);
840 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
841 pmecorrF = avx128fma_pmecorrF_f(zeta2);
842 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
843 felec = _mm_mul_ps(qq11,felec);
844 pmecorrV = avx128fma_pmecorrV_f(zeta2);
845 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
846 velec = _mm_mul_ps(qq11,velec);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_andnot_ps(dummy_mask,velec);
850 velecsum = _mm_add_ps(velecsum,velec);
854 fscal = _mm_andnot_ps(dummy_mask,fscal);
856 /* Update vectorial force */
857 fix1 = _mm_macc_ps(dx11,fscal,fix1);
858 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
859 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
861 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
862 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
863 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 r12 = _mm_mul_ps(rsq12,rinv12);
870 r12 = _mm_andnot_ps(dummy_mask,r12);
872 /* EWALD ELECTROSTATICS */
874 /* Analytical PME correction */
875 zeta2 = _mm_mul_ps(beta2,rsq12);
876 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
877 pmecorrF = avx128fma_pmecorrF_f(zeta2);
878 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
879 felec = _mm_mul_ps(qq12,felec);
880 pmecorrV = avx128fma_pmecorrV_f(zeta2);
881 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
882 velec = _mm_mul_ps(qq12,velec);
884 /* Update potential sum for this i atom from the interaction with this j atom. */
885 velec = _mm_andnot_ps(dummy_mask,velec);
886 velecsum = _mm_add_ps(velecsum,velec);
890 fscal = _mm_andnot_ps(dummy_mask,fscal);
892 /* Update vectorial force */
893 fix1 = _mm_macc_ps(dx12,fscal,fix1);
894 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
895 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
897 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
898 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
899 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
901 /**************************
902 * CALCULATE INTERACTIONS *
903 **************************/
905 r20 = _mm_mul_ps(rsq20,rinv20);
906 r20 = _mm_andnot_ps(dummy_mask,r20);
908 /* EWALD ELECTROSTATICS */
910 /* Analytical PME correction */
911 zeta2 = _mm_mul_ps(beta2,rsq20);
912 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
913 pmecorrF = avx128fma_pmecorrF_f(zeta2);
914 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
915 felec = _mm_mul_ps(qq20,felec);
916 pmecorrV = avx128fma_pmecorrV_f(zeta2);
917 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
918 velec = _mm_mul_ps(qq20,velec);
920 /* Update potential sum for this i atom from the interaction with this j atom. */
921 velec = _mm_andnot_ps(dummy_mask,velec);
922 velecsum = _mm_add_ps(velecsum,velec);
926 fscal = _mm_andnot_ps(dummy_mask,fscal);
928 /* Update vectorial force */
929 fix2 = _mm_macc_ps(dx20,fscal,fix2);
930 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
931 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
933 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
934 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
935 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 r21 = _mm_mul_ps(rsq21,rinv21);
942 r21 = _mm_andnot_ps(dummy_mask,r21);
944 /* EWALD ELECTROSTATICS */
946 /* Analytical PME correction */
947 zeta2 = _mm_mul_ps(beta2,rsq21);
948 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
949 pmecorrF = avx128fma_pmecorrF_f(zeta2);
950 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
951 felec = _mm_mul_ps(qq21,felec);
952 pmecorrV = avx128fma_pmecorrV_f(zeta2);
953 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
954 velec = _mm_mul_ps(qq21,velec);
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec = _mm_andnot_ps(dummy_mask,velec);
958 velecsum = _mm_add_ps(velecsum,velec);
962 fscal = _mm_andnot_ps(dummy_mask,fscal);
964 /* Update vectorial force */
965 fix2 = _mm_macc_ps(dx21,fscal,fix2);
966 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
967 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
969 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
970 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
971 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
973 /**************************
974 * CALCULATE INTERACTIONS *
975 **************************/
977 r22 = _mm_mul_ps(rsq22,rinv22);
978 r22 = _mm_andnot_ps(dummy_mask,r22);
980 /* EWALD ELECTROSTATICS */
982 /* Analytical PME correction */
983 zeta2 = _mm_mul_ps(beta2,rsq22);
984 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
985 pmecorrF = avx128fma_pmecorrF_f(zeta2);
986 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
987 felec = _mm_mul_ps(qq22,felec);
988 pmecorrV = avx128fma_pmecorrV_f(zeta2);
989 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
990 velec = _mm_mul_ps(qq22,velec);
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _mm_andnot_ps(dummy_mask,velec);
994 velecsum = _mm_add_ps(velecsum,velec);
998 fscal = _mm_andnot_ps(dummy_mask,fscal);
1000 /* Update vectorial force */
1001 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1002 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1003 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1005 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1006 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1007 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1009 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1010 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1011 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1012 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1014 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1015 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1017 /* Inner loop uses 270 flops */
1020 /* End of innermost loop */
1022 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1023 f+i_coord_offset,fshift+i_shift_offset);
1026 /* Update potential energies */
1027 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1029 /* Increment number of inner iterations */
1030 inneriter += j_index_end - j_index_start;
1032 /* Outer loop uses 19 flops */
1035 /* Increment number of outer iterations */
1038 /* Update outer/inner flops */
1040 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*270);
1043 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_single
1044 * Electrostatics interaction: Ewald
1045 * VdW interaction: None
1046 * Geometry: Water3-Water3
1047 * Calculate force/pot: Force
1050 nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_128_fma_single
1051 (t_nblist * gmx_restrict nlist,
1052 rvec * gmx_restrict xx,
1053 rvec * gmx_restrict ff,
1054 struct t_forcerec * gmx_restrict fr,
1055 t_mdatoms * gmx_restrict mdatoms,
1056 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1057 t_nrnb * gmx_restrict nrnb)
1059 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1060 * just 0 for non-waters.
1061 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1062 * jnr indices corresponding to data put in the four positions in the SIMD register.
1064 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1065 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1066 int jnrA,jnrB,jnrC,jnrD;
1067 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1068 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1069 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1070 real rcutoff_scalar;
1071 real *shiftvec,*fshift,*x,*f;
1072 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1073 real scratch[4*DIM];
1074 __m128 fscal,rcutoff,rcutoff2,jidxall;
1076 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1078 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1080 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1081 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1082 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1083 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1084 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1085 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1086 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1087 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1088 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1089 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1090 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1091 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1092 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1093 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1094 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1095 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1096 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1099 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1100 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1102 __m128 dummy_mask,cutoff_mask;
1103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1104 __m128 one = _mm_set1_ps(1.0);
1105 __m128 two = _mm_set1_ps(2.0);
1111 jindex = nlist->jindex;
1113 shiftidx = nlist->shift;
1115 shiftvec = fr->shift_vec[0];
1116 fshift = fr->fshift[0];
1117 facel = _mm_set1_ps(fr->ic->epsfac);
1118 charge = mdatoms->chargeA;
1120 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1121 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1122 beta2 = _mm_mul_ps(beta,beta);
1123 beta3 = _mm_mul_ps(beta,beta2);
1124 ewtab = fr->ic->tabq_coul_F;
1125 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1126 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1128 /* Setup water-specific parameters */
1129 inr = nlist->iinr[0];
1130 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1131 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1132 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1134 jq0 = _mm_set1_ps(charge[inr+0]);
1135 jq1 = _mm_set1_ps(charge[inr+1]);
1136 jq2 = _mm_set1_ps(charge[inr+2]);
1137 qq00 = _mm_mul_ps(iq0,jq0);
1138 qq01 = _mm_mul_ps(iq0,jq1);
1139 qq02 = _mm_mul_ps(iq0,jq2);
1140 qq10 = _mm_mul_ps(iq1,jq0);
1141 qq11 = _mm_mul_ps(iq1,jq1);
1142 qq12 = _mm_mul_ps(iq1,jq2);
1143 qq20 = _mm_mul_ps(iq2,jq0);
1144 qq21 = _mm_mul_ps(iq2,jq1);
1145 qq22 = _mm_mul_ps(iq2,jq2);
1147 /* Avoid stupid compiler warnings */
1148 jnrA = jnrB = jnrC = jnrD = 0;
1149 j_coord_offsetA = 0;
1150 j_coord_offsetB = 0;
1151 j_coord_offsetC = 0;
1152 j_coord_offsetD = 0;
1157 for(iidx=0;iidx<4*DIM;iidx++)
1159 scratch[iidx] = 0.0;
1162 /* Start outer loop over neighborlists */
1163 for(iidx=0; iidx<nri; iidx++)
1165 /* Load shift vector for this list */
1166 i_shift_offset = DIM*shiftidx[iidx];
1168 /* Load limits for loop over neighbors */
1169 j_index_start = jindex[iidx];
1170 j_index_end = jindex[iidx+1];
1172 /* Get outer coordinate index */
1174 i_coord_offset = DIM*inr;
1176 /* Load i particle coords and add shift vector */
1177 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1178 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1180 fix0 = _mm_setzero_ps();
1181 fiy0 = _mm_setzero_ps();
1182 fiz0 = _mm_setzero_ps();
1183 fix1 = _mm_setzero_ps();
1184 fiy1 = _mm_setzero_ps();
1185 fiz1 = _mm_setzero_ps();
1186 fix2 = _mm_setzero_ps();
1187 fiy2 = _mm_setzero_ps();
1188 fiz2 = _mm_setzero_ps();
1190 /* Start inner kernel loop */
1191 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1194 /* Get j neighbor index, and coordinate index */
1196 jnrB = jjnr[jidx+1];
1197 jnrC = jjnr[jidx+2];
1198 jnrD = jjnr[jidx+3];
1199 j_coord_offsetA = DIM*jnrA;
1200 j_coord_offsetB = DIM*jnrB;
1201 j_coord_offsetC = DIM*jnrC;
1202 j_coord_offsetD = DIM*jnrD;
1204 /* load j atom coordinates */
1205 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1206 x+j_coord_offsetC,x+j_coord_offsetD,
1207 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1209 /* Calculate displacement vector */
1210 dx00 = _mm_sub_ps(ix0,jx0);
1211 dy00 = _mm_sub_ps(iy0,jy0);
1212 dz00 = _mm_sub_ps(iz0,jz0);
1213 dx01 = _mm_sub_ps(ix0,jx1);
1214 dy01 = _mm_sub_ps(iy0,jy1);
1215 dz01 = _mm_sub_ps(iz0,jz1);
1216 dx02 = _mm_sub_ps(ix0,jx2);
1217 dy02 = _mm_sub_ps(iy0,jy2);
1218 dz02 = _mm_sub_ps(iz0,jz2);
1219 dx10 = _mm_sub_ps(ix1,jx0);
1220 dy10 = _mm_sub_ps(iy1,jy0);
1221 dz10 = _mm_sub_ps(iz1,jz0);
1222 dx11 = _mm_sub_ps(ix1,jx1);
1223 dy11 = _mm_sub_ps(iy1,jy1);
1224 dz11 = _mm_sub_ps(iz1,jz1);
1225 dx12 = _mm_sub_ps(ix1,jx2);
1226 dy12 = _mm_sub_ps(iy1,jy2);
1227 dz12 = _mm_sub_ps(iz1,jz2);
1228 dx20 = _mm_sub_ps(ix2,jx0);
1229 dy20 = _mm_sub_ps(iy2,jy0);
1230 dz20 = _mm_sub_ps(iz2,jz0);
1231 dx21 = _mm_sub_ps(ix2,jx1);
1232 dy21 = _mm_sub_ps(iy2,jy1);
1233 dz21 = _mm_sub_ps(iz2,jz1);
1234 dx22 = _mm_sub_ps(ix2,jx2);
1235 dy22 = _mm_sub_ps(iy2,jy2);
1236 dz22 = _mm_sub_ps(iz2,jz2);
1238 /* Calculate squared distance and things based on it */
1239 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1240 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1241 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1242 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1243 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1244 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1245 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1246 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1247 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1249 rinv00 = avx128fma_invsqrt_f(rsq00);
1250 rinv01 = avx128fma_invsqrt_f(rsq01);
1251 rinv02 = avx128fma_invsqrt_f(rsq02);
1252 rinv10 = avx128fma_invsqrt_f(rsq10);
1253 rinv11 = avx128fma_invsqrt_f(rsq11);
1254 rinv12 = avx128fma_invsqrt_f(rsq12);
1255 rinv20 = avx128fma_invsqrt_f(rsq20);
1256 rinv21 = avx128fma_invsqrt_f(rsq21);
1257 rinv22 = avx128fma_invsqrt_f(rsq22);
1259 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1260 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1261 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1262 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1263 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1264 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1265 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1266 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1267 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1269 fjx0 = _mm_setzero_ps();
1270 fjy0 = _mm_setzero_ps();
1271 fjz0 = _mm_setzero_ps();
1272 fjx1 = _mm_setzero_ps();
1273 fjy1 = _mm_setzero_ps();
1274 fjz1 = _mm_setzero_ps();
1275 fjx2 = _mm_setzero_ps();
1276 fjy2 = _mm_setzero_ps();
1277 fjz2 = _mm_setzero_ps();
1279 /**************************
1280 * CALCULATE INTERACTIONS *
1281 **************************/
1283 r00 = _mm_mul_ps(rsq00,rinv00);
1285 /* EWALD ELECTROSTATICS */
1287 /* Analytical PME correction */
1288 zeta2 = _mm_mul_ps(beta2,rsq00);
1289 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1290 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1291 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1292 felec = _mm_mul_ps(qq00,felec);
1296 /* Update vectorial force */
1297 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1298 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1299 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1301 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1302 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1303 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1305 /**************************
1306 * CALCULATE INTERACTIONS *
1307 **************************/
1309 r01 = _mm_mul_ps(rsq01,rinv01);
1311 /* EWALD ELECTROSTATICS */
1313 /* Analytical PME correction */
1314 zeta2 = _mm_mul_ps(beta2,rsq01);
1315 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1316 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1317 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1318 felec = _mm_mul_ps(qq01,felec);
1322 /* Update vectorial force */
1323 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1324 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1325 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1327 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1328 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1329 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1331 /**************************
1332 * CALCULATE INTERACTIONS *
1333 **************************/
1335 r02 = _mm_mul_ps(rsq02,rinv02);
1337 /* EWALD ELECTROSTATICS */
1339 /* Analytical PME correction */
1340 zeta2 = _mm_mul_ps(beta2,rsq02);
1341 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1342 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1343 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1344 felec = _mm_mul_ps(qq02,felec);
1348 /* Update vectorial force */
1349 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1350 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1351 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1353 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1354 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1355 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1357 /**************************
1358 * CALCULATE INTERACTIONS *
1359 **************************/
1361 r10 = _mm_mul_ps(rsq10,rinv10);
1363 /* EWALD ELECTROSTATICS */
1365 /* Analytical PME correction */
1366 zeta2 = _mm_mul_ps(beta2,rsq10);
1367 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1368 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1369 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1370 felec = _mm_mul_ps(qq10,felec);
1374 /* Update vectorial force */
1375 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1376 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1377 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1379 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1380 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1381 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1383 /**************************
1384 * CALCULATE INTERACTIONS *
1385 **************************/
1387 r11 = _mm_mul_ps(rsq11,rinv11);
1389 /* EWALD ELECTROSTATICS */
1391 /* Analytical PME correction */
1392 zeta2 = _mm_mul_ps(beta2,rsq11);
1393 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1394 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1395 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1396 felec = _mm_mul_ps(qq11,felec);
1400 /* Update vectorial force */
1401 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1402 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1403 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1405 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1406 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1407 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1409 /**************************
1410 * CALCULATE INTERACTIONS *
1411 **************************/
1413 r12 = _mm_mul_ps(rsq12,rinv12);
1415 /* EWALD ELECTROSTATICS */
1417 /* Analytical PME correction */
1418 zeta2 = _mm_mul_ps(beta2,rsq12);
1419 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1420 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1421 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1422 felec = _mm_mul_ps(qq12,felec);
1426 /* Update vectorial force */
1427 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1428 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1429 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1431 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1432 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1433 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 r20 = _mm_mul_ps(rsq20,rinv20);
1441 /* EWALD ELECTROSTATICS */
1443 /* Analytical PME correction */
1444 zeta2 = _mm_mul_ps(beta2,rsq20);
1445 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1446 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1447 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1448 felec = _mm_mul_ps(qq20,felec);
1452 /* Update vectorial force */
1453 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1454 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1455 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1457 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1458 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1459 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1461 /**************************
1462 * CALCULATE INTERACTIONS *
1463 **************************/
1465 r21 = _mm_mul_ps(rsq21,rinv21);
1467 /* EWALD ELECTROSTATICS */
1469 /* Analytical PME correction */
1470 zeta2 = _mm_mul_ps(beta2,rsq21);
1471 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1472 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1473 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1474 felec = _mm_mul_ps(qq21,felec);
1478 /* Update vectorial force */
1479 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1480 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1481 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1483 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1484 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1485 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 r22 = _mm_mul_ps(rsq22,rinv22);
1493 /* EWALD ELECTROSTATICS */
1495 /* Analytical PME correction */
1496 zeta2 = _mm_mul_ps(beta2,rsq22);
1497 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1498 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1499 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1500 felec = _mm_mul_ps(qq22,felec);
1504 /* Update vectorial force */
1505 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1506 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1507 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1509 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1510 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1511 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1513 fjptrA = f+j_coord_offsetA;
1514 fjptrB = f+j_coord_offsetB;
1515 fjptrC = f+j_coord_offsetC;
1516 fjptrD = f+j_coord_offsetD;
1518 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1519 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1521 /* Inner loop uses 252 flops */
1524 if(jidx<j_index_end)
1527 /* Get j neighbor index, and coordinate index */
1528 jnrlistA = jjnr[jidx];
1529 jnrlistB = jjnr[jidx+1];
1530 jnrlistC = jjnr[jidx+2];
1531 jnrlistD = jjnr[jidx+3];
1532 /* Sign of each element will be negative for non-real atoms.
1533 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1534 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1536 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1537 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1538 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1539 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1540 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1541 j_coord_offsetA = DIM*jnrA;
1542 j_coord_offsetB = DIM*jnrB;
1543 j_coord_offsetC = DIM*jnrC;
1544 j_coord_offsetD = DIM*jnrD;
1546 /* load j atom coordinates */
1547 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1548 x+j_coord_offsetC,x+j_coord_offsetD,
1549 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1551 /* Calculate displacement vector */
1552 dx00 = _mm_sub_ps(ix0,jx0);
1553 dy00 = _mm_sub_ps(iy0,jy0);
1554 dz00 = _mm_sub_ps(iz0,jz0);
1555 dx01 = _mm_sub_ps(ix0,jx1);
1556 dy01 = _mm_sub_ps(iy0,jy1);
1557 dz01 = _mm_sub_ps(iz0,jz1);
1558 dx02 = _mm_sub_ps(ix0,jx2);
1559 dy02 = _mm_sub_ps(iy0,jy2);
1560 dz02 = _mm_sub_ps(iz0,jz2);
1561 dx10 = _mm_sub_ps(ix1,jx0);
1562 dy10 = _mm_sub_ps(iy1,jy0);
1563 dz10 = _mm_sub_ps(iz1,jz0);
1564 dx11 = _mm_sub_ps(ix1,jx1);
1565 dy11 = _mm_sub_ps(iy1,jy1);
1566 dz11 = _mm_sub_ps(iz1,jz1);
1567 dx12 = _mm_sub_ps(ix1,jx2);
1568 dy12 = _mm_sub_ps(iy1,jy2);
1569 dz12 = _mm_sub_ps(iz1,jz2);
1570 dx20 = _mm_sub_ps(ix2,jx0);
1571 dy20 = _mm_sub_ps(iy2,jy0);
1572 dz20 = _mm_sub_ps(iz2,jz0);
1573 dx21 = _mm_sub_ps(ix2,jx1);
1574 dy21 = _mm_sub_ps(iy2,jy1);
1575 dz21 = _mm_sub_ps(iz2,jz1);
1576 dx22 = _mm_sub_ps(ix2,jx2);
1577 dy22 = _mm_sub_ps(iy2,jy2);
1578 dz22 = _mm_sub_ps(iz2,jz2);
1580 /* Calculate squared distance and things based on it */
1581 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1582 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1583 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1584 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1585 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1586 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1587 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1588 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1589 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1591 rinv00 = avx128fma_invsqrt_f(rsq00);
1592 rinv01 = avx128fma_invsqrt_f(rsq01);
1593 rinv02 = avx128fma_invsqrt_f(rsq02);
1594 rinv10 = avx128fma_invsqrt_f(rsq10);
1595 rinv11 = avx128fma_invsqrt_f(rsq11);
1596 rinv12 = avx128fma_invsqrt_f(rsq12);
1597 rinv20 = avx128fma_invsqrt_f(rsq20);
1598 rinv21 = avx128fma_invsqrt_f(rsq21);
1599 rinv22 = avx128fma_invsqrt_f(rsq22);
1601 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1602 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1603 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1604 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1605 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1606 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1607 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1608 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1609 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1611 fjx0 = _mm_setzero_ps();
1612 fjy0 = _mm_setzero_ps();
1613 fjz0 = _mm_setzero_ps();
1614 fjx1 = _mm_setzero_ps();
1615 fjy1 = _mm_setzero_ps();
1616 fjz1 = _mm_setzero_ps();
1617 fjx2 = _mm_setzero_ps();
1618 fjy2 = _mm_setzero_ps();
1619 fjz2 = _mm_setzero_ps();
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 r00 = _mm_mul_ps(rsq00,rinv00);
1626 r00 = _mm_andnot_ps(dummy_mask,r00);
1628 /* EWALD ELECTROSTATICS */
1630 /* Analytical PME correction */
1631 zeta2 = _mm_mul_ps(beta2,rsq00);
1632 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1633 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1634 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1635 felec = _mm_mul_ps(qq00,felec);
1639 fscal = _mm_andnot_ps(dummy_mask,fscal);
1641 /* Update vectorial force */
1642 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1643 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1644 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1646 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1647 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1648 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 r01 = _mm_mul_ps(rsq01,rinv01);
1655 r01 = _mm_andnot_ps(dummy_mask,r01);
1657 /* EWALD ELECTROSTATICS */
1659 /* Analytical PME correction */
1660 zeta2 = _mm_mul_ps(beta2,rsq01);
1661 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1662 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1663 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1664 felec = _mm_mul_ps(qq01,felec);
1668 fscal = _mm_andnot_ps(dummy_mask,fscal);
1670 /* Update vectorial force */
1671 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1672 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1673 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1675 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1676 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1677 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1679 /**************************
1680 * CALCULATE INTERACTIONS *
1681 **************************/
1683 r02 = _mm_mul_ps(rsq02,rinv02);
1684 r02 = _mm_andnot_ps(dummy_mask,r02);
1686 /* EWALD ELECTROSTATICS */
1688 /* Analytical PME correction */
1689 zeta2 = _mm_mul_ps(beta2,rsq02);
1690 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1691 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1692 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1693 felec = _mm_mul_ps(qq02,felec);
1697 fscal = _mm_andnot_ps(dummy_mask,fscal);
1699 /* Update vectorial force */
1700 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1701 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1702 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1704 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1705 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1706 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 r10 = _mm_mul_ps(rsq10,rinv10);
1713 r10 = _mm_andnot_ps(dummy_mask,r10);
1715 /* EWALD ELECTROSTATICS */
1717 /* Analytical PME correction */
1718 zeta2 = _mm_mul_ps(beta2,rsq10);
1719 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1720 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1721 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1722 felec = _mm_mul_ps(qq10,felec);
1726 fscal = _mm_andnot_ps(dummy_mask,fscal);
1728 /* Update vectorial force */
1729 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1730 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1731 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1733 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1734 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1735 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1737 /**************************
1738 * CALCULATE INTERACTIONS *
1739 **************************/
1741 r11 = _mm_mul_ps(rsq11,rinv11);
1742 r11 = _mm_andnot_ps(dummy_mask,r11);
1744 /* EWALD ELECTROSTATICS */
1746 /* Analytical PME correction */
1747 zeta2 = _mm_mul_ps(beta2,rsq11);
1748 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1749 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1750 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1751 felec = _mm_mul_ps(qq11,felec);
1755 fscal = _mm_andnot_ps(dummy_mask,fscal);
1757 /* Update vectorial force */
1758 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1759 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1760 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1762 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1763 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1764 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 r12 = _mm_mul_ps(rsq12,rinv12);
1771 r12 = _mm_andnot_ps(dummy_mask,r12);
1773 /* EWALD ELECTROSTATICS */
1775 /* Analytical PME correction */
1776 zeta2 = _mm_mul_ps(beta2,rsq12);
1777 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1778 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1779 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1780 felec = _mm_mul_ps(qq12,felec);
1784 fscal = _mm_andnot_ps(dummy_mask,fscal);
1786 /* Update vectorial force */
1787 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1788 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1789 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1791 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1792 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1793 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 r20 = _mm_mul_ps(rsq20,rinv20);
1800 r20 = _mm_andnot_ps(dummy_mask,r20);
1802 /* EWALD ELECTROSTATICS */
1804 /* Analytical PME correction */
1805 zeta2 = _mm_mul_ps(beta2,rsq20);
1806 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1807 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1808 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1809 felec = _mm_mul_ps(qq20,felec);
1813 fscal = _mm_andnot_ps(dummy_mask,fscal);
1815 /* Update vectorial force */
1816 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1817 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1818 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1820 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1821 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1822 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1824 /**************************
1825 * CALCULATE INTERACTIONS *
1826 **************************/
1828 r21 = _mm_mul_ps(rsq21,rinv21);
1829 r21 = _mm_andnot_ps(dummy_mask,r21);
1831 /* EWALD ELECTROSTATICS */
1833 /* Analytical PME correction */
1834 zeta2 = _mm_mul_ps(beta2,rsq21);
1835 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1836 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1837 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1838 felec = _mm_mul_ps(qq21,felec);
1842 fscal = _mm_andnot_ps(dummy_mask,fscal);
1844 /* Update vectorial force */
1845 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1846 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1847 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1849 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1850 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1851 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 r22 = _mm_mul_ps(rsq22,rinv22);
1858 r22 = _mm_andnot_ps(dummy_mask,r22);
1860 /* EWALD ELECTROSTATICS */
1862 /* Analytical PME correction */
1863 zeta2 = _mm_mul_ps(beta2,rsq22);
1864 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1865 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1866 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1867 felec = _mm_mul_ps(qq22,felec);
1871 fscal = _mm_andnot_ps(dummy_mask,fscal);
1873 /* Update vectorial force */
1874 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1875 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1876 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1878 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1879 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1880 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1882 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1883 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1884 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1885 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1887 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1888 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1890 /* Inner loop uses 261 flops */
1893 /* End of innermost loop */
1895 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1896 f+i_coord_offset,fshift+i_shift_offset);
1898 /* Increment number of inner iterations */
1899 inneriter += j_index_end - j_index_start;
1901 /* Outer loop uses 18 flops */
1904 /* Increment number of outer iterations */
1907 /* Update outer/inner flops */
1909 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*261);