2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
109 __m128 dummy_mask,cutoff_mask;
110 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111 __m128 one = _mm_set1_ps(1.0);
112 __m128 two = _mm_set1_ps(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm_set1_ps(fr->ic->epsfac);
125 charge = mdatoms->chargeA;
127 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
128 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
129 beta2 = _mm_mul_ps(beta,beta);
130 beta3 = _mm_mul_ps(beta,beta2);
131 ewtab = fr->ic->tabq_coul_FDV0;
132 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
133 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
138 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
141 jq0 = _mm_set1_ps(charge[inr+0]);
142 jq1 = _mm_set1_ps(charge[inr+1]);
143 jq2 = _mm_set1_ps(charge[inr+2]);
144 qq00 = _mm_mul_ps(iq0,jq0);
145 qq01 = _mm_mul_ps(iq0,jq1);
146 qq02 = _mm_mul_ps(iq0,jq2);
147 qq10 = _mm_mul_ps(iq1,jq0);
148 qq11 = _mm_mul_ps(iq1,jq1);
149 qq12 = _mm_mul_ps(iq1,jq2);
150 qq20 = _mm_mul_ps(iq2,jq0);
151 qq21 = _mm_mul_ps(iq2,jq1);
152 qq22 = _mm_mul_ps(iq2,jq2);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->ic->rcoulomb;
156 rcutoff = _mm_set1_ps(rcutoff_scalar);
157 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
159 /* Avoid stupid compiler warnings */
160 jnrA = jnrB = jnrC = jnrD = 0;
169 for(iidx=0;iidx<4*DIM;iidx++)
174 /* Start outer loop over neighborlists */
175 for(iidx=0; iidx<nri; iidx++)
177 /* Load shift vector for this list */
178 i_shift_offset = DIM*shiftidx[iidx];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
190 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
192 fix0 = _mm_setzero_ps();
193 fiy0 = _mm_setzero_ps();
194 fiz0 = _mm_setzero_ps();
195 fix1 = _mm_setzero_ps();
196 fiy1 = _mm_setzero_ps();
197 fiz1 = _mm_setzero_ps();
198 fix2 = _mm_setzero_ps();
199 fiy2 = _mm_setzero_ps();
200 fiz2 = _mm_setzero_ps();
202 /* Reset potential sums */
203 velecsum = _mm_setzero_ps();
205 /* Start inner kernel loop */
206 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
209 /* Get j neighbor index, and coordinate index */
214 j_coord_offsetA = DIM*jnrA;
215 j_coord_offsetB = DIM*jnrB;
216 j_coord_offsetC = DIM*jnrC;
217 j_coord_offsetD = DIM*jnrD;
219 /* load j atom coordinates */
220 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
221 x+j_coord_offsetC,x+j_coord_offsetD,
222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
224 /* Calculate displacement vector */
225 dx00 = _mm_sub_ps(ix0,jx0);
226 dy00 = _mm_sub_ps(iy0,jy0);
227 dz00 = _mm_sub_ps(iz0,jz0);
228 dx01 = _mm_sub_ps(ix0,jx1);
229 dy01 = _mm_sub_ps(iy0,jy1);
230 dz01 = _mm_sub_ps(iz0,jz1);
231 dx02 = _mm_sub_ps(ix0,jx2);
232 dy02 = _mm_sub_ps(iy0,jy2);
233 dz02 = _mm_sub_ps(iz0,jz2);
234 dx10 = _mm_sub_ps(ix1,jx0);
235 dy10 = _mm_sub_ps(iy1,jy0);
236 dz10 = _mm_sub_ps(iz1,jz0);
237 dx11 = _mm_sub_ps(ix1,jx1);
238 dy11 = _mm_sub_ps(iy1,jy1);
239 dz11 = _mm_sub_ps(iz1,jz1);
240 dx12 = _mm_sub_ps(ix1,jx2);
241 dy12 = _mm_sub_ps(iy1,jy2);
242 dz12 = _mm_sub_ps(iz1,jz2);
243 dx20 = _mm_sub_ps(ix2,jx0);
244 dy20 = _mm_sub_ps(iy2,jy0);
245 dz20 = _mm_sub_ps(iz2,jz0);
246 dx21 = _mm_sub_ps(ix2,jx1);
247 dy21 = _mm_sub_ps(iy2,jy1);
248 dz21 = _mm_sub_ps(iz2,jz1);
249 dx22 = _mm_sub_ps(ix2,jx2);
250 dy22 = _mm_sub_ps(iy2,jy2);
251 dz22 = _mm_sub_ps(iz2,jz2);
253 /* Calculate squared distance and things based on it */
254 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
255 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
256 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
257 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
258 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
259 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
260 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
261 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
262 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
264 rinv00 = avx128fma_invsqrt_f(rsq00);
265 rinv01 = avx128fma_invsqrt_f(rsq01);
266 rinv02 = avx128fma_invsqrt_f(rsq02);
267 rinv10 = avx128fma_invsqrt_f(rsq10);
268 rinv11 = avx128fma_invsqrt_f(rsq11);
269 rinv12 = avx128fma_invsqrt_f(rsq12);
270 rinv20 = avx128fma_invsqrt_f(rsq20);
271 rinv21 = avx128fma_invsqrt_f(rsq21);
272 rinv22 = avx128fma_invsqrt_f(rsq22);
274 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
275 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
276 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
277 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
278 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
279 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
280 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
281 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
282 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
284 fjx0 = _mm_setzero_ps();
285 fjy0 = _mm_setzero_ps();
286 fjz0 = _mm_setzero_ps();
287 fjx1 = _mm_setzero_ps();
288 fjy1 = _mm_setzero_ps();
289 fjz1 = _mm_setzero_ps();
290 fjx2 = _mm_setzero_ps();
291 fjy2 = _mm_setzero_ps();
292 fjz2 = _mm_setzero_ps();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 if (gmx_mm_any_lt(rsq00,rcutoff2))
301 r00 = _mm_mul_ps(rsq00,rinv00);
303 /* EWALD ELECTROSTATICS */
305 /* Analytical PME correction */
306 zeta2 = _mm_mul_ps(beta2,rsq00);
307 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
308 pmecorrF = avx128fma_pmecorrF_f(zeta2);
309 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
310 felec = _mm_mul_ps(qq00,felec);
311 pmecorrV = avx128fma_pmecorrV_f(zeta2);
312 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
313 velec = _mm_mul_ps(qq00,velec);
315 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
317 /* Update potential sum for this i atom from the interaction with this j atom. */
318 velec = _mm_and_ps(velec,cutoff_mask);
319 velecsum = _mm_add_ps(velecsum,velec);
323 fscal = _mm_and_ps(fscal,cutoff_mask);
325 /* Update vectorial force */
326 fix0 = _mm_macc_ps(dx00,fscal,fix0);
327 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
328 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
330 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
331 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
332 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
336 /**************************
337 * CALCULATE INTERACTIONS *
338 **************************/
340 if (gmx_mm_any_lt(rsq01,rcutoff2))
343 r01 = _mm_mul_ps(rsq01,rinv01);
345 /* EWALD ELECTROSTATICS */
347 /* Analytical PME correction */
348 zeta2 = _mm_mul_ps(beta2,rsq01);
349 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
350 pmecorrF = avx128fma_pmecorrF_f(zeta2);
351 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
352 felec = _mm_mul_ps(qq01,felec);
353 pmecorrV = avx128fma_pmecorrV_f(zeta2);
354 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
355 velec = _mm_mul_ps(qq01,velec);
357 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 velec = _mm_and_ps(velec,cutoff_mask);
361 velecsum = _mm_add_ps(velecsum,velec);
365 fscal = _mm_and_ps(fscal,cutoff_mask);
367 /* Update vectorial force */
368 fix0 = _mm_macc_ps(dx01,fscal,fix0);
369 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
370 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
372 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
373 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
374 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 if (gmx_mm_any_lt(rsq02,rcutoff2))
385 r02 = _mm_mul_ps(rsq02,rinv02);
387 /* EWALD ELECTROSTATICS */
389 /* Analytical PME correction */
390 zeta2 = _mm_mul_ps(beta2,rsq02);
391 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
392 pmecorrF = avx128fma_pmecorrF_f(zeta2);
393 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
394 felec = _mm_mul_ps(qq02,felec);
395 pmecorrV = avx128fma_pmecorrV_f(zeta2);
396 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
397 velec = _mm_mul_ps(qq02,velec);
399 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velec = _mm_and_ps(velec,cutoff_mask);
403 velecsum = _mm_add_ps(velecsum,velec);
407 fscal = _mm_and_ps(fscal,cutoff_mask);
409 /* Update vectorial force */
410 fix0 = _mm_macc_ps(dx02,fscal,fix0);
411 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
412 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
414 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
415 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
416 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 if (gmx_mm_any_lt(rsq10,rcutoff2))
427 r10 = _mm_mul_ps(rsq10,rinv10);
429 /* EWALD ELECTROSTATICS */
431 /* Analytical PME correction */
432 zeta2 = _mm_mul_ps(beta2,rsq10);
433 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
434 pmecorrF = avx128fma_pmecorrF_f(zeta2);
435 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
436 felec = _mm_mul_ps(qq10,felec);
437 pmecorrV = avx128fma_pmecorrV_f(zeta2);
438 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
439 velec = _mm_mul_ps(qq10,velec);
441 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velec = _mm_and_ps(velec,cutoff_mask);
445 velecsum = _mm_add_ps(velecsum,velec);
449 fscal = _mm_and_ps(fscal,cutoff_mask);
451 /* Update vectorial force */
452 fix1 = _mm_macc_ps(dx10,fscal,fix1);
453 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
454 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
456 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
457 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
458 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 if (gmx_mm_any_lt(rsq11,rcutoff2))
469 r11 = _mm_mul_ps(rsq11,rinv11);
471 /* EWALD ELECTROSTATICS */
473 /* Analytical PME correction */
474 zeta2 = _mm_mul_ps(beta2,rsq11);
475 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
476 pmecorrF = avx128fma_pmecorrF_f(zeta2);
477 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
478 felec = _mm_mul_ps(qq11,felec);
479 pmecorrV = avx128fma_pmecorrV_f(zeta2);
480 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
481 velec = _mm_mul_ps(qq11,velec);
483 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velec = _mm_and_ps(velec,cutoff_mask);
487 velecsum = _mm_add_ps(velecsum,velec);
491 fscal = _mm_and_ps(fscal,cutoff_mask);
493 /* Update vectorial force */
494 fix1 = _mm_macc_ps(dx11,fscal,fix1);
495 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
496 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
498 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
499 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
500 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 if (gmx_mm_any_lt(rsq12,rcutoff2))
511 r12 = _mm_mul_ps(rsq12,rinv12);
513 /* EWALD ELECTROSTATICS */
515 /* Analytical PME correction */
516 zeta2 = _mm_mul_ps(beta2,rsq12);
517 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
518 pmecorrF = avx128fma_pmecorrF_f(zeta2);
519 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
520 felec = _mm_mul_ps(qq12,felec);
521 pmecorrV = avx128fma_pmecorrV_f(zeta2);
522 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
523 velec = _mm_mul_ps(qq12,velec);
525 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velec = _mm_and_ps(velec,cutoff_mask);
529 velecsum = _mm_add_ps(velecsum,velec);
533 fscal = _mm_and_ps(fscal,cutoff_mask);
535 /* Update vectorial force */
536 fix1 = _mm_macc_ps(dx12,fscal,fix1);
537 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
538 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
540 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
541 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
542 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 if (gmx_mm_any_lt(rsq20,rcutoff2))
553 r20 = _mm_mul_ps(rsq20,rinv20);
555 /* EWALD ELECTROSTATICS */
557 /* Analytical PME correction */
558 zeta2 = _mm_mul_ps(beta2,rsq20);
559 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
560 pmecorrF = avx128fma_pmecorrF_f(zeta2);
561 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
562 felec = _mm_mul_ps(qq20,felec);
563 pmecorrV = avx128fma_pmecorrV_f(zeta2);
564 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
565 velec = _mm_mul_ps(qq20,velec);
567 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velec = _mm_and_ps(velec,cutoff_mask);
571 velecsum = _mm_add_ps(velecsum,velec);
575 fscal = _mm_and_ps(fscal,cutoff_mask);
577 /* Update vectorial force */
578 fix2 = _mm_macc_ps(dx20,fscal,fix2);
579 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
580 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
582 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
583 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
584 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 if (gmx_mm_any_lt(rsq21,rcutoff2))
595 r21 = _mm_mul_ps(rsq21,rinv21);
597 /* EWALD ELECTROSTATICS */
599 /* Analytical PME correction */
600 zeta2 = _mm_mul_ps(beta2,rsq21);
601 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
602 pmecorrF = avx128fma_pmecorrF_f(zeta2);
603 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
604 felec = _mm_mul_ps(qq21,felec);
605 pmecorrV = avx128fma_pmecorrV_f(zeta2);
606 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
607 velec = _mm_mul_ps(qq21,velec);
609 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec = _mm_and_ps(velec,cutoff_mask);
613 velecsum = _mm_add_ps(velecsum,velec);
617 fscal = _mm_and_ps(fscal,cutoff_mask);
619 /* Update vectorial force */
620 fix2 = _mm_macc_ps(dx21,fscal,fix2);
621 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
622 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
624 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
625 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
626 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 if (gmx_mm_any_lt(rsq22,rcutoff2))
637 r22 = _mm_mul_ps(rsq22,rinv22);
639 /* EWALD ELECTROSTATICS */
641 /* Analytical PME correction */
642 zeta2 = _mm_mul_ps(beta2,rsq22);
643 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
644 pmecorrF = avx128fma_pmecorrF_f(zeta2);
645 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
646 felec = _mm_mul_ps(qq22,felec);
647 pmecorrV = avx128fma_pmecorrV_f(zeta2);
648 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
649 velec = _mm_mul_ps(qq22,velec);
651 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
653 /* Update potential sum for this i atom from the interaction with this j atom. */
654 velec = _mm_and_ps(velec,cutoff_mask);
655 velecsum = _mm_add_ps(velecsum,velec);
659 fscal = _mm_and_ps(fscal,cutoff_mask);
661 /* Update vectorial force */
662 fix2 = _mm_macc_ps(dx22,fscal,fix2);
663 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
664 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
666 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
667 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
668 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
672 fjptrA = f+j_coord_offsetA;
673 fjptrB = f+j_coord_offsetB;
674 fjptrC = f+j_coord_offsetC;
675 fjptrD = f+j_coord_offsetD;
677 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
678 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
680 /* Inner loop uses 297 flops */
686 /* Get j neighbor index, and coordinate index */
687 jnrlistA = jjnr[jidx];
688 jnrlistB = jjnr[jidx+1];
689 jnrlistC = jjnr[jidx+2];
690 jnrlistD = jjnr[jidx+3];
691 /* Sign of each element will be negative for non-real atoms.
692 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
693 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
695 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
696 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
697 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
698 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
699 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
700 j_coord_offsetA = DIM*jnrA;
701 j_coord_offsetB = DIM*jnrB;
702 j_coord_offsetC = DIM*jnrC;
703 j_coord_offsetD = DIM*jnrD;
705 /* load j atom coordinates */
706 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
707 x+j_coord_offsetC,x+j_coord_offsetD,
708 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
710 /* Calculate displacement vector */
711 dx00 = _mm_sub_ps(ix0,jx0);
712 dy00 = _mm_sub_ps(iy0,jy0);
713 dz00 = _mm_sub_ps(iz0,jz0);
714 dx01 = _mm_sub_ps(ix0,jx1);
715 dy01 = _mm_sub_ps(iy0,jy1);
716 dz01 = _mm_sub_ps(iz0,jz1);
717 dx02 = _mm_sub_ps(ix0,jx2);
718 dy02 = _mm_sub_ps(iy0,jy2);
719 dz02 = _mm_sub_ps(iz0,jz2);
720 dx10 = _mm_sub_ps(ix1,jx0);
721 dy10 = _mm_sub_ps(iy1,jy0);
722 dz10 = _mm_sub_ps(iz1,jz0);
723 dx11 = _mm_sub_ps(ix1,jx1);
724 dy11 = _mm_sub_ps(iy1,jy1);
725 dz11 = _mm_sub_ps(iz1,jz1);
726 dx12 = _mm_sub_ps(ix1,jx2);
727 dy12 = _mm_sub_ps(iy1,jy2);
728 dz12 = _mm_sub_ps(iz1,jz2);
729 dx20 = _mm_sub_ps(ix2,jx0);
730 dy20 = _mm_sub_ps(iy2,jy0);
731 dz20 = _mm_sub_ps(iz2,jz0);
732 dx21 = _mm_sub_ps(ix2,jx1);
733 dy21 = _mm_sub_ps(iy2,jy1);
734 dz21 = _mm_sub_ps(iz2,jz1);
735 dx22 = _mm_sub_ps(ix2,jx2);
736 dy22 = _mm_sub_ps(iy2,jy2);
737 dz22 = _mm_sub_ps(iz2,jz2);
739 /* Calculate squared distance and things based on it */
740 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
741 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
742 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
743 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
744 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
745 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
746 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
747 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
748 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
750 rinv00 = avx128fma_invsqrt_f(rsq00);
751 rinv01 = avx128fma_invsqrt_f(rsq01);
752 rinv02 = avx128fma_invsqrt_f(rsq02);
753 rinv10 = avx128fma_invsqrt_f(rsq10);
754 rinv11 = avx128fma_invsqrt_f(rsq11);
755 rinv12 = avx128fma_invsqrt_f(rsq12);
756 rinv20 = avx128fma_invsqrt_f(rsq20);
757 rinv21 = avx128fma_invsqrt_f(rsq21);
758 rinv22 = avx128fma_invsqrt_f(rsq22);
760 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
761 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
762 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
763 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
764 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
765 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
766 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
767 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
768 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
770 fjx0 = _mm_setzero_ps();
771 fjy0 = _mm_setzero_ps();
772 fjz0 = _mm_setzero_ps();
773 fjx1 = _mm_setzero_ps();
774 fjy1 = _mm_setzero_ps();
775 fjz1 = _mm_setzero_ps();
776 fjx2 = _mm_setzero_ps();
777 fjy2 = _mm_setzero_ps();
778 fjz2 = _mm_setzero_ps();
780 /**************************
781 * CALCULATE INTERACTIONS *
782 **************************/
784 if (gmx_mm_any_lt(rsq00,rcutoff2))
787 r00 = _mm_mul_ps(rsq00,rinv00);
788 r00 = _mm_andnot_ps(dummy_mask,r00);
790 /* EWALD ELECTROSTATICS */
792 /* Analytical PME correction */
793 zeta2 = _mm_mul_ps(beta2,rsq00);
794 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
795 pmecorrF = avx128fma_pmecorrF_f(zeta2);
796 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
797 felec = _mm_mul_ps(qq00,felec);
798 pmecorrV = avx128fma_pmecorrV_f(zeta2);
799 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
800 velec = _mm_mul_ps(qq00,velec);
802 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
804 /* Update potential sum for this i atom from the interaction with this j atom. */
805 velec = _mm_and_ps(velec,cutoff_mask);
806 velec = _mm_andnot_ps(dummy_mask,velec);
807 velecsum = _mm_add_ps(velecsum,velec);
811 fscal = _mm_and_ps(fscal,cutoff_mask);
813 fscal = _mm_andnot_ps(dummy_mask,fscal);
815 /* Update vectorial force */
816 fix0 = _mm_macc_ps(dx00,fscal,fix0);
817 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
818 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
820 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
821 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
822 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
826 /**************************
827 * CALCULATE INTERACTIONS *
828 **************************/
830 if (gmx_mm_any_lt(rsq01,rcutoff2))
833 r01 = _mm_mul_ps(rsq01,rinv01);
834 r01 = _mm_andnot_ps(dummy_mask,r01);
836 /* EWALD ELECTROSTATICS */
838 /* Analytical PME correction */
839 zeta2 = _mm_mul_ps(beta2,rsq01);
840 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
841 pmecorrF = avx128fma_pmecorrF_f(zeta2);
842 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
843 felec = _mm_mul_ps(qq01,felec);
844 pmecorrV = avx128fma_pmecorrV_f(zeta2);
845 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
846 velec = _mm_mul_ps(qq01,velec);
848 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
850 /* Update potential sum for this i atom from the interaction with this j atom. */
851 velec = _mm_and_ps(velec,cutoff_mask);
852 velec = _mm_andnot_ps(dummy_mask,velec);
853 velecsum = _mm_add_ps(velecsum,velec);
857 fscal = _mm_and_ps(fscal,cutoff_mask);
859 fscal = _mm_andnot_ps(dummy_mask,fscal);
861 /* Update vectorial force */
862 fix0 = _mm_macc_ps(dx01,fscal,fix0);
863 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
864 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
866 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
867 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
868 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
872 /**************************
873 * CALCULATE INTERACTIONS *
874 **************************/
876 if (gmx_mm_any_lt(rsq02,rcutoff2))
879 r02 = _mm_mul_ps(rsq02,rinv02);
880 r02 = _mm_andnot_ps(dummy_mask,r02);
882 /* EWALD ELECTROSTATICS */
884 /* Analytical PME correction */
885 zeta2 = _mm_mul_ps(beta2,rsq02);
886 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
887 pmecorrF = avx128fma_pmecorrF_f(zeta2);
888 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
889 felec = _mm_mul_ps(qq02,felec);
890 pmecorrV = avx128fma_pmecorrV_f(zeta2);
891 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
892 velec = _mm_mul_ps(qq02,velec);
894 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec = _mm_and_ps(velec,cutoff_mask);
898 velec = _mm_andnot_ps(dummy_mask,velec);
899 velecsum = _mm_add_ps(velecsum,velec);
903 fscal = _mm_and_ps(fscal,cutoff_mask);
905 fscal = _mm_andnot_ps(dummy_mask,fscal);
907 /* Update vectorial force */
908 fix0 = _mm_macc_ps(dx02,fscal,fix0);
909 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
910 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
912 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
913 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
914 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 if (gmx_mm_any_lt(rsq10,rcutoff2))
925 r10 = _mm_mul_ps(rsq10,rinv10);
926 r10 = _mm_andnot_ps(dummy_mask,r10);
928 /* EWALD ELECTROSTATICS */
930 /* Analytical PME correction */
931 zeta2 = _mm_mul_ps(beta2,rsq10);
932 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
933 pmecorrF = avx128fma_pmecorrF_f(zeta2);
934 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
935 felec = _mm_mul_ps(qq10,felec);
936 pmecorrV = avx128fma_pmecorrV_f(zeta2);
937 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
938 velec = _mm_mul_ps(qq10,velec);
940 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec = _mm_and_ps(velec,cutoff_mask);
944 velec = _mm_andnot_ps(dummy_mask,velec);
945 velecsum = _mm_add_ps(velecsum,velec);
949 fscal = _mm_and_ps(fscal,cutoff_mask);
951 fscal = _mm_andnot_ps(dummy_mask,fscal);
953 /* Update vectorial force */
954 fix1 = _mm_macc_ps(dx10,fscal,fix1);
955 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
956 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
958 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
959 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
960 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
964 /**************************
965 * CALCULATE INTERACTIONS *
966 **************************/
968 if (gmx_mm_any_lt(rsq11,rcutoff2))
971 r11 = _mm_mul_ps(rsq11,rinv11);
972 r11 = _mm_andnot_ps(dummy_mask,r11);
974 /* EWALD ELECTROSTATICS */
976 /* Analytical PME correction */
977 zeta2 = _mm_mul_ps(beta2,rsq11);
978 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
979 pmecorrF = avx128fma_pmecorrF_f(zeta2);
980 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
981 felec = _mm_mul_ps(qq11,felec);
982 pmecorrV = avx128fma_pmecorrV_f(zeta2);
983 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
984 velec = _mm_mul_ps(qq11,velec);
986 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
988 /* Update potential sum for this i atom from the interaction with this j atom. */
989 velec = _mm_and_ps(velec,cutoff_mask);
990 velec = _mm_andnot_ps(dummy_mask,velec);
991 velecsum = _mm_add_ps(velecsum,velec);
995 fscal = _mm_and_ps(fscal,cutoff_mask);
997 fscal = _mm_andnot_ps(dummy_mask,fscal);
999 /* Update vectorial force */
1000 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1001 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1002 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1004 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1005 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1006 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1010 /**************************
1011 * CALCULATE INTERACTIONS *
1012 **************************/
1014 if (gmx_mm_any_lt(rsq12,rcutoff2))
1017 r12 = _mm_mul_ps(rsq12,rinv12);
1018 r12 = _mm_andnot_ps(dummy_mask,r12);
1020 /* EWALD ELECTROSTATICS */
1022 /* Analytical PME correction */
1023 zeta2 = _mm_mul_ps(beta2,rsq12);
1024 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1025 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1026 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1027 felec = _mm_mul_ps(qq12,felec);
1028 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1029 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
1030 velec = _mm_mul_ps(qq12,velec);
1032 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec = _mm_and_ps(velec,cutoff_mask);
1036 velec = _mm_andnot_ps(dummy_mask,velec);
1037 velecsum = _mm_add_ps(velecsum,velec);
1041 fscal = _mm_and_ps(fscal,cutoff_mask);
1043 fscal = _mm_andnot_ps(dummy_mask,fscal);
1045 /* Update vectorial force */
1046 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1047 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1048 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1050 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1051 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1052 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 if (gmx_mm_any_lt(rsq20,rcutoff2))
1063 r20 = _mm_mul_ps(rsq20,rinv20);
1064 r20 = _mm_andnot_ps(dummy_mask,r20);
1066 /* EWALD ELECTROSTATICS */
1068 /* Analytical PME correction */
1069 zeta2 = _mm_mul_ps(beta2,rsq20);
1070 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1071 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1072 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1073 felec = _mm_mul_ps(qq20,felec);
1074 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1075 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
1076 velec = _mm_mul_ps(qq20,velec);
1078 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1080 /* Update potential sum for this i atom from the interaction with this j atom. */
1081 velec = _mm_and_ps(velec,cutoff_mask);
1082 velec = _mm_andnot_ps(dummy_mask,velec);
1083 velecsum = _mm_add_ps(velecsum,velec);
1087 fscal = _mm_and_ps(fscal,cutoff_mask);
1089 fscal = _mm_andnot_ps(dummy_mask,fscal);
1091 /* Update vectorial force */
1092 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1093 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1094 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1096 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1097 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1098 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1102 /**************************
1103 * CALCULATE INTERACTIONS *
1104 **************************/
1106 if (gmx_mm_any_lt(rsq21,rcutoff2))
1109 r21 = _mm_mul_ps(rsq21,rinv21);
1110 r21 = _mm_andnot_ps(dummy_mask,r21);
1112 /* EWALD ELECTROSTATICS */
1114 /* Analytical PME correction */
1115 zeta2 = _mm_mul_ps(beta2,rsq21);
1116 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1117 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1118 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1119 felec = _mm_mul_ps(qq21,felec);
1120 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1121 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
1122 velec = _mm_mul_ps(qq21,velec);
1124 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec = _mm_and_ps(velec,cutoff_mask);
1128 velec = _mm_andnot_ps(dummy_mask,velec);
1129 velecsum = _mm_add_ps(velecsum,velec);
1133 fscal = _mm_and_ps(fscal,cutoff_mask);
1135 fscal = _mm_andnot_ps(dummy_mask,fscal);
1137 /* Update vectorial force */
1138 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1139 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1140 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1142 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1143 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1144 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1152 if (gmx_mm_any_lt(rsq22,rcutoff2))
1155 r22 = _mm_mul_ps(rsq22,rinv22);
1156 r22 = _mm_andnot_ps(dummy_mask,r22);
1158 /* EWALD ELECTROSTATICS */
1160 /* Analytical PME correction */
1161 zeta2 = _mm_mul_ps(beta2,rsq22);
1162 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1163 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1164 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1165 felec = _mm_mul_ps(qq22,felec);
1166 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1167 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
1168 velec = _mm_mul_ps(qq22,velec);
1170 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1172 /* Update potential sum for this i atom from the interaction with this j atom. */
1173 velec = _mm_and_ps(velec,cutoff_mask);
1174 velec = _mm_andnot_ps(dummy_mask,velec);
1175 velecsum = _mm_add_ps(velecsum,velec);
1179 fscal = _mm_and_ps(fscal,cutoff_mask);
1181 fscal = _mm_andnot_ps(dummy_mask,fscal);
1183 /* Update vectorial force */
1184 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1185 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1186 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1188 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1189 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1190 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1194 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1195 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1196 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1197 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1199 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1200 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1202 /* Inner loop uses 306 flops */
1205 /* End of innermost loop */
1207 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1208 f+i_coord_offset,fshift+i_shift_offset);
1211 /* Update potential energies */
1212 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1214 /* Increment number of inner iterations */
1215 inneriter += j_index_end - j_index_start;
1217 /* Outer loop uses 19 flops */
1220 /* Increment number of outer iterations */
1223 /* Update outer/inner flops */
1225 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*306);
1228 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_single
1229 * Electrostatics interaction: Ewald
1230 * VdW interaction: None
1231 * Geometry: Water3-Water3
1232 * Calculate force/pot: Force
1235 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_128_fma_single
1236 (t_nblist * gmx_restrict nlist,
1237 rvec * gmx_restrict xx,
1238 rvec * gmx_restrict ff,
1239 struct t_forcerec * gmx_restrict fr,
1240 t_mdatoms * gmx_restrict mdatoms,
1241 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1242 t_nrnb * gmx_restrict nrnb)
1244 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1245 * just 0 for non-waters.
1246 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1247 * jnr indices corresponding to data put in the four positions in the SIMD register.
1249 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1250 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1251 int jnrA,jnrB,jnrC,jnrD;
1252 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1253 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1254 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1255 real rcutoff_scalar;
1256 real *shiftvec,*fshift,*x,*f;
1257 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1258 real scratch[4*DIM];
1259 __m128 fscal,rcutoff,rcutoff2,jidxall;
1261 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1263 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1265 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1266 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1267 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1268 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1269 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1270 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1271 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1272 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1273 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1274 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1275 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1276 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1277 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1278 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1279 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1280 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1281 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1284 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1285 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1287 __m128 dummy_mask,cutoff_mask;
1288 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1289 __m128 one = _mm_set1_ps(1.0);
1290 __m128 two = _mm_set1_ps(2.0);
1296 jindex = nlist->jindex;
1298 shiftidx = nlist->shift;
1300 shiftvec = fr->shift_vec[0];
1301 fshift = fr->fshift[0];
1302 facel = _mm_set1_ps(fr->ic->epsfac);
1303 charge = mdatoms->chargeA;
1305 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1306 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1307 beta2 = _mm_mul_ps(beta,beta);
1308 beta3 = _mm_mul_ps(beta,beta2);
1309 ewtab = fr->ic->tabq_coul_F;
1310 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1311 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1313 /* Setup water-specific parameters */
1314 inr = nlist->iinr[0];
1315 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1316 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1317 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1319 jq0 = _mm_set1_ps(charge[inr+0]);
1320 jq1 = _mm_set1_ps(charge[inr+1]);
1321 jq2 = _mm_set1_ps(charge[inr+2]);
1322 qq00 = _mm_mul_ps(iq0,jq0);
1323 qq01 = _mm_mul_ps(iq0,jq1);
1324 qq02 = _mm_mul_ps(iq0,jq2);
1325 qq10 = _mm_mul_ps(iq1,jq0);
1326 qq11 = _mm_mul_ps(iq1,jq1);
1327 qq12 = _mm_mul_ps(iq1,jq2);
1328 qq20 = _mm_mul_ps(iq2,jq0);
1329 qq21 = _mm_mul_ps(iq2,jq1);
1330 qq22 = _mm_mul_ps(iq2,jq2);
1332 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1333 rcutoff_scalar = fr->ic->rcoulomb;
1334 rcutoff = _mm_set1_ps(rcutoff_scalar);
1335 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1337 /* Avoid stupid compiler warnings */
1338 jnrA = jnrB = jnrC = jnrD = 0;
1339 j_coord_offsetA = 0;
1340 j_coord_offsetB = 0;
1341 j_coord_offsetC = 0;
1342 j_coord_offsetD = 0;
1347 for(iidx=0;iidx<4*DIM;iidx++)
1349 scratch[iidx] = 0.0;
1352 /* Start outer loop over neighborlists */
1353 for(iidx=0; iidx<nri; iidx++)
1355 /* Load shift vector for this list */
1356 i_shift_offset = DIM*shiftidx[iidx];
1358 /* Load limits for loop over neighbors */
1359 j_index_start = jindex[iidx];
1360 j_index_end = jindex[iidx+1];
1362 /* Get outer coordinate index */
1364 i_coord_offset = DIM*inr;
1366 /* Load i particle coords and add shift vector */
1367 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1368 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1370 fix0 = _mm_setzero_ps();
1371 fiy0 = _mm_setzero_ps();
1372 fiz0 = _mm_setzero_ps();
1373 fix1 = _mm_setzero_ps();
1374 fiy1 = _mm_setzero_ps();
1375 fiz1 = _mm_setzero_ps();
1376 fix2 = _mm_setzero_ps();
1377 fiy2 = _mm_setzero_ps();
1378 fiz2 = _mm_setzero_ps();
1380 /* Start inner kernel loop */
1381 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1384 /* Get j neighbor index, and coordinate index */
1386 jnrB = jjnr[jidx+1];
1387 jnrC = jjnr[jidx+2];
1388 jnrD = jjnr[jidx+3];
1389 j_coord_offsetA = DIM*jnrA;
1390 j_coord_offsetB = DIM*jnrB;
1391 j_coord_offsetC = DIM*jnrC;
1392 j_coord_offsetD = DIM*jnrD;
1394 /* load j atom coordinates */
1395 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1396 x+j_coord_offsetC,x+j_coord_offsetD,
1397 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1399 /* Calculate displacement vector */
1400 dx00 = _mm_sub_ps(ix0,jx0);
1401 dy00 = _mm_sub_ps(iy0,jy0);
1402 dz00 = _mm_sub_ps(iz0,jz0);
1403 dx01 = _mm_sub_ps(ix0,jx1);
1404 dy01 = _mm_sub_ps(iy0,jy1);
1405 dz01 = _mm_sub_ps(iz0,jz1);
1406 dx02 = _mm_sub_ps(ix0,jx2);
1407 dy02 = _mm_sub_ps(iy0,jy2);
1408 dz02 = _mm_sub_ps(iz0,jz2);
1409 dx10 = _mm_sub_ps(ix1,jx0);
1410 dy10 = _mm_sub_ps(iy1,jy0);
1411 dz10 = _mm_sub_ps(iz1,jz0);
1412 dx11 = _mm_sub_ps(ix1,jx1);
1413 dy11 = _mm_sub_ps(iy1,jy1);
1414 dz11 = _mm_sub_ps(iz1,jz1);
1415 dx12 = _mm_sub_ps(ix1,jx2);
1416 dy12 = _mm_sub_ps(iy1,jy2);
1417 dz12 = _mm_sub_ps(iz1,jz2);
1418 dx20 = _mm_sub_ps(ix2,jx0);
1419 dy20 = _mm_sub_ps(iy2,jy0);
1420 dz20 = _mm_sub_ps(iz2,jz0);
1421 dx21 = _mm_sub_ps(ix2,jx1);
1422 dy21 = _mm_sub_ps(iy2,jy1);
1423 dz21 = _mm_sub_ps(iz2,jz1);
1424 dx22 = _mm_sub_ps(ix2,jx2);
1425 dy22 = _mm_sub_ps(iy2,jy2);
1426 dz22 = _mm_sub_ps(iz2,jz2);
1428 /* Calculate squared distance and things based on it */
1429 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1430 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1431 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1432 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1433 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1434 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1435 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1436 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1437 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1439 rinv00 = avx128fma_invsqrt_f(rsq00);
1440 rinv01 = avx128fma_invsqrt_f(rsq01);
1441 rinv02 = avx128fma_invsqrt_f(rsq02);
1442 rinv10 = avx128fma_invsqrt_f(rsq10);
1443 rinv11 = avx128fma_invsqrt_f(rsq11);
1444 rinv12 = avx128fma_invsqrt_f(rsq12);
1445 rinv20 = avx128fma_invsqrt_f(rsq20);
1446 rinv21 = avx128fma_invsqrt_f(rsq21);
1447 rinv22 = avx128fma_invsqrt_f(rsq22);
1449 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1450 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1451 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1452 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1453 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1454 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1455 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1456 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1457 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1459 fjx0 = _mm_setzero_ps();
1460 fjy0 = _mm_setzero_ps();
1461 fjz0 = _mm_setzero_ps();
1462 fjx1 = _mm_setzero_ps();
1463 fjy1 = _mm_setzero_ps();
1464 fjz1 = _mm_setzero_ps();
1465 fjx2 = _mm_setzero_ps();
1466 fjy2 = _mm_setzero_ps();
1467 fjz2 = _mm_setzero_ps();
1469 /**************************
1470 * CALCULATE INTERACTIONS *
1471 **************************/
1473 if (gmx_mm_any_lt(rsq00,rcutoff2))
1476 r00 = _mm_mul_ps(rsq00,rinv00);
1478 /* EWALD ELECTROSTATICS */
1480 /* Analytical PME correction */
1481 zeta2 = _mm_mul_ps(beta2,rsq00);
1482 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1483 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1484 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1485 felec = _mm_mul_ps(qq00,felec);
1487 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1491 fscal = _mm_and_ps(fscal,cutoff_mask);
1493 /* Update vectorial force */
1494 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1495 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1496 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1498 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1499 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1500 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 if (gmx_mm_any_lt(rsq01,rcutoff2))
1511 r01 = _mm_mul_ps(rsq01,rinv01);
1513 /* EWALD ELECTROSTATICS */
1515 /* Analytical PME correction */
1516 zeta2 = _mm_mul_ps(beta2,rsq01);
1517 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1518 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1519 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1520 felec = _mm_mul_ps(qq01,felec);
1522 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1526 fscal = _mm_and_ps(fscal,cutoff_mask);
1528 /* Update vectorial force */
1529 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1530 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1531 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1533 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1534 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1535 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1543 if (gmx_mm_any_lt(rsq02,rcutoff2))
1546 r02 = _mm_mul_ps(rsq02,rinv02);
1548 /* EWALD ELECTROSTATICS */
1550 /* Analytical PME correction */
1551 zeta2 = _mm_mul_ps(beta2,rsq02);
1552 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1553 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1554 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1555 felec = _mm_mul_ps(qq02,felec);
1557 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1561 fscal = _mm_and_ps(fscal,cutoff_mask);
1563 /* Update vectorial force */
1564 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1565 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1566 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1568 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1569 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1570 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1574 /**************************
1575 * CALCULATE INTERACTIONS *
1576 **************************/
1578 if (gmx_mm_any_lt(rsq10,rcutoff2))
1581 r10 = _mm_mul_ps(rsq10,rinv10);
1583 /* EWALD ELECTROSTATICS */
1585 /* Analytical PME correction */
1586 zeta2 = _mm_mul_ps(beta2,rsq10);
1587 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1588 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1589 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1590 felec = _mm_mul_ps(qq10,felec);
1592 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1596 fscal = _mm_and_ps(fscal,cutoff_mask);
1598 /* Update vectorial force */
1599 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1600 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1601 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1603 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1604 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1605 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 if (gmx_mm_any_lt(rsq11,rcutoff2))
1616 r11 = _mm_mul_ps(rsq11,rinv11);
1618 /* EWALD ELECTROSTATICS */
1620 /* Analytical PME correction */
1621 zeta2 = _mm_mul_ps(beta2,rsq11);
1622 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1623 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1624 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1625 felec = _mm_mul_ps(qq11,felec);
1627 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1631 fscal = _mm_and_ps(fscal,cutoff_mask);
1633 /* Update vectorial force */
1634 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1635 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1636 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1638 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1639 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1640 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1644 /**************************
1645 * CALCULATE INTERACTIONS *
1646 **************************/
1648 if (gmx_mm_any_lt(rsq12,rcutoff2))
1651 r12 = _mm_mul_ps(rsq12,rinv12);
1653 /* EWALD ELECTROSTATICS */
1655 /* Analytical PME correction */
1656 zeta2 = _mm_mul_ps(beta2,rsq12);
1657 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1658 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1659 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1660 felec = _mm_mul_ps(qq12,felec);
1662 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1666 fscal = _mm_and_ps(fscal,cutoff_mask);
1668 /* Update vectorial force */
1669 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1670 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1671 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1673 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1674 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1675 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1679 /**************************
1680 * CALCULATE INTERACTIONS *
1681 **************************/
1683 if (gmx_mm_any_lt(rsq20,rcutoff2))
1686 r20 = _mm_mul_ps(rsq20,rinv20);
1688 /* EWALD ELECTROSTATICS */
1690 /* Analytical PME correction */
1691 zeta2 = _mm_mul_ps(beta2,rsq20);
1692 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1693 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1694 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1695 felec = _mm_mul_ps(qq20,felec);
1697 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1701 fscal = _mm_and_ps(fscal,cutoff_mask);
1703 /* Update vectorial force */
1704 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1705 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1706 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1708 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1709 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1710 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 if (gmx_mm_any_lt(rsq21,rcutoff2))
1721 r21 = _mm_mul_ps(rsq21,rinv21);
1723 /* EWALD ELECTROSTATICS */
1725 /* Analytical PME correction */
1726 zeta2 = _mm_mul_ps(beta2,rsq21);
1727 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1728 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1729 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1730 felec = _mm_mul_ps(qq21,felec);
1732 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1736 fscal = _mm_and_ps(fscal,cutoff_mask);
1738 /* Update vectorial force */
1739 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1740 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1741 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1743 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1744 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1745 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1749 /**************************
1750 * CALCULATE INTERACTIONS *
1751 **************************/
1753 if (gmx_mm_any_lt(rsq22,rcutoff2))
1756 r22 = _mm_mul_ps(rsq22,rinv22);
1758 /* EWALD ELECTROSTATICS */
1760 /* Analytical PME correction */
1761 zeta2 = _mm_mul_ps(beta2,rsq22);
1762 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1763 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1764 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1765 felec = _mm_mul_ps(qq22,felec);
1767 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1771 fscal = _mm_and_ps(fscal,cutoff_mask);
1773 /* Update vectorial force */
1774 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1775 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1776 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1778 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1779 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1780 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1784 fjptrA = f+j_coord_offsetA;
1785 fjptrB = f+j_coord_offsetB;
1786 fjptrC = f+j_coord_offsetC;
1787 fjptrD = f+j_coord_offsetD;
1789 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1790 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1792 /* Inner loop uses 279 flops */
1795 if(jidx<j_index_end)
1798 /* Get j neighbor index, and coordinate index */
1799 jnrlistA = jjnr[jidx];
1800 jnrlistB = jjnr[jidx+1];
1801 jnrlistC = jjnr[jidx+2];
1802 jnrlistD = jjnr[jidx+3];
1803 /* Sign of each element will be negative for non-real atoms.
1804 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1805 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1807 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1808 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1809 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1810 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1811 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1812 j_coord_offsetA = DIM*jnrA;
1813 j_coord_offsetB = DIM*jnrB;
1814 j_coord_offsetC = DIM*jnrC;
1815 j_coord_offsetD = DIM*jnrD;
1817 /* load j atom coordinates */
1818 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1819 x+j_coord_offsetC,x+j_coord_offsetD,
1820 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1822 /* Calculate displacement vector */
1823 dx00 = _mm_sub_ps(ix0,jx0);
1824 dy00 = _mm_sub_ps(iy0,jy0);
1825 dz00 = _mm_sub_ps(iz0,jz0);
1826 dx01 = _mm_sub_ps(ix0,jx1);
1827 dy01 = _mm_sub_ps(iy0,jy1);
1828 dz01 = _mm_sub_ps(iz0,jz1);
1829 dx02 = _mm_sub_ps(ix0,jx2);
1830 dy02 = _mm_sub_ps(iy0,jy2);
1831 dz02 = _mm_sub_ps(iz0,jz2);
1832 dx10 = _mm_sub_ps(ix1,jx0);
1833 dy10 = _mm_sub_ps(iy1,jy0);
1834 dz10 = _mm_sub_ps(iz1,jz0);
1835 dx11 = _mm_sub_ps(ix1,jx1);
1836 dy11 = _mm_sub_ps(iy1,jy1);
1837 dz11 = _mm_sub_ps(iz1,jz1);
1838 dx12 = _mm_sub_ps(ix1,jx2);
1839 dy12 = _mm_sub_ps(iy1,jy2);
1840 dz12 = _mm_sub_ps(iz1,jz2);
1841 dx20 = _mm_sub_ps(ix2,jx0);
1842 dy20 = _mm_sub_ps(iy2,jy0);
1843 dz20 = _mm_sub_ps(iz2,jz0);
1844 dx21 = _mm_sub_ps(ix2,jx1);
1845 dy21 = _mm_sub_ps(iy2,jy1);
1846 dz21 = _mm_sub_ps(iz2,jz1);
1847 dx22 = _mm_sub_ps(ix2,jx2);
1848 dy22 = _mm_sub_ps(iy2,jy2);
1849 dz22 = _mm_sub_ps(iz2,jz2);
1851 /* Calculate squared distance and things based on it */
1852 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1853 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1854 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1855 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1856 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1857 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1858 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1859 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1860 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1862 rinv00 = avx128fma_invsqrt_f(rsq00);
1863 rinv01 = avx128fma_invsqrt_f(rsq01);
1864 rinv02 = avx128fma_invsqrt_f(rsq02);
1865 rinv10 = avx128fma_invsqrt_f(rsq10);
1866 rinv11 = avx128fma_invsqrt_f(rsq11);
1867 rinv12 = avx128fma_invsqrt_f(rsq12);
1868 rinv20 = avx128fma_invsqrt_f(rsq20);
1869 rinv21 = avx128fma_invsqrt_f(rsq21);
1870 rinv22 = avx128fma_invsqrt_f(rsq22);
1872 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1873 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1874 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1875 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1876 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1877 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1878 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1879 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1880 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1882 fjx0 = _mm_setzero_ps();
1883 fjy0 = _mm_setzero_ps();
1884 fjz0 = _mm_setzero_ps();
1885 fjx1 = _mm_setzero_ps();
1886 fjy1 = _mm_setzero_ps();
1887 fjz1 = _mm_setzero_ps();
1888 fjx2 = _mm_setzero_ps();
1889 fjy2 = _mm_setzero_ps();
1890 fjz2 = _mm_setzero_ps();
1892 /**************************
1893 * CALCULATE INTERACTIONS *
1894 **************************/
1896 if (gmx_mm_any_lt(rsq00,rcutoff2))
1899 r00 = _mm_mul_ps(rsq00,rinv00);
1900 r00 = _mm_andnot_ps(dummy_mask,r00);
1902 /* EWALD ELECTROSTATICS */
1904 /* Analytical PME correction */
1905 zeta2 = _mm_mul_ps(beta2,rsq00);
1906 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1907 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1908 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1909 felec = _mm_mul_ps(qq00,felec);
1911 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1915 fscal = _mm_and_ps(fscal,cutoff_mask);
1917 fscal = _mm_andnot_ps(dummy_mask,fscal);
1919 /* Update vectorial force */
1920 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1921 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1922 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1924 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1925 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1926 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1930 /**************************
1931 * CALCULATE INTERACTIONS *
1932 **************************/
1934 if (gmx_mm_any_lt(rsq01,rcutoff2))
1937 r01 = _mm_mul_ps(rsq01,rinv01);
1938 r01 = _mm_andnot_ps(dummy_mask,r01);
1940 /* EWALD ELECTROSTATICS */
1942 /* Analytical PME correction */
1943 zeta2 = _mm_mul_ps(beta2,rsq01);
1944 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1945 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1946 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1947 felec = _mm_mul_ps(qq01,felec);
1949 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1953 fscal = _mm_and_ps(fscal,cutoff_mask);
1955 fscal = _mm_andnot_ps(dummy_mask,fscal);
1957 /* Update vectorial force */
1958 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1959 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1960 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1962 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1963 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1964 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1968 /**************************
1969 * CALCULATE INTERACTIONS *
1970 **************************/
1972 if (gmx_mm_any_lt(rsq02,rcutoff2))
1975 r02 = _mm_mul_ps(rsq02,rinv02);
1976 r02 = _mm_andnot_ps(dummy_mask,r02);
1978 /* EWALD ELECTROSTATICS */
1980 /* Analytical PME correction */
1981 zeta2 = _mm_mul_ps(beta2,rsq02);
1982 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1983 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1984 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1985 felec = _mm_mul_ps(qq02,felec);
1987 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1991 fscal = _mm_and_ps(fscal,cutoff_mask);
1993 fscal = _mm_andnot_ps(dummy_mask,fscal);
1995 /* Update vectorial force */
1996 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1997 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1998 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2000 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2001 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2002 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2006 /**************************
2007 * CALCULATE INTERACTIONS *
2008 **************************/
2010 if (gmx_mm_any_lt(rsq10,rcutoff2))
2013 r10 = _mm_mul_ps(rsq10,rinv10);
2014 r10 = _mm_andnot_ps(dummy_mask,r10);
2016 /* EWALD ELECTROSTATICS */
2018 /* Analytical PME correction */
2019 zeta2 = _mm_mul_ps(beta2,rsq10);
2020 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
2021 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2022 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2023 felec = _mm_mul_ps(qq10,felec);
2025 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2029 fscal = _mm_and_ps(fscal,cutoff_mask);
2031 fscal = _mm_andnot_ps(dummy_mask,fscal);
2033 /* Update vectorial force */
2034 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2035 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2036 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2038 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2039 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2040 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2044 /**************************
2045 * CALCULATE INTERACTIONS *
2046 **************************/
2048 if (gmx_mm_any_lt(rsq11,rcutoff2))
2051 r11 = _mm_mul_ps(rsq11,rinv11);
2052 r11 = _mm_andnot_ps(dummy_mask,r11);
2054 /* EWALD ELECTROSTATICS */
2056 /* Analytical PME correction */
2057 zeta2 = _mm_mul_ps(beta2,rsq11);
2058 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2059 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2060 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2061 felec = _mm_mul_ps(qq11,felec);
2063 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2067 fscal = _mm_and_ps(fscal,cutoff_mask);
2069 fscal = _mm_andnot_ps(dummy_mask,fscal);
2071 /* Update vectorial force */
2072 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2073 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2074 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2076 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2077 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2078 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2082 /**************************
2083 * CALCULATE INTERACTIONS *
2084 **************************/
2086 if (gmx_mm_any_lt(rsq12,rcutoff2))
2089 r12 = _mm_mul_ps(rsq12,rinv12);
2090 r12 = _mm_andnot_ps(dummy_mask,r12);
2092 /* EWALD ELECTROSTATICS */
2094 /* Analytical PME correction */
2095 zeta2 = _mm_mul_ps(beta2,rsq12);
2096 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2097 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2098 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2099 felec = _mm_mul_ps(qq12,felec);
2101 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2105 fscal = _mm_and_ps(fscal,cutoff_mask);
2107 fscal = _mm_andnot_ps(dummy_mask,fscal);
2109 /* Update vectorial force */
2110 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2111 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2112 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2114 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2115 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2116 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2120 /**************************
2121 * CALCULATE INTERACTIONS *
2122 **************************/
2124 if (gmx_mm_any_lt(rsq20,rcutoff2))
2127 r20 = _mm_mul_ps(rsq20,rinv20);
2128 r20 = _mm_andnot_ps(dummy_mask,r20);
2130 /* EWALD ELECTROSTATICS */
2132 /* Analytical PME correction */
2133 zeta2 = _mm_mul_ps(beta2,rsq20);
2134 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2135 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2136 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2137 felec = _mm_mul_ps(qq20,felec);
2139 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2143 fscal = _mm_and_ps(fscal,cutoff_mask);
2145 fscal = _mm_andnot_ps(dummy_mask,fscal);
2147 /* Update vectorial force */
2148 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2149 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2150 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2152 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2153 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2154 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 if (gmx_mm_any_lt(rsq21,rcutoff2))
2165 r21 = _mm_mul_ps(rsq21,rinv21);
2166 r21 = _mm_andnot_ps(dummy_mask,r21);
2168 /* EWALD ELECTROSTATICS */
2170 /* Analytical PME correction */
2171 zeta2 = _mm_mul_ps(beta2,rsq21);
2172 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2173 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2174 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2175 felec = _mm_mul_ps(qq21,felec);
2177 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2181 fscal = _mm_and_ps(fscal,cutoff_mask);
2183 fscal = _mm_andnot_ps(dummy_mask,fscal);
2185 /* Update vectorial force */
2186 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2187 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2188 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2190 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2191 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2192 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2196 /**************************
2197 * CALCULATE INTERACTIONS *
2198 **************************/
2200 if (gmx_mm_any_lt(rsq22,rcutoff2))
2203 r22 = _mm_mul_ps(rsq22,rinv22);
2204 r22 = _mm_andnot_ps(dummy_mask,r22);
2206 /* EWALD ELECTROSTATICS */
2208 /* Analytical PME correction */
2209 zeta2 = _mm_mul_ps(beta2,rsq22);
2210 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2211 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2212 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2213 felec = _mm_mul_ps(qq22,felec);
2215 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2219 fscal = _mm_and_ps(fscal,cutoff_mask);
2221 fscal = _mm_andnot_ps(dummy_mask,fscal);
2223 /* Update vectorial force */
2224 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2225 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2226 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2228 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2229 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2230 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2234 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2235 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2236 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2237 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2239 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2240 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2242 /* Inner loop uses 288 flops */
2245 /* End of innermost loop */
2247 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2248 f+i_coord_offset,fshift+i_shift_offset);
2250 /* Increment number of inner iterations */
2251 inneriter += j_index_end - j_index_start;
2253 /* Outer loop uses 18 flops */
2256 /* Increment number of outer iterations */
2259 /* Update outer/inner flops */
2261 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*288);