2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
112 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
115 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116 real rswitch_scalar,d_scalar;
117 __m128 dummy_mask,cutoff_mask;
118 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one = _mm_set1_ps(1.0);
120 __m128 two = _mm_set1_ps(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_ps(fr->ic->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
139 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
140 beta2 = _mm_mul_ps(beta,beta);
141 beta3 = _mm_mul_ps(beta,beta2);
142 ewtab = fr->ic->tabq_coul_FDV0;
143 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
144 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
149 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
150 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
151 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
153 jq0 = _mm_set1_ps(charge[inr+0]);
154 jq1 = _mm_set1_ps(charge[inr+1]);
155 jq2 = _mm_set1_ps(charge[inr+2]);
156 vdwjidx0A = 2*vdwtype[inr+0];
157 qq00 = _mm_mul_ps(iq0,jq0);
158 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
159 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
160 qq01 = _mm_mul_ps(iq0,jq1);
161 qq02 = _mm_mul_ps(iq0,jq2);
162 qq10 = _mm_mul_ps(iq1,jq0);
163 qq11 = _mm_mul_ps(iq1,jq1);
164 qq12 = _mm_mul_ps(iq1,jq2);
165 qq20 = _mm_mul_ps(iq2,jq0);
166 qq21 = _mm_mul_ps(iq2,jq1);
167 qq22 = _mm_mul_ps(iq2,jq2);
169 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
170 rcutoff_scalar = fr->ic->rcoulomb;
171 rcutoff = _mm_set1_ps(rcutoff_scalar);
172 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
174 rswitch_scalar = fr->ic->rcoulomb_switch;
175 rswitch = _mm_set1_ps(rswitch_scalar);
176 /* Setup switch parameters */
177 d_scalar = rcutoff_scalar-rswitch_scalar;
178 d = _mm_set1_ps(d_scalar);
179 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
180 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
181 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
182 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
183 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
184 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186 /* Avoid stupid compiler warnings */
187 jnrA = jnrB = jnrC = jnrD = 0;
196 for(iidx=0;iidx<4*DIM;iidx++)
201 /* Start outer loop over neighborlists */
202 for(iidx=0; iidx<nri; iidx++)
204 /* Load shift vector for this list */
205 i_shift_offset = DIM*shiftidx[iidx];
207 /* Load limits for loop over neighbors */
208 j_index_start = jindex[iidx];
209 j_index_end = jindex[iidx+1];
211 /* Get outer coordinate index */
213 i_coord_offset = DIM*inr;
215 /* Load i particle coords and add shift vector */
216 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
217 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
219 fix0 = _mm_setzero_ps();
220 fiy0 = _mm_setzero_ps();
221 fiz0 = _mm_setzero_ps();
222 fix1 = _mm_setzero_ps();
223 fiy1 = _mm_setzero_ps();
224 fiz1 = _mm_setzero_ps();
225 fix2 = _mm_setzero_ps();
226 fiy2 = _mm_setzero_ps();
227 fiz2 = _mm_setzero_ps();
229 /* Reset potential sums */
230 velecsum = _mm_setzero_ps();
231 vvdwsum = _mm_setzero_ps();
233 /* Start inner kernel loop */
234 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
237 /* Get j neighbor index, and coordinate index */
242 j_coord_offsetA = DIM*jnrA;
243 j_coord_offsetB = DIM*jnrB;
244 j_coord_offsetC = DIM*jnrC;
245 j_coord_offsetD = DIM*jnrD;
247 /* load j atom coordinates */
248 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
249 x+j_coord_offsetC,x+j_coord_offsetD,
250 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
252 /* Calculate displacement vector */
253 dx00 = _mm_sub_ps(ix0,jx0);
254 dy00 = _mm_sub_ps(iy0,jy0);
255 dz00 = _mm_sub_ps(iz0,jz0);
256 dx01 = _mm_sub_ps(ix0,jx1);
257 dy01 = _mm_sub_ps(iy0,jy1);
258 dz01 = _mm_sub_ps(iz0,jz1);
259 dx02 = _mm_sub_ps(ix0,jx2);
260 dy02 = _mm_sub_ps(iy0,jy2);
261 dz02 = _mm_sub_ps(iz0,jz2);
262 dx10 = _mm_sub_ps(ix1,jx0);
263 dy10 = _mm_sub_ps(iy1,jy0);
264 dz10 = _mm_sub_ps(iz1,jz0);
265 dx11 = _mm_sub_ps(ix1,jx1);
266 dy11 = _mm_sub_ps(iy1,jy1);
267 dz11 = _mm_sub_ps(iz1,jz1);
268 dx12 = _mm_sub_ps(ix1,jx2);
269 dy12 = _mm_sub_ps(iy1,jy2);
270 dz12 = _mm_sub_ps(iz1,jz2);
271 dx20 = _mm_sub_ps(ix2,jx0);
272 dy20 = _mm_sub_ps(iy2,jy0);
273 dz20 = _mm_sub_ps(iz2,jz0);
274 dx21 = _mm_sub_ps(ix2,jx1);
275 dy21 = _mm_sub_ps(iy2,jy1);
276 dz21 = _mm_sub_ps(iz2,jz1);
277 dx22 = _mm_sub_ps(ix2,jx2);
278 dy22 = _mm_sub_ps(iy2,jy2);
279 dz22 = _mm_sub_ps(iz2,jz2);
281 /* Calculate squared distance and things based on it */
282 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
283 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
284 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
285 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
286 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
287 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
288 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
289 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
290 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
292 rinv00 = avx128fma_invsqrt_f(rsq00);
293 rinv01 = avx128fma_invsqrt_f(rsq01);
294 rinv02 = avx128fma_invsqrt_f(rsq02);
295 rinv10 = avx128fma_invsqrt_f(rsq10);
296 rinv11 = avx128fma_invsqrt_f(rsq11);
297 rinv12 = avx128fma_invsqrt_f(rsq12);
298 rinv20 = avx128fma_invsqrt_f(rsq20);
299 rinv21 = avx128fma_invsqrt_f(rsq21);
300 rinv22 = avx128fma_invsqrt_f(rsq22);
302 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
303 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
304 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
305 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
306 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
307 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
308 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
309 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
310 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
312 fjx0 = _mm_setzero_ps();
313 fjy0 = _mm_setzero_ps();
314 fjz0 = _mm_setzero_ps();
315 fjx1 = _mm_setzero_ps();
316 fjy1 = _mm_setzero_ps();
317 fjz1 = _mm_setzero_ps();
318 fjx2 = _mm_setzero_ps();
319 fjy2 = _mm_setzero_ps();
320 fjz2 = _mm_setzero_ps();
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
326 if (gmx_mm_any_lt(rsq00,rcutoff2))
329 r00 = _mm_mul_ps(rsq00,rinv00);
331 /* EWALD ELECTROSTATICS */
333 /* Analytical PME correction */
334 zeta2 = _mm_mul_ps(beta2,rsq00);
335 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
336 pmecorrF = avx128fma_pmecorrF_f(zeta2);
337 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
338 felec = _mm_mul_ps(qq00,felec);
339 pmecorrV = avx128fma_pmecorrV_f(zeta2);
340 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
341 velec = _mm_mul_ps(qq00,velec);
343 /* LENNARD-JONES DISPERSION/REPULSION */
345 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
346 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
347 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
348 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
349 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
351 d = _mm_sub_ps(r00,rswitch);
352 d = _mm_max_ps(d,_mm_setzero_ps());
353 d2 = _mm_mul_ps(d,d);
354 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
356 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
358 /* Evaluate switch function */
359 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
360 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
361 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
362 velec = _mm_mul_ps(velec,sw);
363 vvdw = _mm_mul_ps(vvdw,sw);
364 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velec = _mm_and_ps(velec,cutoff_mask);
368 velecsum = _mm_add_ps(velecsum,velec);
369 vvdw = _mm_and_ps(vvdw,cutoff_mask);
370 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
372 fscal = _mm_add_ps(felec,fvdw);
374 fscal = _mm_and_ps(fscal,cutoff_mask);
376 /* Update vectorial force */
377 fix0 = _mm_macc_ps(dx00,fscal,fix0);
378 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
379 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
381 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
382 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
383 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 if (gmx_mm_any_lt(rsq01,rcutoff2))
394 r01 = _mm_mul_ps(rsq01,rinv01);
396 /* EWALD ELECTROSTATICS */
398 /* Analytical PME correction */
399 zeta2 = _mm_mul_ps(beta2,rsq01);
400 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
401 pmecorrF = avx128fma_pmecorrF_f(zeta2);
402 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
403 felec = _mm_mul_ps(qq01,felec);
404 pmecorrV = avx128fma_pmecorrV_f(zeta2);
405 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
406 velec = _mm_mul_ps(qq01,velec);
408 d = _mm_sub_ps(r01,rswitch);
409 d = _mm_max_ps(d,_mm_setzero_ps());
410 d2 = _mm_mul_ps(d,d);
411 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
413 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
415 /* Evaluate switch function */
416 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
417 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
418 velec = _mm_mul_ps(velec,sw);
419 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
421 /* Update potential sum for this i atom from the interaction with this j atom. */
422 velec = _mm_and_ps(velec,cutoff_mask);
423 velecsum = _mm_add_ps(velecsum,velec);
427 fscal = _mm_and_ps(fscal,cutoff_mask);
429 /* Update vectorial force */
430 fix0 = _mm_macc_ps(dx01,fscal,fix0);
431 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
432 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
434 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
435 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
436 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
440 /**************************
441 * CALCULATE INTERACTIONS *
442 **************************/
444 if (gmx_mm_any_lt(rsq02,rcutoff2))
447 r02 = _mm_mul_ps(rsq02,rinv02);
449 /* EWALD ELECTROSTATICS */
451 /* Analytical PME correction */
452 zeta2 = _mm_mul_ps(beta2,rsq02);
453 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
454 pmecorrF = avx128fma_pmecorrF_f(zeta2);
455 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
456 felec = _mm_mul_ps(qq02,felec);
457 pmecorrV = avx128fma_pmecorrV_f(zeta2);
458 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
459 velec = _mm_mul_ps(qq02,velec);
461 d = _mm_sub_ps(r02,rswitch);
462 d = _mm_max_ps(d,_mm_setzero_ps());
463 d2 = _mm_mul_ps(d,d);
464 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
466 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
468 /* Evaluate switch function */
469 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
470 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
471 velec = _mm_mul_ps(velec,sw);
472 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velec = _mm_and_ps(velec,cutoff_mask);
476 velecsum = _mm_add_ps(velecsum,velec);
480 fscal = _mm_and_ps(fscal,cutoff_mask);
482 /* Update vectorial force */
483 fix0 = _mm_macc_ps(dx02,fscal,fix0);
484 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
485 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
487 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
488 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
489 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 if (gmx_mm_any_lt(rsq10,rcutoff2))
500 r10 = _mm_mul_ps(rsq10,rinv10);
502 /* EWALD ELECTROSTATICS */
504 /* Analytical PME correction */
505 zeta2 = _mm_mul_ps(beta2,rsq10);
506 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
507 pmecorrF = avx128fma_pmecorrF_f(zeta2);
508 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
509 felec = _mm_mul_ps(qq10,felec);
510 pmecorrV = avx128fma_pmecorrV_f(zeta2);
511 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
512 velec = _mm_mul_ps(qq10,velec);
514 d = _mm_sub_ps(r10,rswitch);
515 d = _mm_max_ps(d,_mm_setzero_ps());
516 d2 = _mm_mul_ps(d,d);
517 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
519 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
521 /* Evaluate switch function */
522 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
523 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
524 velec = _mm_mul_ps(velec,sw);
525 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velec = _mm_and_ps(velec,cutoff_mask);
529 velecsum = _mm_add_ps(velecsum,velec);
533 fscal = _mm_and_ps(fscal,cutoff_mask);
535 /* Update vectorial force */
536 fix1 = _mm_macc_ps(dx10,fscal,fix1);
537 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
538 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
540 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
541 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
542 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 if (gmx_mm_any_lt(rsq11,rcutoff2))
553 r11 = _mm_mul_ps(rsq11,rinv11);
555 /* EWALD ELECTROSTATICS */
557 /* Analytical PME correction */
558 zeta2 = _mm_mul_ps(beta2,rsq11);
559 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
560 pmecorrF = avx128fma_pmecorrF_f(zeta2);
561 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
562 felec = _mm_mul_ps(qq11,felec);
563 pmecorrV = avx128fma_pmecorrV_f(zeta2);
564 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
565 velec = _mm_mul_ps(qq11,velec);
567 d = _mm_sub_ps(r11,rswitch);
568 d = _mm_max_ps(d,_mm_setzero_ps());
569 d2 = _mm_mul_ps(d,d);
570 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
572 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
574 /* Evaluate switch function */
575 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
576 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
577 velec = _mm_mul_ps(velec,sw);
578 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
580 /* Update potential sum for this i atom from the interaction with this j atom. */
581 velec = _mm_and_ps(velec,cutoff_mask);
582 velecsum = _mm_add_ps(velecsum,velec);
586 fscal = _mm_and_ps(fscal,cutoff_mask);
588 /* Update vectorial force */
589 fix1 = _mm_macc_ps(dx11,fscal,fix1);
590 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
591 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
593 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
594 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
595 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 if (gmx_mm_any_lt(rsq12,rcutoff2))
606 r12 = _mm_mul_ps(rsq12,rinv12);
608 /* EWALD ELECTROSTATICS */
610 /* Analytical PME correction */
611 zeta2 = _mm_mul_ps(beta2,rsq12);
612 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
613 pmecorrF = avx128fma_pmecorrF_f(zeta2);
614 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
615 felec = _mm_mul_ps(qq12,felec);
616 pmecorrV = avx128fma_pmecorrV_f(zeta2);
617 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
618 velec = _mm_mul_ps(qq12,velec);
620 d = _mm_sub_ps(r12,rswitch);
621 d = _mm_max_ps(d,_mm_setzero_ps());
622 d2 = _mm_mul_ps(d,d);
623 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
625 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
627 /* Evaluate switch function */
628 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
629 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
630 velec = _mm_mul_ps(velec,sw);
631 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
633 /* Update potential sum for this i atom from the interaction with this j atom. */
634 velec = _mm_and_ps(velec,cutoff_mask);
635 velecsum = _mm_add_ps(velecsum,velec);
639 fscal = _mm_and_ps(fscal,cutoff_mask);
641 /* Update vectorial force */
642 fix1 = _mm_macc_ps(dx12,fscal,fix1);
643 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
644 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
646 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
647 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
648 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 if (gmx_mm_any_lt(rsq20,rcutoff2))
659 r20 = _mm_mul_ps(rsq20,rinv20);
661 /* EWALD ELECTROSTATICS */
663 /* Analytical PME correction */
664 zeta2 = _mm_mul_ps(beta2,rsq20);
665 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
666 pmecorrF = avx128fma_pmecorrF_f(zeta2);
667 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
668 felec = _mm_mul_ps(qq20,felec);
669 pmecorrV = avx128fma_pmecorrV_f(zeta2);
670 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
671 velec = _mm_mul_ps(qq20,velec);
673 d = _mm_sub_ps(r20,rswitch);
674 d = _mm_max_ps(d,_mm_setzero_ps());
675 d2 = _mm_mul_ps(d,d);
676 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
678 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
680 /* Evaluate switch function */
681 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
682 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
683 velec = _mm_mul_ps(velec,sw);
684 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
686 /* Update potential sum for this i atom from the interaction with this j atom. */
687 velec = _mm_and_ps(velec,cutoff_mask);
688 velecsum = _mm_add_ps(velecsum,velec);
692 fscal = _mm_and_ps(fscal,cutoff_mask);
694 /* Update vectorial force */
695 fix2 = _mm_macc_ps(dx20,fscal,fix2);
696 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
697 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
699 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
700 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
701 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
705 /**************************
706 * CALCULATE INTERACTIONS *
707 **************************/
709 if (gmx_mm_any_lt(rsq21,rcutoff2))
712 r21 = _mm_mul_ps(rsq21,rinv21);
714 /* EWALD ELECTROSTATICS */
716 /* Analytical PME correction */
717 zeta2 = _mm_mul_ps(beta2,rsq21);
718 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
719 pmecorrF = avx128fma_pmecorrF_f(zeta2);
720 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
721 felec = _mm_mul_ps(qq21,felec);
722 pmecorrV = avx128fma_pmecorrV_f(zeta2);
723 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
724 velec = _mm_mul_ps(qq21,velec);
726 d = _mm_sub_ps(r21,rswitch);
727 d = _mm_max_ps(d,_mm_setzero_ps());
728 d2 = _mm_mul_ps(d,d);
729 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
731 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
733 /* Evaluate switch function */
734 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
735 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
736 velec = _mm_mul_ps(velec,sw);
737 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velec = _mm_and_ps(velec,cutoff_mask);
741 velecsum = _mm_add_ps(velecsum,velec);
745 fscal = _mm_and_ps(fscal,cutoff_mask);
747 /* Update vectorial force */
748 fix2 = _mm_macc_ps(dx21,fscal,fix2);
749 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
750 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
752 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
753 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
754 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 if (gmx_mm_any_lt(rsq22,rcutoff2))
765 r22 = _mm_mul_ps(rsq22,rinv22);
767 /* EWALD ELECTROSTATICS */
769 /* Analytical PME correction */
770 zeta2 = _mm_mul_ps(beta2,rsq22);
771 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
772 pmecorrF = avx128fma_pmecorrF_f(zeta2);
773 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
774 felec = _mm_mul_ps(qq22,felec);
775 pmecorrV = avx128fma_pmecorrV_f(zeta2);
776 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
777 velec = _mm_mul_ps(qq22,velec);
779 d = _mm_sub_ps(r22,rswitch);
780 d = _mm_max_ps(d,_mm_setzero_ps());
781 d2 = _mm_mul_ps(d,d);
782 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
784 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
786 /* Evaluate switch function */
787 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
788 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
789 velec = _mm_mul_ps(velec,sw);
790 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
792 /* Update potential sum for this i atom from the interaction with this j atom. */
793 velec = _mm_and_ps(velec,cutoff_mask);
794 velecsum = _mm_add_ps(velecsum,velec);
798 fscal = _mm_and_ps(fscal,cutoff_mask);
800 /* Update vectorial force */
801 fix2 = _mm_macc_ps(dx22,fscal,fix2);
802 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
803 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
805 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
806 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
807 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
811 fjptrA = f+j_coord_offsetA;
812 fjptrB = f+j_coord_offsetB;
813 fjptrC = f+j_coord_offsetC;
814 fjptrD = f+j_coord_offsetD;
816 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
817 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
819 /* Inner loop uses 495 flops */
825 /* Get j neighbor index, and coordinate index */
826 jnrlistA = jjnr[jidx];
827 jnrlistB = jjnr[jidx+1];
828 jnrlistC = jjnr[jidx+2];
829 jnrlistD = jjnr[jidx+3];
830 /* Sign of each element will be negative for non-real atoms.
831 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
832 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
834 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
835 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
836 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
837 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
838 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
839 j_coord_offsetA = DIM*jnrA;
840 j_coord_offsetB = DIM*jnrB;
841 j_coord_offsetC = DIM*jnrC;
842 j_coord_offsetD = DIM*jnrD;
844 /* load j atom coordinates */
845 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
846 x+j_coord_offsetC,x+j_coord_offsetD,
847 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
849 /* Calculate displacement vector */
850 dx00 = _mm_sub_ps(ix0,jx0);
851 dy00 = _mm_sub_ps(iy0,jy0);
852 dz00 = _mm_sub_ps(iz0,jz0);
853 dx01 = _mm_sub_ps(ix0,jx1);
854 dy01 = _mm_sub_ps(iy0,jy1);
855 dz01 = _mm_sub_ps(iz0,jz1);
856 dx02 = _mm_sub_ps(ix0,jx2);
857 dy02 = _mm_sub_ps(iy0,jy2);
858 dz02 = _mm_sub_ps(iz0,jz2);
859 dx10 = _mm_sub_ps(ix1,jx0);
860 dy10 = _mm_sub_ps(iy1,jy0);
861 dz10 = _mm_sub_ps(iz1,jz0);
862 dx11 = _mm_sub_ps(ix1,jx1);
863 dy11 = _mm_sub_ps(iy1,jy1);
864 dz11 = _mm_sub_ps(iz1,jz1);
865 dx12 = _mm_sub_ps(ix1,jx2);
866 dy12 = _mm_sub_ps(iy1,jy2);
867 dz12 = _mm_sub_ps(iz1,jz2);
868 dx20 = _mm_sub_ps(ix2,jx0);
869 dy20 = _mm_sub_ps(iy2,jy0);
870 dz20 = _mm_sub_ps(iz2,jz0);
871 dx21 = _mm_sub_ps(ix2,jx1);
872 dy21 = _mm_sub_ps(iy2,jy1);
873 dz21 = _mm_sub_ps(iz2,jz1);
874 dx22 = _mm_sub_ps(ix2,jx2);
875 dy22 = _mm_sub_ps(iy2,jy2);
876 dz22 = _mm_sub_ps(iz2,jz2);
878 /* Calculate squared distance and things based on it */
879 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
880 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
881 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
882 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
883 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
884 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
885 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
886 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
887 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
889 rinv00 = avx128fma_invsqrt_f(rsq00);
890 rinv01 = avx128fma_invsqrt_f(rsq01);
891 rinv02 = avx128fma_invsqrt_f(rsq02);
892 rinv10 = avx128fma_invsqrt_f(rsq10);
893 rinv11 = avx128fma_invsqrt_f(rsq11);
894 rinv12 = avx128fma_invsqrt_f(rsq12);
895 rinv20 = avx128fma_invsqrt_f(rsq20);
896 rinv21 = avx128fma_invsqrt_f(rsq21);
897 rinv22 = avx128fma_invsqrt_f(rsq22);
899 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
900 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
901 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
902 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
903 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
904 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
905 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
906 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
907 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
909 fjx0 = _mm_setzero_ps();
910 fjy0 = _mm_setzero_ps();
911 fjz0 = _mm_setzero_ps();
912 fjx1 = _mm_setzero_ps();
913 fjy1 = _mm_setzero_ps();
914 fjz1 = _mm_setzero_ps();
915 fjx2 = _mm_setzero_ps();
916 fjy2 = _mm_setzero_ps();
917 fjz2 = _mm_setzero_ps();
919 /**************************
920 * CALCULATE INTERACTIONS *
921 **************************/
923 if (gmx_mm_any_lt(rsq00,rcutoff2))
926 r00 = _mm_mul_ps(rsq00,rinv00);
927 r00 = _mm_andnot_ps(dummy_mask,r00);
929 /* EWALD ELECTROSTATICS */
931 /* Analytical PME correction */
932 zeta2 = _mm_mul_ps(beta2,rsq00);
933 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
934 pmecorrF = avx128fma_pmecorrF_f(zeta2);
935 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
936 felec = _mm_mul_ps(qq00,felec);
937 pmecorrV = avx128fma_pmecorrV_f(zeta2);
938 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
939 velec = _mm_mul_ps(qq00,velec);
941 /* LENNARD-JONES DISPERSION/REPULSION */
943 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
944 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
945 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
946 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
947 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
949 d = _mm_sub_ps(r00,rswitch);
950 d = _mm_max_ps(d,_mm_setzero_ps());
951 d2 = _mm_mul_ps(d,d);
952 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
954 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
956 /* Evaluate switch function */
957 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
958 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
959 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
960 velec = _mm_mul_ps(velec,sw);
961 vvdw = _mm_mul_ps(vvdw,sw);
962 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 velec = _mm_and_ps(velec,cutoff_mask);
966 velec = _mm_andnot_ps(dummy_mask,velec);
967 velecsum = _mm_add_ps(velecsum,velec);
968 vvdw = _mm_and_ps(vvdw,cutoff_mask);
969 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
970 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
972 fscal = _mm_add_ps(felec,fvdw);
974 fscal = _mm_and_ps(fscal,cutoff_mask);
976 fscal = _mm_andnot_ps(dummy_mask,fscal);
978 /* Update vectorial force */
979 fix0 = _mm_macc_ps(dx00,fscal,fix0);
980 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
981 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
983 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
984 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
985 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 if (gmx_mm_any_lt(rsq01,rcutoff2))
996 r01 = _mm_mul_ps(rsq01,rinv01);
997 r01 = _mm_andnot_ps(dummy_mask,r01);
999 /* EWALD ELECTROSTATICS */
1001 /* Analytical PME correction */
1002 zeta2 = _mm_mul_ps(beta2,rsq01);
1003 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1004 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1005 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1006 felec = _mm_mul_ps(qq01,felec);
1007 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1008 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1009 velec = _mm_mul_ps(qq01,velec);
1011 d = _mm_sub_ps(r01,rswitch);
1012 d = _mm_max_ps(d,_mm_setzero_ps());
1013 d2 = _mm_mul_ps(d,d);
1014 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1016 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1018 /* Evaluate switch function */
1019 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1020 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1021 velec = _mm_mul_ps(velec,sw);
1022 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1024 /* Update potential sum for this i atom from the interaction with this j atom. */
1025 velec = _mm_and_ps(velec,cutoff_mask);
1026 velec = _mm_andnot_ps(dummy_mask,velec);
1027 velecsum = _mm_add_ps(velecsum,velec);
1031 fscal = _mm_and_ps(fscal,cutoff_mask);
1033 fscal = _mm_andnot_ps(dummy_mask,fscal);
1035 /* Update vectorial force */
1036 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1037 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1038 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1040 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1041 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1042 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 if (gmx_mm_any_lt(rsq02,rcutoff2))
1053 r02 = _mm_mul_ps(rsq02,rinv02);
1054 r02 = _mm_andnot_ps(dummy_mask,r02);
1056 /* EWALD ELECTROSTATICS */
1058 /* Analytical PME correction */
1059 zeta2 = _mm_mul_ps(beta2,rsq02);
1060 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1061 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1062 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1063 felec = _mm_mul_ps(qq02,felec);
1064 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1065 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1066 velec = _mm_mul_ps(qq02,velec);
1068 d = _mm_sub_ps(r02,rswitch);
1069 d = _mm_max_ps(d,_mm_setzero_ps());
1070 d2 = _mm_mul_ps(d,d);
1071 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1073 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1075 /* Evaluate switch function */
1076 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1077 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1078 velec = _mm_mul_ps(velec,sw);
1079 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1081 /* Update potential sum for this i atom from the interaction with this j atom. */
1082 velec = _mm_and_ps(velec,cutoff_mask);
1083 velec = _mm_andnot_ps(dummy_mask,velec);
1084 velecsum = _mm_add_ps(velecsum,velec);
1088 fscal = _mm_and_ps(fscal,cutoff_mask);
1090 fscal = _mm_andnot_ps(dummy_mask,fscal);
1092 /* Update vectorial force */
1093 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1094 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1095 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1097 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1098 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1099 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 if (gmx_mm_any_lt(rsq10,rcutoff2))
1110 r10 = _mm_mul_ps(rsq10,rinv10);
1111 r10 = _mm_andnot_ps(dummy_mask,r10);
1113 /* EWALD ELECTROSTATICS */
1115 /* Analytical PME correction */
1116 zeta2 = _mm_mul_ps(beta2,rsq10);
1117 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1118 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1119 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1120 felec = _mm_mul_ps(qq10,felec);
1121 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1122 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1123 velec = _mm_mul_ps(qq10,velec);
1125 d = _mm_sub_ps(r10,rswitch);
1126 d = _mm_max_ps(d,_mm_setzero_ps());
1127 d2 = _mm_mul_ps(d,d);
1128 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1130 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1132 /* Evaluate switch function */
1133 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1134 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1135 velec = _mm_mul_ps(velec,sw);
1136 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1138 /* Update potential sum for this i atom from the interaction with this j atom. */
1139 velec = _mm_and_ps(velec,cutoff_mask);
1140 velec = _mm_andnot_ps(dummy_mask,velec);
1141 velecsum = _mm_add_ps(velecsum,velec);
1145 fscal = _mm_and_ps(fscal,cutoff_mask);
1147 fscal = _mm_andnot_ps(dummy_mask,fscal);
1149 /* Update vectorial force */
1150 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1151 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1152 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1154 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1155 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1156 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1160 /**************************
1161 * CALCULATE INTERACTIONS *
1162 **************************/
1164 if (gmx_mm_any_lt(rsq11,rcutoff2))
1167 r11 = _mm_mul_ps(rsq11,rinv11);
1168 r11 = _mm_andnot_ps(dummy_mask,r11);
1170 /* EWALD ELECTROSTATICS */
1172 /* Analytical PME correction */
1173 zeta2 = _mm_mul_ps(beta2,rsq11);
1174 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1175 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1176 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1177 felec = _mm_mul_ps(qq11,felec);
1178 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1179 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1180 velec = _mm_mul_ps(qq11,velec);
1182 d = _mm_sub_ps(r11,rswitch);
1183 d = _mm_max_ps(d,_mm_setzero_ps());
1184 d2 = _mm_mul_ps(d,d);
1185 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1187 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1189 /* Evaluate switch function */
1190 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1191 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1192 velec = _mm_mul_ps(velec,sw);
1193 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec = _mm_and_ps(velec,cutoff_mask);
1197 velec = _mm_andnot_ps(dummy_mask,velec);
1198 velecsum = _mm_add_ps(velecsum,velec);
1202 fscal = _mm_and_ps(fscal,cutoff_mask);
1204 fscal = _mm_andnot_ps(dummy_mask,fscal);
1206 /* Update vectorial force */
1207 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1208 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1209 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1211 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1212 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1213 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 if (gmx_mm_any_lt(rsq12,rcutoff2))
1224 r12 = _mm_mul_ps(rsq12,rinv12);
1225 r12 = _mm_andnot_ps(dummy_mask,r12);
1227 /* EWALD ELECTROSTATICS */
1229 /* Analytical PME correction */
1230 zeta2 = _mm_mul_ps(beta2,rsq12);
1231 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1232 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1233 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1234 felec = _mm_mul_ps(qq12,felec);
1235 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1236 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1237 velec = _mm_mul_ps(qq12,velec);
1239 d = _mm_sub_ps(r12,rswitch);
1240 d = _mm_max_ps(d,_mm_setzero_ps());
1241 d2 = _mm_mul_ps(d,d);
1242 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1244 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1246 /* Evaluate switch function */
1247 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1248 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1249 velec = _mm_mul_ps(velec,sw);
1250 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1252 /* Update potential sum for this i atom from the interaction with this j atom. */
1253 velec = _mm_and_ps(velec,cutoff_mask);
1254 velec = _mm_andnot_ps(dummy_mask,velec);
1255 velecsum = _mm_add_ps(velecsum,velec);
1259 fscal = _mm_and_ps(fscal,cutoff_mask);
1261 fscal = _mm_andnot_ps(dummy_mask,fscal);
1263 /* Update vectorial force */
1264 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1265 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1266 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1268 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1269 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1270 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1274 /**************************
1275 * CALCULATE INTERACTIONS *
1276 **************************/
1278 if (gmx_mm_any_lt(rsq20,rcutoff2))
1281 r20 = _mm_mul_ps(rsq20,rinv20);
1282 r20 = _mm_andnot_ps(dummy_mask,r20);
1284 /* EWALD ELECTROSTATICS */
1286 /* Analytical PME correction */
1287 zeta2 = _mm_mul_ps(beta2,rsq20);
1288 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1289 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1290 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1291 felec = _mm_mul_ps(qq20,felec);
1292 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1293 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1294 velec = _mm_mul_ps(qq20,velec);
1296 d = _mm_sub_ps(r20,rswitch);
1297 d = _mm_max_ps(d,_mm_setzero_ps());
1298 d2 = _mm_mul_ps(d,d);
1299 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1301 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1303 /* Evaluate switch function */
1304 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1305 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1306 velec = _mm_mul_ps(velec,sw);
1307 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1309 /* Update potential sum for this i atom from the interaction with this j atom. */
1310 velec = _mm_and_ps(velec,cutoff_mask);
1311 velec = _mm_andnot_ps(dummy_mask,velec);
1312 velecsum = _mm_add_ps(velecsum,velec);
1316 fscal = _mm_and_ps(fscal,cutoff_mask);
1318 fscal = _mm_andnot_ps(dummy_mask,fscal);
1320 /* Update vectorial force */
1321 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1322 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1323 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1325 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1326 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1327 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1331 /**************************
1332 * CALCULATE INTERACTIONS *
1333 **************************/
1335 if (gmx_mm_any_lt(rsq21,rcutoff2))
1338 r21 = _mm_mul_ps(rsq21,rinv21);
1339 r21 = _mm_andnot_ps(dummy_mask,r21);
1341 /* EWALD ELECTROSTATICS */
1343 /* Analytical PME correction */
1344 zeta2 = _mm_mul_ps(beta2,rsq21);
1345 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1346 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1347 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1348 felec = _mm_mul_ps(qq21,felec);
1349 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1350 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1351 velec = _mm_mul_ps(qq21,velec);
1353 d = _mm_sub_ps(r21,rswitch);
1354 d = _mm_max_ps(d,_mm_setzero_ps());
1355 d2 = _mm_mul_ps(d,d);
1356 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1358 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1360 /* Evaluate switch function */
1361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1362 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1363 velec = _mm_mul_ps(velec,sw);
1364 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1366 /* Update potential sum for this i atom from the interaction with this j atom. */
1367 velec = _mm_and_ps(velec,cutoff_mask);
1368 velec = _mm_andnot_ps(dummy_mask,velec);
1369 velecsum = _mm_add_ps(velecsum,velec);
1373 fscal = _mm_and_ps(fscal,cutoff_mask);
1375 fscal = _mm_andnot_ps(dummy_mask,fscal);
1377 /* Update vectorial force */
1378 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1379 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1380 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1382 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1383 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1384 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1388 /**************************
1389 * CALCULATE INTERACTIONS *
1390 **************************/
1392 if (gmx_mm_any_lt(rsq22,rcutoff2))
1395 r22 = _mm_mul_ps(rsq22,rinv22);
1396 r22 = _mm_andnot_ps(dummy_mask,r22);
1398 /* EWALD ELECTROSTATICS */
1400 /* Analytical PME correction */
1401 zeta2 = _mm_mul_ps(beta2,rsq22);
1402 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1403 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1404 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1405 felec = _mm_mul_ps(qq22,felec);
1406 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1407 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1408 velec = _mm_mul_ps(qq22,velec);
1410 d = _mm_sub_ps(r22,rswitch);
1411 d = _mm_max_ps(d,_mm_setzero_ps());
1412 d2 = _mm_mul_ps(d,d);
1413 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1415 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1417 /* Evaluate switch function */
1418 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1419 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1420 velec = _mm_mul_ps(velec,sw);
1421 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1423 /* Update potential sum for this i atom from the interaction with this j atom. */
1424 velec = _mm_and_ps(velec,cutoff_mask);
1425 velec = _mm_andnot_ps(dummy_mask,velec);
1426 velecsum = _mm_add_ps(velecsum,velec);
1430 fscal = _mm_and_ps(fscal,cutoff_mask);
1432 fscal = _mm_andnot_ps(dummy_mask,fscal);
1434 /* Update vectorial force */
1435 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1436 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1437 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1439 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1440 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1441 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1445 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1446 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1447 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1448 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1450 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1451 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1453 /* Inner loop uses 504 flops */
1456 /* End of innermost loop */
1458 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1459 f+i_coord_offset,fshift+i_shift_offset);
1462 /* Update potential energies */
1463 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1464 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1466 /* Increment number of inner iterations */
1467 inneriter += j_index_end - j_index_start;
1469 /* Outer loop uses 20 flops */
1472 /* Increment number of outer iterations */
1475 /* Update outer/inner flops */
1477 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*504);
1480 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1481 * Electrostatics interaction: Ewald
1482 * VdW interaction: LennardJones
1483 * Geometry: Water3-Water3
1484 * Calculate force/pot: Force
1487 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1488 (t_nblist * gmx_restrict nlist,
1489 rvec * gmx_restrict xx,
1490 rvec * gmx_restrict ff,
1491 struct t_forcerec * gmx_restrict fr,
1492 t_mdatoms * gmx_restrict mdatoms,
1493 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1494 t_nrnb * gmx_restrict nrnb)
1496 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1497 * just 0 for non-waters.
1498 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1499 * jnr indices corresponding to data put in the four positions in the SIMD register.
1501 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1502 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1503 int jnrA,jnrB,jnrC,jnrD;
1504 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1505 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1506 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1507 real rcutoff_scalar;
1508 real *shiftvec,*fshift,*x,*f;
1509 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1510 real scratch[4*DIM];
1511 __m128 fscal,rcutoff,rcutoff2,jidxall;
1513 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1515 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1517 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1518 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1519 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1520 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1521 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1522 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1523 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1524 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1525 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1526 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1527 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1528 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1529 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1530 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1531 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1532 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1533 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1536 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1539 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1540 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1542 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1543 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1545 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1546 real rswitch_scalar,d_scalar;
1547 __m128 dummy_mask,cutoff_mask;
1548 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1549 __m128 one = _mm_set1_ps(1.0);
1550 __m128 two = _mm_set1_ps(2.0);
1556 jindex = nlist->jindex;
1558 shiftidx = nlist->shift;
1560 shiftvec = fr->shift_vec[0];
1561 fshift = fr->fshift[0];
1562 facel = _mm_set1_ps(fr->ic->epsfac);
1563 charge = mdatoms->chargeA;
1564 nvdwtype = fr->ntype;
1565 vdwparam = fr->nbfp;
1566 vdwtype = mdatoms->typeA;
1568 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1569 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1570 beta2 = _mm_mul_ps(beta,beta);
1571 beta3 = _mm_mul_ps(beta,beta2);
1572 ewtab = fr->ic->tabq_coul_FDV0;
1573 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1574 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1576 /* Setup water-specific parameters */
1577 inr = nlist->iinr[0];
1578 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1579 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1580 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1581 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1583 jq0 = _mm_set1_ps(charge[inr+0]);
1584 jq1 = _mm_set1_ps(charge[inr+1]);
1585 jq2 = _mm_set1_ps(charge[inr+2]);
1586 vdwjidx0A = 2*vdwtype[inr+0];
1587 qq00 = _mm_mul_ps(iq0,jq0);
1588 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1589 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1590 qq01 = _mm_mul_ps(iq0,jq1);
1591 qq02 = _mm_mul_ps(iq0,jq2);
1592 qq10 = _mm_mul_ps(iq1,jq0);
1593 qq11 = _mm_mul_ps(iq1,jq1);
1594 qq12 = _mm_mul_ps(iq1,jq2);
1595 qq20 = _mm_mul_ps(iq2,jq0);
1596 qq21 = _mm_mul_ps(iq2,jq1);
1597 qq22 = _mm_mul_ps(iq2,jq2);
1599 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1600 rcutoff_scalar = fr->ic->rcoulomb;
1601 rcutoff = _mm_set1_ps(rcutoff_scalar);
1602 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1604 rswitch_scalar = fr->ic->rcoulomb_switch;
1605 rswitch = _mm_set1_ps(rswitch_scalar);
1606 /* Setup switch parameters */
1607 d_scalar = rcutoff_scalar-rswitch_scalar;
1608 d = _mm_set1_ps(d_scalar);
1609 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1610 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1611 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1612 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1613 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1614 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1616 /* Avoid stupid compiler warnings */
1617 jnrA = jnrB = jnrC = jnrD = 0;
1618 j_coord_offsetA = 0;
1619 j_coord_offsetB = 0;
1620 j_coord_offsetC = 0;
1621 j_coord_offsetD = 0;
1626 for(iidx=0;iidx<4*DIM;iidx++)
1628 scratch[iidx] = 0.0;
1631 /* Start outer loop over neighborlists */
1632 for(iidx=0; iidx<nri; iidx++)
1634 /* Load shift vector for this list */
1635 i_shift_offset = DIM*shiftidx[iidx];
1637 /* Load limits for loop over neighbors */
1638 j_index_start = jindex[iidx];
1639 j_index_end = jindex[iidx+1];
1641 /* Get outer coordinate index */
1643 i_coord_offset = DIM*inr;
1645 /* Load i particle coords and add shift vector */
1646 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1647 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1649 fix0 = _mm_setzero_ps();
1650 fiy0 = _mm_setzero_ps();
1651 fiz0 = _mm_setzero_ps();
1652 fix1 = _mm_setzero_ps();
1653 fiy1 = _mm_setzero_ps();
1654 fiz1 = _mm_setzero_ps();
1655 fix2 = _mm_setzero_ps();
1656 fiy2 = _mm_setzero_ps();
1657 fiz2 = _mm_setzero_ps();
1659 /* Start inner kernel loop */
1660 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1663 /* Get j neighbor index, and coordinate index */
1665 jnrB = jjnr[jidx+1];
1666 jnrC = jjnr[jidx+2];
1667 jnrD = jjnr[jidx+3];
1668 j_coord_offsetA = DIM*jnrA;
1669 j_coord_offsetB = DIM*jnrB;
1670 j_coord_offsetC = DIM*jnrC;
1671 j_coord_offsetD = DIM*jnrD;
1673 /* load j atom coordinates */
1674 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1675 x+j_coord_offsetC,x+j_coord_offsetD,
1676 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1678 /* Calculate displacement vector */
1679 dx00 = _mm_sub_ps(ix0,jx0);
1680 dy00 = _mm_sub_ps(iy0,jy0);
1681 dz00 = _mm_sub_ps(iz0,jz0);
1682 dx01 = _mm_sub_ps(ix0,jx1);
1683 dy01 = _mm_sub_ps(iy0,jy1);
1684 dz01 = _mm_sub_ps(iz0,jz1);
1685 dx02 = _mm_sub_ps(ix0,jx2);
1686 dy02 = _mm_sub_ps(iy0,jy2);
1687 dz02 = _mm_sub_ps(iz0,jz2);
1688 dx10 = _mm_sub_ps(ix1,jx0);
1689 dy10 = _mm_sub_ps(iy1,jy0);
1690 dz10 = _mm_sub_ps(iz1,jz0);
1691 dx11 = _mm_sub_ps(ix1,jx1);
1692 dy11 = _mm_sub_ps(iy1,jy1);
1693 dz11 = _mm_sub_ps(iz1,jz1);
1694 dx12 = _mm_sub_ps(ix1,jx2);
1695 dy12 = _mm_sub_ps(iy1,jy2);
1696 dz12 = _mm_sub_ps(iz1,jz2);
1697 dx20 = _mm_sub_ps(ix2,jx0);
1698 dy20 = _mm_sub_ps(iy2,jy0);
1699 dz20 = _mm_sub_ps(iz2,jz0);
1700 dx21 = _mm_sub_ps(ix2,jx1);
1701 dy21 = _mm_sub_ps(iy2,jy1);
1702 dz21 = _mm_sub_ps(iz2,jz1);
1703 dx22 = _mm_sub_ps(ix2,jx2);
1704 dy22 = _mm_sub_ps(iy2,jy2);
1705 dz22 = _mm_sub_ps(iz2,jz2);
1707 /* Calculate squared distance and things based on it */
1708 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1709 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1710 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1711 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1712 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1713 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1714 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1715 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1716 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1718 rinv00 = avx128fma_invsqrt_f(rsq00);
1719 rinv01 = avx128fma_invsqrt_f(rsq01);
1720 rinv02 = avx128fma_invsqrt_f(rsq02);
1721 rinv10 = avx128fma_invsqrt_f(rsq10);
1722 rinv11 = avx128fma_invsqrt_f(rsq11);
1723 rinv12 = avx128fma_invsqrt_f(rsq12);
1724 rinv20 = avx128fma_invsqrt_f(rsq20);
1725 rinv21 = avx128fma_invsqrt_f(rsq21);
1726 rinv22 = avx128fma_invsqrt_f(rsq22);
1728 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1729 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1730 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1731 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1732 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1733 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1734 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1735 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1736 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1738 fjx0 = _mm_setzero_ps();
1739 fjy0 = _mm_setzero_ps();
1740 fjz0 = _mm_setzero_ps();
1741 fjx1 = _mm_setzero_ps();
1742 fjy1 = _mm_setzero_ps();
1743 fjz1 = _mm_setzero_ps();
1744 fjx2 = _mm_setzero_ps();
1745 fjy2 = _mm_setzero_ps();
1746 fjz2 = _mm_setzero_ps();
1748 /**************************
1749 * CALCULATE INTERACTIONS *
1750 **************************/
1752 if (gmx_mm_any_lt(rsq00,rcutoff2))
1755 r00 = _mm_mul_ps(rsq00,rinv00);
1757 /* EWALD ELECTROSTATICS */
1759 /* Analytical PME correction */
1760 zeta2 = _mm_mul_ps(beta2,rsq00);
1761 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1762 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1763 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1764 felec = _mm_mul_ps(qq00,felec);
1765 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1766 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1767 velec = _mm_mul_ps(qq00,velec);
1769 /* LENNARD-JONES DISPERSION/REPULSION */
1771 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1772 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1773 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1774 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1775 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1777 d = _mm_sub_ps(r00,rswitch);
1778 d = _mm_max_ps(d,_mm_setzero_ps());
1779 d2 = _mm_mul_ps(d,d);
1780 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1782 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1784 /* Evaluate switch function */
1785 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1786 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1787 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1788 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1790 fscal = _mm_add_ps(felec,fvdw);
1792 fscal = _mm_and_ps(fscal,cutoff_mask);
1794 /* Update vectorial force */
1795 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1796 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1797 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1799 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1800 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1801 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 if (gmx_mm_any_lt(rsq01,rcutoff2))
1812 r01 = _mm_mul_ps(rsq01,rinv01);
1814 /* EWALD ELECTROSTATICS */
1816 /* Analytical PME correction */
1817 zeta2 = _mm_mul_ps(beta2,rsq01);
1818 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1819 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1820 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1821 felec = _mm_mul_ps(qq01,felec);
1822 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1823 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1824 velec = _mm_mul_ps(qq01,velec);
1826 d = _mm_sub_ps(r01,rswitch);
1827 d = _mm_max_ps(d,_mm_setzero_ps());
1828 d2 = _mm_mul_ps(d,d);
1829 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1831 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1833 /* Evaluate switch function */
1834 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1835 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1836 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1840 fscal = _mm_and_ps(fscal,cutoff_mask);
1842 /* Update vectorial force */
1843 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1844 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1845 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1847 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1848 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1849 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 if (gmx_mm_any_lt(rsq02,rcutoff2))
1860 r02 = _mm_mul_ps(rsq02,rinv02);
1862 /* EWALD ELECTROSTATICS */
1864 /* Analytical PME correction */
1865 zeta2 = _mm_mul_ps(beta2,rsq02);
1866 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1867 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1868 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1869 felec = _mm_mul_ps(qq02,felec);
1870 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1871 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1872 velec = _mm_mul_ps(qq02,velec);
1874 d = _mm_sub_ps(r02,rswitch);
1875 d = _mm_max_ps(d,_mm_setzero_ps());
1876 d2 = _mm_mul_ps(d,d);
1877 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1879 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1881 /* Evaluate switch function */
1882 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1883 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1884 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1888 fscal = _mm_and_ps(fscal,cutoff_mask);
1890 /* Update vectorial force */
1891 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1892 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1893 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1895 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1896 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1897 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1901 /**************************
1902 * CALCULATE INTERACTIONS *
1903 **************************/
1905 if (gmx_mm_any_lt(rsq10,rcutoff2))
1908 r10 = _mm_mul_ps(rsq10,rinv10);
1910 /* EWALD ELECTROSTATICS */
1912 /* Analytical PME correction */
1913 zeta2 = _mm_mul_ps(beta2,rsq10);
1914 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1915 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1916 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1917 felec = _mm_mul_ps(qq10,felec);
1918 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1919 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1920 velec = _mm_mul_ps(qq10,velec);
1922 d = _mm_sub_ps(r10,rswitch);
1923 d = _mm_max_ps(d,_mm_setzero_ps());
1924 d2 = _mm_mul_ps(d,d);
1925 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1927 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1929 /* Evaluate switch function */
1930 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1931 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1932 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1936 fscal = _mm_and_ps(fscal,cutoff_mask);
1938 /* Update vectorial force */
1939 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1940 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1941 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1943 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1944 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1945 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 if (gmx_mm_any_lt(rsq11,rcutoff2))
1956 r11 = _mm_mul_ps(rsq11,rinv11);
1958 /* EWALD ELECTROSTATICS */
1960 /* Analytical PME correction */
1961 zeta2 = _mm_mul_ps(beta2,rsq11);
1962 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1963 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1964 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1965 felec = _mm_mul_ps(qq11,felec);
1966 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1967 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1968 velec = _mm_mul_ps(qq11,velec);
1970 d = _mm_sub_ps(r11,rswitch);
1971 d = _mm_max_ps(d,_mm_setzero_ps());
1972 d2 = _mm_mul_ps(d,d);
1973 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1975 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1977 /* Evaluate switch function */
1978 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1979 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1980 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1984 fscal = _mm_and_ps(fscal,cutoff_mask);
1986 /* Update vectorial force */
1987 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1988 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1989 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1991 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1992 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1993 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2001 if (gmx_mm_any_lt(rsq12,rcutoff2))
2004 r12 = _mm_mul_ps(rsq12,rinv12);
2006 /* EWALD ELECTROSTATICS */
2008 /* Analytical PME correction */
2009 zeta2 = _mm_mul_ps(beta2,rsq12);
2010 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2011 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2012 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2013 felec = _mm_mul_ps(qq12,felec);
2014 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2015 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2016 velec = _mm_mul_ps(qq12,velec);
2018 d = _mm_sub_ps(r12,rswitch);
2019 d = _mm_max_ps(d,_mm_setzero_ps());
2020 d2 = _mm_mul_ps(d,d);
2021 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2023 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2025 /* Evaluate switch function */
2026 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2027 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2028 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2032 fscal = _mm_and_ps(fscal,cutoff_mask);
2034 /* Update vectorial force */
2035 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2036 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2037 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2039 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2040 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2041 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2045 /**************************
2046 * CALCULATE INTERACTIONS *
2047 **************************/
2049 if (gmx_mm_any_lt(rsq20,rcutoff2))
2052 r20 = _mm_mul_ps(rsq20,rinv20);
2054 /* EWALD ELECTROSTATICS */
2056 /* Analytical PME correction */
2057 zeta2 = _mm_mul_ps(beta2,rsq20);
2058 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2059 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2060 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2061 felec = _mm_mul_ps(qq20,felec);
2062 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2063 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2064 velec = _mm_mul_ps(qq20,velec);
2066 d = _mm_sub_ps(r20,rswitch);
2067 d = _mm_max_ps(d,_mm_setzero_ps());
2068 d2 = _mm_mul_ps(d,d);
2069 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2071 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2073 /* Evaluate switch function */
2074 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2075 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2076 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2080 fscal = _mm_and_ps(fscal,cutoff_mask);
2082 /* Update vectorial force */
2083 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2084 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2085 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2087 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2088 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2089 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 if (gmx_mm_any_lt(rsq21,rcutoff2))
2100 r21 = _mm_mul_ps(rsq21,rinv21);
2102 /* EWALD ELECTROSTATICS */
2104 /* Analytical PME correction */
2105 zeta2 = _mm_mul_ps(beta2,rsq21);
2106 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2107 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2108 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2109 felec = _mm_mul_ps(qq21,felec);
2110 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2111 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2112 velec = _mm_mul_ps(qq21,velec);
2114 d = _mm_sub_ps(r21,rswitch);
2115 d = _mm_max_ps(d,_mm_setzero_ps());
2116 d2 = _mm_mul_ps(d,d);
2117 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2119 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2121 /* Evaluate switch function */
2122 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2123 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2124 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2128 fscal = _mm_and_ps(fscal,cutoff_mask);
2130 /* Update vectorial force */
2131 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2132 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2133 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2135 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2136 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2137 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2141 /**************************
2142 * CALCULATE INTERACTIONS *
2143 **************************/
2145 if (gmx_mm_any_lt(rsq22,rcutoff2))
2148 r22 = _mm_mul_ps(rsq22,rinv22);
2150 /* EWALD ELECTROSTATICS */
2152 /* Analytical PME correction */
2153 zeta2 = _mm_mul_ps(beta2,rsq22);
2154 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2155 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2156 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2157 felec = _mm_mul_ps(qq22,felec);
2158 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2159 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2160 velec = _mm_mul_ps(qq22,velec);
2162 d = _mm_sub_ps(r22,rswitch);
2163 d = _mm_max_ps(d,_mm_setzero_ps());
2164 d2 = _mm_mul_ps(d,d);
2165 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2167 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2169 /* Evaluate switch function */
2170 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2171 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2172 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2176 fscal = _mm_and_ps(fscal,cutoff_mask);
2178 /* Update vectorial force */
2179 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2180 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2181 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2183 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2184 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2185 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2189 fjptrA = f+j_coord_offsetA;
2190 fjptrB = f+j_coord_offsetB;
2191 fjptrC = f+j_coord_offsetC;
2192 fjptrD = f+j_coord_offsetD;
2194 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2195 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2197 /* Inner loop uses 465 flops */
2200 if(jidx<j_index_end)
2203 /* Get j neighbor index, and coordinate index */
2204 jnrlistA = jjnr[jidx];
2205 jnrlistB = jjnr[jidx+1];
2206 jnrlistC = jjnr[jidx+2];
2207 jnrlistD = jjnr[jidx+3];
2208 /* Sign of each element will be negative for non-real atoms.
2209 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2210 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2212 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2213 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2214 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2215 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2216 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2217 j_coord_offsetA = DIM*jnrA;
2218 j_coord_offsetB = DIM*jnrB;
2219 j_coord_offsetC = DIM*jnrC;
2220 j_coord_offsetD = DIM*jnrD;
2222 /* load j atom coordinates */
2223 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2224 x+j_coord_offsetC,x+j_coord_offsetD,
2225 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2227 /* Calculate displacement vector */
2228 dx00 = _mm_sub_ps(ix0,jx0);
2229 dy00 = _mm_sub_ps(iy0,jy0);
2230 dz00 = _mm_sub_ps(iz0,jz0);
2231 dx01 = _mm_sub_ps(ix0,jx1);
2232 dy01 = _mm_sub_ps(iy0,jy1);
2233 dz01 = _mm_sub_ps(iz0,jz1);
2234 dx02 = _mm_sub_ps(ix0,jx2);
2235 dy02 = _mm_sub_ps(iy0,jy2);
2236 dz02 = _mm_sub_ps(iz0,jz2);
2237 dx10 = _mm_sub_ps(ix1,jx0);
2238 dy10 = _mm_sub_ps(iy1,jy0);
2239 dz10 = _mm_sub_ps(iz1,jz0);
2240 dx11 = _mm_sub_ps(ix1,jx1);
2241 dy11 = _mm_sub_ps(iy1,jy1);
2242 dz11 = _mm_sub_ps(iz1,jz1);
2243 dx12 = _mm_sub_ps(ix1,jx2);
2244 dy12 = _mm_sub_ps(iy1,jy2);
2245 dz12 = _mm_sub_ps(iz1,jz2);
2246 dx20 = _mm_sub_ps(ix2,jx0);
2247 dy20 = _mm_sub_ps(iy2,jy0);
2248 dz20 = _mm_sub_ps(iz2,jz0);
2249 dx21 = _mm_sub_ps(ix2,jx1);
2250 dy21 = _mm_sub_ps(iy2,jy1);
2251 dz21 = _mm_sub_ps(iz2,jz1);
2252 dx22 = _mm_sub_ps(ix2,jx2);
2253 dy22 = _mm_sub_ps(iy2,jy2);
2254 dz22 = _mm_sub_ps(iz2,jz2);
2256 /* Calculate squared distance and things based on it */
2257 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2258 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2259 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2260 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2261 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2262 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2263 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2264 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2265 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2267 rinv00 = avx128fma_invsqrt_f(rsq00);
2268 rinv01 = avx128fma_invsqrt_f(rsq01);
2269 rinv02 = avx128fma_invsqrt_f(rsq02);
2270 rinv10 = avx128fma_invsqrt_f(rsq10);
2271 rinv11 = avx128fma_invsqrt_f(rsq11);
2272 rinv12 = avx128fma_invsqrt_f(rsq12);
2273 rinv20 = avx128fma_invsqrt_f(rsq20);
2274 rinv21 = avx128fma_invsqrt_f(rsq21);
2275 rinv22 = avx128fma_invsqrt_f(rsq22);
2277 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2278 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2279 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2280 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2281 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2282 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2283 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2284 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2285 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2287 fjx0 = _mm_setzero_ps();
2288 fjy0 = _mm_setzero_ps();
2289 fjz0 = _mm_setzero_ps();
2290 fjx1 = _mm_setzero_ps();
2291 fjy1 = _mm_setzero_ps();
2292 fjz1 = _mm_setzero_ps();
2293 fjx2 = _mm_setzero_ps();
2294 fjy2 = _mm_setzero_ps();
2295 fjz2 = _mm_setzero_ps();
2297 /**************************
2298 * CALCULATE INTERACTIONS *
2299 **************************/
2301 if (gmx_mm_any_lt(rsq00,rcutoff2))
2304 r00 = _mm_mul_ps(rsq00,rinv00);
2305 r00 = _mm_andnot_ps(dummy_mask,r00);
2307 /* EWALD ELECTROSTATICS */
2309 /* Analytical PME correction */
2310 zeta2 = _mm_mul_ps(beta2,rsq00);
2311 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
2312 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2313 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2314 felec = _mm_mul_ps(qq00,felec);
2315 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2316 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
2317 velec = _mm_mul_ps(qq00,velec);
2319 /* LENNARD-JONES DISPERSION/REPULSION */
2321 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2322 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2323 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2324 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
2325 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2327 d = _mm_sub_ps(r00,rswitch);
2328 d = _mm_max_ps(d,_mm_setzero_ps());
2329 d2 = _mm_mul_ps(d,d);
2330 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2332 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2334 /* Evaluate switch function */
2335 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2336 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2337 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2338 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2340 fscal = _mm_add_ps(felec,fvdw);
2342 fscal = _mm_and_ps(fscal,cutoff_mask);
2344 fscal = _mm_andnot_ps(dummy_mask,fscal);
2346 /* Update vectorial force */
2347 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2348 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2349 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2351 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2352 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2353 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2357 /**************************
2358 * CALCULATE INTERACTIONS *
2359 **************************/
2361 if (gmx_mm_any_lt(rsq01,rcutoff2))
2364 r01 = _mm_mul_ps(rsq01,rinv01);
2365 r01 = _mm_andnot_ps(dummy_mask,r01);
2367 /* EWALD ELECTROSTATICS */
2369 /* Analytical PME correction */
2370 zeta2 = _mm_mul_ps(beta2,rsq01);
2371 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
2372 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2373 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2374 felec = _mm_mul_ps(qq01,felec);
2375 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2376 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
2377 velec = _mm_mul_ps(qq01,velec);
2379 d = _mm_sub_ps(r01,rswitch);
2380 d = _mm_max_ps(d,_mm_setzero_ps());
2381 d2 = _mm_mul_ps(d,d);
2382 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2384 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2386 /* Evaluate switch function */
2387 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2388 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2389 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2393 fscal = _mm_and_ps(fscal,cutoff_mask);
2395 fscal = _mm_andnot_ps(dummy_mask,fscal);
2397 /* Update vectorial force */
2398 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2399 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2400 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2402 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2403 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2404 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2408 /**************************
2409 * CALCULATE INTERACTIONS *
2410 **************************/
2412 if (gmx_mm_any_lt(rsq02,rcutoff2))
2415 r02 = _mm_mul_ps(rsq02,rinv02);
2416 r02 = _mm_andnot_ps(dummy_mask,r02);
2418 /* EWALD ELECTROSTATICS */
2420 /* Analytical PME correction */
2421 zeta2 = _mm_mul_ps(beta2,rsq02);
2422 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
2423 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2424 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2425 felec = _mm_mul_ps(qq02,felec);
2426 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2427 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
2428 velec = _mm_mul_ps(qq02,velec);
2430 d = _mm_sub_ps(r02,rswitch);
2431 d = _mm_max_ps(d,_mm_setzero_ps());
2432 d2 = _mm_mul_ps(d,d);
2433 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2435 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2437 /* Evaluate switch function */
2438 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2439 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2440 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2444 fscal = _mm_and_ps(fscal,cutoff_mask);
2446 fscal = _mm_andnot_ps(dummy_mask,fscal);
2448 /* Update vectorial force */
2449 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2450 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2451 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2453 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2454 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2455 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2459 /**************************
2460 * CALCULATE INTERACTIONS *
2461 **************************/
2463 if (gmx_mm_any_lt(rsq10,rcutoff2))
2466 r10 = _mm_mul_ps(rsq10,rinv10);
2467 r10 = _mm_andnot_ps(dummy_mask,r10);
2469 /* EWALD ELECTROSTATICS */
2471 /* Analytical PME correction */
2472 zeta2 = _mm_mul_ps(beta2,rsq10);
2473 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
2474 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2475 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2476 felec = _mm_mul_ps(qq10,felec);
2477 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2478 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
2479 velec = _mm_mul_ps(qq10,velec);
2481 d = _mm_sub_ps(r10,rswitch);
2482 d = _mm_max_ps(d,_mm_setzero_ps());
2483 d2 = _mm_mul_ps(d,d);
2484 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2486 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2488 /* Evaluate switch function */
2489 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2490 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2491 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2495 fscal = _mm_and_ps(fscal,cutoff_mask);
2497 fscal = _mm_andnot_ps(dummy_mask,fscal);
2499 /* Update vectorial force */
2500 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2501 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2502 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2504 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2505 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2506 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2510 /**************************
2511 * CALCULATE INTERACTIONS *
2512 **************************/
2514 if (gmx_mm_any_lt(rsq11,rcutoff2))
2517 r11 = _mm_mul_ps(rsq11,rinv11);
2518 r11 = _mm_andnot_ps(dummy_mask,r11);
2520 /* EWALD ELECTROSTATICS */
2522 /* Analytical PME correction */
2523 zeta2 = _mm_mul_ps(beta2,rsq11);
2524 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2525 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2526 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2527 felec = _mm_mul_ps(qq11,felec);
2528 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2529 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2530 velec = _mm_mul_ps(qq11,velec);
2532 d = _mm_sub_ps(r11,rswitch);
2533 d = _mm_max_ps(d,_mm_setzero_ps());
2534 d2 = _mm_mul_ps(d,d);
2535 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2537 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2539 /* Evaluate switch function */
2540 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2541 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2542 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2546 fscal = _mm_and_ps(fscal,cutoff_mask);
2548 fscal = _mm_andnot_ps(dummy_mask,fscal);
2550 /* Update vectorial force */
2551 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2552 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2553 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2555 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2556 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2557 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2561 /**************************
2562 * CALCULATE INTERACTIONS *
2563 **************************/
2565 if (gmx_mm_any_lt(rsq12,rcutoff2))
2568 r12 = _mm_mul_ps(rsq12,rinv12);
2569 r12 = _mm_andnot_ps(dummy_mask,r12);
2571 /* EWALD ELECTROSTATICS */
2573 /* Analytical PME correction */
2574 zeta2 = _mm_mul_ps(beta2,rsq12);
2575 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2576 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2577 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2578 felec = _mm_mul_ps(qq12,felec);
2579 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2580 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2581 velec = _mm_mul_ps(qq12,velec);
2583 d = _mm_sub_ps(r12,rswitch);
2584 d = _mm_max_ps(d,_mm_setzero_ps());
2585 d2 = _mm_mul_ps(d,d);
2586 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2588 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2590 /* Evaluate switch function */
2591 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2592 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2593 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2597 fscal = _mm_and_ps(fscal,cutoff_mask);
2599 fscal = _mm_andnot_ps(dummy_mask,fscal);
2601 /* Update vectorial force */
2602 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2603 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2604 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2606 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2607 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2608 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2612 /**************************
2613 * CALCULATE INTERACTIONS *
2614 **************************/
2616 if (gmx_mm_any_lt(rsq20,rcutoff2))
2619 r20 = _mm_mul_ps(rsq20,rinv20);
2620 r20 = _mm_andnot_ps(dummy_mask,r20);
2622 /* EWALD ELECTROSTATICS */
2624 /* Analytical PME correction */
2625 zeta2 = _mm_mul_ps(beta2,rsq20);
2626 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2627 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2628 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2629 felec = _mm_mul_ps(qq20,felec);
2630 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2631 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2632 velec = _mm_mul_ps(qq20,velec);
2634 d = _mm_sub_ps(r20,rswitch);
2635 d = _mm_max_ps(d,_mm_setzero_ps());
2636 d2 = _mm_mul_ps(d,d);
2637 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2639 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2641 /* Evaluate switch function */
2642 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2643 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2644 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2648 fscal = _mm_and_ps(fscal,cutoff_mask);
2650 fscal = _mm_andnot_ps(dummy_mask,fscal);
2652 /* Update vectorial force */
2653 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2654 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2655 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2657 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2658 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2659 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2663 /**************************
2664 * CALCULATE INTERACTIONS *
2665 **************************/
2667 if (gmx_mm_any_lt(rsq21,rcutoff2))
2670 r21 = _mm_mul_ps(rsq21,rinv21);
2671 r21 = _mm_andnot_ps(dummy_mask,r21);
2673 /* EWALD ELECTROSTATICS */
2675 /* Analytical PME correction */
2676 zeta2 = _mm_mul_ps(beta2,rsq21);
2677 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2678 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2679 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2680 felec = _mm_mul_ps(qq21,felec);
2681 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2682 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2683 velec = _mm_mul_ps(qq21,velec);
2685 d = _mm_sub_ps(r21,rswitch);
2686 d = _mm_max_ps(d,_mm_setzero_ps());
2687 d2 = _mm_mul_ps(d,d);
2688 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2690 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2692 /* Evaluate switch function */
2693 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2694 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2695 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2699 fscal = _mm_and_ps(fscal,cutoff_mask);
2701 fscal = _mm_andnot_ps(dummy_mask,fscal);
2703 /* Update vectorial force */
2704 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2705 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2706 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2708 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2709 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2710 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2714 /**************************
2715 * CALCULATE INTERACTIONS *
2716 **************************/
2718 if (gmx_mm_any_lt(rsq22,rcutoff2))
2721 r22 = _mm_mul_ps(rsq22,rinv22);
2722 r22 = _mm_andnot_ps(dummy_mask,r22);
2724 /* EWALD ELECTROSTATICS */
2726 /* Analytical PME correction */
2727 zeta2 = _mm_mul_ps(beta2,rsq22);
2728 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2729 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2730 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2731 felec = _mm_mul_ps(qq22,felec);
2732 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2733 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2734 velec = _mm_mul_ps(qq22,velec);
2736 d = _mm_sub_ps(r22,rswitch);
2737 d = _mm_max_ps(d,_mm_setzero_ps());
2738 d2 = _mm_mul_ps(d,d);
2739 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2741 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2743 /* Evaluate switch function */
2744 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2745 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2746 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2750 fscal = _mm_and_ps(fscal,cutoff_mask);
2752 fscal = _mm_andnot_ps(dummy_mask,fscal);
2754 /* Update vectorial force */
2755 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2756 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2757 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2759 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2760 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2761 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2765 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2766 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2767 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2768 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2770 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2771 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2773 /* Inner loop uses 474 flops */
2776 /* End of innermost loop */
2778 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2779 f+i_coord_offset,fshift+i_shift_offset);
2781 /* Increment number of inner iterations */
2782 inneriter += j_index_end - j_index_start;
2784 /* Outer loop uses 18 flops */
2787 /* Increment number of outer iterations */
2790 /* Update outer/inner flops */
2792 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*474);