2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
89 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
91 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
93 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
94 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
95 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
96 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
97 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
98 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
99 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
100 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
101 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
102 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
109 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
110 real rswitch_scalar,d_scalar;
111 __m128 dummy_mask,cutoff_mask;
112 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
113 __m128 one = _mm_set1_ps(1.0);
114 __m128 two = _mm_set1_ps(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm_set1_ps(fr->ic->epsfac);
127 charge = mdatoms->chargeA;
129 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
130 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
131 beta2 = _mm_mul_ps(beta,beta);
132 beta3 = _mm_mul_ps(beta,beta2);
133 ewtab = fr->ic->tabq_coul_FDV0;
134 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
135 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
137 /* Setup water-specific parameters */
138 inr = nlist->iinr[0];
139 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
140 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
141 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
143 jq1 = _mm_set1_ps(charge[inr+1]);
144 jq2 = _mm_set1_ps(charge[inr+2]);
145 jq3 = _mm_set1_ps(charge[inr+3]);
146 qq11 = _mm_mul_ps(iq1,jq1);
147 qq12 = _mm_mul_ps(iq1,jq2);
148 qq13 = _mm_mul_ps(iq1,jq3);
149 qq21 = _mm_mul_ps(iq2,jq1);
150 qq22 = _mm_mul_ps(iq2,jq2);
151 qq23 = _mm_mul_ps(iq2,jq3);
152 qq31 = _mm_mul_ps(iq3,jq1);
153 qq32 = _mm_mul_ps(iq3,jq2);
154 qq33 = _mm_mul_ps(iq3,jq3);
156 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
157 rcutoff_scalar = fr->ic->rcoulomb;
158 rcutoff = _mm_set1_ps(rcutoff_scalar);
159 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
161 rswitch_scalar = fr->ic->rcoulomb_switch;
162 rswitch = _mm_set1_ps(rswitch_scalar);
163 /* Setup switch parameters */
164 d_scalar = rcutoff_scalar-rswitch_scalar;
165 d = _mm_set1_ps(d_scalar);
166 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
167 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
168 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
169 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
170 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
171 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
173 /* Avoid stupid compiler warnings */
174 jnrA = jnrB = jnrC = jnrD = 0;
183 for(iidx=0;iidx<4*DIM;iidx++)
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
204 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
206 fix1 = _mm_setzero_ps();
207 fiy1 = _mm_setzero_ps();
208 fiz1 = _mm_setzero_ps();
209 fix2 = _mm_setzero_ps();
210 fiy2 = _mm_setzero_ps();
211 fiz2 = _mm_setzero_ps();
212 fix3 = _mm_setzero_ps();
213 fiy3 = _mm_setzero_ps();
214 fiz3 = _mm_setzero_ps();
216 /* Reset potential sums */
217 velecsum = _mm_setzero_ps();
219 /* Start inner kernel loop */
220 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
223 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
230 j_coord_offsetC = DIM*jnrC;
231 j_coord_offsetD = DIM*jnrD;
233 /* load j atom coordinates */
234 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
235 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
236 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
238 /* Calculate displacement vector */
239 dx11 = _mm_sub_ps(ix1,jx1);
240 dy11 = _mm_sub_ps(iy1,jy1);
241 dz11 = _mm_sub_ps(iz1,jz1);
242 dx12 = _mm_sub_ps(ix1,jx2);
243 dy12 = _mm_sub_ps(iy1,jy2);
244 dz12 = _mm_sub_ps(iz1,jz2);
245 dx13 = _mm_sub_ps(ix1,jx3);
246 dy13 = _mm_sub_ps(iy1,jy3);
247 dz13 = _mm_sub_ps(iz1,jz3);
248 dx21 = _mm_sub_ps(ix2,jx1);
249 dy21 = _mm_sub_ps(iy2,jy1);
250 dz21 = _mm_sub_ps(iz2,jz1);
251 dx22 = _mm_sub_ps(ix2,jx2);
252 dy22 = _mm_sub_ps(iy2,jy2);
253 dz22 = _mm_sub_ps(iz2,jz2);
254 dx23 = _mm_sub_ps(ix2,jx3);
255 dy23 = _mm_sub_ps(iy2,jy3);
256 dz23 = _mm_sub_ps(iz2,jz3);
257 dx31 = _mm_sub_ps(ix3,jx1);
258 dy31 = _mm_sub_ps(iy3,jy1);
259 dz31 = _mm_sub_ps(iz3,jz1);
260 dx32 = _mm_sub_ps(ix3,jx2);
261 dy32 = _mm_sub_ps(iy3,jy2);
262 dz32 = _mm_sub_ps(iz3,jz2);
263 dx33 = _mm_sub_ps(ix3,jx3);
264 dy33 = _mm_sub_ps(iy3,jy3);
265 dz33 = _mm_sub_ps(iz3,jz3);
267 /* Calculate squared distance and things based on it */
268 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
269 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
270 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
271 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
272 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
273 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
274 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
275 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
276 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
278 rinv11 = avx128fma_invsqrt_f(rsq11);
279 rinv12 = avx128fma_invsqrt_f(rsq12);
280 rinv13 = avx128fma_invsqrt_f(rsq13);
281 rinv21 = avx128fma_invsqrt_f(rsq21);
282 rinv22 = avx128fma_invsqrt_f(rsq22);
283 rinv23 = avx128fma_invsqrt_f(rsq23);
284 rinv31 = avx128fma_invsqrt_f(rsq31);
285 rinv32 = avx128fma_invsqrt_f(rsq32);
286 rinv33 = avx128fma_invsqrt_f(rsq33);
288 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
289 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
290 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
291 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
292 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
293 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
294 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
295 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
296 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
298 fjx1 = _mm_setzero_ps();
299 fjy1 = _mm_setzero_ps();
300 fjz1 = _mm_setzero_ps();
301 fjx2 = _mm_setzero_ps();
302 fjy2 = _mm_setzero_ps();
303 fjz2 = _mm_setzero_ps();
304 fjx3 = _mm_setzero_ps();
305 fjy3 = _mm_setzero_ps();
306 fjz3 = _mm_setzero_ps();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm_any_lt(rsq11,rcutoff2))
315 r11 = _mm_mul_ps(rsq11,rinv11);
317 /* EWALD ELECTROSTATICS */
319 /* Analytical PME correction */
320 zeta2 = _mm_mul_ps(beta2,rsq11);
321 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
322 pmecorrF = avx128fma_pmecorrF_f(zeta2);
323 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
324 felec = _mm_mul_ps(qq11,felec);
325 pmecorrV = avx128fma_pmecorrV_f(zeta2);
326 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
327 velec = _mm_mul_ps(qq11,velec);
329 d = _mm_sub_ps(r11,rswitch);
330 d = _mm_max_ps(d,_mm_setzero_ps());
331 d2 = _mm_mul_ps(d,d);
332 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
334 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
336 /* Evaluate switch function */
337 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
338 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
339 velec = _mm_mul_ps(velec,sw);
340 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
342 /* Update potential sum for this i atom from the interaction with this j atom. */
343 velec = _mm_and_ps(velec,cutoff_mask);
344 velecsum = _mm_add_ps(velecsum,velec);
348 fscal = _mm_and_ps(fscal,cutoff_mask);
350 /* Update vectorial force */
351 fix1 = _mm_macc_ps(dx11,fscal,fix1);
352 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
353 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
355 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
356 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
357 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
361 /**************************
362 * CALCULATE INTERACTIONS *
363 **************************/
365 if (gmx_mm_any_lt(rsq12,rcutoff2))
368 r12 = _mm_mul_ps(rsq12,rinv12);
370 /* EWALD ELECTROSTATICS */
372 /* Analytical PME correction */
373 zeta2 = _mm_mul_ps(beta2,rsq12);
374 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
375 pmecorrF = avx128fma_pmecorrF_f(zeta2);
376 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
377 felec = _mm_mul_ps(qq12,felec);
378 pmecorrV = avx128fma_pmecorrV_f(zeta2);
379 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
380 velec = _mm_mul_ps(qq12,velec);
382 d = _mm_sub_ps(r12,rswitch);
383 d = _mm_max_ps(d,_mm_setzero_ps());
384 d2 = _mm_mul_ps(d,d);
385 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
387 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
389 /* Evaluate switch function */
390 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
391 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
392 velec = _mm_mul_ps(velec,sw);
393 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velec = _mm_and_ps(velec,cutoff_mask);
397 velecsum = _mm_add_ps(velecsum,velec);
401 fscal = _mm_and_ps(fscal,cutoff_mask);
403 /* Update vectorial force */
404 fix1 = _mm_macc_ps(dx12,fscal,fix1);
405 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
406 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
408 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
409 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
410 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 if (gmx_mm_any_lt(rsq13,rcutoff2))
421 r13 = _mm_mul_ps(rsq13,rinv13);
423 /* EWALD ELECTROSTATICS */
425 /* Analytical PME correction */
426 zeta2 = _mm_mul_ps(beta2,rsq13);
427 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
428 pmecorrF = avx128fma_pmecorrF_f(zeta2);
429 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
430 felec = _mm_mul_ps(qq13,felec);
431 pmecorrV = avx128fma_pmecorrV_f(zeta2);
432 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
433 velec = _mm_mul_ps(qq13,velec);
435 d = _mm_sub_ps(r13,rswitch);
436 d = _mm_max_ps(d,_mm_setzero_ps());
437 d2 = _mm_mul_ps(d,d);
438 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
440 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
442 /* Evaluate switch function */
443 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
444 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
445 velec = _mm_mul_ps(velec,sw);
446 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velec = _mm_and_ps(velec,cutoff_mask);
450 velecsum = _mm_add_ps(velecsum,velec);
454 fscal = _mm_and_ps(fscal,cutoff_mask);
456 /* Update vectorial force */
457 fix1 = _mm_macc_ps(dx13,fscal,fix1);
458 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
459 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
461 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
462 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
463 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
467 /**************************
468 * CALCULATE INTERACTIONS *
469 **************************/
471 if (gmx_mm_any_lt(rsq21,rcutoff2))
474 r21 = _mm_mul_ps(rsq21,rinv21);
476 /* EWALD ELECTROSTATICS */
478 /* Analytical PME correction */
479 zeta2 = _mm_mul_ps(beta2,rsq21);
480 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
481 pmecorrF = avx128fma_pmecorrF_f(zeta2);
482 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
483 felec = _mm_mul_ps(qq21,felec);
484 pmecorrV = avx128fma_pmecorrV_f(zeta2);
485 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
486 velec = _mm_mul_ps(qq21,velec);
488 d = _mm_sub_ps(r21,rswitch);
489 d = _mm_max_ps(d,_mm_setzero_ps());
490 d2 = _mm_mul_ps(d,d);
491 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
493 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
495 /* Evaluate switch function */
496 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
497 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
498 velec = _mm_mul_ps(velec,sw);
499 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velec = _mm_and_ps(velec,cutoff_mask);
503 velecsum = _mm_add_ps(velecsum,velec);
507 fscal = _mm_and_ps(fscal,cutoff_mask);
509 /* Update vectorial force */
510 fix2 = _mm_macc_ps(dx21,fscal,fix2);
511 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
512 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
514 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
515 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
516 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm_any_lt(rsq22,rcutoff2))
527 r22 = _mm_mul_ps(rsq22,rinv22);
529 /* EWALD ELECTROSTATICS */
531 /* Analytical PME correction */
532 zeta2 = _mm_mul_ps(beta2,rsq22);
533 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
534 pmecorrF = avx128fma_pmecorrF_f(zeta2);
535 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
536 felec = _mm_mul_ps(qq22,felec);
537 pmecorrV = avx128fma_pmecorrV_f(zeta2);
538 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
539 velec = _mm_mul_ps(qq22,velec);
541 d = _mm_sub_ps(r22,rswitch);
542 d = _mm_max_ps(d,_mm_setzero_ps());
543 d2 = _mm_mul_ps(d,d);
544 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
546 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
548 /* Evaluate switch function */
549 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
550 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
551 velec = _mm_mul_ps(velec,sw);
552 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velec = _mm_and_ps(velec,cutoff_mask);
556 velecsum = _mm_add_ps(velecsum,velec);
560 fscal = _mm_and_ps(fscal,cutoff_mask);
562 /* Update vectorial force */
563 fix2 = _mm_macc_ps(dx22,fscal,fix2);
564 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
565 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
567 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
568 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
569 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 if (gmx_mm_any_lt(rsq23,rcutoff2))
580 r23 = _mm_mul_ps(rsq23,rinv23);
582 /* EWALD ELECTROSTATICS */
584 /* Analytical PME correction */
585 zeta2 = _mm_mul_ps(beta2,rsq23);
586 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
587 pmecorrF = avx128fma_pmecorrF_f(zeta2);
588 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
589 felec = _mm_mul_ps(qq23,felec);
590 pmecorrV = avx128fma_pmecorrV_f(zeta2);
591 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
592 velec = _mm_mul_ps(qq23,velec);
594 d = _mm_sub_ps(r23,rswitch);
595 d = _mm_max_ps(d,_mm_setzero_ps());
596 d2 = _mm_mul_ps(d,d);
597 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
599 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
601 /* Evaluate switch function */
602 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
603 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
604 velec = _mm_mul_ps(velec,sw);
605 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
607 /* Update potential sum for this i atom from the interaction with this j atom. */
608 velec = _mm_and_ps(velec,cutoff_mask);
609 velecsum = _mm_add_ps(velecsum,velec);
613 fscal = _mm_and_ps(fscal,cutoff_mask);
615 /* Update vectorial force */
616 fix2 = _mm_macc_ps(dx23,fscal,fix2);
617 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
618 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
620 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
621 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
622 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
626 /**************************
627 * CALCULATE INTERACTIONS *
628 **************************/
630 if (gmx_mm_any_lt(rsq31,rcutoff2))
633 r31 = _mm_mul_ps(rsq31,rinv31);
635 /* EWALD ELECTROSTATICS */
637 /* Analytical PME correction */
638 zeta2 = _mm_mul_ps(beta2,rsq31);
639 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
640 pmecorrF = avx128fma_pmecorrF_f(zeta2);
641 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
642 felec = _mm_mul_ps(qq31,felec);
643 pmecorrV = avx128fma_pmecorrV_f(zeta2);
644 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
645 velec = _mm_mul_ps(qq31,velec);
647 d = _mm_sub_ps(r31,rswitch);
648 d = _mm_max_ps(d,_mm_setzero_ps());
649 d2 = _mm_mul_ps(d,d);
650 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
652 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
654 /* Evaluate switch function */
655 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
656 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
657 velec = _mm_mul_ps(velec,sw);
658 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 velec = _mm_and_ps(velec,cutoff_mask);
662 velecsum = _mm_add_ps(velecsum,velec);
666 fscal = _mm_and_ps(fscal,cutoff_mask);
668 /* Update vectorial force */
669 fix3 = _mm_macc_ps(dx31,fscal,fix3);
670 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
671 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
673 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
674 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
675 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
679 /**************************
680 * CALCULATE INTERACTIONS *
681 **************************/
683 if (gmx_mm_any_lt(rsq32,rcutoff2))
686 r32 = _mm_mul_ps(rsq32,rinv32);
688 /* EWALD ELECTROSTATICS */
690 /* Analytical PME correction */
691 zeta2 = _mm_mul_ps(beta2,rsq32);
692 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
693 pmecorrF = avx128fma_pmecorrF_f(zeta2);
694 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
695 felec = _mm_mul_ps(qq32,felec);
696 pmecorrV = avx128fma_pmecorrV_f(zeta2);
697 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
698 velec = _mm_mul_ps(qq32,velec);
700 d = _mm_sub_ps(r32,rswitch);
701 d = _mm_max_ps(d,_mm_setzero_ps());
702 d2 = _mm_mul_ps(d,d);
703 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
705 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
707 /* Evaluate switch function */
708 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
709 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
710 velec = _mm_mul_ps(velec,sw);
711 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velec = _mm_and_ps(velec,cutoff_mask);
715 velecsum = _mm_add_ps(velecsum,velec);
719 fscal = _mm_and_ps(fscal,cutoff_mask);
721 /* Update vectorial force */
722 fix3 = _mm_macc_ps(dx32,fscal,fix3);
723 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
724 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
726 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
727 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
728 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
732 /**************************
733 * CALCULATE INTERACTIONS *
734 **************************/
736 if (gmx_mm_any_lt(rsq33,rcutoff2))
739 r33 = _mm_mul_ps(rsq33,rinv33);
741 /* EWALD ELECTROSTATICS */
743 /* Analytical PME correction */
744 zeta2 = _mm_mul_ps(beta2,rsq33);
745 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
746 pmecorrF = avx128fma_pmecorrF_f(zeta2);
747 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
748 felec = _mm_mul_ps(qq33,felec);
749 pmecorrV = avx128fma_pmecorrV_f(zeta2);
750 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
751 velec = _mm_mul_ps(qq33,velec);
753 d = _mm_sub_ps(r33,rswitch);
754 d = _mm_max_ps(d,_mm_setzero_ps());
755 d2 = _mm_mul_ps(d,d);
756 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
758 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
760 /* Evaluate switch function */
761 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
762 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
763 velec = _mm_mul_ps(velec,sw);
764 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
766 /* Update potential sum for this i atom from the interaction with this j atom. */
767 velec = _mm_and_ps(velec,cutoff_mask);
768 velecsum = _mm_add_ps(velecsum,velec);
772 fscal = _mm_and_ps(fscal,cutoff_mask);
774 /* Update vectorial force */
775 fix3 = _mm_macc_ps(dx33,fscal,fix3);
776 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
777 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
779 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
780 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
781 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
785 fjptrA = f+j_coord_offsetA;
786 fjptrB = f+j_coord_offsetB;
787 fjptrC = f+j_coord_offsetC;
788 fjptrD = f+j_coord_offsetD;
790 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
791 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
793 /* Inner loop uses 477 flops */
799 /* Get j neighbor index, and coordinate index */
800 jnrlistA = jjnr[jidx];
801 jnrlistB = jjnr[jidx+1];
802 jnrlistC = jjnr[jidx+2];
803 jnrlistD = jjnr[jidx+3];
804 /* Sign of each element will be negative for non-real atoms.
805 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
806 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
808 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
809 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
810 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
811 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
812 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
813 j_coord_offsetA = DIM*jnrA;
814 j_coord_offsetB = DIM*jnrB;
815 j_coord_offsetC = DIM*jnrC;
816 j_coord_offsetD = DIM*jnrD;
818 /* load j atom coordinates */
819 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
820 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
821 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
823 /* Calculate displacement vector */
824 dx11 = _mm_sub_ps(ix1,jx1);
825 dy11 = _mm_sub_ps(iy1,jy1);
826 dz11 = _mm_sub_ps(iz1,jz1);
827 dx12 = _mm_sub_ps(ix1,jx2);
828 dy12 = _mm_sub_ps(iy1,jy2);
829 dz12 = _mm_sub_ps(iz1,jz2);
830 dx13 = _mm_sub_ps(ix1,jx3);
831 dy13 = _mm_sub_ps(iy1,jy3);
832 dz13 = _mm_sub_ps(iz1,jz3);
833 dx21 = _mm_sub_ps(ix2,jx1);
834 dy21 = _mm_sub_ps(iy2,jy1);
835 dz21 = _mm_sub_ps(iz2,jz1);
836 dx22 = _mm_sub_ps(ix2,jx2);
837 dy22 = _mm_sub_ps(iy2,jy2);
838 dz22 = _mm_sub_ps(iz2,jz2);
839 dx23 = _mm_sub_ps(ix2,jx3);
840 dy23 = _mm_sub_ps(iy2,jy3);
841 dz23 = _mm_sub_ps(iz2,jz3);
842 dx31 = _mm_sub_ps(ix3,jx1);
843 dy31 = _mm_sub_ps(iy3,jy1);
844 dz31 = _mm_sub_ps(iz3,jz1);
845 dx32 = _mm_sub_ps(ix3,jx2);
846 dy32 = _mm_sub_ps(iy3,jy2);
847 dz32 = _mm_sub_ps(iz3,jz2);
848 dx33 = _mm_sub_ps(ix3,jx3);
849 dy33 = _mm_sub_ps(iy3,jy3);
850 dz33 = _mm_sub_ps(iz3,jz3);
852 /* Calculate squared distance and things based on it */
853 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
854 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
855 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
856 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
857 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
858 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
859 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
860 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
861 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
863 rinv11 = avx128fma_invsqrt_f(rsq11);
864 rinv12 = avx128fma_invsqrt_f(rsq12);
865 rinv13 = avx128fma_invsqrt_f(rsq13);
866 rinv21 = avx128fma_invsqrt_f(rsq21);
867 rinv22 = avx128fma_invsqrt_f(rsq22);
868 rinv23 = avx128fma_invsqrt_f(rsq23);
869 rinv31 = avx128fma_invsqrt_f(rsq31);
870 rinv32 = avx128fma_invsqrt_f(rsq32);
871 rinv33 = avx128fma_invsqrt_f(rsq33);
873 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
874 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
875 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
876 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
877 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
878 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
879 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
880 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
881 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
883 fjx1 = _mm_setzero_ps();
884 fjy1 = _mm_setzero_ps();
885 fjz1 = _mm_setzero_ps();
886 fjx2 = _mm_setzero_ps();
887 fjy2 = _mm_setzero_ps();
888 fjz2 = _mm_setzero_ps();
889 fjx3 = _mm_setzero_ps();
890 fjy3 = _mm_setzero_ps();
891 fjz3 = _mm_setzero_ps();
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 if (gmx_mm_any_lt(rsq11,rcutoff2))
900 r11 = _mm_mul_ps(rsq11,rinv11);
901 r11 = _mm_andnot_ps(dummy_mask,r11);
903 /* EWALD ELECTROSTATICS */
905 /* Analytical PME correction */
906 zeta2 = _mm_mul_ps(beta2,rsq11);
907 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
908 pmecorrF = avx128fma_pmecorrF_f(zeta2);
909 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
910 felec = _mm_mul_ps(qq11,felec);
911 pmecorrV = avx128fma_pmecorrV_f(zeta2);
912 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
913 velec = _mm_mul_ps(qq11,velec);
915 d = _mm_sub_ps(r11,rswitch);
916 d = _mm_max_ps(d,_mm_setzero_ps());
917 d2 = _mm_mul_ps(d,d);
918 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
920 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
922 /* Evaluate switch function */
923 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
924 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
925 velec = _mm_mul_ps(velec,sw);
926 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
928 /* Update potential sum for this i atom from the interaction with this j atom. */
929 velec = _mm_and_ps(velec,cutoff_mask);
930 velec = _mm_andnot_ps(dummy_mask,velec);
931 velecsum = _mm_add_ps(velecsum,velec);
935 fscal = _mm_and_ps(fscal,cutoff_mask);
937 fscal = _mm_andnot_ps(dummy_mask,fscal);
939 /* Update vectorial force */
940 fix1 = _mm_macc_ps(dx11,fscal,fix1);
941 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
942 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
944 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
945 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
946 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
950 /**************************
951 * CALCULATE INTERACTIONS *
952 **************************/
954 if (gmx_mm_any_lt(rsq12,rcutoff2))
957 r12 = _mm_mul_ps(rsq12,rinv12);
958 r12 = _mm_andnot_ps(dummy_mask,r12);
960 /* EWALD ELECTROSTATICS */
962 /* Analytical PME correction */
963 zeta2 = _mm_mul_ps(beta2,rsq12);
964 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
965 pmecorrF = avx128fma_pmecorrF_f(zeta2);
966 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
967 felec = _mm_mul_ps(qq12,felec);
968 pmecorrV = avx128fma_pmecorrV_f(zeta2);
969 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
970 velec = _mm_mul_ps(qq12,velec);
972 d = _mm_sub_ps(r12,rswitch);
973 d = _mm_max_ps(d,_mm_setzero_ps());
974 d2 = _mm_mul_ps(d,d);
975 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
977 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
979 /* Evaluate switch function */
980 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
981 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
982 velec = _mm_mul_ps(velec,sw);
983 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
985 /* Update potential sum for this i atom from the interaction with this j atom. */
986 velec = _mm_and_ps(velec,cutoff_mask);
987 velec = _mm_andnot_ps(dummy_mask,velec);
988 velecsum = _mm_add_ps(velecsum,velec);
992 fscal = _mm_and_ps(fscal,cutoff_mask);
994 fscal = _mm_andnot_ps(dummy_mask,fscal);
996 /* Update vectorial force */
997 fix1 = _mm_macc_ps(dx12,fscal,fix1);
998 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
999 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1001 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1002 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1003 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1007 /**************************
1008 * CALCULATE INTERACTIONS *
1009 **************************/
1011 if (gmx_mm_any_lt(rsq13,rcutoff2))
1014 r13 = _mm_mul_ps(rsq13,rinv13);
1015 r13 = _mm_andnot_ps(dummy_mask,r13);
1017 /* EWALD ELECTROSTATICS */
1019 /* Analytical PME correction */
1020 zeta2 = _mm_mul_ps(beta2,rsq13);
1021 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1022 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1023 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1024 felec = _mm_mul_ps(qq13,felec);
1025 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1026 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
1027 velec = _mm_mul_ps(qq13,velec);
1029 d = _mm_sub_ps(r13,rswitch);
1030 d = _mm_max_ps(d,_mm_setzero_ps());
1031 d2 = _mm_mul_ps(d,d);
1032 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1034 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1036 /* Evaluate switch function */
1037 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1038 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1039 velec = _mm_mul_ps(velec,sw);
1040 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1042 /* Update potential sum for this i atom from the interaction with this j atom. */
1043 velec = _mm_and_ps(velec,cutoff_mask);
1044 velec = _mm_andnot_ps(dummy_mask,velec);
1045 velecsum = _mm_add_ps(velecsum,velec);
1049 fscal = _mm_and_ps(fscal,cutoff_mask);
1051 fscal = _mm_andnot_ps(dummy_mask,fscal);
1053 /* Update vectorial force */
1054 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1055 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1056 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1058 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1059 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1060 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1064 /**************************
1065 * CALCULATE INTERACTIONS *
1066 **************************/
1068 if (gmx_mm_any_lt(rsq21,rcutoff2))
1071 r21 = _mm_mul_ps(rsq21,rinv21);
1072 r21 = _mm_andnot_ps(dummy_mask,r21);
1074 /* EWALD ELECTROSTATICS */
1076 /* Analytical PME correction */
1077 zeta2 = _mm_mul_ps(beta2,rsq21);
1078 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1079 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1080 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1081 felec = _mm_mul_ps(qq21,felec);
1082 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1083 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1084 velec = _mm_mul_ps(qq21,velec);
1086 d = _mm_sub_ps(r21,rswitch);
1087 d = _mm_max_ps(d,_mm_setzero_ps());
1088 d2 = _mm_mul_ps(d,d);
1089 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1091 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1093 /* Evaluate switch function */
1094 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1095 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1096 velec = _mm_mul_ps(velec,sw);
1097 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _mm_and_ps(velec,cutoff_mask);
1101 velec = _mm_andnot_ps(dummy_mask,velec);
1102 velecsum = _mm_add_ps(velecsum,velec);
1106 fscal = _mm_and_ps(fscal,cutoff_mask);
1108 fscal = _mm_andnot_ps(dummy_mask,fscal);
1110 /* Update vectorial force */
1111 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1112 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1113 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1115 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1116 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1117 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1121 /**************************
1122 * CALCULATE INTERACTIONS *
1123 **************************/
1125 if (gmx_mm_any_lt(rsq22,rcutoff2))
1128 r22 = _mm_mul_ps(rsq22,rinv22);
1129 r22 = _mm_andnot_ps(dummy_mask,r22);
1131 /* EWALD ELECTROSTATICS */
1133 /* Analytical PME correction */
1134 zeta2 = _mm_mul_ps(beta2,rsq22);
1135 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1136 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1137 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1138 felec = _mm_mul_ps(qq22,felec);
1139 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1140 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1141 velec = _mm_mul_ps(qq22,velec);
1143 d = _mm_sub_ps(r22,rswitch);
1144 d = _mm_max_ps(d,_mm_setzero_ps());
1145 d2 = _mm_mul_ps(d,d);
1146 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1148 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1150 /* Evaluate switch function */
1151 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1152 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1153 velec = _mm_mul_ps(velec,sw);
1154 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1156 /* Update potential sum for this i atom from the interaction with this j atom. */
1157 velec = _mm_and_ps(velec,cutoff_mask);
1158 velec = _mm_andnot_ps(dummy_mask,velec);
1159 velecsum = _mm_add_ps(velecsum,velec);
1163 fscal = _mm_and_ps(fscal,cutoff_mask);
1165 fscal = _mm_andnot_ps(dummy_mask,fscal);
1167 /* Update vectorial force */
1168 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1169 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1170 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1172 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1173 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1174 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1178 /**************************
1179 * CALCULATE INTERACTIONS *
1180 **************************/
1182 if (gmx_mm_any_lt(rsq23,rcutoff2))
1185 r23 = _mm_mul_ps(rsq23,rinv23);
1186 r23 = _mm_andnot_ps(dummy_mask,r23);
1188 /* EWALD ELECTROSTATICS */
1190 /* Analytical PME correction */
1191 zeta2 = _mm_mul_ps(beta2,rsq23);
1192 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1193 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1194 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1195 felec = _mm_mul_ps(qq23,felec);
1196 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1197 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1198 velec = _mm_mul_ps(qq23,velec);
1200 d = _mm_sub_ps(r23,rswitch);
1201 d = _mm_max_ps(d,_mm_setzero_ps());
1202 d2 = _mm_mul_ps(d,d);
1203 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1205 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1207 /* Evaluate switch function */
1208 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1209 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1210 velec = _mm_mul_ps(velec,sw);
1211 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1213 /* Update potential sum for this i atom from the interaction with this j atom. */
1214 velec = _mm_and_ps(velec,cutoff_mask);
1215 velec = _mm_andnot_ps(dummy_mask,velec);
1216 velecsum = _mm_add_ps(velecsum,velec);
1220 fscal = _mm_and_ps(fscal,cutoff_mask);
1222 fscal = _mm_andnot_ps(dummy_mask,fscal);
1224 /* Update vectorial force */
1225 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1226 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1227 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1229 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1230 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1231 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1235 /**************************
1236 * CALCULATE INTERACTIONS *
1237 **************************/
1239 if (gmx_mm_any_lt(rsq31,rcutoff2))
1242 r31 = _mm_mul_ps(rsq31,rinv31);
1243 r31 = _mm_andnot_ps(dummy_mask,r31);
1245 /* EWALD ELECTROSTATICS */
1247 /* Analytical PME correction */
1248 zeta2 = _mm_mul_ps(beta2,rsq31);
1249 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1250 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1251 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1252 felec = _mm_mul_ps(qq31,felec);
1253 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1254 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1255 velec = _mm_mul_ps(qq31,velec);
1257 d = _mm_sub_ps(r31,rswitch);
1258 d = _mm_max_ps(d,_mm_setzero_ps());
1259 d2 = _mm_mul_ps(d,d);
1260 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1262 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1264 /* Evaluate switch function */
1265 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1266 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1267 velec = _mm_mul_ps(velec,sw);
1268 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1270 /* Update potential sum for this i atom from the interaction with this j atom. */
1271 velec = _mm_and_ps(velec,cutoff_mask);
1272 velec = _mm_andnot_ps(dummy_mask,velec);
1273 velecsum = _mm_add_ps(velecsum,velec);
1277 fscal = _mm_and_ps(fscal,cutoff_mask);
1279 fscal = _mm_andnot_ps(dummy_mask,fscal);
1281 /* Update vectorial force */
1282 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1283 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1284 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1286 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1287 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1288 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1292 /**************************
1293 * CALCULATE INTERACTIONS *
1294 **************************/
1296 if (gmx_mm_any_lt(rsq32,rcutoff2))
1299 r32 = _mm_mul_ps(rsq32,rinv32);
1300 r32 = _mm_andnot_ps(dummy_mask,r32);
1302 /* EWALD ELECTROSTATICS */
1304 /* Analytical PME correction */
1305 zeta2 = _mm_mul_ps(beta2,rsq32);
1306 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1307 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1308 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1309 felec = _mm_mul_ps(qq32,felec);
1310 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1311 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1312 velec = _mm_mul_ps(qq32,velec);
1314 d = _mm_sub_ps(r32,rswitch);
1315 d = _mm_max_ps(d,_mm_setzero_ps());
1316 d2 = _mm_mul_ps(d,d);
1317 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1319 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1321 /* Evaluate switch function */
1322 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1323 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1324 velec = _mm_mul_ps(velec,sw);
1325 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1327 /* Update potential sum for this i atom from the interaction with this j atom. */
1328 velec = _mm_and_ps(velec,cutoff_mask);
1329 velec = _mm_andnot_ps(dummy_mask,velec);
1330 velecsum = _mm_add_ps(velecsum,velec);
1334 fscal = _mm_and_ps(fscal,cutoff_mask);
1336 fscal = _mm_andnot_ps(dummy_mask,fscal);
1338 /* Update vectorial force */
1339 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1340 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1341 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1343 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1344 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1345 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1349 /**************************
1350 * CALCULATE INTERACTIONS *
1351 **************************/
1353 if (gmx_mm_any_lt(rsq33,rcutoff2))
1356 r33 = _mm_mul_ps(rsq33,rinv33);
1357 r33 = _mm_andnot_ps(dummy_mask,r33);
1359 /* EWALD ELECTROSTATICS */
1361 /* Analytical PME correction */
1362 zeta2 = _mm_mul_ps(beta2,rsq33);
1363 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1364 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1365 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1366 felec = _mm_mul_ps(qq33,felec);
1367 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1368 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1369 velec = _mm_mul_ps(qq33,velec);
1371 d = _mm_sub_ps(r33,rswitch);
1372 d = _mm_max_ps(d,_mm_setzero_ps());
1373 d2 = _mm_mul_ps(d,d);
1374 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1376 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1378 /* Evaluate switch function */
1379 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1380 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1381 velec = _mm_mul_ps(velec,sw);
1382 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1384 /* Update potential sum for this i atom from the interaction with this j atom. */
1385 velec = _mm_and_ps(velec,cutoff_mask);
1386 velec = _mm_andnot_ps(dummy_mask,velec);
1387 velecsum = _mm_add_ps(velecsum,velec);
1391 fscal = _mm_and_ps(fscal,cutoff_mask);
1393 fscal = _mm_andnot_ps(dummy_mask,fscal);
1395 /* Update vectorial force */
1396 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1397 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1398 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1400 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1401 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1402 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1406 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1407 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1408 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1409 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1411 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1412 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1414 /* Inner loop uses 486 flops */
1417 /* End of innermost loop */
1419 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1420 f+i_coord_offset+DIM,fshift+i_shift_offset);
1423 /* Update potential energies */
1424 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1426 /* Increment number of inner iterations */
1427 inneriter += j_index_end - j_index_start;
1429 /* Outer loop uses 19 flops */
1432 /* Increment number of outer iterations */
1435 /* Update outer/inner flops */
1437 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*486);
1440 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_single
1441 * Electrostatics interaction: Ewald
1442 * VdW interaction: None
1443 * Geometry: Water4-Water4
1444 * Calculate force/pot: Force
1447 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_single
1448 (t_nblist * gmx_restrict nlist,
1449 rvec * gmx_restrict xx,
1450 rvec * gmx_restrict ff,
1451 struct t_forcerec * gmx_restrict fr,
1452 t_mdatoms * gmx_restrict mdatoms,
1453 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1454 t_nrnb * gmx_restrict nrnb)
1456 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1457 * just 0 for non-waters.
1458 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1459 * jnr indices corresponding to data put in the four positions in the SIMD register.
1461 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1462 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1463 int jnrA,jnrB,jnrC,jnrD;
1464 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1465 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1466 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1467 real rcutoff_scalar;
1468 real *shiftvec,*fshift,*x,*f;
1469 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1470 real scratch[4*DIM];
1471 __m128 fscal,rcutoff,rcutoff2,jidxall;
1473 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1475 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1477 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1478 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1479 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1480 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1481 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1482 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1483 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1484 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1485 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1486 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1487 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1488 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1489 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1490 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1491 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1492 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1493 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1496 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1497 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1499 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1500 real rswitch_scalar,d_scalar;
1501 __m128 dummy_mask,cutoff_mask;
1502 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1503 __m128 one = _mm_set1_ps(1.0);
1504 __m128 two = _mm_set1_ps(2.0);
1510 jindex = nlist->jindex;
1512 shiftidx = nlist->shift;
1514 shiftvec = fr->shift_vec[0];
1515 fshift = fr->fshift[0];
1516 facel = _mm_set1_ps(fr->ic->epsfac);
1517 charge = mdatoms->chargeA;
1519 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1520 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1521 beta2 = _mm_mul_ps(beta,beta);
1522 beta3 = _mm_mul_ps(beta,beta2);
1523 ewtab = fr->ic->tabq_coul_FDV0;
1524 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1525 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1527 /* Setup water-specific parameters */
1528 inr = nlist->iinr[0];
1529 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1530 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1531 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1533 jq1 = _mm_set1_ps(charge[inr+1]);
1534 jq2 = _mm_set1_ps(charge[inr+2]);
1535 jq3 = _mm_set1_ps(charge[inr+3]);
1536 qq11 = _mm_mul_ps(iq1,jq1);
1537 qq12 = _mm_mul_ps(iq1,jq2);
1538 qq13 = _mm_mul_ps(iq1,jq3);
1539 qq21 = _mm_mul_ps(iq2,jq1);
1540 qq22 = _mm_mul_ps(iq2,jq2);
1541 qq23 = _mm_mul_ps(iq2,jq3);
1542 qq31 = _mm_mul_ps(iq3,jq1);
1543 qq32 = _mm_mul_ps(iq3,jq2);
1544 qq33 = _mm_mul_ps(iq3,jq3);
1546 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1547 rcutoff_scalar = fr->ic->rcoulomb;
1548 rcutoff = _mm_set1_ps(rcutoff_scalar);
1549 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1551 rswitch_scalar = fr->ic->rcoulomb_switch;
1552 rswitch = _mm_set1_ps(rswitch_scalar);
1553 /* Setup switch parameters */
1554 d_scalar = rcutoff_scalar-rswitch_scalar;
1555 d = _mm_set1_ps(d_scalar);
1556 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1557 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1558 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1559 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1560 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1561 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1563 /* Avoid stupid compiler warnings */
1564 jnrA = jnrB = jnrC = jnrD = 0;
1565 j_coord_offsetA = 0;
1566 j_coord_offsetB = 0;
1567 j_coord_offsetC = 0;
1568 j_coord_offsetD = 0;
1573 for(iidx=0;iidx<4*DIM;iidx++)
1575 scratch[iidx] = 0.0;
1578 /* Start outer loop over neighborlists */
1579 for(iidx=0; iidx<nri; iidx++)
1581 /* Load shift vector for this list */
1582 i_shift_offset = DIM*shiftidx[iidx];
1584 /* Load limits for loop over neighbors */
1585 j_index_start = jindex[iidx];
1586 j_index_end = jindex[iidx+1];
1588 /* Get outer coordinate index */
1590 i_coord_offset = DIM*inr;
1592 /* Load i particle coords and add shift vector */
1593 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1594 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1596 fix1 = _mm_setzero_ps();
1597 fiy1 = _mm_setzero_ps();
1598 fiz1 = _mm_setzero_ps();
1599 fix2 = _mm_setzero_ps();
1600 fiy2 = _mm_setzero_ps();
1601 fiz2 = _mm_setzero_ps();
1602 fix3 = _mm_setzero_ps();
1603 fiy3 = _mm_setzero_ps();
1604 fiz3 = _mm_setzero_ps();
1606 /* Start inner kernel loop */
1607 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1610 /* Get j neighbor index, and coordinate index */
1612 jnrB = jjnr[jidx+1];
1613 jnrC = jjnr[jidx+2];
1614 jnrD = jjnr[jidx+3];
1615 j_coord_offsetA = DIM*jnrA;
1616 j_coord_offsetB = DIM*jnrB;
1617 j_coord_offsetC = DIM*jnrC;
1618 j_coord_offsetD = DIM*jnrD;
1620 /* load j atom coordinates */
1621 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1622 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1623 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1625 /* Calculate displacement vector */
1626 dx11 = _mm_sub_ps(ix1,jx1);
1627 dy11 = _mm_sub_ps(iy1,jy1);
1628 dz11 = _mm_sub_ps(iz1,jz1);
1629 dx12 = _mm_sub_ps(ix1,jx2);
1630 dy12 = _mm_sub_ps(iy1,jy2);
1631 dz12 = _mm_sub_ps(iz1,jz2);
1632 dx13 = _mm_sub_ps(ix1,jx3);
1633 dy13 = _mm_sub_ps(iy1,jy3);
1634 dz13 = _mm_sub_ps(iz1,jz3);
1635 dx21 = _mm_sub_ps(ix2,jx1);
1636 dy21 = _mm_sub_ps(iy2,jy1);
1637 dz21 = _mm_sub_ps(iz2,jz1);
1638 dx22 = _mm_sub_ps(ix2,jx2);
1639 dy22 = _mm_sub_ps(iy2,jy2);
1640 dz22 = _mm_sub_ps(iz2,jz2);
1641 dx23 = _mm_sub_ps(ix2,jx3);
1642 dy23 = _mm_sub_ps(iy2,jy3);
1643 dz23 = _mm_sub_ps(iz2,jz3);
1644 dx31 = _mm_sub_ps(ix3,jx1);
1645 dy31 = _mm_sub_ps(iy3,jy1);
1646 dz31 = _mm_sub_ps(iz3,jz1);
1647 dx32 = _mm_sub_ps(ix3,jx2);
1648 dy32 = _mm_sub_ps(iy3,jy2);
1649 dz32 = _mm_sub_ps(iz3,jz2);
1650 dx33 = _mm_sub_ps(ix3,jx3);
1651 dy33 = _mm_sub_ps(iy3,jy3);
1652 dz33 = _mm_sub_ps(iz3,jz3);
1654 /* Calculate squared distance and things based on it */
1655 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1656 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1657 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1658 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1659 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1660 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1661 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1662 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1663 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1665 rinv11 = avx128fma_invsqrt_f(rsq11);
1666 rinv12 = avx128fma_invsqrt_f(rsq12);
1667 rinv13 = avx128fma_invsqrt_f(rsq13);
1668 rinv21 = avx128fma_invsqrt_f(rsq21);
1669 rinv22 = avx128fma_invsqrt_f(rsq22);
1670 rinv23 = avx128fma_invsqrt_f(rsq23);
1671 rinv31 = avx128fma_invsqrt_f(rsq31);
1672 rinv32 = avx128fma_invsqrt_f(rsq32);
1673 rinv33 = avx128fma_invsqrt_f(rsq33);
1675 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1676 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1677 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1678 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1679 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1680 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1681 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1682 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1683 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1685 fjx1 = _mm_setzero_ps();
1686 fjy1 = _mm_setzero_ps();
1687 fjz1 = _mm_setzero_ps();
1688 fjx2 = _mm_setzero_ps();
1689 fjy2 = _mm_setzero_ps();
1690 fjz2 = _mm_setzero_ps();
1691 fjx3 = _mm_setzero_ps();
1692 fjy3 = _mm_setzero_ps();
1693 fjz3 = _mm_setzero_ps();
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 if (gmx_mm_any_lt(rsq11,rcutoff2))
1702 r11 = _mm_mul_ps(rsq11,rinv11);
1704 /* EWALD ELECTROSTATICS */
1706 /* Analytical PME correction */
1707 zeta2 = _mm_mul_ps(beta2,rsq11);
1708 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1709 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1710 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1711 felec = _mm_mul_ps(qq11,felec);
1712 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1713 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1714 velec = _mm_mul_ps(qq11,velec);
1716 d = _mm_sub_ps(r11,rswitch);
1717 d = _mm_max_ps(d,_mm_setzero_ps());
1718 d2 = _mm_mul_ps(d,d);
1719 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1721 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1723 /* Evaluate switch function */
1724 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1725 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1726 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1730 fscal = _mm_and_ps(fscal,cutoff_mask);
1732 /* Update vectorial force */
1733 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1734 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1735 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1737 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1738 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1739 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1743 /**************************
1744 * CALCULATE INTERACTIONS *
1745 **************************/
1747 if (gmx_mm_any_lt(rsq12,rcutoff2))
1750 r12 = _mm_mul_ps(rsq12,rinv12);
1752 /* EWALD ELECTROSTATICS */
1754 /* Analytical PME correction */
1755 zeta2 = _mm_mul_ps(beta2,rsq12);
1756 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1757 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1758 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1759 felec = _mm_mul_ps(qq12,felec);
1760 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1761 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1762 velec = _mm_mul_ps(qq12,velec);
1764 d = _mm_sub_ps(r12,rswitch);
1765 d = _mm_max_ps(d,_mm_setzero_ps());
1766 d2 = _mm_mul_ps(d,d);
1767 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1769 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1771 /* Evaluate switch function */
1772 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1773 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1774 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1778 fscal = _mm_and_ps(fscal,cutoff_mask);
1780 /* Update vectorial force */
1781 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1782 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1783 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1785 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1786 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1787 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1791 /**************************
1792 * CALCULATE INTERACTIONS *
1793 **************************/
1795 if (gmx_mm_any_lt(rsq13,rcutoff2))
1798 r13 = _mm_mul_ps(rsq13,rinv13);
1800 /* EWALD ELECTROSTATICS */
1802 /* Analytical PME correction */
1803 zeta2 = _mm_mul_ps(beta2,rsq13);
1804 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1805 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1806 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1807 felec = _mm_mul_ps(qq13,felec);
1808 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1809 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
1810 velec = _mm_mul_ps(qq13,velec);
1812 d = _mm_sub_ps(r13,rswitch);
1813 d = _mm_max_ps(d,_mm_setzero_ps());
1814 d2 = _mm_mul_ps(d,d);
1815 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1817 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1819 /* Evaluate switch function */
1820 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1821 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1822 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1826 fscal = _mm_and_ps(fscal,cutoff_mask);
1828 /* Update vectorial force */
1829 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1830 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1831 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1833 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1834 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1835 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1839 /**************************
1840 * CALCULATE INTERACTIONS *
1841 **************************/
1843 if (gmx_mm_any_lt(rsq21,rcutoff2))
1846 r21 = _mm_mul_ps(rsq21,rinv21);
1848 /* EWALD ELECTROSTATICS */
1850 /* Analytical PME correction */
1851 zeta2 = _mm_mul_ps(beta2,rsq21);
1852 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1853 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1854 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1855 felec = _mm_mul_ps(qq21,felec);
1856 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1857 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1858 velec = _mm_mul_ps(qq21,velec);
1860 d = _mm_sub_ps(r21,rswitch);
1861 d = _mm_max_ps(d,_mm_setzero_ps());
1862 d2 = _mm_mul_ps(d,d);
1863 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1865 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1867 /* Evaluate switch function */
1868 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1869 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1870 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1874 fscal = _mm_and_ps(fscal,cutoff_mask);
1876 /* Update vectorial force */
1877 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1878 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1879 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1881 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1882 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1883 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 if (gmx_mm_any_lt(rsq22,rcutoff2))
1894 r22 = _mm_mul_ps(rsq22,rinv22);
1896 /* EWALD ELECTROSTATICS */
1898 /* Analytical PME correction */
1899 zeta2 = _mm_mul_ps(beta2,rsq22);
1900 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1901 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1902 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1903 felec = _mm_mul_ps(qq22,felec);
1904 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1905 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1906 velec = _mm_mul_ps(qq22,velec);
1908 d = _mm_sub_ps(r22,rswitch);
1909 d = _mm_max_ps(d,_mm_setzero_ps());
1910 d2 = _mm_mul_ps(d,d);
1911 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1913 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1915 /* Evaluate switch function */
1916 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1917 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1918 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1922 fscal = _mm_and_ps(fscal,cutoff_mask);
1924 /* Update vectorial force */
1925 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1926 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1927 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1929 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1930 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1931 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1935 /**************************
1936 * CALCULATE INTERACTIONS *
1937 **************************/
1939 if (gmx_mm_any_lt(rsq23,rcutoff2))
1942 r23 = _mm_mul_ps(rsq23,rinv23);
1944 /* EWALD ELECTROSTATICS */
1946 /* Analytical PME correction */
1947 zeta2 = _mm_mul_ps(beta2,rsq23);
1948 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1949 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1950 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1951 felec = _mm_mul_ps(qq23,felec);
1952 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1953 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1954 velec = _mm_mul_ps(qq23,velec);
1956 d = _mm_sub_ps(r23,rswitch);
1957 d = _mm_max_ps(d,_mm_setzero_ps());
1958 d2 = _mm_mul_ps(d,d);
1959 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1961 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1963 /* Evaluate switch function */
1964 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1965 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1966 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1970 fscal = _mm_and_ps(fscal,cutoff_mask);
1972 /* Update vectorial force */
1973 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1974 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1975 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1977 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1978 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1979 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1983 /**************************
1984 * CALCULATE INTERACTIONS *
1985 **************************/
1987 if (gmx_mm_any_lt(rsq31,rcutoff2))
1990 r31 = _mm_mul_ps(rsq31,rinv31);
1992 /* EWALD ELECTROSTATICS */
1994 /* Analytical PME correction */
1995 zeta2 = _mm_mul_ps(beta2,rsq31);
1996 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1997 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1998 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1999 felec = _mm_mul_ps(qq31,felec);
2000 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2001 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2002 velec = _mm_mul_ps(qq31,velec);
2004 d = _mm_sub_ps(r31,rswitch);
2005 d = _mm_max_ps(d,_mm_setzero_ps());
2006 d2 = _mm_mul_ps(d,d);
2007 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2009 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2011 /* Evaluate switch function */
2012 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2013 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2014 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2018 fscal = _mm_and_ps(fscal,cutoff_mask);
2020 /* Update vectorial force */
2021 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2022 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2023 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2025 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2026 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2027 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2031 /**************************
2032 * CALCULATE INTERACTIONS *
2033 **************************/
2035 if (gmx_mm_any_lt(rsq32,rcutoff2))
2038 r32 = _mm_mul_ps(rsq32,rinv32);
2040 /* EWALD ELECTROSTATICS */
2042 /* Analytical PME correction */
2043 zeta2 = _mm_mul_ps(beta2,rsq32);
2044 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2045 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2046 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2047 felec = _mm_mul_ps(qq32,felec);
2048 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2049 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2050 velec = _mm_mul_ps(qq32,velec);
2052 d = _mm_sub_ps(r32,rswitch);
2053 d = _mm_max_ps(d,_mm_setzero_ps());
2054 d2 = _mm_mul_ps(d,d);
2055 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2057 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2059 /* Evaluate switch function */
2060 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2061 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2062 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2066 fscal = _mm_and_ps(fscal,cutoff_mask);
2068 /* Update vectorial force */
2069 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2070 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2071 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2073 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2074 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2075 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2079 /**************************
2080 * CALCULATE INTERACTIONS *
2081 **************************/
2083 if (gmx_mm_any_lt(rsq33,rcutoff2))
2086 r33 = _mm_mul_ps(rsq33,rinv33);
2088 /* EWALD ELECTROSTATICS */
2090 /* Analytical PME correction */
2091 zeta2 = _mm_mul_ps(beta2,rsq33);
2092 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2093 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2094 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2095 felec = _mm_mul_ps(qq33,felec);
2096 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2097 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2098 velec = _mm_mul_ps(qq33,velec);
2100 d = _mm_sub_ps(r33,rswitch);
2101 d = _mm_max_ps(d,_mm_setzero_ps());
2102 d2 = _mm_mul_ps(d,d);
2103 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2105 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2107 /* Evaluate switch function */
2108 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2109 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2110 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2114 fscal = _mm_and_ps(fscal,cutoff_mask);
2116 /* Update vectorial force */
2117 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2118 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2119 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2121 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2122 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2123 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2127 fjptrA = f+j_coord_offsetA;
2128 fjptrB = f+j_coord_offsetB;
2129 fjptrC = f+j_coord_offsetC;
2130 fjptrD = f+j_coord_offsetD;
2132 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2133 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2135 /* Inner loop uses 450 flops */
2138 if(jidx<j_index_end)
2141 /* Get j neighbor index, and coordinate index */
2142 jnrlistA = jjnr[jidx];
2143 jnrlistB = jjnr[jidx+1];
2144 jnrlistC = jjnr[jidx+2];
2145 jnrlistD = jjnr[jidx+3];
2146 /* Sign of each element will be negative for non-real atoms.
2147 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2148 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2150 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2151 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2152 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2153 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2154 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2155 j_coord_offsetA = DIM*jnrA;
2156 j_coord_offsetB = DIM*jnrB;
2157 j_coord_offsetC = DIM*jnrC;
2158 j_coord_offsetD = DIM*jnrD;
2160 /* load j atom coordinates */
2161 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2162 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2163 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2165 /* Calculate displacement vector */
2166 dx11 = _mm_sub_ps(ix1,jx1);
2167 dy11 = _mm_sub_ps(iy1,jy1);
2168 dz11 = _mm_sub_ps(iz1,jz1);
2169 dx12 = _mm_sub_ps(ix1,jx2);
2170 dy12 = _mm_sub_ps(iy1,jy2);
2171 dz12 = _mm_sub_ps(iz1,jz2);
2172 dx13 = _mm_sub_ps(ix1,jx3);
2173 dy13 = _mm_sub_ps(iy1,jy3);
2174 dz13 = _mm_sub_ps(iz1,jz3);
2175 dx21 = _mm_sub_ps(ix2,jx1);
2176 dy21 = _mm_sub_ps(iy2,jy1);
2177 dz21 = _mm_sub_ps(iz2,jz1);
2178 dx22 = _mm_sub_ps(ix2,jx2);
2179 dy22 = _mm_sub_ps(iy2,jy2);
2180 dz22 = _mm_sub_ps(iz2,jz2);
2181 dx23 = _mm_sub_ps(ix2,jx3);
2182 dy23 = _mm_sub_ps(iy2,jy3);
2183 dz23 = _mm_sub_ps(iz2,jz3);
2184 dx31 = _mm_sub_ps(ix3,jx1);
2185 dy31 = _mm_sub_ps(iy3,jy1);
2186 dz31 = _mm_sub_ps(iz3,jz1);
2187 dx32 = _mm_sub_ps(ix3,jx2);
2188 dy32 = _mm_sub_ps(iy3,jy2);
2189 dz32 = _mm_sub_ps(iz3,jz2);
2190 dx33 = _mm_sub_ps(ix3,jx3);
2191 dy33 = _mm_sub_ps(iy3,jy3);
2192 dz33 = _mm_sub_ps(iz3,jz3);
2194 /* Calculate squared distance and things based on it */
2195 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2196 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2197 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2198 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2199 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2200 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2201 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2202 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2203 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2205 rinv11 = avx128fma_invsqrt_f(rsq11);
2206 rinv12 = avx128fma_invsqrt_f(rsq12);
2207 rinv13 = avx128fma_invsqrt_f(rsq13);
2208 rinv21 = avx128fma_invsqrt_f(rsq21);
2209 rinv22 = avx128fma_invsqrt_f(rsq22);
2210 rinv23 = avx128fma_invsqrt_f(rsq23);
2211 rinv31 = avx128fma_invsqrt_f(rsq31);
2212 rinv32 = avx128fma_invsqrt_f(rsq32);
2213 rinv33 = avx128fma_invsqrt_f(rsq33);
2215 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2216 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2217 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2218 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2219 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2220 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2221 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2222 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2223 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2225 fjx1 = _mm_setzero_ps();
2226 fjy1 = _mm_setzero_ps();
2227 fjz1 = _mm_setzero_ps();
2228 fjx2 = _mm_setzero_ps();
2229 fjy2 = _mm_setzero_ps();
2230 fjz2 = _mm_setzero_ps();
2231 fjx3 = _mm_setzero_ps();
2232 fjy3 = _mm_setzero_ps();
2233 fjz3 = _mm_setzero_ps();
2235 /**************************
2236 * CALCULATE INTERACTIONS *
2237 **************************/
2239 if (gmx_mm_any_lt(rsq11,rcutoff2))
2242 r11 = _mm_mul_ps(rsq11,rinv11);
2243 r11 = _mm_andnot_ps(dummy_mask,r11);
2245 /* EWALD ELECTROSTATICS */
2247 /* Analytical PME correction */
2248 zeta2 = _mm_mul_ps(beta2,rsq11);
2249 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2250 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2251 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2252 felec = _mm_mul_ps(qq11,felec);
2253 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2254 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2255 velec = _mm_mul_ps(qq11,velec);
2257 d = _mm_sub_ps(r11,rswitch);
2258 d = _mm_max_ps(d,_mm_setzero_ps());
2259 d2 = _mm_mul_ps(d,d);
2260 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2262 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2264 /* Evaluate switch function */
2265 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2266 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2267 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2271 fscal = _mm_and_ps(fscal,cutoff_mask);
2273 fscal = _mm_andnot_ps(dummy_mask,fscal);
2275 /* Update vectorial force */
2276 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2277 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2278 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2280 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2281 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2282 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 if (gmx_mm_any_lt(rsq12,rcutoff2))
2293 r12 = _mm_mul_ps(rsq12,rinv12);
2294 r12 = _mm_andnot_ps(dummy_mask,r12);
2296 /* EWALD ELECTROSTATICS */
2298 /* Analytical PME correction */
2299 zeta2 = _mm_mul_ps(beta2,rsq12);
2300 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2301 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2302 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2303 felec = _mm_mul_ps(qq12,felec);
2304 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2305 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2306 velec = _mm_mul_ps(qq12,velec);
2308 d = _mm_sub_ps(r12,rswitch);
2309 d = _mm_max_ps(d,_mm_setzero_ps());
2310 d2 = _mm_mul_ps(d,d);
2311 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2313 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2315 /* Evaluate switch function */
2316 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2317 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2318 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2322 fscal = _mm_and_ps(fscal,cutoff_mask);
2324 fscal = _mm_andnot_ps(dummy_mask,fscal);
2326 /* Update vectorial force */
2327 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2328 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2329 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2331 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2332 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2333 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2337 /**************************
2338 * CALCULATE INTERACTIONS *
2339 **************************/
2341 if (gmx_mm_any_lt(rsq13,rcutoff2))
2344 r13 = _mm_mul_ps(rsq13,rinv13);
2345 r13 = _mm_andnot_ps(dummy_mask,r13);
2347 /* EWALD ELECTROSTATICS */
2349 /* Analytical PME correction */
2350 zeta2 = _mm_mul_ps(beta2,rsq13);
2351 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
2352 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2353 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2354 felec = _mm_mul_ps(qq13,felec);
2355 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2356 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
2357 velec = _mm_mul_ps(qq13,velec);
2359 d = _mm_sub_ps(r13,rswitch);
2360 d = _mm_max_ps(d,_mm_setzero_ps());
2361 d2 = _mm_mul_ps(d,d);
2362 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2364 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2366 /* Evaluate switch function */
2367 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2368 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2369 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2373 fscal = _mm_and_ps(fscal,cutoff_mask);
2375 fscal = _mm_andnot_ps(dummy_mask,fscal);
2377 /* Update vectorial force */
2378 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2379 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2380 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2382 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2383 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2384 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2388 /**************************
2389 * CALCULATE INTERACTIONS *
2390 **************************/
2392 if (gmx_mm_any_lt(rsq21,rcutoff2))
2395 r21 = _mm_mul_ps(rsq21,rinv21);
2396 r21 = _mm_andnot_ps(dummy_mask,r21);
2398 /* EWALD ELECTROSTATICS */
2400 /* Analytical PME correction */
2401 zeta2 = _mm_mul_ps(beta2,rsq21);
2402 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2403 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2404 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2405 felec = _mm_mul_ps(qq21,felec);
2406 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2407 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2408 velec = _mm_mul_ps(qq21,velec);
2410 d = _mm_sub_ps(r21,rswitch);
2411 d = _mm_max_ps(d,_mm_setzero_ps());
2412 d2 = _mm_mul_ps(d,d);
2413 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2415 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2417 /* Evaluate switch function */
2418 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2419 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2420 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2424 fscal = _mm_and_ps(fscal,cutoff_mask);
2426 fscal = _mm_andnot_ps(dummy_mask,fscal);
2428 /* Update vectorial force */
2429 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2430 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2431 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2433 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2434 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2435 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2439 /**************************
2440 * CALCULATE INTERACTIONS *
2441 **************************/
2443 if (gmx_mm_any_lt(rsq22,rcutoff2))
2446 r22 = _mm_mul_ps(rsq22,rinv22);
2447 r22 = _mm_andnot_ps(dummy_mask,r22);
2449 /* EWALD ELECTROSTATICS */
2451 /* Analytical PME correction */
2452 zeta2 = _mm_mul_ps(beta2,rsq22);
2453 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2454 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2455 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2456 felec = _mm_mul_ps(qq22,felec);
2457 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2458 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2459 velec = _mm_mul_ps(qq22,velec);
2461 d = _mm_sub_ps(r22,rswitch);
2462 d = _mm_max_ps(d,_mm_setzero_ps());
2463 d2 = _mm_mul_ps(d,d);
2464 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2466 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2468 /* Evaluate switch function */
2469 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2470 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2471 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2475 fscal = _mm_and_ps(fscal,cutoff_mask);
2477 fscal = _mm_andnot_ps(dummy_mask,fscal);
2479 /* Update vectorial force */
2480 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2481 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2482 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2484 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2485 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2486 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2490 /**************************
2491 * CALCULATE INTERACTIONS *
2492 **************************/
2494 if (gmx_mm_any_lt(rsq23,rcutoff2))
2497 r23 = _mm_mul_ps(rsq23,rinv23);
2498 r23 = _mm_andnot_ps(dummy_mask,r23);
2500 /* EWALD ELECTROSTATICS */
2502 /* Analytical PME correction */
2503 zeta2 = _mm_mul_ps(beta2,rsq23);
2504 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
2505 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2506 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2507 felec = _mm_mul_ps(qq23,felec);
2508 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2509 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
2510 velec = _mm_mul_ps(qq23,velec);
2512 d = _mm_sub_ps(r23,rswitch);
2513 d = _mm_max_ps(d,_mm_setzero_ps());
2514 d2 = _mm_mul_ps(d,d);
2515 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2517 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2519 /* Evaluate switch function */
2520 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2521 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2522 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2526 fscal = _mm_and_ps(fscal,cutoff_mask);
2528 fscal = _mm_andnot_ps(dummy_mask,fscal);
2530 /* Update vectorial force */
2531 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2532 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2533 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2535 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2536 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2537 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2541 /**************************
2542 * CALCULATE INTERACTIONS *
2543 **************************/
2545 if (gmx_mm_any_lt(rsq31,rcutoff2))
2548 r31 = _mm_mul_ps(rsq31,rinv31);
2549 r31 = _mm_andnot_ps(dummy_mask,r31);
2551 /* EWALD ELECTROSTATICS */
2553 /* Analytical PME correction */
2554 zeta2 = _mm_mul_ps(beta2,rsq31);
2555 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
2556 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2557 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2558 felec = _mm_mul_ps(qq31,felec);
2559 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2560 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2561 velec = _mm_mul_ps(qq31,velec);
2563 d = _mm_sub_ps(r31,rswitch);
2564 d = _mm_max_ps(d,_mm_setzero_ps());
2565 d2 = _mm_mul_ps(d,d);
2566 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2568 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2570 /* Evaluate switch function */
2571 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2572 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2573 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2577 fscal = _mm_and_ps(fscal,cutoff_mask);
2579 fscal = _mm_andnot_ps(dummy_mask,fscal);
2581 /* Update vectorial force */
2582 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2583 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2584 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2586 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2587 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2588 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2592 /**************************
2593 * CALCULATE INTERACTIONS *
2594 **************************/
2596 if (gmx_mm_any_lt(rsq32,rcutoff2))
2599 r32 = _mm_mul_ps(rsq32,rinv32);
2600 r32 = _mm_andnot_ps(dummy_mask,r32);
2602 /* EWALD ELECTROSTATICS */
2604 /* Analytical PME correction */
2605 zeta2 = _mm_mul_ps(beta2,rsq32);
2606 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2607 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2608 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2609 felec = _mm_mul_ps(qq32,felec);
2610 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2611 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2612 velec = _mm_mul_ps(qq32,velec);
2614 d = _mm_sub_ps(r32,rswitch);
2615 d = _mm_max_ps(d,_mm_setzero_ps());
2616 d2 = _mm_mul_ps(d,d);
2617 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2619 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2621 /* Evaluate switch function */
2622 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2623 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2624 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2628 fscal = _mm_and_ps(fscal,cutoff_mask);
2630 fscal = _mm_andnot_ps(dummy_mask,fscal);
2632 /* Update vectorial force */
2633 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2634 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2635 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2637 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2638 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2639 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2643 /**************************
2644 * CALCULATE INTERACTIONS *
2645 **************************/
2647 if (gmx_mm_any_lt(rsq33,rcutoff2))
2650 r33 = _mm_mul_ps(rsq33,rinv33);
2651 r33 = _mm_andnot_ps(dummy_mask,r33);
2653 /* EWALD ELECTROSTATICS */
2655 /* Analytical PME correction */
2656 zeta2 = _mm_mul_ps(beta2,rsq33);
2657 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2658 pmecorrF = avx128fma_pmecorrF_f(zeta2);
2659 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2660 felec = _mm_mul_ps(qq33,felec);
2661 pmecorrV = avx128fma_pmecorrV_f(zeta2);
2662 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2663 velec = _mm_mul_ps(qq33,velec);
2665 d = _mm_sub_ps(r33,rswitch);
2666 d = _mm_max_ps(d,_mm_setzero_ps());
2667 d2 = _mm_mul_ps(d,d);
2668 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2670 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2672 /* Evaluate switch function */
2673 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2674 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2675 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2679 fscal = _mm_and_ps(fscal,cutoff_mask);
2681 fscal = _mm_andnot_ps(dummy_mask,fscal);
2683 /* Update vectorial force */
2684 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2685 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2686 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2688 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2689 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2690 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2694 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2695 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2696 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2697 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2699 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2700 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2702 /* Inner loop uses 459 flops */
2705 /* End of innermost loop */
2707 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2708 f+i_coord_offset+DIM,fshift+i_shift_offset);
2710 /* Increment number of inner iterations */
2711 inneriter += j_index_end - j_index_start;
2713 /* Outer loop uses 18 flops */
2716 /* Increment number of outer iterations */
2719 /* Update outer/inner flops */
2721 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*459);