2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrE,jnrF,jnrG,jnrH;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 real * vdwioffsetptr1;
86 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 real * vdwioffsetptr3;
90 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
92 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
94 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
96 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128i ewitab_lo,ewitab_hi;
110 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
111 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
113 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
114 real rswitch_scalar,d_scalar;
115 __m256 dummy_mask,cutoff_mask;
116 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
117 __m256 one = _mm256_set1_ps(1.0);
118 __m256 two = _mm256_set1_ps(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm256_set1_ps(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
133 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
134 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
135 beta2 = _mm256_mul_ps(beta,beta);
136 beta3 = _mm256_mul_ps(beta,beta2);
138 ewtab = fr->ic->tabq_coul_FDV0;
139 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
140 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
145 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
146 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
148 jq1 = _mm256_set1_ps(charge[inr+1]);
149 jq2 = _mm256_set1_ps(charge[inr+2]);
150 jq3 = _mm256_set1_ps(charge[inr+3]);
151 qq11 = _mm256_mul_ps(iq1,jq1);
152 qq12 = _mm256_mul_ps(iq1,jq2);
153 qq13 = _mm256_mul_ps(iq1,jq3);
154 qq21 = _mm256_mul_ps(iq2,jq1);
155 qq22 = _mm256_mul_ps(iq2,jq2);
156 qq23 = _mm256_mul_ps(iq2,jq3);
157 qq31 = _mm256_mul_ps(iq3,jq1);
158 qq32 = _mm256_mul_ps(iq3,jq2);
159 qq33 = _mm256_mul_ps(iq3,jq3);
161 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
162 rcutoff_scalar = fr->ic->rcoulomb;
163 rcutoff = _mm256_set1_ps(rcutoff_scalar);
164 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
166 rswitch_scalar = fr->ic->rcoulomb_switch;
167 rswitch = _mm256_set1_ps(rswitch_scalar);
168 /* Setup switch parameters */
169 d_scalar = rcutoff_scalar-rswitch_scalar;
170 d = _mm256_set1_ps(d_scalar);
171 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
172 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
173 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
174 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
175 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178 /* Avoid stupid compiler warnings */
179 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
192 for(iidx=0;iidx<4*DIM;iidx++)
197 /* Start outer loop over neighborlists */
198 for(iidx=0; iidx<nri; iidx++)
200 /* Load shift vector for this list */
201 i_shift_offset = DIM*shiftidx[iidx];
203 /* Load limits for loop over neighbors */
204 j_index_start = jindex[iidx];
205 j_index_end = jindex[iidx+1];
207 /* Get outer coordinate index */
209 i_coord_offset = DIM*inr;
211 /* Load i particle coords and add shift vector */
212 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
213 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
215 fix1 = _mm256_setzero_ps();
216 fiy1 = _mm256_setzero_ps();
217 fiz1 = _mm256_setzero_ps();
218 fix2 = _mm256_setzero_ps();
219 fiy2 = _mm256_setzero_ps();
220 fiz2 = _mm256_setzero_ps();
221 fix3 = _mm256_setzero_ps();
222 fiy3 = _mm256_setzero_ps();
223 fiz3 = _mm256_setzero_ps();
225 /* Reset potential sums */
226 velecsum = _mm256_setzero_ps();
228 /* Start inner kernel loop */
229 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
232 /* Get j neighbor index, and coordinate index */
241 j_coord_offsetA = DIM*jnrA;
242 j_coord_offsetB = DIM*jnrB;
243 j_coord_offsetC = DIM*jnrC;
244 j_coord_offsetD = DIM*jnrD;
245 j_coord_offsetE = DIM*jnrE;
246 j_coord_offsetF = DIM*jnrF;
247 j_coord_offsetG = DIM*jnrG;
248 j_coord_offsetH = DIM*jnrH;
250 /* load j atom coordinates */
251 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
252 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
253 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
254 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
255 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
257 /* Calculate displacement vector */
258 dx11 = _mm256_sub_ps(ix1,jx1);
259 dy11 = _mm256_sub_ps(iy1,jy1);
260 dz11 = _mm256_sub_ps(iz1,jz1);
261 dx12 = _mm256_sub_ps(ix1,jx2);
262 dy12 = _mm256_sub_ps(iy1,jy2);
263 dz12 = _mm256_sub_ps(iz1,jz2);
264 dx13 = _mm256_sub_ps(ix1,jx3);
265 dy13 = _mm256_sub_ps(iy1,jy3);
266 dz13 = _mm256_sub_ps(iz1,jz3);
267 dx21 = _mm256_sub_ps(ix2,jx1);
268 dy21 = _mm256_sub_ps(iy2,jy1);
269 dz21 = _mm256_sub_ps(iz2,jz1);
270 dx22 = _mm256_sub_ps(ix2,jx2);
271 dy22 = _mm256_sub_ps(iy2,jy2);
272 dz22 = _mm256_sub_ps(iz2,jz2);
273 dx23 = _mm256_sub_ps(ix2,jx3);
274 dy23 = _mm256_sub_ps(iy2,jy3);
275 dz23 = _mm256_sub_ps(iz2,jz3);
276 dx31 = _mm256_sub_ps(ix3,jx1);
277 dy31 = _mm256_sub_ps(iy3,jy1);
278 dz31 = _mm256_sub_ps(iz3,jz1);
279 dx32 = _mm256_sub_ps(ix3,jx2);
280 dy32 = _mm256_sub_ps(iy3,jy2);
281 dz32 = _mm256_sub_ps(iz3,jz2);
282 dx33 = _mm256_sub_ps(ix3,jx3);
283 dy33 = _mm256_sub_ps(iy3,jy3);
284 dz33 = _mm256_sub_ps(iz3,jz3);
286 /* Calculate squared distance and things based on it */
287 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
288 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
289 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
290 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
291 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
292 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
293 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
294 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
295 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
297 rinv11 = avx256_invsqrt_f(rsq11);
298 rinv12 = avx256_invsqrt_f(rsq12);
299 rinv13 = avx256_invsqrt_f(rsq13);
300 rinv21 = avx256_invsqrt_f(rsq21);
301 rinv22 = avx256_invsqrt_f(rsq22);
302 rinv23 = avx256_invsqrt_f(rsq23);
303 rinv31 = avx256_invsqrt_f(rsq31);
304 rinv32 = avx256_invsqrt_f(rsq32);
305 rinv33 = avx256_invsqrt_f(rsq33);
307 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
308 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
309 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
310 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
311 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
312 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
313 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
314 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
315 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
317 fjx1 = _mm256_setzero_ps();
318 fjy1 = _mm256_setzero_ps();
319 fjz1 = _mm256_setzero_ps();
320 fjx2 = _mm256_setzero_ps();
321 fjy2 = _mm256_setzero_ps();
322 fjz2 = _mm256_setzero_ps();
323 fjx3 = _mm256_setzero_ps();
324 fjy3 = _mm256_setzero_ps();
325 fjz3 = _mm256_setzero_ps();
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 if (gmx_mm256_any_lt(rsq11,rcutoff2))
334 r11 = _mm256_mul_ps(rsq11,rinv11);
336 /* EWALD ELECTROSTATICS */
338 /* Analytical PME correction */
339 zeta2 = _mm256_mul_ps(beta2,rsq11);
340 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
341 pmecorrF = avx256_pmecorrF_f(zeta2);
342 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
343 felec = _mm256_mul_ps(qq11,felec);
344 pmecorrV = avx256_pmecorrV_f(zeta2);
345 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
346 velec = _mm256_sub_ps(rinv11,pmecorrV);
347 velec = _mm256_mul_ps(qq11,velec);
349 d = _mm256_sub_ps(r11,rswitch);
350 d = _mm256_max_ps(d,_mm256_setzero_ps());
351 d2 = _mm256_mul_ps(d,d);
352 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
354 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
356 /* Evaluate switch function */
357 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
358 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
359 velec = _mm256_mul_ps(velec,sw);
360 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velec = _mm256_and_ps(velec,cutoff_mask);
364 velecsum = _mm256_add_ps(velecsum,velec);
368 fscal = _mm256_and_ps(fscal,cutoff_mask);
370 /* Calculate temporary vectorial force */
371 tx = _mm256_mul_ps(fscal,dx11);
372 ty = _mm256_mul_ps(fscal,dy11);
373 tz = _mm256_mul_ps(fscal,dz11);
375 /* Update vectorial force */
376 fix1 = _mm256_add_ps(fix1,tx);
377 fiy1 = _mm256_add_ps(fiy1,ty);
378 fiz1 = _mm256_add_ps(fiz1,tz);
380 fjx1 = _mm256_add_ps(fjx1,tx);
381 fjy1 = _mm256_add_ps(fjy1,ty);
382 fjz1 = _mm256_add_ps(fjz1,tz);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm256_any_lt(rsq12,rcutoff2))
393 r12 = _mm256_mul_ps(rsq12,rinv12);
395 /* EWALD ELECTROSTATICS */
397 /* Analytical PME correction */
398 zeta2 = _mm256_mul_ps(beta2,rsq12);
399 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
400 pmecorrF = avx256_pmecorrF_f(zeta2);
401 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
402 felec = _mm256_mul_ps(qq12,felec);
403 pmecorrV = avx256_pmecorrV_f(zeta2);
404 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
405 velec = _mm256_sub_ps(rinv12,pmecorrV);
406 velec = _mm256_mul_ps(qq12,velec);
408 d = _mm256_sub_ps(r12,rswitch);
409 d = _mm256_max_ps(d,_mm256_setzero_ps());
410 d2 = _mm256_mul_ps(d,d);
411 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
413 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
415 /* Evaluate switch function */
416 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
417 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
418 velec = _mm256_mul_ps(velec,sw);
419 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
421 /* Update potential sum for this i atom from the interaction with this j atom. */
422 velec = _mm256_and_ps(velec,cutoff_mask);
423 velecsum = _mm256_add_ps(velecsum,velec);
427 fscal = _mm256_and_ps(fscal,cutoff_mask);
429 /* Calculate temporary vectorial force */
430 tx = _mm256_mul_ps(fscal,dx12);
431 ty = _mm256_mul_ps(fscal,dy12);
432 tz = _mm256_mul_ps(fscal,dz12);
434 /* Update vectorial force */
435 fix1 = _mm256_add_ps(fix1,tx);
436 fiy1 = _mm256_add_ps(fiy1,ty);
437 fiz1 = _mm256_add_ps(fiz1,tz);
439 fjx2 = _mm256_add_ps(fjx2,tx);
440 fjy2 = _mm256_add_ps(fjy2,ty);
441 fjz2 = _mm256_add_ps(fjz2,tz);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 if (gmx_mm256_any_lt(rsq13,rcutoff2))
452 r13 = _mm256_mul_ps(rsq13,rinv13);
454 /* EWALD ELECTROSTATICS */
456 /* Analytical PME correction */
457 zeta2 = _mm256_mul_ps(beta2,rsq13);
458 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
459 pmecorrF = avx256_pmecorrF_f(zeta2);
460 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
461 felec = _mm256_mul_ps(qq13,felec);
462 pmecorrV = avx256_pmecorrV_f(zeta2);
463 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
464 velec = _mm256_sub_ps(rinv13,pmecorrV);
465 velec = _mm256_mul_ps(qq13,velec);
467 d = _mm256_sub_ps(r13,rswitch);
468 d = _mm256_max_ps(d,_mm256_setzero_ps());
469 d2 = _mm256_mul_ps(d,d);
470 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
472 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
474 /* Evaluate switch function */
475 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
476 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
477 velec = _mm256_mul_ps(velec,sw);
478 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
480 /* Update potential sum for this i atom from the interaction with this j atom. */
481 velec = _mm256_and_ps(velec,cutoff_mask);
482 velecsum = _mm256_add_ps(velecsum,velec);
486 fscal = _mm256_and_ps(fscal,cutoff_mask);
488 /* Calculate temporary vectorial force */
489 tx = _mm256_mul_ps(fscal,dx13);
490 ty = _mm256_mul_ps(fscal,dy13);
491 tz = _mm256_mul_ps(fscal,dz13);
493 /* Update vectorial force */
494 fix1 = _mm256_add_ps(fix1,tx);
495 fiy1 = _mm256_add_ps(fiy1,ty);
496 fiz1 = _mm256_add_ps(fiz1,tz);
498 fjx3 = _mm256_add_ps(fjx3,tx);
499 fjy3 = _mm256_add_ps(fjy3,ty);
500 fjz3 = _mm256_add_ps(fjz3,tz);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 if (gmx_mm256_any_lt(rsq21,rcutoff2))
511 r21 = _mm256_mul_ps(rsq21,rinv21);
513 /* EWALD ELECTROSTATICS */
515 /* Analytical PME correction */
516 zeta2 = _mm256_mul_ps(beta2,rsq21);
517 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
518 pmecorrF = avx256_pmecorrF_f(zeta2);
519 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
520 felec = _mm256_mul_ps(qq21,felec);
521 pmecorrV = avx256_pmecorrV_f(zeta2);
522 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
523 velec = _mm256_sub_ps(rinv21,pmecorrV);
524 velec = _mm256_mul_ps(qq21,velec);
526 d = _mm256_sub_ps(r21,rswitch);
527 d = _mm256_max_ps(d,_mm256_setzero_ps());
528 d2 = _mm256_mul_ps(d,d);
529 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
531 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
533 /* Evaluate switch function */
534 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
535 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
536 velec = _mm256_mul_ps(velec,sw);
537 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velec = _mm256_and_ps(velec,cutoff_mask);
541 velecsum = _mm256_add_ps(velecsum,velec);
545 fscal = _mm256_and_ps(fscal,cutoff_mask);
547 /* Calculate temporary vectorial force */
548 tx = _mm256_mul_ps(fscal,dx21);
549 ty = _mm256_mul_ps(fscal,dy21);
550 tz = _mm256_mul_ps(fscal,dz21);
552 /* Update vectorial force */
553 fix2 = _mm256_add_ps(fix2,tx);
554 fiy2 = _mm256_add_ps(fiy2,ty);
555 fiz2 = _mm256_add_ps(fiz2,tz);
557 fjx1 = _mm256_add_ps(fjx1,tx);
558 fjy1 = _mm256_add_ps(fjy1,ty);
559 fjz1 = _mm256_add_ps(fjz1,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 if (gmx_mm256_any_lt(rsq22,rcutoff2))
570 r22 = _mm256_mul_ps(rsq22,rinv22);
572 /* EWALD ELECTROSTATICS */
574 /* Analytical PME correction */
575 zeta2 = _mm256_mul_ps(beta2,rsq22);
576 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
577 pmecorrF = avx256_pmecorrF_f(zeta2);
578 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
579 felec = _mm256_mul_ps(qq22,felec);
580 pmecorrV = avx256_pmecorrV_f(zeta2);
581 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
582 velec = _mm256_sub_ps(rinv22,pmecorrV);
583 velec = _mm256_mul_ps(qq22,velec);
585 d = _mm256_sub_ps(r22,rswitch);
586 d = _mm256_max_ps(d,_mm256_setzero_ps());
587 d2 = _mm256_mul_ps(d,d);
588 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
590 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
592 /* Evaluate switch function */
593 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
594 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
595 velec = _mm256_mul_ps(velec,sw);
596 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec = _mm256_and_ps(velec,cutoff_mask);
600 velecsum = _mm256_add_ps(velecsum,velec);
604 fscal = _mm256_and_ps(fscal,cutoff_mask);
606 /* Calculate temporary vectorial force */
607 tx = _mm256_mul_ps(fscal,dx22);
608 ty = _mm256_mul_ps(fscal,dy22);
609 tz = _mm256_mul_ps(fscal,dz22);
611 /* Update vectorial force */
612 fix2 = _mm256_add_ps(fix2,tx);
613 fiy2 = _mm256_add_ps(fiy2,ty);
614 fiz2 = _mm256_add_ps(fiz2,tz);
616 fjx2 = _mm256_add_ps(fjx2,tx);
617 fjy2 = _mm256_add_ps(fjy2,ty);
618 fjz2 = _mm256_add_ps(fjz2,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm256_any_lt(rsq23,rcutoff2))
629 r23 = _mm256_mul_ps(rsq23,rinv23);
631 /* EWALD ELECTROSTATICS */
633 /* Analytical PME correction */
634 zeta2 = _mm256_mul_ps(beta2,rsq23);
635 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
636 pmecorrF = avx256_pmecorrF_f(zeta2);
637 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
638 felec = _mm256_mul_ps(qq23,felec);
639 pmecorrV = avx256_pmecorrV_f(zeta2);
640 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
641 velec = _mm256_sub_ps(rinv23,pmecorrV);
642 velec = _mm256_mul_ps(qq23,velec);
644 d = _mm256_sub_ps(r23,rswitch);
645 d = _mm256_max_ps(d,_mm256_setzero_ps());
646 d2 = _mm256_mul_ps(d,d);
647 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
649 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
651 /* Evaluate switch function */
652 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
653 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
654 velec = _mm256_mul_ps(velec,sw);
655 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velec = _mm256_and_ps(velec,cutoff_mask);
659 velecsum = _mm256_add_ps(velecsum,velec);
663 fscal = _mm256_and_ps(fscal,cutoff_mask);
665 /* Calculate temporary vectorial force */
666 tx = _mm256_mul_ps(fscal,dx23);
667 ty = _mm256_mul_ps(fscal,dy23);
668 tz = _mm256_mul_ps(fscal,dz23);
670 /* Update vectorial force */
671 fix2 = _mm256_add_ps(fix2,tx);
672 fiy2 = _mm256_add_ps(fiy2,ty);
673 fiz2 = _mm256_add_ps(fiz2,tz);
675 fjx3 = _mm256_add_ps(fjx3,tx);
676 fjy3 = _mm256_add_ps(fjy3,ty);
677 fjz3 = _mm256_add_ps(fjz3,tz);
681 /**************************
682 * CALCULATE INTERACTIONS *
683 **************************/
685 if (gmx_mm256_any_lt(rsq31,rcutoff2))
688 r31 = _mm256_mul_ps(rsq31,rinv31);
690 /* EWALD ELECTROSTATICS */
692 /* Analytical PME correction */
693 zeta2 = _mm256_mul_ps(beta2,rsq31);
694 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
695 pmecorrF = avx256_pmecorrF_f(zeta2);
696 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
697 felec = _mm256_mul_ps(qq31,felec);
698 pmecorrV = avx256_pmecorrV_f(zeta2);
699 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
700 velec = _mm256_sub_ps(rinv31,pmecorrV);
701 velec = _mm256_mul_ps(qq31,velec);
703 d = _mm256_sub_ps(r31,rswitch);
704 d = _mm256_max_ps(d,_mm256_setzero_ps());
705 d2 = _mm256_mul_ps(d,d);
706 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
708 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
710 /* Evaluate switch function */
711 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
712 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
713 velec = _mm256_mul_ps(velec,sw);
714 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
716 /* Update potential sum for this i atom from the interaction with this j atom. */
717 velec = _mm256_and_ps(velec,cutoff_mask);
718 velecsum = _mm256_add_ps(velecsum,velec);
722 fscal = _mm256_and_ps(fscal,cutoff_mask);
724 /* Calculate temporary vectorial force */
725 tx = _mm256_mul_ps(fscal,dx31);
726 ty = _mm256_mul_ps(fscal,dy31);
727 tz = _mm256_mul_ps(fscal,dz31);
729 /* Update vectorial force */
730 fix3 = _mm256_add_ps(fix3,tx);
731 fiy3 = _mm256_add_ps(fiy3,ty);
732 fiz3 = _mm256_add_ps(fiz3,tz);
734 fjx1 = _mm256_add_ps(fjx1,tx);
735 fjy1 = _mm256_add_ps(fjy1,ty);
736 fjz1 = _mm256_add_ps(fjz1,tz);
740 /**************************
741 * CALCULATE INTERACTIONS *
742 **************************/
744 if (gmx_mm256_any_lt(rsq32,rcutoff2))
747 r32 = _mm256_mul_ps(rsq32,rinv32);
749 /* EWALD ELECTROSTATICS */
751 /* Analytical PME correction */
752 zeta2 = _mm256_mul_ps(beta2,rsq32);
753 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
754 pmecorrF = avx256_pmecorrF_f(zeta2);
755 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
756 felec = _mm256_mul_ps(qq32,felec);
757 pmecorrV = avx256_pmecorrV_f(zeta2);
758 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
759 velec = _mm256_sub_ps(rinv32,pmecorrV);
760 velec = _mm256_mul_ps(qq32,velec);
762 d = _mm256_sub_ps(r32,rswitch);
763 d = _mm256_max_ps(d,_mm256_setzero_ps());
764 d2 = _mm256_mul_ps(d,d);
765 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
767 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
769 /* Evaluate switch function */
770 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
771 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
772 velec = _mm256_mul_ps(velec,sw);
773 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
775 /* Update potential sum for this i atom from the interaction with this j atom. */
776 velec = _mm256_and_ps(velec,cutoff_mask);
777 velecsum = _mm256_add_ps(velecsum,velec);
781 fscal = _mm256_and_ps(fscal,cutoff_mask);
783 /* Calculate temporary vectorial force */
784 tx = _mm256_mul_ps(fscal,dx32);
785 ty = _mm256_mul_ps(fscal,dy32);
786 tz = _mm256_mul_ps(fscal,dz32);
788 /* Update vectorial force */
789 fix3 = _mm256_add_ps(fix3,tx);
790 fiy3 = _mm256_add_ps(fiy3,ty);
791 fiz3 = _mm256_add_ps(fiz3,tz);
793 fjx2 = _mm256_add_ps(fjx2,tx);
794 fjy2 = _mm256_add_ps(fjy2,ty);
795 fjz2 = _mm256_add_ps(fjz2,tz);
799 /**************************
800 * CALCULATE INTERACTIONS *
801 **************************/
803 if (gmx_mm256_any_lt(rsq33,rcutoff2))
806 r33 = _mm256_mul_ps(rsq33,rinv33);
808 /* EWALD ELECTROSTATICS */
810 /* Analytical PME correction */
811 zeta2 = _mm256_mul_ps(beta2,rsq33);
812 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
813 pmecorrF = avx256_pmecorrF_f(zeta2);
814 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
815 felec = _mm256_mul_ps(qq33,felec);
816 pmecorrV = avx256_pmecorrV_f(zeta2);
817 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
818 velec = _mm256_sub_ps(rinv33,pmecorrV);
819 velec = _mm256_mul_ps(qq33,velec);
821 d = _mm256_sub_ps(r33,rswitch);
822 d = _mm256_max_ps(d,_mm256_setzero_ps());
823 d2 = _mm256_mul_ps(d,d);
824 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
826 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
828 /* Evaluate switch function */
829 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
830 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
831 velec = _mm256_mul_ps(velec,sw);
832 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec = _mm256_and_ps(velec,cutoff_mask);
836 velecsum = _mm256_add_ps(velecsum,velec);
840 fscal = _mm256_and_ps(fscal,cutoff_mask);
842 /* Calculate temporary vectorial force */
843 tx = _mm256_mul_ps(fscal,dx33);
844 ty = _mm256_mul_ps(fscal,dy33);
845 tz = _mm256_mul_ps(fscal,dz33);
847 /* Update vectorial force */
848 fix3 = _mm256_add_ps(fix3,tx);
849 fiy3 = _mm256_add_ps(fiy3,ty);
850 fiz3 = _mm256_add_ps(fiz3,tz);
852 fjx3 = _mm256_add_ps(fjx3,tx);
853 fjy3 = _mm256_add_ps(fjy3,ty);
854 fjz3 = _mm256_add_ps(fjz3,tz);
858 fjptrA = f+j_coord_offsetA;
859 fjptrB = f+j_coord_offsetB;
860 fjptrC = f+j_coord_offsetC;
861 fjptrD = f+j_coord_offsetD;
862 fjptrE = f+j_coord_offsetE;
863 fjptrF = f+j_coord_offsetF;
864 fjptrG = f+j_coord_offsetG;
865 fjptrH = f+j_coord_offsetH;
867 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
868 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
869 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
871 /* Inner loop uses 972 flops */
877 /* Get j neighbor index, and coordinate index */
878 jnrlistA = jjnr[jidx];
879 jnrlistB = jjnr[jidx+1];
880 jnrlistC = jjnr[jidx+2];
881 jnrlistD = jjnr[jidx+3];
882 jnrlistE = jjnr[jidx+4];
883 jnrlistF = jjnr[jidx+5];
884 jnrlistG = jjnr[jidx+6];
885 jnrlistH = jjnr[jidx+7];
886 /* Sign of each element will be negative for non-real atoms.
887 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
888 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
890 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
891 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
893 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
894 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
895 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
896 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
897 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
898 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
899 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
900 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
901 j_coord_offsetA = DIM*jnrA;
902 j_coord_offsetB = DIM*jnrB;
903 j_coord_offsetC = DIM*jnrC;
904 j_coord_offsetD = DIM*jnrD;
905 j_coord_offsetE = DIM*jnrE;
906 j_coord_offsetF = DIM*jnrF;
907 j_coord_offsetG = DIM*jnrG;
908 j_coord_offsetH = DIM*jnrH;
910 /* load j atom coordinates */
911 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
912 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
913 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
914 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
915 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
917 /* Calculate displacement vector */
918 dx11 = _mm256_sub_ps(ix1,jx1);
919 dy11 = _mm256_sub_ps(iy1,jy1);
920 dz11 = _mm256_sub_ps(iz1,jz1);
921 dx12 = _mm256_sub_ps(ix1,jx2);
922 dy12 = _mm256_sub_ps(iy1,jy2);
923 dz12 = _mm256_sub_ps(iz1,jz2);
924 dx13 = _mm256_sub_ps(ix1,jx3);
925 dy13 = _mm256_sub_ps(iy1,jy3);
926 dz13 = _mm256_sub_ps(iz1,jz3);
927 dx21 = _mm256_sub_ps(ix2,jx1);
928 dy21 = _mm256_sub_ps(iy2,jy1);
929 dz21 = _mm256_sub_ps(iz2,jz1);
930 dx22 = _mm256_sub_ps(ix2,jx2);
931 dy22 = _mm256_sub_ps(iy2,jy2);
932 dz22 = _mm256_sub_ps(iz2,jz2);
933 dx23 = _mm256_sub_ps(ix2,jx3);
934 dy23 = _mm256_sub_ps(iy2,jy3);
935 dz23 = _mm256_sub_ps(iz2,jz3);
936 dx31 = _mm256_sub_ps(ix3,jx1);
937 dy31 = _mm256_sub_ps(iy3,jy1);
938 dz31 = _mm256_sub_ps(iz3,jz1);
939 dx32 = _mm256_sub_ps(ix3,jx2);
940 dy32 = _mm256_sub_ps(iy3,jy2);
941 dz32 = _mm256_sub_ps(iz3,jz2);
942 dx33 = _mm256_sub_ps(ix3,jx3);
943 dy33 = _mm256_sub_ps(iy3,jy3);
944 dz33 = _mm256_sub_ps(iz3,jz3);
946 /* Calculate squared distance and things based on it */
947 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
948 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
949 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
950 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
951 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
952 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
953 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
954 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
955 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
957 rinv11 = avx256_invsqrt_f(rsq11);
958 rinv12 = avx256_invsqrt_f(rsq12);
959 rinv13 = avx256_invsqrt_f(rsq13);
960 rinv21 = avx256_invsqrt_f(rsq21);
961 rinv22 = avx256_invsqrt_f(rsq22);
962 rinv23 = avx256_invsqrt_f(rsq23);
963 rinv31 = avx256_invsqrt_f(rsq31);
964 rinv32 = avx256_invsqrt_f(rsq32);
965 rinv33 = avx256_invsqrt_f(rsq33);
967 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
968 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
969 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
970 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
971 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
972 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
973 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
974 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
975 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
977 fjx1 = _mm256_setzero_ps();
978 fjy1 = _mm256_setzero_ps();
979 fjz1 = _mm256_setzero_ps();
980 fjx2 = _mm256_setzero_ps();
981 fjy2 = _mm256_setzero_ps();
982 fjz2 = _mm256_setzero_ps();
983 fjx3 = _mm256_setzero_ps();
984 fjy3 = _mm256_setzero_ps();
985 fjz3 = _mm256_setzero_ps();
987 /**************************
988 * CALCULATE INTERACTIONS *
989 **************************/
991 if (gmx_mm256_any_lt(rsq11,rcutoff2))
994 r11 = _mm256_mul_ps(rsq11,rinv11);
995 r11 = _mm256_andnot_ps(dummy_mask,r11);
997 /* EWALD ELECTROSTATICS */
999 /* Analytical PME correction */
1000 zeta2 = _mm256_mul_ps(beta2,rsq11);
1001 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1002 pmecorrF = avx256_pmecorrF_f(zeta2);
1003 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1004 felec = _mm256_mul_ps(qq11,felec);
1005 pmecorrV = avx256_pmecorrV_f(zeta2);
1006 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1007 velec = _mm256_sub_ps(rinv11,pmecorrV);
1008 velec = _mm256_mul_ps(qq11,velec);
1010 d = _mm256_sub_ps(r11,rswitch);
1011 d = _mm256_max_ps(d,_mm256_setzero_ps());
1012 d2 = _mm256_mul_ps(d,d);
1013 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1015 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1017 /* Evaluate switch function */
1018 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1019 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1020 velec = _mm256_mul_ps(velec,sw);
1021 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1023 /* Update potential sum for this i atom from the interaction with this j atom. */
1024 velec = _mm256_and_ps(velec,cutoff_mask);
1025 velec = _mm256_andnot_ps(dummy_mask,velec);
1026 velecsum = _mm256_add_ps(velecsum,velec);
1030 fscal = _mm256_and_ps(fscal,cutoff_mask);
1032 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1034 /* Calculate temporary vectorial force */
1035 tx = _mm256_mul_ps(fscal,dx11);
1036 ty = _mm256_mul_ps(fscal,dy11);
1037 tz = _mm256_mul_ps(fscal,dz11);
1039 /* Update vectorial force */
1040 fix1 = _mm256_add_ps(fix1,tx);
1041 fiy1 = _mm256_add_ps(fiy1,ty);
1042 fiz1 = _mm256_add_ps(fiz1,tz);
1044 fjx1 = _mm256_add_ps(fjx1,tx);
1045 fjy1 = _mm256_add_ps(fjy1,ty);
1046 fjz1 = _mm256_add_ps(fjz1,tz);
1050 /**************************
1051 * CALCULATE INTERACTIONS *
1052 **************************/
1054 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1057 r12 = _mm256_mul_ps(rsq12,rinv12);
1058 r12 = _mm256_andnot_ps(dummy_mask,r12);
1060 /* EWALD ELECTROSTATICS */
1062 /* Analytical PME correction */
1063 zeta2 = _mm256_mul_ps(beta2,rsq12);
1064 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1065 pmecorrF = avx256_pmecorrF_f(zeta2);
1066 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1067 felec = _mm256_mul_ps(qq12,felec);
1068 pmecorrV = avx256_pmecorrV_f(zeta2);
1069 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1070 velec = _mm256_sub_ps(rinv12,pmecorrV);
1071 velec = _mm256_mul_ps(qq12,velec);
1073 d = _mm256_sub_ps(r12,rswitch);
1074 d = _mm256_max_ps(d,_mm256_setzero_ps());
1075 d2 = _mm256_mul_ps(d,d);
1076 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1078 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1080 /* Evaluate switch function */
1081 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1082 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1083 velec = _mm256_mul_ps(velec,sw);
1084 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1086 /* Update potential sum for this i atom from the interaction with this j atom. */
1087 velec = _mm256_and_ps(velec,cutoff_mask);
1088 velec = _mm256_andnot_ps(dummy_mask,velec);
1089 velecsum = _mm256_add_ps(velecsum,velec);
1093 fscal = _mm256_and_ps(fscal,cutoff_mask);
1095 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1097 /* Calculate temporary vectorial force */
1098 tx = _mm256_mul_ps(fscal,dx12);
1099 ty = _mm256_mul_ps(fscal,dy12);
1100 tz = _mm256_mul_ps(fscal,dz12);
1102 /* Update vectorial force */
1103 fix1 = _mm256_add_ps(fix1,tx);
1104 fiy1 = _mm256_add_ps(fiy1,ty);
1105 fiz1 = _mm256_add_ps(fiz1,tz);
1107 fjx2 = _mm256_add_ps(fjx2,tx);
1108 fjy2 = _mm256_add_ps(fjy2,ty);
1109 fjz2 = _mm256_add_ps(fjz2,tz);
1113 /**************************
1114 * CALCULATE INTERACTIONS *
1115 **************************/
1117 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1120 r13 = _mm256_mul_ps(rsq13,rinv13);
1121 r13 = _mm256_andnot_ps(dummy_mask,r13);
1123 /* EWALD ELECTROSTATICS */
1125 /* Analytical PME correction */
1126 zeta2 = _mm256_mul_ps(beta2,rsq13);
1127 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1128 pmecorrF = avx256_pmecorrF_f(zeta2);
1129 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1130 felec = _mm256_mul_ps(qq13,felec);
1131 pmecorrV = avx256_pmecorrV_f(zeta2);
1132 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1133 velec = _mm256_sub_ps(rinv13,pmecorrV);
1134 velec = _mm256_mul_ps(qq13,velec);
1136 d = _mm256_sub_ps(r13,rswitch);
1137 d = _mm256_max_ps(d,_mm256_setzero_ps());
1138 d2 = _mm256_mul_ps(d,d);
1139 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1141 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1143 /* Evaluate switch function */
1144 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1145 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1146 velec = _mm256_mul_ps(velec,sw);
1147 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm256_and_ps(velec,cutoff_mask);
1151 velec = _mm256_andnot_ps(dummy_mask,velec);
1152 velecsum = _mm256_add_ps(velecsum,velec);
1156 fscal = _mm256_and_ps(fscal,cutoff_mask);
1158 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1160 /* Calculate temporary vectorial force */
1161 tx = _mm256_mul_ps(fscal,dx13);
1162 ty = _mm256_mul_ps(fscal,dy13);
1163 tz = _mm256_mul_ps(fscal,dz13);
1165 /* Update vectorial force */
1166 fix1 = _mm256_add_ps(fix1,tx);
1167 fiy1 = _mm256_add_ps(fiy1,ty);
1168 fiz1 = _mm256_add_ps(fiz1,tz);
1170 fjx3 = _mm256_add_ps(fjx3,tx);
1171 fjy3 = _mm256_add_ps(fjy3,ty);
1172 fjz3 = _mm256_add_ps(fjz3,tz);
1176 /**************************
1177 * CALCULATE INTERACTIONS *
1178 **************************/
1180 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1183 r21 = _mm256_mul_ps(rsq21,rinv21);
1184 r21 = _mm256_andnot_ps(dummy_mask,r21);
1186 /* EWALD ELECTROSTATICS */
1188 /* Analytical PME correction */
1189 zeta2 = _mm256_mul_ps(beta2,rsq21);
1190 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1191 pmecorrF = avx256_pmecorrF_f(zeta2);
1192 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1193 felec = _mm256_mul_ps(qq21,felec);
1194 pmecorrV = avx256_pmecorrV_f(zeta2);
1195 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1196 velec = _mm256_sub_ps(rinv21,pmecorrV);
1197 velec = _mm256_mul_ps(qq21,velec);
1199 d = _mm256_sub_ps(r21,rswitch);
1200 d = _mm256_max_ps(d,_mm256_setzero_ps());
1201 d2 = _mm256_mul_ps(d,d);
1202 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1204 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1206 /* Evaluate switch function */
1207 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1208 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1209 velec = _mm256_mul_ps(velec,sw);
1210 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1212 /* Update potential sum for this i atom from the interaction with this j atom. */
1213 velec = _mm256_and_ps(velec,cutoff_mask);
1214 velec = _mm256_andnot_ps(dummy_mask,velec);
1215 velecsum = _mm256_add_ps(velecsum,velec);
1219 fscal = _mm256_and_ps(fscal,cutoff_mask);
1221 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1223 /* Calculate temporary vectorial force */
1224 tx = _mm256_mul_ps(fscal,dx21);
1225 ty = _mm256_mul_ps(fscal,dy21);
1226 tz = _mm256_mul_ps(fscal,dz21);
1228 /* Update vectorial force */
1229 fix2 = _mm256_add_ps(fix2,tx);
1230 fiy2 = _mm256_add_ps(fiy2,ty);
1231 fiz2 = _mm256_add_ps(fiz2,tz);
1233 fjx1 = _mm256_add_ps(fjx1,tx);
1234 fjy1 = _mm256_add_ps(fjy1,ty);
1235 fjz1 = _mm256_add_ps(fjz1,tz);
1239 /**************************
1240 * CALCULATE INTERACTIONS *
1241 **************************/
1243 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1246 r22 = _mm256_mul_ps(rsq22,rinv22);
1247 r22 = _mm256_andnot_ps(dummy_mask,r22);
1249 /* EWALD ELECTROSTATICS */
1251 /* Analytical PME correction */
1252 zeta2 = _mm256_mul_ps(beta2,rsq22);
1253 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1254 pmecorrF = avx256_pmecorrF_f(zeta2);
1255 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1256 felec = _mm256_mul_ps(qq22,felec);
1257 pmecorrV = avx256_pmecorrV_f(zeta2);
1258 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1259 velec = _mm256_sub_ps(rinv22,pmecorrV);
1260 velec = _mm256_mul_ps(qq22,velec);
1262 d = _mm256_sub_ps(r22,rswitch);
1263 d = _mm256_max_ps(d,_mm256_setzero_ps());
1264 d2 = _mm256_mul_ps(d,d);
1265 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1267 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1269 /* Evaluate switch function */
1270 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1271 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1272 velec = _mm256_mul_ps(velec,sw);
1273 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1275 /* Update potential sum for this i atom from the interaction with this j atom. */
1276 velec = _mm256_and_ps(velec,cutoff_mask);
1277 velec = _mm256_andnot_ps(dummy_mask,velec);
1278 velecsum = _mm256_add_ps(velecsum,velec);
1282 fscal = _mm256_and_ps(fscal,cutoff_mask);
1284 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1286 /* Calculate temporary vectorial force */
1287 tx = _mm256_mul_ps(fscal,dx22);
1288 ty = _mm256_mul_ps(fscal,dy22);
1289 tz = _mm256_mul_ps(fscal,dz22);
1291 /* Update vectorial force */
1292 fix2 = _mm256_add_ps(fix2,tx);
1293 fiy2 = _mm256_add_ps(fiy2,ty);
1294 fiz2 = _mm256_add_ps(fiz2,tz);
1296 fjx2 = _mm256_add_ps(fjx2,tx);
1297 fjy2 = _mm256_add_ps(fjy2,ty);
1298 fjz2 = _mm256_add_ps(fjz2,tz);
1302 /**************************
1303 * CALCULATE INTERACTIONS *
1304 **************************/
1306 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1309 r23 = _mm256_mul_ps(rsq23,rinv23);
1310 r23 = _mm256_andnot_ps(dummy_mask,r23);
1312 /* EWALD ELECTROSTATICS */
1314 /* Analytical PME correction */
1315 zeta2 = _mm256_mul_ps(beta2,rsq23);
1316 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1317 pmecorrF = avx256_pmecorrF_f(zeta2);
1318 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1319 felec = _mm256_mul_ps(qq23,felec);
1320 pmecorrV = avx256_pmecorrV_f(zeta2);
1321 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1322 velec = _mm256_sub_ps(rinv23,pmecorrV);
1323 velec = _mm256_mul_ps(qq23,velec);
1325 d = _mm256_sub_ps(r23,rswitch);
1326 d = _mm256_max_ps(d,_mm256_setzero_ps());
1327 d2 = _mm256_mul_ps(d,d);
1328 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1330 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1332 /* Evaluate switch function */
1333 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1334 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1335 velec = _mm256_mul_ps(velec,sw);
1336 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1338 /* Update potential sum for this i atom from the interaction with this j atom. */
1339 velec = _mm256_and_ps(velec,cutoff_mask);
1340 velec = _mm256_andnot_ps(dummy_mask,velec);
1341 velecsum = _mm256_add_ps(velecsum,velec);
1345 fscal = _mm256_and_ps(fscal,cutoff_mask);
1347 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1349 /* Calculate temporary vectorial force */
1350 tx = _mm256_mul_ps(fscal,dx23);
1351 ty = _mm256_mul_ps(fscal,dy23);
1352 tz = _mm256_mul_ps(fscal,dz23);
1354 /* Update vectorial force */
1355 fix2 = _mm256_add_ps(fix2,tx);
1356 fiy2 = _mm256_add_ps(fiy2,ty);
1357 fiz2 = _mm256_add_ps(fiz2,tz);
1359 fjx3 = _mm256_add_ps(fjx3,tx);
1360 fjy3 = _mm256_add_ps(fjy3,ty);
1361 fjz3 = _mm256_add_ps(fjz3,tz);
1365 /**************************
1366 * CALCULATE INTERACTIONS *
1367 **************************/
1369 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1372 r31 = _mm256_mul_ps(rsq31,rinv31);
1373 r31 = _mm256_andnot_ps(dummy_mask,r31);
1375 /* EWALD ELECTROSTATICS */
1377 /* Analytical PME correction */
1378 zeta2 = _mm256_mul_ps(beta2,rsq31);
1379 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1380 pmecorrF = avx256_pmecorrF_f(zeta2);
1381 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1382 felec = _mm256_mul_ps(qq31,felec);
1383 pmecorrV = avx256_pmecorrV_f(zeta2);
1384 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1385 velec = _mm256_sub_ps(rinv31,pmecorrV);
1386 velec = _mm256_mul_ps(qq31,velec);
1388 d = _mm256_sub_ps(r31,rswitch);
1389 d = _mm256_max_ps(d,_mm256_setzero_ps());
1390 d2 = _mm256_mul_ps(d,d);
1391 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1393 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1395 /* Evaluate switch function */
1396 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1397 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1398 velec = _mm256_mul_ps(velec,sw);
1399 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1401 /* Update potential sum for this i atom from the interaction with this j atom. */
1402 velec = _mm256_and_ps(velec,cutoff_mask);
1403 velec = _mm256_andnot_ps(dummy_mask,velec);
1404 velecsum = _mm256_add_ps(velecsum,velec);
1408 fscal = _mm256_and_ps(fscal,cutoff_mask);
1410 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1412 /* Calculate temporary vectorial force */
1413 tx = _mm256_mul_ps(fscal,dx31);
1414 ty = _mm256_mul_ps(fscal,dy31);
1415 tz = _mm256_mul_ps(fscal,dz31);
1417 /* Update vectorial force */
1418 fix3 = _mm256_add_ps(fix3,tx);
1419 fiy3 = _mm256_add_ps(fiy3,ty);
1420 fiz3 = _mm256_add_ps(fiz3,tz);
1422 fjx1 = _mm256_add_ps(fjx1,tx);
1423 fjy1 = _mm256_add_ps(fjy1,ty);
1424 fjz1 = _mm256_add_ps(fjz1,tz);
1428 /**************************
1429 * CALCULATE INTERACTIONS *
1430 **************************/
1432 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1435 r32 = _mm256_mul_ps(rsq32,rinv32);
1436 r32 = _mm256_andnot_ps(dummy_mask,r32);
1438 /* EWALD ELECTROSTATICS */
1440 /* Analytical PME correction */
1441 zeta2 = _mm256_mul_ps(beta2,rsq32);
1442 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1443 pmecorrF = avx256_pmecorrF_f(zeta2);
1444 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1445 felec = _mm256_mul_ps(qq32,felec);
1446 pmecorrV = avx256_pmecorrV_f(zeta2);
1447 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1448 velec = _mm256_sub_ps(rinv32,pmecorrV);
1449 velec = _mm256_mul_ps(qq32,velec);
1451 d = _mm256_sub_ps(r32,rswitch);
1452 d = _mm256_max_ps(d,_mm256_setzero_ps());
1453 d2 = _mm256_mul_ps(d,d);
1454 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1456 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1458 /* Evaluate switch function */
1459 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1460 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1461 velec = _mm256_mul_ps(velec,sw);
1462 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1464 /* Update potential sum for this i atom from the interaction with this j atom. */
1465 velec = _mm256_and_ps(velec,cutoff_mask);
1466 velec = _mm256_andnot_ps(dummy_mask,velec);
1467 velecsum = _mm256_add_ps(velecsum,velec);
1471 fscal = _mm256_and_ps(fscal,cutoff_mask);
1473 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1475 /* Calculate temporary vectorial force */
1476 tx = _mm256_mul_ps(fscal,dx32);
1477 ty = _mm256_mul_ps(fscal,dy32);
1478 tz = _mm256_mul_ps(fscal,dz32);
1480 /* Update vectorial force */
1481 fix3 = _mm256_add_ps(fix3,tx);
1482 fiy3 = _mm256_add_ps(fiy3,ty);
1483 fiz3 = _mm256_add_ps(fiz3,tz);
1485 fjx2 = _mm256_add_ps(fjx2,tx);
1486 fjy2 = _mm256_add_ps(fjy2,ty);
1487 fjz2 = _mm256_add_ps(fjz2,tz);
1491 /**************************
1492 * CALCULATE INTERACTIONS *
1493 **************************/
1495 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1498 r33 = _mm256_mul_ps(rsq33,rinv33);
1499 r33 = _mm256_andnot_ps(dummy_mask,r33);
1501 /* EWALD ELECTROSTATICS */
1503 /* Analytical PME correction */
1504 zeta2 = _mm256_mul_ps(beta2,rsq33);
1505 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1506 pmecorrF = avx256_pmecorrF_f(zeta2);
1507 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1508 felec = _mm256_mul_ps(qq33,felec);
1509 pmecorrV = avx256_pmecorrV_f(zeta2);
1510 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1511 velec = _mm256_sub_ps(rinv33,pmecorrV);
1512 velec = _mm256_mul_ps(qq33,velec);
1514 d = _mm256_sub_ps(r33,rswitch);
1515 d = _mm256_max_ps(d,_mm256_setzero_ps());
1516 d2 = _mm256_mul_ps(d,d);
1517 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1519 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1521 /* Evaluate switch function */
1522 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1523 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1524 velec = _mm256_mul_ps(velec,sw);
1525 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1527 /* Update potential sum for this i atom from the interaction with this j atom. */
1528 velec = _mm256_and_ps(velec,cutoff_mask);
1529 velec = _mm256_andnot_ps(dummy_mask,velec);
1530 velecsum = _mm256_add_ps(velecsum,velec);
1534 fscal = _mm256_and_ps(fscal,cutoff_mask);
1536 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1538 /* Calculate temporary vectorial force */
1539 tx = _mm256_mul_ps(fscal,dx33);
1540 ty = _mm256_mul_ps(fscal,dy33);
1541 tz = _mm256_mul_ps(fscal,dz33);
1543 /* Update vectorial force */
1544 fix3 = _mm256_add_ps(fix3,tx);
1545 fiy3 = _mm256_add_ps(fiy3,ty);
1546 fiz3 = _mm256_add_ps(fiz3,tz);
1548 fjx3 = _mm256_add_ps(fjx3,tx);
1549 fjy3 = _mm256_add_ps(fjy3,ty);
1550 fjz3 = _mm256_add_ps(fjz3,tz);
1554 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1555 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1556 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1557 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1558 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1559 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1560 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1561 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1563 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1564 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1565 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1567 /* Inner loop uses 981 flops */
1570 /* End of innermost loop */
1572 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1573 f+i_coord_offset+DIM,fshift+i_shift_offset);
1576 /* Update potential energies */
1577 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1579 /* Increment number of inner iterations */
1580 inneriter += j_index_end - j_index_start;
1582 /* Outer loop uses 19 flops */
1585 /* Increment number of outer iterations */
1588 /* Update outer/inner flops */
1590 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*981);
1593 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1594 * Electrostatics interaction: Ewald
1595 * VdW interaction: None
1596 * Geometry: Water4-Water4
1597 * Calculate force/pot: Force
1600 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1601 (t_nblist * gmx_restrict nlist,
1602 rvec * gmx_restrict xx,
1603 rvec * gmx_restrict ff,
1604 struct t_forcerec * gmx_restrict fr,
1605 t_mdatoms * gmx_restrict mdatoms,
1606 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1607 t_nrnb * gmx_restrict nrnb)
1609 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1610 * just 0 for non-waters.
1611 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1612 * jnr indices corresponding to data put in the four positions in the SIMD register.
1614 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1615 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1616 int jnrA,jnrB,jnrC,jnrD;
1617 int jnrE,jnrF,jnrG,jnrH;
1618 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1619 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1620 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1621 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1622 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1623 real rcutoff_scalar;
1624 real *shiftvec,*fshift,*x,*f;
1625 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1626 real scratch[4*DIM];
1627 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1628 real * vdwioffsetptr1;
1629 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1630 real * vdwioffsetptr2;
1631 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1632 real * vdwioffsetptr3;
1633 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1634 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1635 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1636 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1637 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1638 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1639 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1640 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1641 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1642 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1643 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1644 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1645 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1646 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1647 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1648 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1649 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1652 __m128i ewitab_lo,ewitab_hi;
1653 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1654 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1656 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1657 real rswitch_scalar,d_scalar;
1658 __m256 dummy_mask,cutoff_mask;
1659 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1660 __m256 one = _mm256_set1_ps(1.0);
1661 __m256 two = _mm256_set1_ps(2.0);
1667 jindex = nlist->jindex;
1669 shiftidx = nlist->shift;
1671 shiftvec = fr->shift_vec[0];
1672 fshift = fr->fshift[0];
1673 facel = _mm256_set1_ps(fr->ic->epsfac);
1674 charge = mdatoms->chargeA;
1676 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1677 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1678 beta2 = _mm256_mul_ps(beta,beta);
1679 beta3 = _mm256_mul_ps(beta,beta2);
1681 ewtab = fr->ic->tabq_coul_FDV0;
1682 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1683 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1685 /* Setup water-specific parameters */
1686 inr = nlist->iinr[0];
1687 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1688 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1689 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1691 jq1 = _mm256_set1_ps(charge[inr+1]);
1692 jq2 = _mm256_set1_ps(charge[inr+2]);
1693 jq3 = _mm256_set1_ps(charge[inr+3]);
1694 qq11 = _mm256_mul_ps(iq1,jq1);
1695 qq12 = _mm256_mul_ps(iq1,jq2);
1696 qq13 = _mm256_mul_ps(iq1,jq3);
1697 qq21 = _mm256_mul_ps(iq2,jq1);
1698 qq22 = _mm256_mul_ps(iq2,jq2);
1699 qq23 = _mm256_mul_ps(iq2,jq3);
1700 qq31 = _mm256_mul_ps(iq3,jq1);
1701 qq32 = _mm256_mul_ps(iq3,jq2);
1702 qq33 = _mm256_mul_ps(iq3,jq3);
1704 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1705 rcutoff_scalar = fr->ic->rcoulomb;
1706 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1707 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1709 rswitch_scalar = fr->ic->rcoulomb_switch;
1710 rswitch = _mm256_set1_ps(rswitch_scalar);
1711 /* Setup switch parameters */
1712 d_scalar = rcutoff_scalar-rswitch_scalar;
1713 d = _mm256_set1_ps(d_scalar);
1714 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1715 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1716 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1717 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1718 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1719 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1721 /* Avoid stupid compiler warnings */
1722 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1723 j_coord_offsetA = 0;
1724 j_coord_offsetB = 0;
1725 j_coord_offsetC = 0;
1726 j_coord_offsetD = 0;
1727 j_coord_offsetE = 0;
1728 j_coord_offsetF = 0;
1729 j_coord_offsetG = 0;
1730 j_coord_offsetH = 0;
1735 for(iidx=0;iidx<4*DIM;iidx++)
1737 scratch[iidx] = 0.0;
1740 /* Start outer loop over neighborlists */
1741 for(iidx=0; iidx<nri; iidx++)
1743 /* Load shift vector for this list */
1744 i_shift_offset = DIM*shiftidx[iidx];
1746 /* Load limits for loop over neighbors */
1747 j_index_start = jindex[iidx];
1748 j_index_end = jindex[iidx+1];
1750 /* Get outer coordinate index */
1752 i_coord_offset = DIM*inr;
1754 /* Load i particle coords and add shift vector */
1755 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1756 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1758 fix1 = _mm256_setzero_ps();
1759 fiy1 = _mm256_setzero_ps();
1760 fiz1 = _mm256_setzero_ps();
1761 fix2 = _mm256_setzero_ps();
1762 fiy2 = _mm256_setzero_ps();
1763 fiz2 = _mm256_setzero_ps();
1764 fix3 = _mm256_setzero_ps();
1765 fiy3 = _mm256_setzero_ps();
1766 fiz3 = _mm256_setzero_ps();
1768 /* Start inner kernel loop */
1769 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1772 /* Get j neighbor index, and coordinate index */
1774 jnrB = jjnr[jidx+1];
1775 jnrC = jjnr[jidx+2];
1776 jnrD = jjnr[jidx+3];
1777 jnrE = jjnr[jidx+4];
1778 jnrF = jjnr[jidx+5];
1779 jnrG = jjnr[jidx+6];
1780 jnrH = jjnr[jidx+7];
1781 j_coord_offsetA = DIM*jnrA;
1782 j_coord_offsetB = DIM*jnrB;
1783 j_coord_offsetC = DIM*jnrC;
1784 j_coord_offsetD = DIM*jnrD;
1785 j_coord_offsetE = DIM*jnrE;
1786 j_coord_offsetF = DIM*jnrF;
1787 j_coord_offsetG = DIM*jnrG;
1788 j_coord_offsetH = DIM*jnrH;
1790 /* load j atom coordinates */
1791 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1792 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1793 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1794 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1795 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1797 /* Calculate displacement vector */
1798 dx11 = _mm256_sub_ps(ix1,jx1);
1799 dy11 = _mm256_sub_ps(iy1,jy1);
1800 dz11 = _mm256_sub_ps(iz1,jz1);
1801 dx12 = _mm256_sub_ps(ix1,jx2);
1802 dy12 = _mm256_sub_ps(iy1,jy2);
1803 dz12 = _mm256_sub_ps(iz1,jz2);
1804 dx13 = _mm256_sub_ps(ix1,jx3);
1805 dy13 = _mm256_sub_ps(iy1,jy3);
1806 dz13 = _mm256_sub_ps(iz1,jz3);
1807 dx21 = _mm256_sub_ps(ix2,jx1);
1808 dy21 = _mm256_sub_ps(iy2,jy1);
1809 dz21 = _mm256_sub_ps(iz2,jz1);
1810 dx22 = _mm256_sub_ps(ix2,jx2);
1811 dy22 = _mm256_sub_ps(iy2,jy2);
1812 dz22 = _mm256_sub_ps(iz2,jz2);
1813 dx23 = _mm256_sub_ps(ix2,jx3);
1814 dy23 = _mm256_sub_ps(iy2,jy3);
1815 dz23 = _mm256_sub_ps(iz2,jz3);
1816 dx31 = _mm256_sub_ps(ix3,jx1);
1817 dy31 = _mm256_sub_ps(iy3,jy1);
1818 dz31 = _mm256_sub_ps(iz3,jz1);
1819 dx32 = _mm256_sub_ps(ix3,jx2);
1820 dy32 = _mm256_sub_ps(iy3,jy2);
1821 dz32 = _mm256_sub_ps(iz3,jz2);
1822 dx33 = _mm256_sub_ps(ix3,jx3);
1823 dy33 = _mm256_sub_ps(iy3,jy3);
1824 dz33 = _mm256_sub_ps(iz3,jz3);
1826 /* Calculate squared distance and things based on it */
1827 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1828 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1829 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1830 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1831 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1832 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1833 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1834 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1835 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1837 rinv11 = avx256_invsqrt_f(rsq11);
1838 rinv12 = avx256_invsqrt_f(rsq12);
1839 rinv13 = avx256_invsqrt_f(rsq13);
1840 rinv21 = avx256_invsqrt_f(rsq21);
1841 rinv22 = avx256_invsqrt_f(rsq22);
1842 rinv23 = avx256_invsqrt_f(rsq23);
1843 rinv31 = avx256_invsqrt_f(rsq31);
1844 rinv32 = avx256_invsqrt_f(rsq32);
1845 rinv33 = avx256_invsqrt_f(rsq33);
1847 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1848 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1849 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1850 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1851 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1852 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1853 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1854 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1855 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1857 fjx1 = _mm256_setzero_ps();
1858 fjy1 = _mm256_setzero_ps();
1859 fjz1 = _mm256_setzero_ps();
1860 fjx2 = _mm256_setzero_ps();
1861 fjy2 = _mm256_setzero_ps();
1862 fjz2 = _mm256_setzero_ps();
1863 fjx3 = _mm256_setzero_ps();
1864 fjy3 = _mm256_setzero_ps();
1865 fjz3 = _mm256_setzero_ps();
1867 /**************************
1868 * CALCULATE INTERACTIONS *
1869 **************************/
1871 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1874 r11 = _mm256_mul_ps(rsq11,rinv11);
1876 /* EWALD ELECTROSTATICS */
1878 /* Analytical PME correction */
1879 zeta2 = _mm256_mul_ps(beta2,rsq11);
1880 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1881 pmecorrF = avx256_pmecorrF_f(zeta2);
1882 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1883 felec = _mm256_mul_ps(qq11,felec);
1884 pmecorrV = avx256_pmecorrV_f(zeta2);
1885 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1886 velec = _mm256_sub_ps(rinv11,pmecorrV);
1887 velec = _mm256_mul_ps(qq11,velec);
1889 d = _mm256_sub_ps(r11,rswitch);
1890 d = _mm256_max_ps(d,_mm256_setzero_ps());
1891 d2 = _mm256_mul_ps(d,d);
1892 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1894 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1896 /* Evaluate switch function */
1897 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1898 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1899 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1903 fscal = _mm256_and_ps(fscal,cutoff_mask);
1905 /* Calculate temporary vectorial force */
1906 tx = _mm256_mul_ps(fscal,dx11);
1907 ty = _mm256_mul_ps(fscal,dy11);
1908 tz = _mm256_mul_ps(fscal,dz11);
1910 /* Update vectorial force */
1911 fix1 = _mm256_add_ps(fix1,tx);
1912 fiy1 = _mm256_add_ps(fiy1,ty);
1913 fiz1 = _mm256_add_ps(fiz1,tz);
1915 fjx1 = _mm256_add_ps(fjx1,tx);
1916 fjy1 = _mm256_add_ps(fjy1,ty);
1917 fjz1 = _mm256_add_ps(fjz1,tz);
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1928 r12 = _mm256_mul_ps(rsq12,rinv12);
1930 /* EWALD ELECTROSTATICS */
1932 /* Analytical PME correction */
1933 zeta2 = _mm256_mul_ps(beta2,rsq12);
1934 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1935 pmecorrF = avx256_pmecorrF_f(zeta2);
1936 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1937 felec = _mm256_mul_ps(qq12,felec);
1938 pmecorrV = avx256_pmecorrV_f(zeta2);
1939 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1940 velec = _mm256_sub_ps(rinv12,pmecorrV);
1941 velec = _mm256_mul_ps(qq12,velec);
1943 d = _mm256_sub_ps(r12,rswitch);
1944 d = _mm256_max_ps(d,_mm256_setzero_ps());
1945 d2 = _mm256_mul_ps(d,d);
1946 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1948 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1950 /* Evaluate switch function */
1951 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1952 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1953 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1957 fscal = _mm256_and_ps(fscal,cutoff_mask);
1959 /* Calculate temporary vectorial force */
1960 tx = _mm256_mul_ps(fscal,dx12);
1961 ty = _mm256_mul_ps(fscal,dy12);
1962 tz = _mm256_mul_ps(fscal,dz12);
1964 /* Update vectorial force */
1965 fix1 = _mm256_add_ps(fix1,tx);
1966 fiy1 = _mm256_add_ps(fiy1,ty);
1967 fiz1 = _mm256_add_ps(fiz1,tz);
1969 fjx2 = _mm256_add_ps(fjx2,tx);
1970 fjy2 = _mm256_add_ps(fjy2,ty);
1971 fjz2 = _mm256_add_ps(fjz2,tz);
1975 /**************************
1976 * CALCULATE INTERACTIONS *
1977 **************************/
1979 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1982 r13 = _mm256_mul_ps(rsq13,rinv13);
1984 /* EWALD ELECTROSTATICS */
1986 /* Analytical PME correction */
1987 zeta2 = _mm256_mul_ps(beta2,rsq13);
1988 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1989 pmecorrF = avx256_pmecorrF_f(zeta2);
1990 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1991 felec = _mm256_mul_ps(qq13,felec);
1992 pmecorrV = avx256_pmecorrV_f(zeta2);
1993 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1994 velec = _mm256_sub_ps(rinv13,pmecorrV);
1995 velec = _mm256_mul_ps(qq13,velec);
1997 d = _mm256_sub_ps(r13,rswitch);
1998 d = _mm256_max_ps(d,_mm256_setzero_ps());
1999 d2 = _mm256_mul_ps(d,d);
2000 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2002 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2004 /* Evaluate switch function */
2005 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2006 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2007 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2011 fscal = _mm256_and_ps(fscal,cutoff_mask);
2013 /* Calculate temporary vectorial force */
2014 tx = _mm256_mul_ps(fscal,dx13);
2015 ty = _mm256_mul_ps(fscal,dy13);
2016 tz = _mm256_mul_ps(fscal,dz13);
2018 /* Update vectorial force */
2019 fix1 = _mm256_add_ps(fix1,tx);
2020 fiy1 = _mm256_add_ps(fiy1,ty);
2021 fiz1 = _mm256_add_ps(fiz1,tz);
2023 fjx3 = _mm256_add_ps(fjx3,tx);
2024 fjy3 = _mm256_add_ps(fjy3,ty);
2025 fjz3 = _mm256_add_ps(fjz3,tz);
2029 /**************************
2030 * CALCULATE INTERACTIONS *
2031 **************************/
2033 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2036 r21 = _mm256_mul_ps(rsq21,rinv21);
2038 /* EWALD ELECTROSTATICS */
2040 /* Analytical PME correction */
2041 zeta2 = _mm256_mul_ps(beta2,rsq21);
2042 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2043 pmecorrF = avx256_pmecorrF_f(zeta2);
2044 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2045 felec = _mm256_mul_ps(qq21,felec);
2046 pmecorrV = avx256_pmecorrV_f(zeta2);
2047 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2048 velec = _mm256_sub_ps(rinv21,pmecorrV);
2049 velec = _mm256_mul_ps(qq21,velec);
2051 d = _mm256_sub_ps(r21,rswitch);
2052 d = _mm256_max_ps(d,_mm256_setzero_ps());
2053 d2 = _mm256_mul_ps(d,d);
2054 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2056 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2058 /* Evaluate switch function */
2059 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2060 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2061 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2065 fscal = _mm256_and_ps(fscal,cutoff_mask);
2067 /* Calculate temporary vectorial force */
2068 tx = _mm256_mul_ps(fscal,dx21);
2069 ty = _mm256_mul_ps(fscal,dy21);
2070 tz = _mm256_mul_ps(fscal,dz21);
2072 /* Update vectorial force */
2073 fix2 = _mm256_add_ps(fix2,tx);
2074 fiy2 = _mm256_add_ps(fiy2,ty);
2075 fiz2 = _mm256_add_ps(fiz2,tz);
2077 fjx1 = _mm256_add_ps(fjx1,tx);
2078 fjy1 = _mm256_add_ps(fjy1,ty);
2079 fjz1 = _mm256_add_ps(fjz1,tz);
2083 /**************************
2084 * CALCULATE INTERACTIONS *
2085 **************************/
2087 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2090 r22 = _mm256_mul_ps(rsq22,rinv22);
2092 /* EWALD ELECTROSTATICS */
2094 /* Analytical PME correction */
2095 zeta2 = _mm256_mul_ps(beta2,rsq22);
2096 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2097 pmecorrF = avx256_pmecorrF_f(zeta2);
2098 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2099 felec = _mm256_mul_ps(qq22,felec);
2100 pmecorrV = avx256_pmecorrV_f(zeta2);
2101 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2102 velec = _mm256_sub_ps(rinv22,pmecorrV);
2103 velec = _mm256_mul_ps(qq22,velec);
2105 d = _mm256_sub_ps(r22,rswitch);
2106 d = _mm256_max_ps(d,_mm256_setzero_ps());
2107 d2 = _mm256_mul_ps(d,d);
2108 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2110 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2112 /* Evaluate switch function */
2113 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2114 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2115 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2119 fscal = _mm256_and_ps(fscal,cutoff_mask);
2121 /* Calculate temporary vectorial force */
2122 tx = _mm256_mul_ps(fscal,dx22);
2123 ty = _mm256_mul_ps(fscal,dy22);
2124 tz = _mm256_mul_ps(fscal,dz22);
2126 /* Update vectorial force */
2127 fix2 = _mm256_add_ps(fix2,tx);
2128 fiy2 = _mm256_add_ps(fiy2,ty);
2129 fiz2 = _mm256_add_ps(fiz2,tz);
2131 fjx2 = _mm256_add_ps(fjx2,tx);
2132 fjy2 = _mm256_add_ps(fjy2,ty);
2133 fjz2 = _mm256_add_ps(fjz2,tz);
2137 /**************************
2138 * CALCULATE INTERACTIONS *
2139 **************************/
2141 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2144 r23 = _mm256_mul_ps(rsq23,rinv23);
2146 /* EWALD ELECTROSTATICS */
2148 /* Analytical PME correction */
2149 zeta2 = _mm256_mul_ps(beta2,rsq23);
2150 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2151 pmecorrF = avx256_pmecorrF_f(zeta2);
2152 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2153 felec = _mm256_mul_ps(qq23,felec);
2154 pmecorrV = avx256_pmecorrV_f(zeta2);
2155 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2156 velec = _mm256_sub_ps(rinv23,pmecorrV);
2157 velec = _mm256_mul_ps(qq23,velec);
2159 d = _mm256_sub_ps(r23,rswitch);
2160 d = _mm256_max_ps(d,_mm256_setzero_ps());
2161 d2 = _mm256_mul_ps(d,d);
2162 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2164 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2166 /* Evaluate switch function */
2167 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2168 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2169 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2173 fscal = _mm256_and_ps(fscal,cutoff_mask);
2175 /* Calculate temporary vectorial force */
2176 tx = _mm256_mul_ps(fscal,dx23);
2177 ty = _mm256_mul_ps(fscal,dy23);
2178 tz = _mm256_mul_ps(fscal,dz23);
2180 /* Update vectorial force */
2181 fix2 = _mm256_add_ps(fix2,tx);
2182 fiy2 = _mm256_add_ps(fiy2,ty);
2183 fiz2 = _mm256_add_ps(fiz2,tz);
2185 fjx3 = _mm256_add_ps(fjx3,tx);
2186 fjy3 = _mm256_add_ps(fjy3,ty);
2187 fjz3 = _mm256_add_ps(fjz3,tz);
2191 /**************************
2192 * CALCULATE INTERACTIONS *
2193 **************************/
2195 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2198 r31 = _mm256_mul_ps(rsq31,rinv31);
2200 /* EWALD ELECTROSTATICS */
2202 /* Analytical PME correction */
2203 zeta2 = _mm256_mul_ps(beta2,rsq31);
2204 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2205 pmecorrF = avx256_pmecorrF_f(zeta2);
2206 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2207 felec = _mm256_mul_ps(qq31,felec);
2208 pmecorrV = avx256_pmecorrV_f(zeta2);
2209 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2210 velec = _mm256_sub_ps(rinv31,pmecorrV);
2211 velec = _mm256_mul_ps(qq31,velec);
2213 d = _mm256_sub_ps(r31,rswitch);
2214 d = _mm256_max_ps(d,_mm256_setzero_ps());
2215 d2 = _mm256_mul_ps(d,d);
2216 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2218 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2220 /* Evaluate switch function */
2221 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2222 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2223 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2227 fscal = _mm256_and_ps(fscal,cutoff_mask);
2229 /* Calculate temporary vectorial force */
2230 tx = _mm256_mul_ps(fscal,dx31);
2231 ty = _mm256_mul_ps(fscal,dy31);
2232 tz = _mm256_mul_ps(fscal,dz31);
2234 /* Update vectorial force */
2235 fix3 = _mm256_add_ps(fix3,tx);
2236 fiy3 = _mm256_add_ps(fiy3,ty);
2237 fiz3 = _mm256_add_ps(fiz3,tz);
2239 fjx1 = _mm256_add_ps(fjx1,tx);
2240 fjy1 = _mm256_add_ps(fjy1,ty);
2241 fjz1 = _mm256_add_ps(fjz1,tz);
2245 /**************************
2246 * CALCULATE INTERACTIONS *
2247 **************************/
2249 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2252 r32 = _mm256_mul_ps(rsq32,rinv32);
2254 /* EWALD ELECTROSTATICS */
2256 /* Analytical PME correction */
2257 zeta2 = _mm256_mul_ps(beta2,rsq32);
2258 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2259 pmecorrF = avx256_pmecorrF_f(zeta2);
2260 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2261 felec = _mm256_mul_ps(qq32,felec);
2262 pmecorrV = avx256_pmecorrV_f(zeta2);
2263 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2264 velec = _mm256_sub_ps(rinv32,pmecorrV);
2265 velec = _mm256_mul_ps(qq32,velec);
2267 d = _mm256_sub_ps(r32,rswitch);
2268 d = _mm256_max_ps(d,_mm256_setzero_ps());
2269 d2 = _mm256_mul_ps(d,d);
2270 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2272 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2274 /* Evaluate switch function */
2275 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2276 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2277 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2281 fscal = _mm256_and_ps(fscal,cutoff_mask);
2283 /* Calculate temporary vectorial force */
2284 tx = _mm256_mul_ps(fscal,dx32);
2285 ty = _mm256_mul_ps(fscal,dy32);
2286 tz = _mm256_mul_ps(fscal,dz32);
2288 /* Update vectorial force */
2289 fix3 = _mm256_add_ps(fix3,tx);
2290 fiy3 = _mm256_add_ps(fiy3,ty);
2291 fiz3 = _mm256_add_ps(fiz3,tz);
2293 fjx2 = _mm256_add_ps(fjx2,tx);
2294 fjy2 = _mm256_add_ps(fjy2,ty);
2295 fjz2 = _mm256_add_ps(fjz2,tz);
2299 /**************************
2300 * CALCULATE INTERACTIONS *
2301 **************************/
2303 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2306 r33 = _mm256_mul_ps(rsq33,rinv33);
2308 /* EWALD ELECTROSTATICS */
2310 /* Analytical PME correction */
2311 zeta2 = _mm256_mul_ps(beta2,rsq33);
2312 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2313 pmecorrF = avx256_pmecorrF_f(zeta2);
2314 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2315 felec = _mm256_mul_ps(qq33,felec);
2316 pmecorrV = avx256_pmecorrV_f(zeta2);
2317 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2318 velec = _mm256_sub_ps(rinv33,pmecorrV);
2319 velec = _mm256_mul_ps(qq33,velec);
2321 d = _mm256_sub_ps(r33,rswitch);
2322 d = _mm256_max_ps(d,_mm256_setzero_ps());
2323 d2 = _mm256_mul_ps(d,d);
2324 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2326 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2328 /* Evaluate switch function */
2329 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2330 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2331 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2335 fscal = _mm256_and_ps(fscal,cutoff_mask);
2337 /* Calculate temporary vectorial force */
2338 tx = _mm256_mul_ps(fscal,dx33);
2339 ty = _mm256_mul_ps(fscal,dy33);
2340 tz = _mm256_mul_ps(fscal,dz33);
2342 /* Update vectorial force */
2343 fix3 = _mm256_add_ps(fix3,tx);
2344 fiy3 = _mm256_add_ps(fiy3,ty);
2345 fiz3 = _mm256_add_ps(fiz3,tz);
2347 fjx3 = _mm256_add_ps(fjx3,tx);
2348 fjy3 = _mm256_add_ps(fjy3,ty);
2349 fjz3 = _mm256_add_ps(fjz3,tz);
2353 fjptrA = f+j_coord_offsetA;
2354 fjptrB = f+j_coord_offsetB;
2355 fjptrC = f+j_coord_offsetC;
2356 fjptrD = f+j_coord_offsetD;
2357 fjptrE = f+j_coord_offsetE;
2358 fjptrF = f+j_coord_offsetF;
2359 fjptrG = f+j_coord_offsetG;
2360 fjptrH = f+j_coord_offsetH;
2362 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2363 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2364 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2366 /* Inner loop uses 945 flops */
2369 if(jidx<j_index_end)
2372 /* Get j neighbor index, and coordinate index */
2373 jnrlistA = jjnr[jidx];
2374 jnrlistB = jjnr[jidx+1];
2375 jnrlistC = jjnr[jidx+2];
2376 jnrlistD = jjnr[jidx+3];
2377 jnrlistE = jjnr[jidx+4];
2378 jnrlistF = jjnr[jidx+5];
2379 jnrlistG = jjnr[jidx+6];
2380 jnrlistH = jjnr[jidx+7];
2381 /* Sign of each element will be negative for non-real atoms.
2382 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2383 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2385 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2386 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2388 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2389 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2390 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2391 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2392 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2393 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2394 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2395 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2396 j_coord_offsetA = DIM*jnrA;
2397 j_coord_offsetB = DIM*jnrB;
2398 j_coord_offsetC = DIM*jnrC;
2399 j_coord_offsetD = DIM*jnrD;
2400 j_coord_offsetE = DIM*jnrE;
2401 j_coord_offsetF = DIM*jnrF;
2402 j_coord_offsetG = DIM*jnrG;
2403 j_coord_offsetH = DIM*jnrH;
2405 /* load j atom coordinates */
2406 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2407 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2408 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
2409 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
2410 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2412 /* Calculate displacement vector */
2413 dx11 = _mm256_sub_ps(ix1,jx1);
2414 dy11 = _mm256_sub_ps(iy1,jy1);
2415 dz11 = _mm256_sub_ps(iz1,jz1);
2416 dx12 = _mm256_sub_ps(ix1,jx2);
2417 dy12 = _mm256_sub_ps(iy1,jy2);
2418 dz12 = _mm256_sub_ps(iz1,jz2);
2419 dx13 = _mm256_sub_ps(ix1,jx3);
2420 dy13 = _mm256_sub_ps(iy1,jy3);
2421 dz13 = _mm256_sub_ps(iz1,jz3);
2422 dx21 = _mm256_sub_ps(ix2,jx1);
2423 dy21 = _mm256_sub_ps(iy2,jy1);
2424 dz21 = _mm256_sub_ps(iz2,jz1);
2425 dx22 = _mm256_sub_ps(ix2,jx2);
2426 dy22 = _mm256_sub_ps(iy2,jy2);
2427 dz22 = _mm256_sub_ps(iz2,jz2);
2428 dx23 = _mm256_sub_ps(ix2,jx3);
2429 dy23 = _mm256_sub_ps(iy2,jy3);
2430 dz23 = _mm256_sub_ps(iz2,jz3);
2431 dx31 = _mm256_sub_ps(ix3,jx1);
2432 dy31 = _mm256_sub_ps(iy3,jy1);
2433 dz31 = _mm256_sub_ps(iz3,jz1);
2434 dx32 = _mm256_sub_ps(ix3,jx2);
2435 dy32 = _mm256_sub_ps(iy3,jy2);
2436 dz32 = _mm256_sub_ps(iz3,jz2);
2437 dx33 = _mm256_sub_ps(ix3,jx3);
2438 dy33 = _mm256_sub_ps(iy3,jy3);
2439 dz33 = _mm256_sub_ps(iz3,jz3);
2441 /* Calculate squared distance and things based on it */
2442 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2443 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2444 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2445 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2446 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2447 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2448 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2449 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2450 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2452 rinv11 = avx256_invsqrt_f(rsq11);
2453 rinv12 = avx256_invsqrt_f(rsq12);
2454 rinv13 = avx256_invsqrt_f(rsq13);
2455 rinv21 = avx256_invsqrt_f(rsq21);
2456 rinv22 = avx256_invsqrt_f(rsq22);
2457 rinv23 = avx256_invsqrt_f(rsq23);
2458 rinv31 = avx256_invsqrt_f(rsq31);
2459 rinv32 = avx256_invsqrt_f(rsq32);
2460 rinv33 = avx256_invsqrt_f(rsq33);
2462 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2463 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2464 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2465 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2466 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2467 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2468 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2469 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2470 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2472 fjx1 = _mm256_setzero_ps();
2473 fjy1 = _mm256_setzero_ps();
2474 fjz1 = _mm256_setzero_ps();
2475 fjx2 = _mm256_setzero_ps();
2476 fjy2 = _mm256_setzero_ps();
2477 fjz2 = _mm256_setzero_ps();
2478 fjx3 = _mm256_setzero_ps();
2479 fjy3 = _mm256_setzero_ps();
2480 fjz3 = _mm256_setzero_ps();
2482 /**************************
2483 * CALCULATE INTERACTIONS *
2484 **************************/
2486 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2489 r11 = _mm256_mul_ps(rsq11,rinv11);
2490 r11 = _mm256_andnot_ps(dummy_mask,r11);
2492 /* EWALD ELECTROSTATICS */
2494 /* Analytical PME correction */
2495 zeta2 = _mm256_mul_ps(beta2,rsq11);
2496 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2497 pmecorrF = avx256_pmecorrF_f(zeta2);
2498 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2499 felec = _mm256_mul_ps(qq11,felec);
2500 pmecorrV = avx256_pmecorrV_f(zeta2);
2501 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2502 velec = _mm256_sub_ps(rinv11,pmecorrV);
2503 velec = _mm256_mul_ps(qq11,velec);
2505 d = _mm256_sub_ps(r11,rswitch);
2506 d = _mm256_max_ps(d,_mm256_setzero_ps());
2507 d2 = _mm256_mul_ps(d,d);
2508 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2510 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2512 /* Evaluate switch function */
2513 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2514 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2515 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2519 fscal = _mm256_and_ps(fscal,cutoff_mask);
2521 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2523 /* Calculate temporary vectorial force */
2524 tx = _mm256_mul_ps(fscal,dx11);
2525 ty = _mm256_mul_ps(fscal,dy11);
2526 tz = _mm256_mul_ps(fscal,dz11);
2528 /* Update vectorial force */
2529 fix1 = _mm256_add_ps(fix1,tx);
2530 fiy1 = _mm256_add_ps(fiy1,ty);
2531 fiz1 = _mm256_add_ps(fiz1,tz);
2533 fjx1 = _mm256_add_ps(fjx1,tx);
2534 fjy1 = _mm256_add_ps(fjy1,ty);
2535 fjz1 = _mm256_add_ps(fjz1,tz);
2539 /**************************
2540 * CALCULATE INTERACTIONS *
2541 **************************/
2543 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2546 r12 = _mm256_mul_ps(rsq12,rinv12);
2547 r12 = _mm256_andnot_ps(dummy_mask,r12);
2549 /* EWALD ELECTROSTATICS */
2551 /* Analytical PME correction */
2552 zeta2 = _mm256_mul_ps(beta2,rsq12);
2553 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2554 pmecorrF = avx256_pmecorrF_f(zeta2);
2555 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2556 felec = _mm256_mul_ps(qq12,felec);
2557 pmecorrV = avx256_pmecorrV_f(zeta2);
2558 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2559 velec = _mm256_sub_ps(rinv12,pmecorrV);
2560 velec = _mm256_mul_ps(qq12,velec);
2562 d = _mm256_sub_ps(r12,rswitch);
2563 d = _mm256_max_ps(d,_mm256_setzero_ps());
2564 d2 = _mm256_mul_ps(d,d);
2565 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2567 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2569 /* Evaluate switch function */
2570 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2571 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2572 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2576 fscal = _mm256_and_ps(fscal,cutoff_mask);
2578 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2580 /* Calculate temporary vectorial force */
2581 tx = _mm256_mul_ps(fscal,dx12);
2582 ty = _mm256_mul_ps(fscal,dy12);
2583 tz = _mm256_mul_ps(fscal,dz12);
2585 /* Update vectorial force */
2586 fix1 = _mm256_add_ps(fix1,tx);
2587 fiy1 = _mm256_add_ps(fiy1,ty);
2588 fiz1 = _mm256_add_ps(fiz1,tz);
2590 fjx2 = _mm256_add_ps(fjx2,tx);
2591 fjy2 = _mm256_add_ps(fjy2,ty);
2592 fjz2 = _mm256_add_ps(fjz2,tz);
2596 /**************************
2597 * CALCULATE INTERACTIONS *
2598 **************************/
2600 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2603 r13 = _mm256_mul_ps(rsq13,rinv13);
2604 r13 = _mm256_andnot_ps(dummy_mask,r13);
2606 /* EWALD ELECTROSTATICS */
2608 /* Analytical PME correction */
2609 zeta2 = _mm256_mul_ps(beta2,rsq13);
2610 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2611 pmecorrF = avx256_pmecorrF_f(zeta2);
2612 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2613 felec = _mm256_mul_ps(qq13,felec);
2614 pmecorrV = avx256_pmecorrV_f(zeta2);
2615 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2616 velec = _mm256_sub_ps(rinv13,pmecorrV);
2617 velec = _mm256_mul_ps(qq13,velec);
2619 d = _mm256_sub_ps(r13,rswitch);
2620 d = _mm256_max_ps(d,_mm256_setzero_ps());
2621 d2 = _mm256_mul_ps(d,d);
2622 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2624 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2626 /* Evaluate switch function */
2627 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2628 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2629 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2633 fscal = _mm256_and_ps(fscal,cutoff_mask);
2635 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2637 /* Calculate temporary vectorial force */
2638 tx = _mm256_mul_ps(fscal,dx13);
2639 ty = _mm256_mul_ps(fscal,dy13);
2640 tz = _mm256_mul_ps(fscal,dz13);
2642 /* Update vectorial force */
2643 fix1 = _mm256_add_ps(fix1,tx);
2644 fiy1 = _mm256_add_ps(fiy1,ty);
2645 fiz1 = _mm256_add_ps(fiz1,tz);
2647 fjx3 = _mm256_add_ps(fjx3,tx);
2648 fjy3 = _mm256_add_ps(fjy3,ty);
2649 fjz3 = _mm256_add_ps(fjz3,tz);
2653 /**************************
2654 * CALCULATE INTERACTIONS *
2655 **************************/
2657 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2660 r21 = _mm256_mul_ps(rsq21,rinv21);
2661 r21 = _mm256_andnot_ps(dummy_mask,r21);
2663 /* EWALD ELECTROSTATICS */
2665 /* Analytical PME correction */
2666 zeta2 = _mm256_mul_ps(beta2,rsq21);
2667 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2668 pmecorrF = avx256_pmecorrF_f(zeta2);
2669 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2670 felec = _mm256_mul_ps(qq21,felec);
2671 pmecorrV = avx256_pmecorrV_f(zeta2);
2672 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2673 velec = _mm256_sub_ps(rinv21,pmecorrV);
2674 velec = _mm256_mul_ps(qq21,velec);
2676 d = _mm256_sub_ps(r21,rswitch);
2677 d = _mm256_max_ps(d,_mm256_setzero_ps());
2678 d2 = _mm256_mul_ps(d,d);
2679 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2681 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2683 /* Evaluate switch function */
2684 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2685 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2686 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2690 fscal = _mm256_and_ps(fscal,cutoff_mask);
2692 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2694 /* Calculate temporary vectorial force */
2695 tx = _mm256_mul_ps(fscal,dx21);
2696 ty = _mm256_mul_ps(fscal,dy21);
2697 tz = _mm256_mul_ps(fscal,dz21);
2699 /* Update vectorial force */
2700 fix2 = _mm256_add_ps(fix2,tx);
2701 fiy2 = _mm256_add_ps(fiy2,ty);
2702 fiz2 = _mm256_add_ps(fiz2,tz);
2704 fjx1 = _mm256_add_ps(fjx1,tx);
2705 fjy1 = _mm256_add_ps(fjy1,ty);
2706 fjz1 = _mm256_add_ps(fjz1,tz);
2710 /**************************
2711 * CALCULATE INTERACTIONS *
2712 **************************/
2714 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2717 r22 = _mm256_mul_ps(rsq22,rinv22);
2718 r22 = _mm256_andnot_ps(dummy_mask,r22);
2720 /* EWALD ELECTROSTATICS */
2722 /* Analytical PME correction */
2723 zeta2 = _mm256_mul_ps(beta2,rsq22);
2724 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2725 pmecorrF = avx256_pmecorrF_f(zeta2);
2726 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2727 felec = _mm256_mul_ps(qq22,felec);
2728 pmecorrV = avx256_pmecorrV_f(zeta2);
2729 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2730 velec = _mm256_sub_ps(rinv22,pmecorrV);
2731 velec = _mm256_mul_ps(qq22,velec);
2733 d = _mm256_sub_ps(r22,rswitch);
2734 d = _mm256_max_ps(d,_mm256_setzero_ps());
2735 d2 = _mm256_mul_ps(d,d);
2736 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2738 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2740 /* Evaluate switch function */
2741 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2742 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2743 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2747 fscal = _mm256_and_ps(fscal,cutoff_mask);
2749 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2751 /* Calculate temporary vectorial force */
2752 tx = _mm256_mul_ps(fscal,dx22);
2753 ty = _mm256_mul_ps(fscal,dy22);
2754 tz = _mm256_mul_ps(fscal,dz22);
2756 /* Update vectorial force */
2757 fix2 = _mm256_add_ps(fix2,tx);
2758 fiy2 = _mm256_add_ps(fiy2,ty);
2759 fiz2 = _mm256_add_ps(fiz2,tz);
2761 fjx2 = _mm256_add_ps(fjx2,tx);
2762 fjy2 = _mm256_add_ps(fjy2,ty);
2763 fjz2 = _mm256_add_ps(fjz2,tz);
2767 /**************************
2768 * CALCULATE INTERACTIONS *
2769 **************************/
2771 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2774 r23 = _mm256_mul_ps(rsq23,rinv23);
2775 r23 = _mm256_andnot_ps(dummy_mask,r23);
2777 /* EWALD ELECTROSTATICS */
2779 /* Analytical PME correction */
2780 zeta2 = _mm256_mul_ps(beta2,rsq23);
2781 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2782 pmecorrF = avx256_pmecorrF_f(zeta2);
2783 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2784 felec = _mm256_mul_ps(qq23,felec);
2785 pmecorrV = avx256_pmecorrV_f(zeta2);
2786 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2787 velec = _mm256_sub_ps(rinv23,pmecorrV);
2788 velec = _mm256_mul_ps(qq23,velec);
2790 d = _mm256_sub_ps(r23,rswitch);
2791 d = _mm256_max_ps(d,_mm256_setzero_ps());
2792 d2 = _mm256_mul_ps(d,d);
2793 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2795 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2797 /* Evaluate switch function */
2798 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2799 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2800 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2804 fscal = _mm256_and_ps(fscal,cutoff_mask);
2806 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2808 /* Calculate temporary vectorial force */
2809 tx = _mm256_mul_ps(fscal,dx23);
2810 ty = _mm256_mul_ps(fscal,dy23);
2811 tz = _mm256_mul_ps(fscal,dz23);
2813 /* Update vectorial force */
2814 fix2 = _mm256_add_ps(fix2,tx);
2815 fiy2 = _mm256_add_ps(fiy2,ty);
2816 fiz2 = _mm256_add_ps(fiz2,tz);
2818 fjx3 = _mm256_add_ps(fjx3,tx);
2819 fjy3 = _mm256_add_ps(fjy3,ty);
2820 fjz3 = _mm256_add_ps(fjz3,tz);
2824 /**************************
2825 * CALCULATE INTERACTIONS *
2826 **************************/
2828 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2831 r31 = _mm256_mul_ps(rsq31,rinv31);
2832 r31 = _mm256_andnot_ps(dummy_mask,r31);
2834 /* EWALD ELECTROSTATICS */
2836 /* Analytical PME correction */
2837 zeta2 = _mm256_mul_ps(beta2,rsq31);
2838 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2839 pmecorrF = avx256_pmecorrF_f(zeta2);
2840 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2841 felec = _mm256_mul_ps(qq31,felec);
2842 pmecorrV = avx256_pmecorrV_f(zeta2);
2843 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2844 velec = _mm256_sub_ps(rinv31,pmecorrV);
2845 velec = _mm256_mul_ps(qq31,velec);
2847 d = _mm256_sub_ps(r31,rswitch);
2848 d = _mm256_max_ps(d,_mm256_setzero_ps());
2849 d2 = _mm256_mul_ps(d,d);
2850 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2852 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2854 /* Evaluate switch function */
2855 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2856 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2857 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2861 fscal = _mm256_and_ps(fscal,cutoff_mask);
2863 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2865 /* Calculate temporary vectorial force */
2866 tx = _mm256_mul_ps(fscal,dx31);
2867 ty = _mm256_mul_ps(fscal,dy31);
2868 tz = _mm256_mul_ps(fscal,dz31);
2870 /* Update vectorial force */
2871 fix3 = _mm256_add_ps(fix3,tx);
2872 fiy3 = _mm256_add_ps(fiy3,ty);
2873 fiz3 = _mm256_add_ps(fiz3,tz);
2875 fjx1 = _mm256_add_ps(fjx1,tx);
2876 fjy1 = _mm256_add_ps(fjy1,ty);
2877 fjz1 = _mm256_add_ps(fjz1,tz);
2881 /**************************
2882 * CALCULATE INTERACTIONS *
2883 **************************/
2885 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2888 r32 = _mm256_mul_ps(rsq32,rinv32);
2889 r32 = _mm256_andnot_ps(dummy_mask,r32);
2891 /* EWALD ELECTROSTATICS */
2893 /* Analytical PME correction */
2894 zeta2 = _mm256_mul_ps(beta2,rsq32);
2895 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2896 pmecorrF = avx256_pmecorrF_f(zeta2);
2897 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2898 felec = _mm256_mul_ps(qq32,felec);
2899 pmecorrV = avx256_pmecorrV_f(zeta2);
2900 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2901 velec = _mm256_sub_ps(rinv32,pmecorrV);
2902 velec = _mm256_mul_ps(qq32,velec);
2904 d = _mm256_sub_ps(r32,rswitch);
2905 d = _mm256_max_ps(d,_mm256_setzero_ps());
2906 d2 = _mm256_mul_ps(d,d);
2907 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2909 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2911 /* Evaluate switch function */
2912 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2913 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2914 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2918 fscal = _mm256_and_ps(fscal,cutoff_mask);
2920 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2922 /* Calculate temporary vectorial force */
2923 tx = _mm256_mul_ps(fscal,dx32);
2924 ty = _mm256_mul_ps(fscal,dy32);
2925 tz = _mm256_mul_ps(fscal,dz32);
2927 /* Update vectorial force */
2928 fix3 = _mm256_add_ps(fix3,tx);
2929 fiy3 = _mm256_add_ps(fiy3,ty);
2930 fiz3 = _mm256_add_ps(fiz3,tz);
2932 fjx2 = _mm256_add_ps(fjx2,tx);
2933 fjy2 = _mm256_add_ps(fjy2,ty);
2934 fjz2 = _mm256_add_ps(fjz2,tz);
2938 /**************************
2939 * CALCULATE INTERACTIONS *
2940 **************************/
2942 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2945 r33 = _mm256_mul_ps(rsq33,rinv33);
2946 r33 = _mm256_andnot_ps(dummy_mask,r33);
2948 /* EWALD ELECTROSTATICS */
2950 /* Analytical PME correction */
2951 zeta2 = _mm256_mul_ps(beta2,rsq33);
2952 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2953 pmecorrF = avx256_pmecorrF_f(zeta2);
2954 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2955 felec = _mm256_mul_ps(qq33,felec);
2956 pmecorrV = avx256_pmecorrV_f(zeta2);
2957 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2958 velec = _mm256_sub_ps(rinv33,pmecorrV);
2959 velec = _mm256_mul_ps(qq33,velec);
2961 d = _mm256_sub_ps(r33,rswitch);
2962 d = _mm256_max_ps(d,_mm256_setzero_ps());
2963 d2 = _mm256_mul_ps(d,d);
2964 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2966 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2968 /* Evaluate switch function */
2969 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2970 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2971 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2975 fscal = _mm256_and_ps(fscal,cutoff_mask);
2977 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2979 /* Calculate temporary vectorial force */
2980 tx = _mm256_mul_ps(fscal,dx33);
2981 ty = _mm256_mul_ps(fscal,dy33);
2982 tz = _mm256_mul_ps(fscal,dz33);
2984 /* Update vectorial force */
2985 fix3 = _mm256_add_ps(fix3,tx);
2986 fiy3 = _mm256_add_ps(fiy3,ty);
2987 fiz3 = _mm256_add_ps(fiz3,tz);
2989 fjx3 = _mm256_add_ps(fjx3,tx);
2990 fjy3 = _mm256_add_ps(fjy3,ty);
2991 fjz3 = _mm256_add_ps(fjz3,tz);
2995 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2996 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2997 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2998 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2999 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3000 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3001 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3002 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3004 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3005 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
3006 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3008 /* Inner loop uses 954 flops */
3011 /* End of innermost loop */
3013 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3014 f+i_coord_offset+DIM,fshift+i_shift_offset);
3016 /* Increment number of inner iterations */
3017 inneriter += j_index_end - j_index_start;
3019 /* Outer loop uses 18 flops */
3022 /* Increment number of outer iterations */
3025 /* Update outer/inner flops */
3027 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*954);