2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrE,jnrF,jnrG,jnrH;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 real * vdwioffsetptr0;
86 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 real * vdwioffsetptr1;
88 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
92 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
94 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
96 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128i ewitab_lo,ewitab_hi;
110 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
111 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
113 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
114 real rswitch_scalar,d_scalar;
115 __m256 dummy_mask,cutoff_mask;
116 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
117 __m256 one = _mm256_set1_ps(1.0);
118 __m256 two = _mm256_set1_ps(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm256_set1_ps(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
133 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
134 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
135 beta2 = _mm256_mul_ps(beta,beta);
136 beta3 = _mm256_mul_ps(beta,beta2);
138 ewtab = fr->ic->tabq_coul_FDV0;
139 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
140 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
145 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
146 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
148 jq0 = _mm256_set1_ps(charge[inr+0]);
149 jq1 = _mm256_set1_ps(charge[inr+1]);
150 jq2 = _mm256_set1_ps(charge[inr+2]);
151 qq00 = _mm256_mul_ps(iq0,jq0);
152 qq01 = _mm256_mul_ps(iq0,jq1);
153 qq02 = _mm256_mul_ps(iq0,jq2);
154 qq10 = _mm256_mul_ps(iq1,jq0);
155 qq11 = _mm256_mul_ps(iq1,jq1);
156 qq12 = _mm256_mul_ps(iq1,jq2);
157 qq20 = _mm256_mul_ps(iq2,jq0);
158 qq21 = _mm256_mul_ps(iq2,jq1);
159 qq22 = _mm256_mul_ps(iq2,jq2);
161 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
162 rcutoff_scalar = fr->ic->rcoulomb;
163 rcutoff = _mm256_set1_ps(rcutoff_scalar);
164 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
166 rswitch_scalar = fr->ic->rcoulomb_switch;
167 rswitch = _mm256_set1_ps(rswitch_scalar);
168 /* Setup switch parameters */
169 d_scalar = rcutoff_scalar-rswitch_scalar;
170 d = _mm256_set1_ps(d_scalar);
171 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
172 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
173 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
174 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
175 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178 /* Avoid stupid compiler warnings */
179 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
192 for(iidx=0;iidx<4*DIM;iidx++)
197 /* Start outer loop over neighborlists */
198 for(iidx=0; iidx<nri; iidx++)
200 /* Load shift vector for this list */
201 i_shift_offset = DIM*shiftidx[iidx];
203 /* Load limits for loop over neighbors */
204 j_index_start = jindex[iidx];
205 j_index_end = jindex[iidx+1];
207 /* Get outer coordinate index */
209 i_coord_offset = DIM*inr;
211 /* Load i particle coords and add shift vector */
212 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
213 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
215 fix0 = _mm256_setzero_ps();
216 fiy0 = _mm256_setzero_ps();
217 fiz0 = _mm256_setzero_ps();
218 fix1 = _mm256_setzero_ps();
219 fiy1 = _mm256_setzero_ps();
220 fiz1 = _mm256_setzero_ps();
221 fix2 = _mm256_setzero_ps();
222 fiy2 = _mm256_setzero_ps();
223 fiz2 = _mm256_setzero_ps();
225 /* Reset potential sums */
226 velecsum = _mm256_setzero_ps();
228 /* Start inner kernel loop */
229 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
232 /* Get j neighbor index, and coordinate index */
241 j_coord_offsetA = DIM*jnrA;
242 j_coord_offsetB = DIM*jnrB;
243 j_coord_offsetC = DIM*jnrC;
244 j_coord_offsetD = DIM*jnrD;
245 j_coord_offsetE = DIM*jnrE;
246 j_coord_offsetF = DIM*jnrF;
247 j_coord_offsetG = DIM*jnrG;
248 j_coord_offsetH = DIM*jnrH;
250 /* load j atom coordinates */
251 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
252 x+j_coord_offsetC,x+j_coord_offsetD,
253 x+j_coord_offsetE,x+j_coord_offsetF,
254 x+j_coord_offsetG,x+j_coord_offsetH,
255 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
257 /* Calculate displacement vector */
258 dx00 = _mm256_sub_ps(ix0,jx0);
259 dy00 = _mm256_sub_ps(iy0,jy0);
260 dz00 = _mm256_sub_ps(iz0,jz0);
261 dx01 = _mm256_sub_ps(ix0,jx1);
262 dy01 = _mm256_sub_ps(iy0,jy1);
263 dz01 = _mm256_sub_ps(iz0,jz1);
264 dx02 = _mm256_sub_ps(ix0,jx2);
265 dy02 = _mm256_sub_ps(iy0,jy2);
266 dz02 = _mm256_sub_ps(iz0,jz2);
267 dx10 = _mm256_sub_ps(ix1,jx0);
268 dy10 = _mm256_sub_ps(iy1,jy0);
269 dz10 = _mm256_sub_ps(iz1,jz0);
270 dx11 = _mm256_sub_ps(ix1,jx1);
271 dy11 = _mm256_sub_ps(iy1,jy1);
272 dz11 = _mm256_sub_ps(iz1,jz1);
273 dx12 = _mm256_sub_ps(ix1,jx2);
274 dy12 = _mm256_sub_ps(iy1,jy2);
275 dz12 = _mm256_sub_ps(iz1,jz2);
276 dx20 = _mm256_sub_ps(ix2,jx0);
277 dy20 = _mm256_sub_ps(iy2,jy0);
278 dz20 = _mm256_sub_ps(iz2,jz0);
279 dx21 = _mm256_sub_ps(ix2,jx1);
280 dy21 = _mm256_sub_ps(iy2,jy1);
281 dz21 = _mm256_sub_ps(iz2,jz1);
282 dx22 = _mm256_sub_ps(ix2,jx2);
283 dy22 = _mm256_sub_ps(iy2,jy2);
284 dz22 = _mm256_sub_ps(iz2,jz2);
286 /* Calculate squared distance and things based on it */
287 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
288 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
289 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
290 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
291 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
292 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
293 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
294 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
295 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
297 rinv00 = avx256_invsqrt_f(rsq00);
298 rinv01 = avx256_invsqrt_f(rsq01);
299 rinv02 = avx256_invsqrt_f(rsq02);
300 rinv10 = avx256_invsqrt_f(rsq10);
301 rinv11 = avx256_invsqrt_f(rsq11);
302 rinv12 = avx256_invsqrt_f(rsq12);
303 rinv20 = avx256_invsqrt_f(rsq20);
304 rinv21 = avx256_invsqrt_f(rsq21);
305 rinv22 = avx256_invsqrt_f(rsq22);
307 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
308 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
309 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
310 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
311 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
312 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
313 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
314 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
315 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
317 fjx0 = _mm256_setzero_ps();
318 fjy0 = _mm256_setzero_ps();
319 fjz0 = _mm256_setzero_ps();
320 fjx1 = _mm256_setzero_ps();
321 fjy1 = _mm256_setzero_ps();
322 fjz1 = _mm256_setzero_ps();
323 fjx2 = _mm256_setzero_ps();
324 fjy2 = _mm256_setzero_ps();
325 fjz2 = _mm256_setzero_ps();
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 if (gmx_mm256_any_lt(rsq00,rcutoff2))
334 r00 = _mm256_mul_ps(rsq00,rinv00);
336 /* EWALD ELECTROSTATICS */
338 /* Analytical PME correction */
339 zeta2 = _mm256_mul_ps(beta2,rsq00);
340 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
341 pmecorrF = avx256_pmecorrF_f(zeta2);
342 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
343 felec = _mm256_mul_ps(qq00,felec);
344 pmecorrV = avx256_pmecorrV_f(zeta2);
345 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
346 velec = _mm256_sub_ps(rinv00,pmecorrV);
347 velec = _mm256_mul_ps(qq00,velec);
349 d = _mm256_sub_ps(r00,rswitch);
350 d = _mm256_max_ps(d,_mm256_setzero_ps());
351 d2 = _mm256_mul_ps(d,d);
352 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
354 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
356 /* Evaluate switch function */
357 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
358 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
359 velec = _mm256_mul_ps(velec,sw);
360 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velec = _mm256_and_ps(velec,cutoff_mask);
364 velecsum = _mm256_add_ps(velecsum,velec);
368 fscal = _mm256_and_ps(fscal,cutoff_mask);
370 /* Calculate temporary vectorial force */
371 tx = _mm256_mul_ps(fscal,dx00);
372 ty = _mm256_mul_ps(fscal,dy00);
373 tz = _mm256_mul_ps(fscal,dz00);
375 /* Update vectorial force */
376 fix0 = _mm256_add_ps(fix0,tx);
377 fiy0 = _mm256_add_ps(fiy0,ty);
378 fiz0 = _mm256_add_ps(fiz0,tz);
380 fjx0 = _mm256_add_ps(fjx0,tx);
381 fjy0 = _mm256_add_ps(fjy0,ty);
382 fjz0 = _mm256_add_ps(fjz0,tz);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm256_any_lt(rsq01,rcutoff2))
393 r01 = _mm256_mul_ps(rsq01,rinv01);
395 /* EWALD ELECTROSTATICS */
397 /* Analytical PME correction */
398 zeta2 = _mm256_mul_ps(beta2,rsq01);
399 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
400 pmecorrF = avx256_pmecorrF_f(zeta2);
401 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
402 felec = _mm256_mul_ps(qq01,felec);
403 pmecorrV = avx256_pmecorrV_f(zeta2);
404 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
405 velec = _mm256_sub_ps(rinv01,pmecorrV);
406 velec = _mm256_mul_ps(qq01,velec);
408 d = _mm256_sub_ps(r01,rswitch);
409 d = _mm256_max_ps(d,_mm256_setzero_ps());
410 d2 = _mm256_mul_ps(d,d);
411 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
413 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
415 /* Evaluate switch function */
416 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
417 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
418 velec = _mm256_mul_ps(velec,sw);
419 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
421 /* Update potential sum for this i atom from the interaction with this j atom. */
422 velec = _mm256_and_ps(velec,cutoff_mask);
423 velecsum = _mm256_add_ps(velecsum,velec);
427 fscal = _mm256_and_ps(fscal,cutoff_mask);
429 /* Calculate temporary vectorial force */
430 tx = _mm256_mul_ps(fscal,dx01);
431 ty = _mm256_mul_ps(fscal,dy01);
432 tz = _mm256_mul_ps(fscal,dz01);
434 /* Update vectorial force */
435 fix0 = _mm256_add_ps(fix0,tx);
436 fiy0 = _mm256_add_ps(fiy0,ty);
437 fiz0 = _mm256_add_ps(fiz0,tz);
439 fjx1 = _mm256_add_ps(fjx1,tx);
440 fjy1 = _mm256_add_ps(fjy1,ty);
441 fjz1 = _mm256_add_ps(fjz1,tz);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 if (gmx_mm256_any_lt(rsq02,rcutoff2))
452 r02 = _mm256_mul_ps(rsq02,rinv02);
454 /* EWALD ELECTROSTATICS */
456 /* Analytical PME correction */
457 zeta2 = _mm256_mul_ps(beta2,rsq02);
458 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
459 pmecorrF = avx256_pmecorrF_f(zeta2);
460 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
461 felec = _mm256_mul_ps(qq02,felec);
462 pmecorrV = avx256_pmecorrV_f(zeta2);
463 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
464 velec = _mm256_sub_ps(rinv02,pmecorrV);
465 velec = _mm256_mul_ps(qq02,velec);
467 d = _mm256_sub_ps(r02,rswitch);
468 d = _mm256_max_ps(d,_mm256_setzero_ps());
469 d2 = _mm256_mul_ps(d,d);
470 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
472 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
474 /* Evaluate switch function */
475 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
476 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
477 velec = _mm256_mul_ps(velec,sw);
478 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
480 /* Update potential sum for this i atom from the interaction with this j atom. */
481 velec = _mm256_and_ps(velec,cutoff_mask);
482 velecsum = _mm256_add_ps(velecsum,velec);
486 fscal = _mm256_and_ps(fscal,cutoff_mask);
488 /* Calculate temporary vectorial force */
489 tx = _mm256_mul_ps(fscal,dx02);
490 ty = _mm256_mul_ps(fscal,dy02);
491 tz = _mm256_mul_ps(fscal,dz02);
493 /* Update vectorial force */
494 fix0 = _mm256_add_ps(fix0,tx);
495 fiy0 = _mm256_add_ps(fiy0,ty);
496 fiz0 = _mm256_add_ps(fiz0,tz);
498 fjx2 = _mm256_add_ps(fjx2,tx);
499 fjy2 = _mm256_add_ps(fjy2,ty);
500 fjz2 = _mm256_add_ps(fjz2,tz);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 if (gmx_mm256_any_lt(rsq10,rcutoff2))
511 r10 = _mm256_mul_ps(rsq10,rinv10);
513 /* EWALD ELECTROSTATICS */
515 /* Analytical PME correction */
516 zeta2 = _mm256_mul_ps(beta2,rsq10);
517 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
518 pmecorrF = avx256_pmecorrF_f(zeta2);
519 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
520 felec = _mm256_mul_ps(qq10,felec);
521 pmecorrV = avx256_pmecorrV_f(zeta2);
522 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
523 velec = _mm256_sub_ps(rinv10,pmecorrV);
524 velec = _mm256_mul_ps(qq10,velec);
526 d = _mm256_sub_ps(r10,rswitch);
527 d = _mm256_max_ps(d,_mm256_setzero_ps());
528 d2 = _mm256_mul_ps(d,d);
529 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
531 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
533 /* Evaluate switch function */
534 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
535 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
536 velec = _mm256_mul_ps(velec,sw);
537 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velec = _mm256_and_ps(velec,cutoff_mask);
541 velecsum = _mm256_add_ps(velecsum,velec);
545 fscal = _mm256_and_ps(fscal,cutoff_mask);
547 /* Calculate temporary vectorial force */
548 tx = _mm256_mul_ps(fscal,dx10);
549 ty = _mm256_mul_ps(fscal,dy10);
550 tz = _mm256_mul_ps(fscal,dz10);
552 /* Update vectorial force */
553 fix1 = _mm256_add_ps(fix1,tx);
554 fiy1 = _mm256_add_ps(fiy1,ty);
555 fiz1 = _mm256_add_ps(fiz1,tz);
557 fjx0 = _mm256_add_ps(fjx0,tx);
558 fjy0 = _mm256_add_ps(fjy0,ty);
559 fjz0 = _mm256_add_ps(fjz0,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 if (gmx_mm256_any_lt(rsq11,rcutoff2))
570 r11 = _mm256_mul_ps(rsq11,rinv11);
572 /* EWALD ELECTROSTATICS */
574 /* Analytical PME correction */
575 zeta2 = _mm256_mul_ps(beta2,rsq11);
576 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
577 pmecorrF = avx256_pmecorrF_f(zeta2);
578 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
579 felec = _mm256_mul_ps(qq11,felec);
580 pmecorrV = avx256_pmecorrV_f(zeta2);
581 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
582 velec = _mm256_sub_ps(rinv11,pmecorrV);
583 velec = _mm256_mul_ps(qq11,velec);
585 d = _mm256_sub_ps(r11,rswitch);
586 d = _mm256_max_ps(d,_mm256_setzero_ps());
587 d2 = _mm256_mul_ps(d,d);
588 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
590 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
592 /* Evaluate switch function */
593 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
594 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
595 velec = _mm256_mul_ps(velec,sw);
596 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec = _mm256_and_ps(velec,cutoff_mask);
600 velecsum = _mm256_add_ps(velecsum,velec);
604 fscal = _mm256_and_ps(fscal,cutoff_mask);
606 /* Calculate temporary vectorial force */
607 tx = _mm256_mul_ps(fscal,dx11);
608 ty = _mm256_mul_ps(fscal,dy11);
609 tz = _mm256_mul_ps(fscal,dz11);
611 /* Update vectorial force */
612 fix1 = _mm256_add_ps(fix1,tx);
613 fiy1 = _mm256_add_ps(fiy1,ty);
614 fiz1 = _mm256_add_ps(fiz1,tz);
616 fjx1 = _mm256_add_ps(fjx1,tx);
617 fjy1 = _mm256_add_ps(fjy1,ty);
618 fjz1 = _mm256_add_ps(fjz1,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm256_any_lt(rsq12,rcutoff2))
629 r12 = _mm256_mul_ps(rsq12,rinv12);
631 /* EWALD ELECTROSTATICS */
633 /* Analytical PME correction */
634 zeta2 = _mm256_mul_ps(beta2,rsq12);
635 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
636 pmecorrF = avx256_pmecorrF_f(zeta2);
637 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
638 felec = _mm256_mul_ps(qq12,felec);
639 pmecorrV = avx256_pmecorrV_f(zeta2);
640 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
641 velec = _mm256_sub_ps(rinv12,pmecorrV);
642 velec = _mm256_mul_ps(qq12,velec);
644 d = _mm256_sub_ps(r12,rswitch);
645 d = _mm256_max_ps(d,_mm256_setzero_ps());
646 d2 = _mm256_mul_ps(d,d);
647 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
649 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
651 /* Evaluate switch function */
652 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
653 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
654 velec = _mm256_mul_ps(velec,sw);
655 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velec = _mm256_and_ps(velec,cutoff_mask);
659 velecsum = _mm256_add_ps(velecsum,velec);
663 fscal = _mm256_and_ps(fscal,cutoff_mask);
665 /* Calculate temporary vectorial force */
666 tx = _mm256_mul_ps(fscal,dx12);
667 ty = _mm256_mul_ps(fscal,dy12);
668 tz = _mm256_mul_ps(fscal,dz12);
670 /* Update vectorial force */
671 fix1 = _mm256_add_ps(fix1,tx);
672 fiy1 = _mm256_add_ps(fiy1,ty);
673 fiz1 = _mm256_add_ps(fiz1,tz);
675 fjx2 = _mm256_add_ps(fjx2,tx);
676 fjy2 = _mm256_add_ps(fjy2,ty);
677 fjz2 = _mm256_add_ps(fjz2,tz);
681 /**************************
682 * CALCULATE INTERACTIONS *
683 **************************/
685 if (gmx_mm256_any_lt(rsq20,rcutoff2))
688 r20 = _mm256_mul_ps(rsq20,rinv20);
690 /* EWALD ELECTROSTATICS */
692 /* Analytical PME correction */
693 zeta2 = _mm256_mul_ps(beta2,rsq20);
694 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
695 pmecorrF = avx256_pmecorrF_f(zeta2);
696 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
697 felec = _mm256_mul_ps(qq20,felec);
698 pmecorrV = avx256_pmecorrV_f(zeta2);
699 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
700 velec = _mm256_sub_ps(rinv20,pmecorrV);
701 velec = _mm256_mul_ps(qq20,velec);
703 d = _mm256_sub_ps(r20,rswitch);
704 d = _mm256_max_ps(d,_mm256_setzero_ps());
705 d2 = _mm256_mul_ps(d,d);
706 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
708 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
710 /* Evaluate switch function */
711 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
712 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
713 velec = _mm256_mul_ps(velec,sw);
714 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
716 /* Update potential sum for this i atom from the interaction with this j atom. */
717 velec = _mm256_and_ps(velec,cutoff_mask);
718 velecsum = _mm256_add_ps(velecsum,velec);
722 fscal = _mm256_and_ps(fscal,cutoff_mask);
724 /* Calculate temporary vectorial force */
725 tx = _mm256_mul_ps(fscal,dx20);
726 ty = _mm256_mul_ps(fscal,dy20);
727 tz = _mm256_mul_ps(fscal,dz20);
729 /* Update vectorial force */
730 fix2 = _mm256_add_ps(fix2,tx);
731 fiy2 = _mm256_add_ps(fiy2,ty);
732 fiz2 = _mm256_add_ps(fiz2,tz);
734 fjx0 = _mm256_add_ps(fjx0,tx);
735 fjy0 = _mm256_add_ps(fjy0,ty);
736 fjz0 = _mm256_add_ps(fjz0,tz);
740 /**************************
741 * CALCULATE INTERACTIONS *
742 **************************/
744 if (gmx_mm256_any_lt(rsq21,rcutoff2))
747 r21 = _mm256_mul_ps(rsq21,rinv21);
749 /* EWALD ELECTROSTATICS */
751 /* Analytical PME correction */
752 zeta2 = _mm256_mul_ps(beta2,rsq21);
753 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
754 pmecorrF = avx256_pmecorrF_f(zeta2);
755 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
756 felec = _mm256_mul_ps(qq21,felec);
757 pmecorrV = avx256_pmecorrV_f(zeta2);
758 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
759 velec = _mm256_sub_ps(rinv21,pmecorrV);
760 velec = _mm256_mul_ps(qq21,velec);
762 d = _mm256_sub_ps(r21,rswitch);
763 d = _mm256_max_ps(d,_mm256_setzero_ps());
764 d2 = _mm256_mul_ps(d,d);
765 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
767 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
769 /* Evaluate switch function */
770 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
771 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
772 velec = _mm256_mul_ps(velec,sw);
773 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
775 /* Update potential sum for this i atom from the interaction with this j atom. */
776 velec = _mm256_and_ps(velec,cutoff_mask);
777 velecsum = _mm256_add_ps(velecsum,velec);
781 fscal = _mm256_and_ps(fscal,cutoff_mask);
783 /* Calculate temporary vectorial force */
784 tx = _mm256_mul_ps(fscal,dx21);
785 ty = _mm256_mul_ps(fscal,dy21);
786 tz = _mm256_mul_ps(fscal,dz21);
788 /* Update vectorial force */
789 fix2 = _mm256_add_ps(fix2,tx);
790 fiy2 = _mm256_add_ps(fiy2,ty);
791 fiz2 = _mm256_add_ps(fiz2,tz);
793 fjx1 = _mm256_add_ps(fjx1,tx);
794 fjy1 = _mm256_add_ps(fjy1,ty);
795 fjz1 = _mm256_add_ps(fjz1,tz);
799 /**************************
800 * CALCULATE INTERACTIONS *
801 **************************/
803 if (gmx_mm256_any_lt(rsq22,rcutoff2))
806 r22 = _mm256_mul_ps(rsq22,rinv22);
808 /* EWALD ELECTROSTATICS */
810 /* Analytical PME correction */
811 zeta2 = _mm256_mul_ps(beta2,rsq22);
812 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
813 pmecorrF = avx256_pmecorrF_f(zeta2);
814 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
815 felec = _mm256_mul_ps(qq22,felec);
816 pmecorrV = avx256_pmecorrV_f(zeta2);
817 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
818 velec = _mm256_sub_ps(rinv22,pmecorrV);
819 velec = _mm256_mul_ps(qq22,velec);
821 d = _mm256_sub_ps(r22,rswitch);
822 d = _mm256_max_ps(d,_mm256_setzero_ps());
823 d2 = _mm256_mul_ps(d,d);
824 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
826 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
828 /* Evaluate switch function */
829 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
830 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
831 velec = _mm256_mul_ps(velec,sw);
832 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec = _mm256_and_ps(velec,cutoff_mask);
836 velecsum = _mm256_add_ps(velecsum,velec);
840 fscal = _mm256_and_ps(fscal,cutoff_mask);
842 /* Calculate temporary vectorial force */
843 tx = _mm256_mul_ps(fscal,dx22);
844 ty = _mm256_mul_ps(fscal,dy22);
845 tz = _mm256_mul_ps(fscal,dz22);
847 /* Update vectorial force */
848 fix2 = _mm256_add_ps(fix2,tx);
849 fiy2 = _mm256_add_ps(fiy2,ty);
850 fiz2 = _mm256_add_ps(fiz2,tz);
852 fjx2 = _mm256_add_ps(fjx2,tx);
853 fjy2 = _mm256_add_ps(fjy2,ty);
854 fjz2 = _mm256_add_ps(fjz2,tz);
858 fjptrA = f+j_coord_offsetA;
859 fjptrB = f+j_coord_offsetB;
860 fjptrC = f+j_coord_offsetC;
861 fjptrD = f+j_coord_offsetD;
862 fjptrE = f+j_coord_offsetE;
863 fjptrF = f+j_coord_offsetF;
864 fjptrG = f+j_coord_offsetG;
865 fjptrH = f+j_coord_offsetH;
867 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
868 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
870 /* Inner loop uses 972 flops */
876 /* Get j neighbor index, and coordinate index */
877 jnrlistA = jjnr[jidx];
878 jnrlistB = jjnr[jidx+1];
879 jnrlistC = jjnr[jidx+2];
880 jnrlistD = jjnr[jidx+3];
881 jnrlistE = jjnr[jidx+4];
882 jnrlistF = jjnr[jidx+5];
883 jnrlistG = jjnr[jidx+6];
884 jnrlistH = jjnr[jidx+7];
885 /* Sign of each element will be negative for non-real atoms.
886 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
887 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
889 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
890 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
892 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
893 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
894 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
895 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
896 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
897 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
898 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
899 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
900 j_coord_offsetA = DIM*jnrA;
901 j_coord_offsetB = DIM*jnrB;
902 j_coord_offsetC = DIM*jnrC;
903 j_coord_offsetD = DIM*jnrD;
904 j_coord_offsetE = DIM*jnrE;
905 j_coord_offsetF = DIM*jnrF;
906 j_coord_offsetG = DIM*jnrG;
907 j_coord_offsetH = DIM*jnrH;
909 /* load j atom coordinates */
910 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
911 x+j_coord_offsetC,x+j_coord_offsetD,
912 x+j_coord_offsetE,x+j_coord_offsetF,
913 x+j_coord_offsetG,x+j_coord_offsetH,
914 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
916 /* Calculate displacement vector */
917 dx00 = _mm256_sub_ps(ix0,jx0);
918 dy00 = _mm256_sub_ps(iy0,jy0);
919 dz00 = _mm256_sub_ps(iz0,jz0);
920 dx01 = _mm256_sub_ps(ix0,jx1);
921 dy01 = _mm256_sub_ps(iy0,jy1);
922 dz01 = _mm256_sub_ps(iz0,jz1);
923 dx02 = _mm256_sub_ps(ix0,jx2);
924 dy02 = _mm256_sub_ps(iy0,jy2);
925 dz02 = _mm256_sub_ps(iz0,jz2);
926 dx10 = _mm256_sub_ps(ix1,jx0);
927 dy10 = _mm256_sub_ps(iy1,jy0);
928 dz10 = _mm256_sub_ps(iz1,jz0);
929 dx11 = _mm256_sub_ps(ix1,jx1);
930 dy11 = _mm256_sub_ps(iy1,jy1);
931 dz11 = _mm256_sub_ps(iz1,jz1);
932 dx12 = _mm256_sub_ps(ix1,jx2);
933 dy12 = _mm256_sub_ps(iy1,jy2);
934 dz12 = _mm256_sub_ps(iz1,jz2);
935 dx20 = _mm256_sub_ps(ix2,jx0);
936 dy20 = _mm256_sub_ps(iy2,jy0);
937 dz20 = _mm256_sub_ps(iz2,jz0);
938 dx21 = _mm256_sub_ps(ix2,jx1);
939 dy21 = _mm256_sub_ps(iy2,jy1);
940 dz21 = _mm256_sub_ps(iz2,jz1);
941 dx22 = _mm256_sub_ps(ix2,jx2);
942 dy22 = _mm256_sub_ps(iy2,jy2);
943 dz22 = _mm256_sub_ps(iz2,jz2);
945 /* Calculate squared distance and things based on it */
946 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
947 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
948 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
949 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
950 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
951 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
952 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
953 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
954 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
956 rinv00 = avx256_invsqrt_f(rsq00);
957 rinv01 = avx256_invsqrt_f(rsq01);
958 rinv02 = avx256_invsqrt_f(rsq02);
959 rinv10 = avx256_invsqrt_f(rsq10);
960 rinv11 = avx256_invsqrt_f(rsq11);
961 rinv12 = avx256_invsqrt_f(rsq12);
962 rinv20 = avx256_invsqrt_f(rsq20);
963 rinv21 = avx256_invsqrt_f(rsq21);
964 rinv22 = avx256_invsqrt_f(rsq22);
966 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
967 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
968 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
969 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
970 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
971 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
972 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
973 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
974 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
976 fjx0 = _mm256_setzero_ps();
977 fjy0 = _mm256_setzero_ps();
978 fjz0 = _mm256_setzero_ps();
979 fjx1 = _mm256_setzero_ps();
980 fjy1 = _mm256_setzero_ps();
981 fjz1 = _mm256_setzero_ps();
982 fjx2 = _mm256_setzero_ps();
983 fjy2 = _mm256_setzero_ps();
984 fjz2 = _mm256_setzero_ps();
986 /**************************
987 * CALCULATE INTERACTIONS *
988 **************************/
990 if (gmx_mm256_any_lt(rsq00,rcutoff2))
993 r00 = _mm256_mul_ps(rsq00,rinv00);
994 r00 = _mm256_andnot_ps(dummy_mask,r00);
996 /* EWALD ELECTROSTATICS */
998 /* Analytical PME correction */
999 zeta2 = _mm256_mul_ps(beta2,rsq00);
1000 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
1001 pmecorrF = avx256_pmecorrF_f(zeta2);
1002 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1003 felec = _mm256_mul_ps(qq00,felec);
1004 pmecorrV = avx256_pmecorrV_f(zeta2);
1005 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1006 velec = _mm256_sub_ps(rinv00,pmecorrV);
1007 velec = _mm256_mul_ps(qq00,velec);
1009 d = _mm256_sub_ps(r00,rswitch);
1010 d = _mm256_max_ps(d,_mm256_setzero_ps());
1011 d2 = _mm256_mul_ps(d,d);
1012 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1014 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1016 /* Evaluate switch function */
1017 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1018 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1019 velec = _mm256_mul_ps(velec,sw);
1020 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1022 /* Update potential sum for this i atom from the interaction with this j atom. */
1023 velec = _mm256_and_ps(velec,cutoff_mask);
1024 velec = _mm256_andnot_ps(dummy_mask,velec);
1025 velecsum = _mm256_add_ps(velecsum,velec);
1029 fscal = _mm256_and_ps(fscal,cutoff_mask);
1031 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm256_mul_ps(fscal,dx00);
1035 ty = _mm256_mul_ps(fscal,dy00);
1036 tz = _mm256_mul_ps(fscal,dz00);
1038 /* Update vectorial force */
1039 fix0 = _mm256_add_ps(fix0,tx);
1040 fiy0 = _mm256_add_ps(fiy0,ty);
1041 fiz0 = _mm256_add_ps(fiz0,tz);
1043 fjx0 = _mm256_add_ps(fjx0,tx);
1044 fjy0 = _mm256_add_ps(fjy0,ty);
1045 fjz0 = _mm256_add_ps(fjz0,tz);
1049 /**************************
1050 * CALCULATE INTERACTIONS *
1051 **************************/
1053 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1056 r01 = _mm256_mul_ps(rsq01,rinv01);
1057 r01 = _mm256_andnot_ps(dummy_mask,r01);
1059 /* EWALD ELECTROSTATICS */
1061 /* Analytical PME correction */
1062 zeta2 = _mm256_mul_ps(beta2,rsq01);
1063 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
1064 pmecorrF = avx256_pmecorrF_f(zeta2);
1065 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1066 felec = _mm256_mul_ps(qq01,felec);
1067 pmecorrV = avx256_pmecorrV_f(zeta2);
1068 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1069 velec = _mm256_sub_ps(rinv01,pmecorrV);
1070 velec = _mm256_mul_ps(qq01,velec);
1072 d = _mm256_sub_ps(r01,rswitch);
1073 d = _mm256_max_ps(d,_mm256_setzero_ps());
1074 d2 = _mm256_mul_ps(d,d);
1075 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1077 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1079 /* Evaluate switch function */
1080 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1081 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1082 velec = _mm256_mul_ps(velec,sw);
1083 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1085 /* Update potential sum for this i atom from the interaction with this j atom. */
1086 velec = _mm256_and_ps(velec,cutoff_mask);
1087 velec = _mm256_andnot_ps(dummy_mask,velec);
1088 velecsum = _mm256_add_ps(velecsum,velec);
1092 fscal = _mm256_and_ps(fscal,cutoff_mask);
1094 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1096 /* Calculate temporary vectorial force */
1097 tx = _mm256_mul_ps(fscal,dx01);
1098 ty = _mm256_mul_ps(fscal,dy01);
1099 tz = _mm256_mul_ps(fscal,dz01);
1101 /* Update vectorial force */
1102 fix0 = _mm256_add_ps(fix0,tx);
1103 fiy0 = _mm256_add_ps(fiy0,ty);
1104 fiz0 = _mm256_add_ps(fiz0,tz);
1106 fjx1 = _mm256_add_ps(fjx1,tx);
1107 fjy1 = _mm256_add_ps(fjy1,ty);
1108 fjz1 = _mm256_add_ps(fjz1,tz);
1112 /**************************
1113 * CALCULATE INTERACTIONS *
1114 **************************/
1116 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1119 r02 = _mm256_mul_ps(rsq02,rinv02);
1120 r02 = _mm256_andnot_ps(dummy_mask,r02);
1122 /* EWALD ELECTROSTATICS */
1124 /* Analytical PME correction */
1125 zeta2 = _mm256_mul_ps(beta2,rsq02);
1126 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
1127 pmecorrF = avx256_pmecorrF_f(zeta2);
1128 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1129 felec = _mm256_mul_ps(qq02,felec);
1130 pmecorrV = avx256_pmecorrV_f(zeta2);
1131 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1132 velec = _mm256_sub_ps(rinv02,pmecorrV);
1133 velec = _mm256_mul_ps(qq02,velec);
1135 d = _mm256_sub_ps(r02,rswitch);
1136 d = _mm256_max_ps(d,_mm256_setzero_ps());
1137 d2 = _mm256_mul_ps(d,d);
1138 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1140 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1142 /* Evaluate switch function */
1143 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1144 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1145 velec = _mm256_mul_ps(velec,sw);
1146 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1148 /* Update potential sum for this i atom from the interaction with this j atom. */
1149 velec = _mm256_and_ps(velec,cutoff_mask);
1150 velec = _mm256_andnot_ps(dummy_mask,velec);
1151 velecsum = _mm256_add_ps(velecsum,velec);
1155 fscal = _mm256_and_ps(fscal,cutoff_mask);
1157 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1159 /* Calculate temporary vectorial force */
1160 tx = _mm256_mul_ps(fscal,dx02);
1161 ty = _mm256_mul_ps(fscal,dy02);
1162 tz = _mm256_mul_ps(fscal,dz02);
1164 /* Update vectorial force */
1165 fix0 = _mm256_add_ps(fix0,tx);
1166 fiy0 = _mm256_add_ps(fiy0,ty);
1167 fiz0 = _mm256_add_ps(fiz0,tz);
1169 fjx2 = _mm256_add_ps(fjx2,tx);
1170 fjy2 = _mm256_add_ps(fjy2,ty);
1171 fjz2 = _mm256_add_ps(fjz2,tz);
1175 /**************************
1176 * CALCULATE INTERACTIONS *
1177 **************************/
1179 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1182 r10 = _mm256_mul_ps(rsq10,rinv10);
1183 r10 = _mm256_andnot_ps(dummy_mask,r10);
1185 /* EWALD ELECTROSTATICS */
1187 /* Analytical PME correction */
1188 zeta2 = _mm256_mul_ps(beta2,rsq10);
1189 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1190 pmecorrF = avx256_pmecorrF_f(zeta2);
1191 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1192 felec = _mm256_mul_ps(qq10,felec);
1193 pmecorrV = avx256_pmecorrV_f(zeta2);
1194 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1195 velec = _mm256_sub_ps(rinv10,pmecorrV);
1196 velec = _mm256_mul_ps(qq10,velec);
1198 d = _mm256_sub_ps(r10,rswitch);
1199 d = _mm256_max_ps(d,_mm256_setzero_ps());
1200 d2 = _mm256_mul_ps(d,d);
1201 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1203 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1205 /* Evaluate switch function */
1206 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1207 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1208 velec = _mm256_mul_ps(velec,sw);
1209 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1211 /* Update potential sum for this i atom from the interaction with this j atom. */
1212 velec = _mm256_and_ps(velec,cutoff_mask);
1213 velec = _mm256_andnot_ps(dummy_mask,velec);
1214 velecsum = _mm256_add_ps(velecsum,velec);
1218 fscal = _mm256_and_ps(fscal,cutoff_mask);
1220 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1222 /* Calculate temporary vectorial force */
1223 tx = _mm256_mul_ps(fscal,dx10);
1224 ty = _mm256_mul_ps(fscal,dy10);
1225 tz = _mm256_mul_ps(fscal,dz10);
1227 /* Update vectorial force */
1228 fix1 = _mm256_add_ps(fix1,tx);
1229 fiy1 = _mm256_add_ps(fiy1,ty);
1230 fiz1 = _mm256_add_ps(fiz1,tz);
1232 fjx0 = _mm256_add_ps(fjx0,tx);
1233 fjy0 = _mm256_add_ps(fjy0,ty);
1234 fjz0 = _mm256_add_ps(fjz0,tz);
1238 /**************************
1239 * CALCULATE INTERACTIONS *
1240 **************************/
1242 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1245 r11 = _mm256_mul_ps(rsq11,rinv11);
1246 r11 = _mm256_andnot_ps(dummy_mask,r11);
1248 /* EWALD ELECTROSTATICS */
1250 /* Analytical PME correction */
1251 zeta2 = _mm256_mul_ps(beta2,rsq11);
1252 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1253 pmecorrF = avx256_pmecorrF_f(zeta2);
1254 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1255 felec = _mm256_mul_ps(qq11,felec);
1256 pmecorrV = avx256_pmecorrV_f(zeta2);
1257 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1258 velec = _mm256_sub_ps(rinv11,pmecorrV);
1259 velec = _mm256_mul_ps(qq11,velec);
1261 d = _mm256_sub_ps(r11,rswitch);
1262 d = _mm256_max_ps(d,_mm256_setzero_ps());
1263 d2 = _mm256_mul_ps(d,d);
1264 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1266 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1268 /* Evaluate switch function */
1269 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1270 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1271 velec = _mm256_mul_ps(velec,sw);
1272 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1274 /* Update potential sum for this i atom from the interaction with this j atom. */
1275 velec = _mm256_and_ps(velec,cutoff_mask);
1276 velec = _mm256_andnot_ps(dummy_mask,velec);
1277 velecsum = _mm256_add_ps(velecsum,velec);
1281 fscal = _mm256_and_ps(fscal,cutoff_mask);
1283 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1285 /* Calculate temporary vectorial force */
1286 tx = _mm256_mul_ps(fscal,dx11);
1287 ty = _mm256_mul_ps(fscal,dy11);
1288 tz = _mm256_mul_ps(fscal,dz11);
1290 /* Update vectorial force */
1291 fix1 = _mm256_add_ps(fix1,tx);
1292 fiy1 = _mm256_add_ps(fiy1,ty);
1293 fiz1 = _mm256_add_ps(fiz1,tz);
1295 fjx1 = _mm256_add_ps(fjx1,tx);
1296 fjy1 = _mm256_add_ps(fjy1,ty);
1297 fjz1 = _mm256_add_ps(fjz1,tz);
1301 /**************************
1302 * CALCULATE INTERACTIONS *
1303 **************************/
1305 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1308 r12 = _mm256_mul_ps(rsq12,rinv12);
1309 r12 = _mm256_andnot_ps(dummy_mask,r12);
1311 /* EWALD ELECTROSTATICS */
1313 /* Analytical PME correction */
1314 zeta2 = _mm256_mul_ps(beta2,rsq12);
1315 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1316 pmecorrF = avx256_pmecorrF_f(zeta2);
1317 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1318 felec = _mm256_mul_ps(qq12,felec);
1319 pmecorrV = avx256_pmecorrV_f(zeta2);
1320 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1321 velec = _mm256_sub_ps(rinv12,pmecorrV);
1322 velec = _mm256_mul_ps(qq12,velec);
1324 d = _mm256_sub_ps(r12,rswitch);
1325 d = _mm256_max_ps(d,_mm256_setzero_ps());
1326 d2 = _mm256_mul_ps(d,d);
1327 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1329 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1331 /* Evaluate switch function */
1332 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1333 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1334 velec = _mm256_mul_ps(velec,sw);
1335 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1337 /* Update potential sum for this i atom from the interaction with this j atom. */
1338 velec = _mm256_and_ps(velec,cutoff_mask);
1339 velec = _mm256_andnot_ps(dummy_mask,velec);
1340 velecsum = _mm256_add_ps(velecsum,velec);
1344 fscal = _mm256_and_ps(fscal,cutoff_mask);
1346 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1348 /* Calculate temporary vectorial force */
1349 tx = _mm256_mul_ps(fscal,dx12);
1350 ty = _mm256_mul_ps(fscal,dy12);
1351 tz = _mm256_mul_ps(fscal,dz12);
1353 /* Update vectorial force */
1354 fix1 = _mm256_add_ps(fix1,tx);
1355 fiy1 = _mm256_add_ps(fiy1,ty);
1356 fiz1 = _mm256_add_ps(fiz1,tz);
1358 fjx2 = _mm256_add_ps(fjx2,tx);
1359 fjy2 = _mm256_add_ps(fjy2,ty);
1360 fjz2 = _mm256_add_ps(fjz2,tz);
1364 /**************************
1365 * CALCULATE INTERACTIONS *
1366 **************************/
1368 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1371 r20 = _mm256_mul_ps(rsq20,rinv20);
1372 r20 = _mm256_andnot_ps(dummy_mask,r20);
1374 /* EWALD ELECTROSTATICS */
1376 /* Analytical PME correction */
1377 zeta2 = _mm256_mul_ps(beta2,rsq20);
1378 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1379 pmecorrF = avx256_pmecorrF_f(zeta2);
1380 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1381 felec = _mm256_mul_ps(qq20,felec);
1382 pmecorrV = avx256_pmecorrV_f(zeta2);
1383 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1384 velec = _mm256_sub_ps(rinv20,pmecorrV);
1385 velec = _mm256_mul_ps(qq20,velec);
1387 d = _mm256_sub_ps(r20,rswitch);
1388 d = _mm256_max_ps(d,_mm256_setzero_ps());
1389 d2 = _mm256_mul_ps(d,d);
1390 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1392 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1394 /* Evaluate switch function */
1395 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1396 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1397 velec = _mm256_mul_ps(velec,sw);
1398 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1400 /* Update potential sum for this i atom from the interaction with this j atom. */
1401 velec = _mm256_and_ps(velec,cutoff_mask);
1402 velec = _mm256_andnot_ps(dummy_mask,velec);
1403 velecsum = _mm256_add_ps(velecsum,velec);
1407 fscal = _mm256_and_ps(fscal,cutoff_mask);
1409 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1411 /* Calculate temporary vectorial force */
1412 tx = _mm256_mul_ps(fscal,dx20);
1413 ty = _mm256_mul_ps(fscal,dy20);
1414 tz = _mm256_mul_ps(fscal,dz20);
1416 /* Update vectorial force */
1417 fix2 = _mm256_add_ps(fix2,tx);
1418 fiy2 = _mm256_add_ps(fiy2,ty);
1419 fiz2 = _mm256_add_ps(fiz2,tz);
1421 fjx0 = _mm256_add_ps(fjx0,tx);
1422 fjy0 = _mm256_add_ps(fjy0,ty);
1423 fjz0 = _mm256_add_ps(fjz0,tz);
1427 /**************************
1428 * CALCULATE INTERACTIONS *
1429 **************************/
1431 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1434 r21 = _mm256_mul_ps(rsq21,rinv21);
1435 r21 = _mm256_andnot_ps(dummy_mask,r21);
1437 /* EWALD ELECTROSTATICS */
1439 /* Analytical PME correction */
1440 zeta2 = _mm256_mul_ps(beta2,rsq21);
1441 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1442 pmecorrF = avx256_pmecorrF_f(zeta2);
1443 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1444 felec = _mm256_mul_ps(qq21,felec);
1445 pmecorrV = avx256_pmecorrV_f(zeta2);
1446 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1447 velec = _mm256_sub_ps(rinv21,pmecorrV);
1448 velec = _mm256_mul_ps(qq21,velec);
1450 d = _mm256_sub_ps(r21,rswitch);
1451 d = _mm256_max_ps(d,_mm256_setzero_ps());
1452 d2 = _mm256_mul_ps(d,d);
1453 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1455 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1457 /* Evaluate switch function */
1458 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1459 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1460 velec = _mm256_mul_ps(velec,sw);
1461 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1463 /* Update potential sum for this i atom from the interaction with this j atom. */
1464 velec = _mm256_and_ps(velec,cutoff_mask);
1465 velec = _mm256_andnot_ps(dummy_mask,velec);
1466 velecsum = _mm256_add_ps(velecsum,velec);
1470 fscal = _mm256_and_ps(fscal,cutoff_mask);
1472 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1474 /* Calculate temporary vectorial force */
1475 tx = _mm256_mul_ps(fscal,dx21);
1476 ty = _mm256_mul_ps(fscal,dy21);
1477 tz = _mm256_mul_ps(fscal,dz21);
1479 /* Update vectorial force */
1480 fix2 = _mm256_add_ps(fix2,tx);
1481 fiy2 = _mm256_add_ps(fiy2,ty);
1482 fiz2 = _mm256_add_ps(fiz2,tz);
1484 fjx1 = _mm256_add_ps(fjx1,tx);
1485 fjy1 = _mm256_add_ps(fjy1,ty);
1486 fjz1 = _mm256_add_ps(fjz1,tz);
1490 /**************************
1491 * CALCULATE INTERACTIONS *
1492 **************************/
1494 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1497 r22 = _mm256_mul_ps(rsq22,rinv22);
1498 r22 = _mm256_andnot_ps(dummy_mask,r22);
1500 /* EWALD ELECTROSTATICS */
1502 /* Analytical PME correction */
1503 zeta2 = _mm256_mul_ps(beta2,rsq22);
1504 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1505 pmecorrF = avx256_pmecorrF_f(zeta2);
1506 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1507 felec = _mm256_mul_ps(qq22,felec);
1508 pmecorrV = avx256_pmecorrV_f(zeta2);
1509 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1510 velec = _mm256_sub_ps(rinv22,pmecorrV);
1511 velec = _mm256_mul_ps(qq22,velec);
1513 d = _mm256_sub_ps(r22,rswitch);
1514 d = _mm256_max_ps(d,_mm256_setzero_ps());
1515 d2 = _mm256_mul_ps(d,d);
1516 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1518 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1520 /* Evaluate switch function */
1521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1522 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1523 velec = _mm256_mul_ps(velec,sw);
1524 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1526 /* Update potential sum for this i atom from the interaction with this j atom. */
1527 velec = _mm256_and_ps(velec,cutoff_mask);
1528 velec = _mm256_andnot_ps(dummy_mask,velec);
1529 velecsum = _mm256_add_ps(velecsum,velec);
1533 fscal = _mm256_and_ps(fscal,cutoff_mask);
1535 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1537 /* Calculate temporary vectorial force */
1538 tx = _mm256_mul_ps(fscal,dx22);
1539 ty = _mm256_mul_ps(fscal,dy22);
1540 tz = _mm256_mul_ps(fscal,dz22);
1542 /* Update vectorial force */
1543 fix2 = _mm256_add_ps(fix2,tx);
1544 fiy2 = _mm256_add_ps(fiy2,ty);
1545 fiz2 = _mm256_add_ps(fiz2,tz);
1547 fjx2 = _mm256_add_ps(fjx2,tx);
1548 fjy2 = _mm256_add_ps(fjy2,ty);
1549 fjz2 = _mm256_add_ps(fjz2,tz);
1553 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1554 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1555 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1556 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1557 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1558 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1559 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1560 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1562 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1563 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1565 /* Inner loop uses 981 flops */
1568 /* End of innermost loop */
1570 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1571 f+i_coord_offset,fshift+i_shift_offset);
1574 /* Update potential energies */
1575 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1577 /* Increment number of inner iterations */
1578 inneriter += j_index_end - j_index_start;
1580 /* Outer loop uses 19 flops */
1583 /* Increment number of outer iterations */
1586 /* Update outer/inner flops */
1588 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*981);
1591 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1592 * Electrostatics interaction: Ewald
1593 * VdW interaction: None
1594 * Geometry: Water3-Water3
1595 * Calculate force/pot: Force
1598 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1599 (t_nblist * gmx_restrict nlist,
1600 rvec * gmx_restrict xx,
1601 rvec * gmx_restrict ff,
1602 struct t_forcerec * gmx_restrict fr,
1603 t_mdatoms * gmx_restrict mdatoms,
1604 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1605 t_nrnb * gmx_restrict nrnb)
1607 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1608 * just 0 for non-waters.
1609 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1610 * jnr indices corresponding to data put in the four positions in the SIMD register.
1612 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1613 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1614 int jnrA,jnrB,jnrC,jnrD;
1615 int jnrE,jnrF,jnrG,jnrH;
1616 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1617 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1618 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1619 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1620 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1621 real rcutoff_scalar;
1622 real *shiftvec,*fshift,*x,*f;
1623 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1624 real scratch[4*DIM];
1625 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1626 real * vdwioffsetptr0;
1627 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1628 real * vdwioffsetptr1;
1629 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1630 real * vdwioffsetptr2;
1631 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1632 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1633 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1634 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1635 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1636 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1637 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1638 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1639 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1640 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1641 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1642 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1643 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1644 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1645 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1646 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1647 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1650 __m128i ewitab_lo,ewitab_hi;
1651 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1652 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1654 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1655 real rswitch_scalar,d_scalar;
1656 __m256 dummy_mask,cutoff_mask;
1657 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1658 __m256 one = _mm256_set1_ps(1.0);
1659 __m256 two = _mm256_set1_ps(2.0);
1665 jindex = nlist->jindex;
1667 shiftidx = nlist->shift;
1669 shiftvec = fr->shift_vec[0];
1670 fshift = fr->fshift[0];
1671 facel = _mm256_set1_ps(fr->ic->epsfac);
1672 charge = mdatoms->chargeA;
1674 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1675 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1676 beta2 = _mm256_mul_ps(beta,beta);
1677 beta3 = _mm256_mul_ps(beta,beta2);
1679 ewtab = fr->ic->tabq_coul_FDV0;
1680 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1681 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1683 /* Setup water-specific parameters */
1684 inr = nlist->iinr[0];
1685 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1686 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1687 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1689 jq0 = _mm256_set1_ps(charge[inr+0]);
1690 jq1 = _mm256_set1_ps(charge[inr+1]);
1691 jq2 = _mm256_set1_ps(charge[inr+2]);
1692 qq00 = _mm256_mul_ps(iq0,jq0);
1693 qq01 = _mm256_mul_ps(iq0,jq1);
1694 qq02 = _mm256_mul_ps(iq0,jq2);
1695 qq10 = _mm256_mul_ps(iq1,jq0);
1696 qq11 = _mm256_mul_ps(iq1,jq1);
1697 qq12 = _mm256_mul_ps(iq1,jq2);
1698 qq20 = _mm256_mul_ps(iq2,jq0);
1699 qq21 = _mm256_mul_ps(iq2,jq1);
1700 qq22 = _mm256_mul_ps(iq2,jq2);
1702 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1703 rcutoff_scalar = fr->ic->rcoulomb;
1704 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1705 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1707 rswitch_scalar = fr->ic->rcoulomb_switch;
1708 rswitch = _mm256_set1_ps(rswitch_scalar);
1709 /* Setup switch parameters */
1710 d_scalar = rcutoff_scalar-rswitch_scalar;
1711 d = _mm256_set1_ps(d_scalar);
1712 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1713 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1714 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1715 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1716 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1717 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1719 /* Avoid stupid compiler warnings */
1720 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1721 j_coord_offsetA = 0;
1722 j_coord_offsetB = 0;
1723 j_coord_offsetC = 0;
1724 j_coord_offsetD = 0;
1725 j_coord_offsetE = 0;
1726 j_coord_offsetF = 0;
1727 j_coord_offsetG = 0;
1728 j_coord_offsetH = 0;
1733 for(iidx=0;iidx<4*DIM;iidx++)
1735 scratch[iidx] = 0.0;
1738 /* Start outer loop over neighborlists */
1739 for(iidx=0; iidx<nri; iidx++)
1741 /* Load shift vector for this list */
1742 i_shift_offset = DIM*shiftidx[iidx];
1744 /* Load limits for loop over neighbors */
1745 j_index_start = jindex[iidx];
1746 j_index_end = jindex[iidx+1];
1748 /* Get outer coordinate index */
1750 i_coord_offset = DIM*inr;
1752 /* Load i particle coords and add shift vector */
1753 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1754 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1756 fix0 = _mm256_setzero_ps();
1757 fiy0 = _mm256_setzero_ps();
1758 fiz0 = _mm256_setzero_ps();
1759 fix1 = _mm256_setzero_ps();
1760 fiy1 = _mm256_setzero_ps();
1761 fiz1 = _mm256_setzero_ps();
1762 fix2 = _mm256_setzero_ps();
1763 fiy2 = _mm256_setzero_ps();
1764 fiz2 = _mm256_setzero_ps();
1766 /* Start inner kernel loop */
1767 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1770 /* Get j neighbor index, and coordinate index */
1772 jnrB = jjnr[jidx+1];
1773 jnrC = jjnr[jidx+2];
1774 jnrD = jjnr[jidx+3];
1775 jnrE = jjnr[jidx+4];
1776 jnrF = jjnr[jidx+5];
1777 jnrG = jjnr[jidx+6];
1778 jnrH = jjnr[jidx+7];
1779 j_coord_offsetA = DIM*jnrA;
1780 j_coord_offsetB = DIM*jnrB;
1781 j_coord_offsetC = DIM*jnrC;
1782 j_coord_offsetD = DIM*jnrD;
1783 j_coord_offsetE = DIM*jnrE;
1784 j_coord_offsetF = DIM*jnrF;
1785 j_coord_offsetG = DIM*jnrG;
1786 j_coord_offsetH = DIM*jnrH;
1788 /* load j atom coordinates */
1789 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1790 x+j_coord_offsetC,x+j_coord_offsetD,
1791 x+j_coord_offsetE,x+j_coord_offsetF,
1792 x+j_coord_offsetG,x+j_coord_offsetH,
1793 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1795 /* Calculate displacement vector */
1796 dx00 = _mm256_sub_ps(ix0,jx0);
1797 dy00 = _mm256_sub_ps(iy0,jy0);
1798 dz00 = _mm256_sub_ps(iz0,jz0);
1799 dx01 = _mm256_sub_ps(ix0,jx1);
1800 dy01 = _mm256_sub_ps(iy0,jy1);
1801 dz01 = _mm256_sub_ps(iz0,jz1);
1802 dx02 = _mm256_sub_ps(ix0,jx2);
1803 dy02 = _mm256_sub_ps(iy0,jy2);
1804 dz02 = _mm256_sub_ps(iz0,jz2);
1805 dx10 = _mm256_sub_ps(ix1,jx0);
1806 dy10 = _mm256_sub_ps(iy1,jy0);
1807 dz10 = _mm256_sub_ps(iz1,jz0);
1808 dx11 = _mm256_sub_ps(ix1,jx1);
1809 dy11 = _mm256_sub_ps(iy1,jy1);
1810 dz11 = _mm256_sub_ps(iz1,jz1);
1811 dx12 = _mm256_sub_ps(ix1,jx2);
1812 dy12 = _mm256_sub_ps(iy1,jy2);
1813 dz12 = _mm256_sub_ps(iz1,jz2);
1814 dx20 = _mm256_sub_ps(ix2,jx0);
1815 dy20 = _mm256_sub_ps(iy2,jy0);
1816 dz20 = _mm256_sub_ps(iz2,jz0);
1817 dx21 = _mm256_sub_ps(ix2,jx1);
1818 dy21 = _mm256_sub_ps(iy2,jy1);
1819 dz21 = _mm256_sub_ps(iz2,jz1);
1820 dx22 = _mm256_sub_ps(ix2,jx2);
1821 dy22 = _mm256_sub_ps(iy2,jy2);
1822 dz22 = _mm256_sub_ps(iz2,jz2);
1824 /* Calculate squared distance and things based on it */
1825 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1826 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1827 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1828 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1829 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1830 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1831 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1832 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1833 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1835 rinv00 = avx256_invsqrt_f(rsq00);
1836 rinv01 = avx256_invsqrt_f(rsq01);
1837 rinv02 = avx256_invsqrt_f(rsq02);
1838 rinv10 = avx256_invsqrt_f(rsq10);
1839 rinv11 = avx256_invsqrt_f(rsq11);
1840 rinv12 = avx256_invsqrt_f(rsq12);
1841 rinv20 = avx256_invsqrt_f(rsq20);
1842 rinv21 = avx256_invsqrt_f(rsq21);
1843 rinv22 = avx256_invsqrt_f(rsq22);
1845 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1846 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1847 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1848 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1849 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1850 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1851 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1852 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1853 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1855 fjx0 = _mm256_setzero_ps();
1856 fjy0 = _mm256_setzero_ps();
1857 fjz0 = _mm256_setzero_ps();
1858 fjx1 = _mm256_setzero_ps();
1859 fjy1 = _mm256_setzero_ps();
1860 fjz1 = _mm256_setzero_ps();
1861 fjx2 = _mm256_setzero_ps();
1862 fjy2 = _mm256_setzero_ps();
1863 fjz2 = _mm256_setzero_ps();
1865 /**************************
1866 * CALCULATE INTERACTIONS *
1867 **************************/
1869 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1872 r00 = _mm256_mul_ps(rsq00,rinv00);
1874 /* EWALD ELECTROSTATICS */
1876 /* Analytical PME correction */
1877 zeta2 = _mm256_mul_ps(beta2,rsq00);
1878 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
1879 pmecorrF = avx256_pmecorrF_f(zeta2);
1880 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1881 felec = _mm256_mul_ps(qq00,felec);
1882 pmecorrV = avx256_pmecorrV_f(zeta2);
1883 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1884 velec = _mm256_sub_ps(rinv00,pmecorrV);
1885 velec = _mm256_mul_ps(qq00,velec);
1887 d = _mm256_sub_ps(r00,rswitch);
1888 d = _mm256_max_ps(d,_mm256_setzero_ps());
1889 d2 = _mm256_mul_ps(d,d);
1890 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1892 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1894 /* Evaluate switch function */
1895 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1896 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1897 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1901 fscal = _mm256_and_ps(fscal,cutoff_mask);
1903 /* Calculate temporary vectorial force */
1904 tx = _mm256_mul_ps(fscal,dx00);
1905 ty = _mm256_mul_ps(fscal,dy00);
1906 tz = _mm256_mul_ps(fscal,dz00);
1908 /* Update vectorial force */
1909 fix0 = _mm256_add_ps(fix0,tx);
1910 fiy0 = _mm256_add_ps(fiy0,ty);
1911 fiz0 = _mm256_add_ps(fiz0,tz);
1913 fjx0 = _mm256_add_ps(fjx0,tx);
1914 fjy0 = _mm256_add_ps(fjy0,ty);
1915 fjz0 = _mm256_add_ps(fjz0,tz);
1919 /**************************
1920 * CALCULATE INTERACTIONS *
1921 **************************/
1923 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1926 r01 = _mm256_mul_ps(rsq01,rinv01);
1928 /* EWALD ELECTROSTATICS */
1930 /* Analytical PME correction */
1931 zeta2 = _mm256_mul_ps(beta2,rsq01);
1932 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
1933 pmecorrF = avx256_pmecorrF_f(zeta2);
1934 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1935 felec = _mm256_mul_ps(qq01,felec);
1936 pmecorrV = avx256_pmecorrV_f(zeta2);
1937 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1938 velec = _mm256_sub_ps(rinv01,pmecorrV);
1939 velec = _mm256_mul_ps(qq01,velec);
1941 d = _mm256_sub_ps(r01,rswitch);
1942 d = _mm256_max_ps(d,_mm256_setzero_ps());
1943 d2 = _mm256_mul_ps(d,d);
1944 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1946 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1948 /* Evaluate switch function */
1949 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1950 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1951 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1955 fscal = _mm256_and_ps(fscal,cutoff_mask);
1957 /* Calculate temporary vectorial force */
1958 tx = _mm256_mul_ps(fscal,dx01);
1959 ty = _mm256_mul_ps(fscal,dy01);
1960 tz = _mm256_mul_ps(fscal,dz01);
1962 /* Update vectorial force */
1963 fix0 = _mm256_add_ps(fix0,tx);
1964 fiy0 = _mm256_add_ps(fiy0,ty);
1965 fiz0 = _mm256_add_ps(fiz0,tz);
1967 fjx1 = _mm256_add_ps(fjx1,tx);
1968 fjy1 = _mm256_add_ps(fjy1,ty);
1969 fjz1 = _mm256_add_ps(fjz1,tz);
1973 /**************************
1974 * CALCULATE INTERACTIONS *
1975 **************************/
1977 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1980 r02 = _mm256_mul_ps(rsq02,rinv02);
1982 /* EWALD ELECTROSTATICS */
1984 /* Analytical PME correction */
1985 zeta2 = _mm256_mul_ps(beta2,rsq02);
1986 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
1987 pmecorrF = avx256_pmecorrF_f(zeta2);
1988 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1989 felec = _mm256_mul_ps(qq02,felec);
1990 pmecorrV = avx256_pmecorrV_f(zeta2);
1991 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1992 velec = _mm256_sub_ps(rinv02,pmecorrV);
1993 velec = _mm256_mul_ps(qq02,velec);
1995 d = _mm256_sub_ps(r02,rswitch);
1996 d = _mm256_max_ps(d,_mm256_setzero_ps());
1997 d2 = _mm256_mul_ps(d,d);
1998 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2000 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2002 /* Evaluate switch function */
2003 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2004 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2005 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2009 fscal = _mm256_and_ps(fscal,cutoff_mask);
2011 /* Calculate temporary vectorial force */
2012 tx = _mm256_mul_ps(fscal,dx02);
2013 ty = _mm256_mul_ps(fscal,dy02);
2014 tz = _mm256_mul_ps(fscal,dz02);
2016 /* Update vectorial force */
2017 fix0 = _mm256_add_ps(fix0,tx);
2018 fiy0 = _mm256_add_ps(fiy0,ty);
2019 fiz0 = _mm256_add_ps(fiz0,tz);
2021 fjx2 = _mm256_add_ps(fjx2,tx);
2022 fjy2 = _mm256_add_ps(fjy2,ty);
2023 fjz2 = _mm256_add_ps(fjz2,tz);
2027 /**************************
2028 * CALCULATE INTERACTIONS *
2029 **************************/
2031 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2034 r10 = _mm256_mul_ps(rsq10,rinv10);
2036 /* EWALD ELECTROSTATICS */
2038 /* Analytical PME correction */
2039 zeta2 = _mm256_mul_ps(beta2,rsq10);
2040 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
2041 pmecorrF = avx256_pmecorrF_f(zeta2);
2042 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2043 felec = _mm256_mul_ps(qq10,felec);
2044 pmecorrV = avx256_pmecorrV_f(zeta2);
2045 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2046 velec = _mm256_sub_ps(rinv10,pmecorrV);
2047 velec = _mm256_mul_ps(qq10,velec);
2049 d = _mm256_sub_ps(r10,rswitch);
2050 d = _mm256_max_ps(d,_mm256_setzero_ps());
2051 d2 = _mm256_mul_ps(d,d);
2052 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2054 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2056 /* Evaluate switch function */
2057 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2058 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2059 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2063 fscal = _mm256_and_ps(fscal,cutoff_mask);
2065 /* Calculate temporary vectorial force */
2066 tx = _mm256_mul_ps(fscal,dx10);
2067 ty = _mm256_mul_ps(fscal,dy10);
2068 tz = _mm256_mul_ps(fscal,dz10);
2070 /* Update vectorial force */
2071 fix1 = _mm256_add_ps(fix1,tx);
2072 fiy1 = _mm256_add_ps(fiy1,ty);
2073 fiz1 = _mm256_add_ps(fiz1,tz);
2075 fjx0 = _mm256_add_ps(fjx0,tx);
2076 fjy0 = _mm256_add_ps(fjy0,ty);
2077 fjz0 = _mm256_add_ps(fjz0,tz);
2081 /**************************
2082 * CALCULATE INTERACTIONS *
2083 **************************/
2085 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2088 r11 = _mm256_mul_ps(rsq11,rinv11);
2090 /* EWALD ELECTROSTATICS */
2092 /* Analytical PME correction */
2093 zeta2 = _mm256_mul_ps(beta2,rsq11);
2094 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2095 pmecorrF = avx256_pmecorrF_f(zeta2);
2096 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2097 felec = _mm256_mul_ps(qq11,felec);
2098 pmecorrV = avx256_pmecorrV_f(zeta2);
2099 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2100 velec = _mm256_sub_ps(rinv11,pmecorrV);
2101 velec = _mm256_mul_ps(qq11,velec);
2103 d = _mm256_sub_ps(r11,rswitch);
2104 d = _mm256_max_ps(d,_mm256_setzero_ps());
2105 d2 = _mm256_mul_ps(d,d);
2106 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2108 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2110 /* Evaluate switch function */
2111 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2112 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2113 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2117 fscal = _mm256_and_ps(fscal,cutoff_mask);
2119 /* Calculate temporary vectorial force */
2120 tx = _mm256_mul_ps(fscal,dx11);
2121 ty = _mm256_mul_ps(fscal,dy11);
2122 tz = _mm256_mul_ps(fscal,dz11);
2124 /* Update vectorial force */
2125 fix1 = _mm256_add_ps(fix1,tx);
2126 fiy1 = _mm256_add_ps(fiy1,ty);
2127 fiz1 = _mm256_add_ps(fiz1,tz);
2129 fjx1 = _mm256_add_ps(fjx1,tx);
2130 fjy1 = _mm256_add_ps(fjy1,ty);
2131 fjz1 = _mm256_add_ps(fjz1,tz);
2135 /**************************
2136 * CALCULATE INTERACTIONS *
2137 **************************/
2139 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2142 r12 = _mm256_mul_ps(rsq12,rinv12);
2144 /* EWALD ELECTROSTATICS */
2146 /* Analytical PME correction */
2147 zeta2 = _mm256_mul_ps(beta2,rsq12);
2148 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2149 pmecorrF = avx256_pmecorrF_f(zeta2);
2150 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2151 felec = _mm256_mul_ps(qq12,felec);
2152 pmecorrV = avx256_pmecorrV_f(zeta2);
2153 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2154 velec = _mm256_sub_ps(rinv12,pmecorrV);
2155 velec = _mm256_mul_ps(qq12,velec);
2157 d = _mm256_sub_ps(r12,rswitch);
2158 d = _mm256_max_ps(d,_mm256_setzero_ps());
2159 d2 = _mm256_mul_ps(d,d);
2160 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2162 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2164 /* Evaluate switch function */
2165 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2166 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2167 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2171 fscal = _mm256_and_ps(fscal,cutoff_mask);
2173 /* Calculate temporary vectorial force */
2174 tx = _mm256_mul_ps(fscal,dx12);
2175 ty = _mm256_mul_ps(fscal,dy12);
2176 tz = _mm256_mul_ps(fscal,dz12);
2178 /* Update vectorial force */
2179 fix1 = _mm256_add_ps(fix1,tx);
2180 fiy1 = _mm256_add_ps(fiy1,ty);
2181 fiz1 = _mm256_add_ps(fiz1,tz);
2183 fjx2 = _mm256_add_ps(fjx2,tx);
2184 fjy2 = _mm256_add_ps(fjy2,ty);
2185 fjz2 = _mm256_add_ps(fjz2,tz);
2189 /**************************
2190 * CALCULATE INTERACTIONS *
2191 **************************/
2193 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2196 r20 = _mm256_mul_ps(rsq20,rinv20);
2198 /* EWALD ELECTROSTATICS */
2200 /* Analytical PME correction */
2201 zeta2 = _mm256_mul_ps(beta2,rsq20);
2202 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
2203 pmecorrF = avx256_pmecorrF_f(zeta2);
2204 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2205 felec = _mm256_mul_ps(qq20,felec);
2206 pmecorrV = avx256_pmecorrV_f(zeta2);
2207 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2208 velec = _mm256_sub_ps(rinv20,pmecorrV);
2209 velec = _mm256_mul_ps(qq20,velec);
2211 d = _mm256_sub_ps(r20,rswitch);
2212 d = _mm256_max_ps(d,_mm256_setzero_ps());
2213 d2 = _mm256_mul_ps(d,d);
2214 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2216 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2218 /* Evaluate switch function */
2219 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2220 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2221 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2225 fscal = _mm256_and_ps(fscal,cutoff_mask);
2227 /* Calculate temporary vectorial force */
2228 tx = _mm256_mul_ps(fscal,dx20);
2229 ty = _mm256_mul_ps(fscal,dy20);
2230 tz = _mm256_mul_ps(fscal,dz20);
2232 /* Update vectorial force */
2233 fix2 = _mm256_add_ps(fix2,tx);
2234 fiy2 = _mm256_add_ps(fiy2,ty);
2235 fiz2 = _mm256_add_ps(fiz2,tz);
2237 fjx0 = _mm256_add_ps(fjx0,tx);
2238 fjy0 = _mm256_add_ps(fjy0,ty);
2239 fjz0 = _mm256_add_ps(fjz0,tz);
2243 /**************************
2244 * CALCULATE INTERACTIONS *
2245 **************************/
2247 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2250 r21 = _mm256_mul_ps(rsq21,rinv21);
2252 /* EWALD ELECTROSTATICS */
2254 /* Analytical PME correction */
2255 zeta2 = _mm256_mul_ps(beta2,rsq21);
2256 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2257 pmecorrF = avx256_pmecorrF_f(zeta2);
2258 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2259 felec = _mm256_mul_ps(qq21,felec);
2260 pmecorrV = avx256_pmecorrV_f(zeta2);
2261 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2262 velec = _mm256_sub_ps(rinv21,pmecorrV);
2263 velec = _mm256_mul_ps(qq21,velec);
2265 d = _mm256_sub_ps(r21,rswitch);
2266 d = _mm256_max_ps(d,_mm256_setzero_ps());
2267 d2 = _mm256_mul_ps(d,d);
2268 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2270 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2272 /* Evaluate switch function */
2273 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2274 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2275 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2279 fscal = _mm256_and_ps(fscal,cutoff_mask);
2281 /* Calculate temporary vectorial force */
2282 tx = _mm256_mul_ps(fscal,dx21);
2283 ty = _mm256_mul_ps(fscal,dy21);
2284 tz = _mm256_mul_ps(fscal,dz21);
2286 /* Update vectorial force */
2287 fix2 = _mm256_add_ps(fix2,tx);
2288 fiy2 = _mm256_add_ps(fiy2,ty);
2289 fiz2 = _mm256_add_ps(fiz2,tz);
2291 fjx1 = _mm256_add_ps(fjx1,tx);
2292 fjy1 = _mm256_add_ps(fjy1,ty);
2293 fjz1 = _mm256_add_ps(fjz1,tz);
2297 /**************************
2298 * CALCULATE INTERACTIONS *
2299 **************************/
2301 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2304 r22 = _mm256_mul_ps(rsq22,rinv22);
2306 /* EWALD ELECTROSTATICS */
2308 /* Analytical PME correction */
2309 zeta2 = _mm256_mul_ps(beta2,rsq22);
2310 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2311 pmecorrF = avx256_pmecorrF_f(zeta2);
2312 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2313 felec = _mm256_mul_ps(qq22,felec);
2314 pmecorrV = avx256_pmecorrV_f(zeta2);
2315 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2316 velec = _mm256_sub_ps(rinv22,pmecorrV);
2317 velec = _mm256_mul_ps(qq22,velec);
2319 d = _mm256_sub_ps(r22,rswitch);
2320 d = _mm256_max_ps(d,_mm256_setzero_ps());
2321 d2 = _mm256_mul_ps(d,d);
2322 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2324 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2326 /* Evaluate switch function */
2327 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2328 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2329 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2333 fscal = _mm256_and_ps(fscal,cutoff_mask);
2335 /* Calculate temporary vectorial force */
2336 tx = _mm256_mul_ps(fscal,dx22);
2337 ty = _mm256_mul_ps(fscal,dy22);
2338 tz = _mm256_mul_ps(fscal,dz22);
2340 /* Update vectorial force */
2341 fix2 = _mm256_add_ps(fix2,tx);
2342 fiy2 = _mm256_add_ps(fiy2,ty);
2343 fiz2 = _mm256_add_ps(fiz2,tz);
2345 fjx2 = _mm256_add_ps(fjx2,tx);
2346 fjy2 = _mm256_add_ps(fjy2,ty);
2347 fjz2 = _mm256_add_ps(fjz2,tz);
2351 fjptrA = f+j_coord_offsetA;
2352 fjptrB = f+j_coord_offsetB;
2353 fjptrC = f+j_coord_offsetC;
2354 fjptrD = f+j_coord_offsetD;
2355 fjptrE = f+j_coord_offsetE;
2356 fjptrF = f+j_coord_offsetF;
2357 fjptrG = f+j_coord_offsetG;
2358 fjptrH = f+j_coord_offsetH;
2360 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2361 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2363 /* Inner loop uses 945 flops */
2366 if(jidx<j_index_end)
2369 /* Get j neighbor index, and coordinate index */
2370 jnrlistA = jjnr[jidx];
2371 jnrlistB = jjnr[jidx+1];
2372 jnrlistC = jjnr[jidx+2];
2373 jnrlistD = jjnr[jidx+3];
2374 jnrlistE = jjnr[jidx+4];
2375 jnrlistF = jjnr[jidx+5];
2376 jnrlistG = jjnr[jidx+6];
2377 jnrlistH = jjnr[jidx+7];
2378 /* Sign of each element will be negative for non-real atoms.
2379 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2380 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2382 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2383 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2385 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2386 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2387 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2388 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2389 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2390 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2391 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2392 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2393 j_coord_offsetA = DIM*jnrA;
2394 j_coord_offsetB = DIM*jnrB;
2395 j_coord_offsetC = DIM*jnrC;
2396 j_coord_offsetD = DIM*jnrD;
2397 j_coord_offsetE = DIM*jnrE;
2398 j_coord_offsetF = DIM*jnrF;
2399 j_coord_offsetG = DIM*jnrG;
2400 j_coord_offsetH = DIM*jnrH;
2402 /* load j atom coordinates */
2403 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2404 x+j_coord_offsetC,x+j_coord_offsetD,
2405 x+j_coord_offsetE,x+j_coord_offsetF,
2406 x+j_coord_offsetG,x+j_coord_offsetH,
2407 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2409 /* Calculate displacement vector */
2410 dx00 = _mm256_sub_ps(ix0,jx0);
2411 dy00 = _mm256_sub_ps(iy0,jy0);
2412 dz00 = _mm256_sub_ps(iz0,jz0);
2413 dx01 = _mm256_sub_ps(ix0,jx1);
2414 dy01 = _mm256_sub_ps(iy0,jy1);
2415 dz01 = _mm256_sub_ps(iz0,jz1);
2416 dx02 = _mm256_sub_ps(ix0,jx2);
2417 dy02 = _mm256_sub_ps(iy0,jy2);
2418 dz02 = _mm256_sub_ps(iz0,jz2);
2419 dx10 = _mm256_sub_ps(ix1,jx0);
2420 dy10 = _mm256_sub_ps(iy1,jy0);
2421 dz10 = _mm256_sub_ps(iz1,jz0);
2422 dx11 = _mm256_sub_ps(ix1,jx1);
2423 dy11 = _mm256_sub_ps(iy1,jy1);
2424 dz11 = _mm256_sub_ps(iz1,jz1);
2425 dx12 = _mm256_sub_ps(ix1,jx2);
2426 dy12 = _mm256_sub_ps(iy1,jy2);
2427 dz12 = _mm256_sub_ps(iz1,jz2);
2428 dx20 = _mm256_sub_ps(ix2,jx0);
2429 dy20 = _mm256_sub_ps(iy2,jy0);
2430 dz20 = _mm256_sub_ps(iz2,jz0);
2431 dx21 = _mm256_sub_ps(ix2,jx1);
2432 dy21 = _mm256_sub_ps(iy2,jy1);
2433 dz21 = _mm256_sub_ps(iz2,jz1);
2434 dx22 = _mm256_sub_ps(ix2,jx2);
2435 dy22 = _mm256_sub_ps(iy2,jy2);
2436 dz22 = _mm256_sub_ps(iz2,jz2);
2438 /* Calculate squared distance and things based on it */
2439 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2440 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2441 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2442 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2443 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2444 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2445 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2446 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2447 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2449 rinv00 = avx256_invsqrt_f(rsq00);
2450 rinv01 = avx256_invsqrt_f(rsq01);
2451 rinv02 = avx256_invsqrt_f(rsq02);
2452 rinv10 = avx256_invsqrt_f(rsq10);
2453 rinv11 = avx256_invsqrt_f(rsq11);
2454 rinv12 = avx256_invsqrt_f(rsq12);
2455 rinv20 = avx256_invsqrt_f(rsq20);
2456 rinv21 = avx256_invsqrt_f(rsq21);
2457 rinv22 = avx256_invsqrt_f(rsq22);
2459 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2460 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
2461 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
2462 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
2463 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2464 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2465 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
2466 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2467 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2469 fjx0 = _mm256_setzero_ps();
2470 fjy0 = _mm256_setzero_ps();
2471 fjz0 = _mm256_setzero_ps();
2472 fjx1 = _mm256_setzero_ps();
2473 fjy1 = _mm256_setzero_ps();
2474 fjz1 = _mm256_setzero_ps();
2475 fjx2 = _mm256_setzero_ps();
2476 fjy2 = _mm256_setzero_ps();
2477 fjz2 = _mm256_setzero_ps();
2479 /**************************
2480 * CALCULATE INTERACTIONS *
2481 **************************/
2483 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2486 r00 = _mm256_mul_ps(rsq00,rinv00);
2487 r00 = _mm256_andnot_ps(dummy_mask,r00);
2489 /* EWALD ELECTROSTATICS */
2491 /* Analytical PME correction */
2492 zeta2 = _mm256_mul_ps(beta2,rsq00);
2493 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
2494 pmecorrF = avx256_pmecorrF_f(zeta2);
2495 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2496 felec = _mm256_mul_ps(qq00,felec);
2497 pmecorrV = avx256_pmecorrV_f(zeta2);
2498 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2499 velec = _mm256_sub_ps(rinv00,pmecorrV);
2500 velec = _mm256_mul_ps(qq00,velec);
2502 d = _mm256_sub_ps(r00,rswitch);
2503 d = _mm256_max_ps(d,_mm256_setzero_ps());
2504 d2 = _mm256_mul_ps(d,d);
2505 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2507 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2509 /* Evaluate switch function */
2510 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2511 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
2512 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2516 fscal = _mm256_and_ps(fscal,cutoff_mask);
2518 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2520 /* Calculate temporary vectorial force */
2521 tx = _mm256_mul_ps(fscal,dx00);
2522 ty = _mm256_mul_ps(fscal,dy00);
2523 tz = _mm256_mul_ps(fscal,dz00);
2525 /* Update vectorial force */
2526 fix0 = _mm256_add_ps(fix0,tx);
2527 fiy0 = _mm256_add_ps(fiy0,ty);
2528 fiz0 = _mm256_add_ps(fiz0,tz);
2530 fjx0 = _mm256_add_ps(fjx0,tx);
2531 fjy0 = _mm256_add_ps(fjy0,ty);
2532 fjz0 = _mm256_add_ps(fjz0,tz);
2536 /**************************
2537 * CALCULATE INTERACTIONS *
2538 **************************/
2540 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2543 r01 = _mm256_mul_ps(rsq01,rinv01);
2544 r01 = _mm256_andnot_ps(dummy_mask,r01);
2546 /* EWALD ELECTROSTATICS */
2548 /* Analytical PME correction */
2549 zeta2 = _mm256_mul_ps(beta2,rsq01);
2550 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
2551 pmecorrF = avx256_pmecorrF_f(zeta2);
2552 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2553 felec = _mm256_mul_ps(qq01,felec);
2554 pmecorrV = avx256_pmecorrV_f(zeta2);
2555 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2556 velec = _mm256_sub_ps(rinv01,pmecorrV);
2557 velec = _mm256_mul_ps(qq01,velec);
2559 d = _mm256_sub_ps(r01,rswitch);
2560 d = _mm256_max_ps(d,_mm256_setzero_ps());
2561 d2 = _mm256_mul_ps(d,d);
2562 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2564 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2566 /* Evaluate switch function */
2567 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2568 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2569 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2573 fscal = _mm256_and_ps(fscal,cutoff_mask);
2575 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2577 /* Calculate temporary vectorial force */
2578 tx = _mm256_mul_ps(fscal,dx01);
2579 ty = _mm256_mul_ps(fscal,dy01);
2580 tz = _mm256_mul_ps(fscal,dz01);
2582 /* Update vectorial force */
2583 fix0 = _mm256_add_ps(fix0,tx);
2584 fiy0 = _mm256_add_ps(fiy0,ty);
2585 fiz0 = _mm256_add_ps(fiz0,tz);
2587 fjx1 = _mm256_add_ps(fjx1,tx);
2588 fjy1 = _mm256_add_ps(fjy1,ty);
2589 fjz1 = _mm256_add_ps(fjz1,tz);
2593 /**************************
2594 * CALCULATE INTERACTIONS *
2595 **************************/
2597 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2600 r02 = _mm256_mul_ps(rsq02,rinv02);
2601 r02 = _mm256_andnot_ps(dummy_mask,r02);
2603 /* EWALD ELECTROSTATICS */
2605 /* Analytical PME correction */
2606 zeta2 = _mm256_mul_ps(beta2,rsq02);
2607 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
2608 pmecorrF = avx256_pmecorrF_f(zeta2);
2609 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2610 felec = _mm256_mul_ps(qq02,felec);
2611 pmecorrV = avx256_pmecorrV_f(zeta2);
2612 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2613 velec = _mm256_sub_ps(rinv02,pmecorrV);
2614 velec = _mm256_mul_ps(qq02,velec);
2616 d = _mm256_sub_ps(r02,rswitch);
2617 d = _mm256_max_ps(d,_mm256_setzero_ps());
2618 d2 = _mm256_mul_ps(d,d);
2619 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2621 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2623 /* Evaluate switch function */
2624 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2625 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2626 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2630 fscal = _mm256_and_ps(fscal,cutoff_mask);
2632 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2634 /* Calculate temporary vectorial force */
2635 tx = _mm256_mul_ps(fscal,dx02);
2636 ty = _mm256_mul_ps(fscal,dy02);
2637 tz = _mm256_mul_ps(fscal,dz02);
2639 /* Update vectorial force */
2640 fix0 = _mm256_add_ps(fix0,tx);
2641 fiy0 = _mm256_add_ps(fiy0,ty);
2642 fiz0 = _mm256_add_ps(fiz0,tz);
2644 fjx2 = _mm256_add_ps(fjx2,tx);
2645 fjy2 = _mm256_add_ps(fjy2,ty);
2646 fjz2 = _mm256_add_ps(fjz2,tz);
2650 /**************************
2651 * CALCULATE INTERACTIONS *
2652 **************************/
2654 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2657 r10 = _mm256_mul_ps(rsq10,rinv10);
2658 r10 = _mm256_andnot_ps(dummy_mask,r10);
2660 /* EWALD ELECTROSTATICS */
2662 /* Analytical PME correction */
2663 zeta2 = _mm256_mul_ps(beta2,rsq10);
2664 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
2665 pmecorrF = avx256_pmecorrF_f(zeta2);
2666 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2667 felec = _mm256_mul_ps(qq10,felec);
2668 pmecorrV = avx256_pmecorrV_f(zeta2);
2669 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2670 velec = _mm256_sub_ps(rinv10,pmecorrV);
2671 velec = _mm256_mul_ps(qq10,velec);
2673 d = _mm256_sub_ps(r10,rswitch);
2674 d = _mm256_max_ps(d,_mm256_setzero_ps());
2675 d2 = _mm256_mul_ps(d,d);
2676 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2678 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2680 /* Evaluate switch function */
2681 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2682 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2683 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2687 fscal = _mm256_and_ps(fscal,cutoff_mask);
2689 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2691 /* Calculate temporary vectorial force */
2692 tx = _mm256_mul_ps(fscal,dx10);
2693 ty = _mm256_mul_ps(fscal,dy10);
2694 tz = _mm256_mul_ps(fscal,dz10);
2696 /* Update vectorial force */
2697 fix1 = _mm256_add_ps(fix1,tx);
2698 fiy1 = _mm256_add_ps(fiy1,ty);
2699 fiz1 = _mm256_add_ps(fiz1,tz);
2701 fjx0 = _mm256_add_ps(fjx0,tx);
2702 fjy0 = _mm256_add_ps(fjy0,ty);
2703 fjz0 = _mm256_add_ps(fjz0,tz);
2707 /**************************
2708 * CALCULATE INTERACTIONS *
2709 **************************/
2711 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2714 r11 = _mm256_mul_ps(rsq11,rinv11);
2715 r11 = _mm256_andnot_ps(dummy_mask,r11);
2717 /* EWALD ELECTROSTATICS */
2719 /* Analytical PME correction */
2720 zeta2 = _mm256_mul_ps(beta2,rsq11);
2721 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2722 pmecorrF = avx256_pmecorrF_f(zeta2);
2723 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2724 felec = _mm256_mul_ps(qq11,felec);
2725 pmecorrV = avx256_pmecorrV_f(zeta2);
2726 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2727 velec = _mm256_sub_ps(rinv11,pmecorrV);
2728 velec = _mm256_mul_ps(qq11,velec);
2730 d = _mm256_sub_ps(r11,rswitch);
2731 d = _mm256_max_ps(d,_mm256_setzero_ps());
2732 d2 = _mm256_mul_ps(d,d);
2733 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2735 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2737 /* Evaluate switch function */
2738 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2739 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2740 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2744 fscal = _mm256_and_ps(fscal,cutoff_mask);
2746 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2748 /* Calculate temporary vectorial force */
2749 tx = _mm256_mul_ps(fscal,dx11);
2750 ty = _mm256_mul_ps(fscal,dy11);
2751 tz = _mm256_mul_ps(fscal,dz11);
2753 /* Update vectorial force */
2754 fix1 = _mm256_add_ps(fix1,tx);
2755 fiy1 = _mm256_add_ps(fiy1,ty);
2756 fiz1 = _mm256_add_ps(fiz1,tz);
2758 fjx1 = _mm256_add_ps(fjx1,tx);
2759 fjy1 = _mm256_add_ps(fjy1,ty);
2760 fjz1 = _mm256_add_ps(fjz1,tz);
2764 /**************************
2765 * CALCULATE INTERACTIONS *
2766 **************************/
2768 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2771 r12 = _mm256_mul_ps(rsq12,rinv12);
2772 r12 = _mm256_andnot_ps(dummy_mask,r12);
2774 /* EWALD ELECTROSTATICS */
2776 /* Analytical PME correction */
2777 zeta2 = _mm256_mul_ps(beta2,rsq12);
2778 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2779 pmecorrF = avx256_pmecorrF_f(zeta2);
2780 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2781 felec = _mm256_mul_ps(qq12,felec);
2782 pmecorrV = avx256_pmecorrV_f(zeta2);
2783 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2784 velec = _mm256_sub_ps(rinv12,pmecorrV);
2785 velec = _mm256_mul_ps(qq12,velec);
2787 d = _mm256_sub_ps(r12,rswitch);
2788 d = _mm256_max_ps(d,_mm256_setzero_ps());
2789 d2 = _mm256_mul_ps(d,d);
2790 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2792 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2794 /* Evaluate switch function */
2795 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2796 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2797 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2801 fscal = _mm256_and_ps(fscal,cutoff_mask);
2803 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2805 /* Calculate temporary vectorial force */
2806 tx = _mm256_mul_ps(fscal,dx12);
2807 ty = _mm256_mul_ps(fscal,dy12);
2808 tz = _mm256_mul_ps(fscal,dz12);
2810 /* Update vectorial force */
2811 fix1 = _mm256_add_ps(fix1,tx);
2812 fiy1 = _mm256_add_ps(fiy1,ty);
2813 fiz1 = _mm256_add_ps(fiz1,tz);
2815 fjx2 = _mm256_add_ps(fjx2,tx);
2816 fjy2 = _mm256_add_ps(fjy2,ty);
2817 fjz2 = _mm256_add_ps(fjz2,tz);
2821 /**************************
2822 * CALCULATE INTERACTIONS *
2823 **************************/
2825 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2828 r20 = _mm256_mul_ps(rsq20,rinv20);
2829 r20 = _mm256_andnot_ps(dummy_mask,r20);
2831 /* EWALD ELECTROSTATICS */
2833 /* Analytical PME correction */
2834 zeta2 = _mm256_mul_ps(beta2,rsq20);
2835 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
2836 pmecorrF = avx256_pmecorrF_f(zeta2);
2837 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2838 felec = _mm256_mul_ps(qq20,felec);
2839 pmecorrV = avx256_pmecorrV_f(zeta2);
2840 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2841 velec = _mm256_sub_ps(rinv20,pmecorrV);
2842 velec = _mm256_mul_ps(qq20,velec);
2844 d = _mm256_sub_ps(r20,rswitch);
2845 d = _mm256_max_ps(d,_mm256_setzero_ps());
2846 d2 = _mm256_mul_ps(d,d);
2847 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2849 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2851 /* Evaluate switch function */
2852 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2853 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2854 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2858 fscal = _mm256_and_ps(fscal,cutoff_mask);
2860 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2862 /* Calculate temporary vectorial force */
2863 tx = _mm256_mul_ps(fscal,dx20);
2864 ty = _mm256_mul_ps(fscal,dy20);
2865 tz = _mm256_mul_ps(fscal,dz20);
2867 /* Update vectorial force */
2868 fix2 = _mm256_add_ps(fix2,tx);
2869 fiy2 = _mm256_add_ps(fiy2,ty);
2870 fiz2 = _mm256_add_ps(fiz2,tz);
2872 fjx0 = _mm256_add_ps(fjx0,tx);
2873 fjy0 = _mm256_add_ps(fjy0,ty);
2874 fjz0 = _mm256_add_ps(fjz0,tz);
2878 /**************************
2879 * CALCULATE INTERACTIONS *
2880 **************************/
2882 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2885 r21 = _mm256_mul_ps(rsq21,rinv21);
2886 r21 = _mm256_andnot_ps(dummy_mask,r21);
2888 /* EWALD ELECTROSTATICS */
2890 /* Analytical PME correction */
2891 zeta2 = _mm256_mul_ps(beta2,rsq21);
2892 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2893 pmecorrF = avx256_pmecorrF_f(zeta2);
2894 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2895 felec = _mm256_mul_ps(qq21,felec);
2896 pmecorrV = avx256_pmecorrV_f(zeta2);
2897 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2898 velec = _mm256_sub_ps(rinv21,pmecorrV);
2899 velec = _mm256_mul_ps(qq21,velec);
2901 d = _mm256_sub_ps(r21,rswitch);
2902 d = _mm256_max_ps(d,_mm256_setzero_ps());
2903 d2 = _mm256_mul_ps(d,d);
2904 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2906 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2908 /* Evaluate switch function */
2909 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2910 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2911 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2915 fscal = _mm256_and_ps(fscal,cutoff_mask);
2917 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2919 /* Calculate temporary vectorial force */
2920 tx = _mm256_mul_ps(fscal,dx21);
2921 ty = _mm256_mul_ps(fscal,dy21);
2922 tz = _mm256_mul_ps(fscal,dz21);
2924 /* Update vectorial force */
2925 fix2 = _mm256_add_ps(fix2,tx);
2926 fiy2 = _mm256_add_ps(fiy2,ty);
2927 fiz2 = _mm256_add_ps(fiz2,tz);
2929 fjx1 = _mm256_add_ps(fjx1,tx);
2930 fjy1 = _mm256_add_ps(fjy1,ty);
2931 fjz1 = _mm256_add_ps(fjz1,tz);
2935 /**************************
2936 * CALCULATE INTERACTIONS *
2937 **************************/
2939 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2942 r22 = _mm256_mul_ps(rsq22,rinv22);
2943 r22 = _mm256_andnot_ps(dummy_mask,r22);
2945 /* EWALD ELECTROSTATICS */
2947 /* Analytical PME correction */
2948 zeta2 = _mm256_mul_ps(beta2,rsq22);
2949 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2950 pmecorrF = avx256_pmecorrF_f(zeta2);
2951 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2952 felec = _mm256_mul_ps(qq22,felec);
2953 pmecorrV = avx256_pmecorrV_f(zeta2);
2954 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2955 velec = _mm256_sub_ps(rinv22,pmecorrV);
2956 velec = _mm256_mul_ps(qq22,velec);
2958 d = _mm256_sub_ps(r22,rswitch);
2959 d = _mm256_max_ps(d,_mm256_setzero_ps());
2960 d2 = _mm256_mul_ps(d,d);
2961 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2963 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2965 /* Evaluate switch function */
2966 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2967 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2968 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2972 fscal = _mm256_and_ps(fscal,cutoff_mask);
2974 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2976 /* Calculate temporary vectorial force */
2977 tx = _mm256_mul_ps(fscal,dx22);
2978 ty = _mm256_mul_ps(fscal,dy22);
2979 tz = _mm256_mul_ps(fscal,dz22);
2981 /* Update vectorial force */
2982 fix2 = _mm256_add_ps(fix2,tx);
2983 fiy2 = _mm256_add_ps(fiy2,ty);
2984 fiz2 = _mm256_add_ps(fiz2,tz);
2986 fjx2 = _mm256_add_ps(fjx2,tx);
2987 fjy2 = _mm256_add_ps(fjy2,ty);
2988 fjz2 = _mm256_add_ps(fjz2,tz);
2992 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2993 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2994 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2995 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2996 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2997 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2998 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2999 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3001 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3002 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3004 /* Inner loop uses 954 flops */
3007 /* End of innermost loop */
3009 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3010 f+i_coord_offset,fshift+i_shift_offset);
3012 /* Increment number of inner iterations */
3013 inneriter += j_index_end - j_index_start;
3015 /* Outer loop uses 18 flops */
3018 /* Increment number of outer iterations */
3021 /* Update outer/inner flops */
3023 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*954);