2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128i ewitab_lo,ewitab_hi;
97 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
100 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
101 real rswitch_scalar,d_scalar;
102 __m256 dummy_mask,cutoff_mask;
103 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
104 __m256 one = _mm256_set1_ps(1.0);
105 __m256 two = _mm256_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm256_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
120 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
121 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
122 beta2 = _mm256_mul_ps(beta,beta);
123 beta3 = _mm256_mul_ps(beta,beta2);
125 ewtab = fr->ic->tabq_coul_FDV0;
126 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
127 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
132 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
133 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
135 jq0 = _mm256_set1_ps(charge[inr+0]);
136 jq1 = _mm256_set1_ps(charge[inr+1]);
137 jq2 = _mm256_set1_ps(charge[inr+2]);
138 qq00 = _mm256_mul_ps(iq0,jq0);
139 qq01 = _mm256_mul_ps(iq0,jq1);
140 qq02 = _mm256_mul_ps(iq0,jq2);
141 qq10 = _mm256_mul_ps(iq1,jq0);
142 qq11 = _mm256_mul_ps(iq1,jq1);
143 qq12 = _mm256_mul_ps(iq1,jq2);
144 qq20 = _mm256_mul_ps(iq2,jq0);
145 qq21 = _mm256_mul_ps(iq2,jq1);
146 qq22 = _mm256_mul_ps(iq2,jq2);
148 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
149 rcutoff_scalar = fr->rcoulomb;
150 rcutoff = _mm256_set1_ps(rcutoff_scalar);
151 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
153 rswitch_scalar = fr->rcoulomb_switch;
154 rswitch = _mm256_set1_ps(rswitch_scalar);
155 /* Setup switch parameters */
156 d_scalar = rcutoff_scalar-rswitch_scalar;
157 d = _mm256_set1_ps(d_scalar);
158 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
159 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
160 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
161 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
162 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
163 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165 /* Avoid stupid compiler warnings */
166 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
179 for(iidx=0;iidx<4*DIM;iidx++)
184 /* Start outer loop over neighborlists */
185 for(iidx=0; iidx<nri; iidx++)
187 /* Load shift vector for this list */
188 i_shift_offset = DIM*shiftidx[iidx];
190 /* Load limits for loop over neighbors */
191 j_index_start = jindex[iidx];
192 j_index_end = jindex[iidx+1];
194 /* Get outer coordinate index */
196 i_coord_offset = DIM*inr;
198 /* Load i particle coords and add shift vector */
199 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
200 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
202 fix0 = _mm256_setzero_ps();
203 fiy0 = _mm256_setzero_ps();
204 fiz0 = _mm256_setzero_ps();
205 fix1 = _mm256_setzero_ps();
206 fiy1 = _mm256_setzero_ps();
207 fiz1 = _mm256_setzero_ps();
208 fix2 = _mm256_setzero_ps();
209 fiy2 = _mm256_setzero_ps();
210 fiz2 = _mm256_setzero_ps();
212 /* Reset potential sums */
213 velecsum = _mm256_setzero_ps();
215 /* Start inner kernel loop */
216 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
219 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
230 j_coord_offsetC = DIM*jnrC;
231 j_coord_offsetD = DIM*jnrD;
232 j_coord_offsetE = DIM*jnrE;
233 j_coord_offsetF = DIM*jnrF;
234 j_coord_offsetG = DIM*jnrG;
235 j_coord_offsetH = DIM*jnrH;
237 /* load j atom coordinates */
238 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239 x+j_coord_offsetC,x+j_coord_offsetD,
240 x+j_coord_offsetE,x+j_coord_offsetF,
241 x+j_coord_offsetG,x+j_coord_offsetH,
242 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
244 /* Calculate displacement vector */
245 dx00 = _mm256_sub_ps(ix0,jx0);
246 dy00 = _mm256_sub_ps(iy0,jy0);
247 dz00 = _mm256_sub_ps(iz0,jz0);
248 dx01 = _mm256_sub_ps(ix0,jx1);
249 dy01 = _mm256_sub_ps(iy0,jy1);
250 dz01 = _mm256_sub_ps(iz0,jz1);
251 dx02 = _mm256_sub_ps(ix0,jx2);
252 dy02 = _mm256_sub_ps(iy0,jy2);
253 dz02 = _mm256_sub_ps(iz0,jz2);
254 dx10 = _mm256_sub_ps(ix1,jx0);
255 dy10 = _mm256_sub_ps(iy1,jy0);
256 dz10 = _mm256_sub_ps(iz1,jz0);
257 dx11 = _mm256_sub_ps(ix1,jx1);
258 dy11 = _mm256_sub_ps(iy1,jy1);
259 dz11 = _mm256_sub_ps(iz1,jz1);
260 dx12 = _mm256_sub_ps(ix1,jx2);
261 dy12 = _mm256_sub_ps(iy1,jy2);
262 dz12 = _mm256_sub_ps(iz1,jz2);
263 dx20 = _mm256_sub_ps(ix2,jx0);
264 dy20 = _mm256_sub_ps(iy2,jy0);
265 dz20 = _mm256_sub_ps(iz2,jz0);
266 dx21 = _mm256_sub_ps(ix2,jx1);
267 dy21 = _mm256_sub_ps(iy2,jy1);
268 dz21 = _mm256_sub_ps(iz2,jz1);
269 dx22 = _mm256_sub_ps(ix2,jx2);
270 dy22 = _mm256_sub_ps(iy2,jy2);
271 dz22 = _mm256_sub_ps(iz2,jz2);
273 /* Calculate squared distance and things based on it */
274 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
275 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
276 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
277 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
278 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
279 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
280 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
281 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
282 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
284 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
285 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
286 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
287 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
288 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
289 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
290 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
291 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
292 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
294 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
295 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
296 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
297 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
298 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
299 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
300 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
301 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
302 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
304 fjx0 = _mm256_setzero_ps();
305 fjy0 = _mm256_setzero_ps();
306 fjz0 = _mm256_setzero_ps();
307 fjx1 = _mm256_setzero_ps();
308 fjy1 = _mm256_setzero_ps();
309 fjz1 = _mm256_setzero_ps();
310 fjx2 = _mm256_setzero_ps();
311 fjy2 = _mm256_setzero_ps();
312 fjz2 = _mm256_setzero_ps();
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 if (gmx_mm256_any_lt(rsq00,rcutoff2))
321 r00 = _mm256_mul_ps(rsq00,rinv00);
323 /* EWALD ELECTROSTATICS */
325 /* Analytical PME correction */
326 zeta2 = _mm256_mul_ps(beta2,rsq00);
327 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
328 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
329 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
330 felec = _mm256_mul_ps(qq00,felec);
331 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
332 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
333 velec = _mm256_sub_ps(rinv00,pmecorrV);
334 velec = _mm256_mul_ps(qq00,velec);
336 d = _mm256_sub_ps(r00,rswitch);
337 d = _mm256_max_ps(d,_mm256_setzero_ps());
338 d2 = _mm256_mul_ps(d,d);
339 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
341 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
343 /* Evaluate switch function */
344 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
345 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
346 velec = _mm256_mul_ps(velec,sw);
347 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velec = _mm256_and_ps(velec,cutoff_mask);
351 velecsum = _mm256_add_ps(velecsum,velec);
355 fscal = _mm256_and_ps(fscal,cutoff_mask);
357 /* Calculate temporary vectorial force */
358 tx = _mm256_mul_ps(fscal,dx00);
359 ty = _mm256_mul_ps(fscal,dy00);
360 tz = _mm256_mul_ps(fscal,dz00);
362 /* Update vectorial force */
363 fix0 = _mm256_add_ps(fix0,tx);
364 fiy0 = _mm256_add_ps(fiy0,ty);
365 fiz0 = _mm256_add_ps(fiz0,tz);
367 fjx0 = _mm256_add_ps(fjx0,tx);
368 fjy0 = _mm256_add_ps(fjy0,ty);
369 fjz0 = _mm256_add_ps(fjz0,tz);
373 /**************************
374 * CALCULATE INTERACTIONS *
375 **************************/
377 if (gmx_mm256_any_lt(rsq01,rcutoff2))
380 r01 = _mm256_mul_ps(rsq01,rinv01);
382 /* EWALD ELECTROSTATICS */
384 /* Analytical PME correction */
385 zeta2 = _mm256_mul_ps(beta2,rsq01);
386 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
387 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
388 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
389 felec = _mm256_mul_ps(qq01,felec);
390 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
391 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
392 velec = _mm256_sub_ps(rinv01,pmecorrV);
393 velec = _mm256_mul_ps(qq01,velec);
395 d = _mm256_sub_ps(r01,rswitch);
396 d = _mm256_max_ps(d,_mm256_setzero_ps());
397 d2 = _mm256_mul_ps(d,d);
398 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
400 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
402 /* Evaluate switch function */
403 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
404 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
405 velec = _mm256_mul_ps(velec,sw);
406 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
408 /* Update potential sum for this i atom from the interaction with this j atom. */
409 velec = _mm256_and_ps(velec,cutoff_mask);
410 velecsum = _mm256_add_ps(velecsum,velec);
414 fscal = _mm256_and_ps(fscal,cutoff_mask);
416 /* Calculate temporary vectorial force */
417 tx = _mm256_mul_ps(fscal,dx01);
418 ty = _mm256_mul_ps(fscal,dy01);
419 tz = _mm256_mul_ps(fscal,dz01);
421 /* Update vectorial force */
422 fix0 = _mm256_add_ps(fix0,tx);
423 fiy0 = _mm256_add_ps(fiy0,ty);
424 fiz0 = _mm256_add_ps(fiz0,tz);
426 fjx1 = _mm256_add_ps(fjx1,tx);
427 fjy1 = _mm256_add_ps(fjy1,ty);
428 fjz1 = _mm256_add_ps(fjz1,tz);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 if (gmx_mm256_any_lt(rsq02,rcutoff2))
439 r02 = _mm256_mul_ps(rsq02,rinv02);
441 /* EWALD ELECTROSTATICS */
443 /* Analytical PME correction */
444 zeta2 = _mm256_mul_ps(beta2,rsq02);
445 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
446 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
447 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
448 felec = _mm256_mul_ps(qq02,felec);
449 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
450 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
451 velec = _mm256_sub_ps(rinv02,pmecorrV);
452 velec = _mm256_mul_ps(qq02,velec);
454 d = _mm256_sub_ps(r02,rswitch);
455 d = _mm256_max_ps(d,_mm256_setzero_ps());
456 d2 = _mm256_mul_ps(d,d);
457 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
459 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
461 /* Evaluate switch function */
462 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
463 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
464 velec = _mm256_mul_ps(velec,sw);
465 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velec = _mm256_and_ps(velec,cutoff_mask);
469 velecsum = _mm256_add_ps(velecsum,velec);
473 fscal = _mm256_and_ps(fscal,cutoff_mask);
475 /* Calculate temporary vectorial force */
476 tx = _mm256_mul_ps(fscal,dx02);
477 ty = _mm256_mul_ps(fscal,dy02);
478 tz = _mm256_mul_ps(fscal,dz02);
480 /* Update vectorial force */
481 fix0 = _mm256_add_ps(fix0,tx);
482 fiy0 = _mm256_add_ps(fiy0,ty);
483 fiz0 = _mm256_add_ps(fiz0,tz);
485 fjx2 = _mm256_add_ps(fjx2,tx);
486 fjy2 = _mm256_add_ps(fjy2,ty);
487 fjz2 = _mm256_add_ps(fjz2,tz);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 if (gmx_mm256_any_lt(rsq10,rcutoff2))
498 r10 = _mm256_mul_ps(rsq10,rinv10);
500 /* EWALD ELECTROSTATICS */
502 /* Analytical PME correction */
503 zeta2 = _mm256_mul_ps(beta2,rsq10);
504 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
505 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
506 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
507 felec = _mm256_mul_ps(qq10,felec);
508 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
509 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
510 velec = _mm256_sub_ps(rinv10,pmecorrV);
511 velec = _mm256_mul_ps(qq10,velec);
513 d = _mm256_sub_ps(r10,rswitch);
514 d = _mm256_max_ps(d,_mm256_setzero_ps());
515 d2 = _mm256_mul_ps(d,d);
516 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
518 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
520 /* Evaluate switch function */
521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
522 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
523 velec = _mm256_mul_ps(velec,sw);
524 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velec = _mm256_and_ps(velec,cutoff_mask);
528 velecsum = _mm256_add_ps(velecsum,velec);
532 fscal = _mm256_and_ps(fscal,cutoff_mask);
534 /* Calculate temporary vectorial force */
535 tx = _mm256_mul_ps(fscal,dx10);
536 ty = _mm256_mul_ps(fscal,dy10);
537 tz = _mm256_mul_ps(fscal,dz10);
539 /* Update vectorial force */
540 fix1 = _mm256_add_ps(fix1,tx);
541 fiy1 = _mm256_add_ps(fiy1,ty);
542 fiz1 = _mm256_add_ps(fiz1,tz);
544 fjx0 = _mm256_add_ps(fjx0,tx);
545 fjy0 = _mm256_add_ps(fjy0,ty);
546 fjz0 = _mm256_add_ps(fjz0,tz);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 if (gmx_mm256_any_lt(rsq11,rcutoff2))
557 r11 = _mm256_mul_ps(rsq11,rinv11);
559 /* EWALD ELECTROSTATICS */
561 /* Analytical PME correction */
562 zeta2 = _mm256_mul_ps(beta2,rsq11);
563 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
564 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
565 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
566 felec = _mm256_mul_ps(qq11,felec);
567 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
568 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
569 velec = _mm256_sub_ps(rinv11,pmecorrV);
570 velec = _mm256_mul_ps(qq11,velec);
572 d = _mm256_sub_ps(r11,rswitch);
573 d = _mm256_max_ps(d,_mm256_setzero_ps());
574 d2 = _mm256_mul_ps(d,d);
575 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
577 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
579 /* Evaluate switch function */
580 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
582 velec = _mm256_mul_ps(velec,sw);
583 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec = _mm256_and_ps(velec,cutoff_mask);
587 velecsum = _mm256_add_ps(velecsum,velec);
591 fscal = _mm256_and_ps(fscal,cutoff_mask);
593 /* Calculate temporary vectorial force */
594 tx = _mm256_mul_ps(fscal,dx11);
595 ty = _mm256_mul_ps(fscal,dy11);
596 tz = _mm256_mul_ps(fscal,dz11);
598 /* Update vectorial force */
599 fix1 = _mm256_add_ps(fix1,tx);
600 fiy1 = _mm256_add_ps(fiy1,ty);
601 fiz1 = _mm256_add_ps(fiz1,tz);
603 fjx1 = _mm256_add_ps(fjx1,tx);
604 fjy1 = _mm256_add_ps(fjy1,ty);
605 fjz1 = _mm256_add_ps(fjz1,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 if (gmx_mm256_any_lt(rsq12,rcutoff2))
616 r12 = _mm256_mul_ps(rsq12,rinv12);
618 /* EWALD ELECTROSTATICS */
620 /* Analytical PME correction */
621 zeta2 = _mm256_mul_ps(beta2,rsq12);
622 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
623 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
624 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
625 felec = _mm256_mul_ps(qq12,felec);
626 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
627 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
628 velec = _mm256_sub_ps(rinv12,pmecorrV);
629 velec = _mm256_mul_ps(qq12,velec);
631 d = _mm256_sub_ps(r12,rswitch);
632 d = _mm256_max_ps(d,_mm256_setzero_ps());
633 d2 = _mm256_mul_ps(d,d);
634 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
636 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
638 /* Evaluate switch function */
639 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
640 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
641 velec = _mm256_mul_ps(velec,sw);
642 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec = _mm256_and_ps(velec,cutoff_mask);
646 velecsum = _mm256_add_ps(velecsum,velec);
650 fscal = _mm256_and_ps(fscal,cutoff_mask);
652 /* Calculate temporary vectorial force */
653 tx = _mm256_mul_ps(fscal,dx12);
654 ty = _mm256_mul_ps(fscal,dy12);
655 tz = _mm256_mul_ps(fscal,dz12);
657 /* Update vectorial force */
658 fix1 = _mm256_add_ps(fix1,tx);
659 fiy1 = _mm256_add_ps(fiy1,ty);
660 fiz1 = _mm256_add_ps(fiz1,tz);
662 fjx2 = _mm256_add_ps(fjx2,tx);
663 fjy2 = _mm256_add_ps(fjy2,ty);
664 fjz2 = _mm256_add_ps(fjz2,tz);
668 /**************************
669 * CALCULATE INTERACTIONS *
670 **************************/
672 if (gmx_mm256_any_lt(rsq20,rcutoff2))
675 r20 = _mm256_mul_ps(rsq20,rinv20);
677 /* EWALD ELECTROSTATICS */
679 /* Analytical PME correction */
680 zeta2 = _mm256_mul_ps(beta2,rsq20);
681 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
682 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
683 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
684 felec = _mm256_mul_ps(qq20,felec);
685 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
686 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
687 velec = _mm256_sub_ps(rinv20,pmecorrV);
688 velec = _mm256_mul_ps(qq20,velec);
690 d = _mm256_sub_ps(r20,rswitch);
691 d = _mm256_max_ps(d,_mm256_setzero_ps());
692 d2 = _mm256_mul_ps(d,d);
693 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
695 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
697 /* Evaluate switch function */
698 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
699 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
700 velec = _mm256_mul_ps(velec,sw);
701 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
703 /* Update potential sum for this i atom from the interaction with this j atom. */
704 velec = _mm256_and_ps(velec,cutoff_mask);
705 velecsum = _mm256_add_ps(velecsum,velec);
709 fscal = _mm256_and_ps(fscal,cutoff_mask);
711 /* Calculate temporary vectorial force */
712 tx = _mm256_mul_ps(fscal,dx20);
713 ty = _mm256_mul_ps(fscal,dy20);
714 tz = _mm256_mul_ps(fscal,dz20);
716 /* Update vectorial force */
717 fix2 = _mm256_add_ps(fix2,tx);
718 fiy2 = _mm256_add_ps(fiy2,ty);
719 fiz2 = _mm256_add_ps(fiz2,tz);
721 fjx0 = _mm256_add_ps(fjx0,tx);
722 fjy0 = _mm256_add_ps(fjy0,ty);
723 fjz0 = _mm256_add_ps(fjz0,tz);
727 /**************************
728 * CALCULATE INTERACTIONS *
729 **************************/
731 if (gmx_mm256_any_lt(rsq21,rcutoff2))
734 r21 = _mm256_mul_ps(rsq21,rinv21);
736 /* EWALD ELECTROSTATICS */
738 /* Analytical PME correction */
739 zeta2 = _mm256_mul_ps(beta2,rsq21);
740 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
741 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
742 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
743 felec = _mm256_mul_ps(qq21,felec);
744 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
745 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
746 velec = _mm256_sub_ps(rinv21,pmecorrV);
747 velec = _mm256_mul_ps(qq21,velec);
749 d = _mm256_sub_ps(r21,rswitch);
750 d = _mm256_max_ps(d,_mm256_setzero_ps());
751 d2 = _mm256_mul_ps(d,d);
752 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
754 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
756 /* Evaluate switch function */
757 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
758 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
759 velec = _mm256_mul_ps(velec,sw);
760 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
762 /* Update potential sum for this i atom from the interaction with this j atom. */
763 velec = _mm256_and_ps(velec,cutoff_mask);
764 velecsum = _mm256_add_ps(velecsum,velec);
768 fscal = _mm256_and_ps(fscal,cutoff_mask);
770 /* Calculate temporary vectorial force */
771 tx = _mm256_mul_ps(fscal,dx21);
772 ty = _mm256_mul_ps(fscal,dy21);
773 tz = _mm256_mul_ps(fscal,dz21);
775 /* Update vectorial force */
776 fix2 = _mm256_add_ps(fix2,tx);
777 fiy2 = _mm256_add_ps(fiy2,ty);
778 fiz2 = _mm256_add_ps(fiz2,tz);
780 fjx1 = _mm256_add_ps(fjx1,tx);
781 fjy1 = _mm256_add_ps(fjy1,ty);
782 fjz1 = _mm256_add_ps(fjz1,tz);
786 /**************************
787 * CALCULATE INTERACTIONS *
788 **************************/
790 if (gmx_mm256_any_lt(rsq22,rcutoff2))
793 r22 = _mm256_mul_ps(rsq22,rinv22);
795 /* EWALD ELECTROSTATICS */
797 /* Analytical PME correction */
798 zeta2 = _mm256_mul_ps(beta2,rsq22);
799 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
800 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
801 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
802 felec = _mm256_mul_ps(qq22,felec);
803 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
804 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
805 velec = _mm256_sub_ps(rinv22,pmecorrV);
806 velec = _mm256_mul_ps(qq22,velec);
808 d = _mm256_sub_ps(r22,rswitch);
809 d = _mm256_max_ps(d,_mm256_setzero_ps());
810 d2 = _mm256_mul_ps(d,d);
811 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
813 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
815 /* Evaluate switch function */
816 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
817 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
818 velec = _mm256_mul_ps(velec,sw);
819 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
821 /* Update potential sum for this i atom from the interaction with this j atom. */
822 velec = _mm256_and_ps(velec,cutoff_mask);
823 velecsum = _mm256_add_ps(velecsum,velec);
827 fscal = _mm256_and_ps(fscal,cutoff_mask);
829 /* Calculate temporary vectorial force */
830 tx = _mm256_mul_ps(fscal,dx22);
831 ty = _mm256_mul_ps(fscal,dy22);
832 tz = _mm256_mul_ps(fscal,dz22);
834 /* Update vectorial force */
835 fix2 = _mm256_add_ps(fix2,tx);
836 fiy2 = _mm256_add_ps(fiy2,ty);
837 fiz2 = _mm256_add_ps(fiz2,tz);
839 fjx2 = _mm256_add_ps(fjx2,tx);
840 fjy2 = _mm256_add_ps(fjy2,ty);
841 fjz2 = _mm256_add_ps(fjz2,tz);
845 fjptrA = f+j_coord_offsetA;
846 fjptrB = f+j_coord_offsetB;
847 fjptrC = f+j_coord_offsetC;
848 fjptrD = f+j_coord_offsetD;
849 fjptrE = f+j_coord_offsetE;
850 fjptrF = f+j_coord_offsetF;
851 fjptrG = f+j_coord_offsetG;
852 fjptrH = f+j_coord_offsetH;
854 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
855 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
857 /* Inner loop uses 972 flops */
863 /* Get j neighbor index, and coordinate index */
864 jnrlistA = jjnr[jidx];
865 jnrlistB = jjnr[jidx+1];
866 jnrlistC = jjnr[jidx+2];
867 jnrlistD = jjnr[jidx+3];
868 jnrlistE = jjnr[jidx+4];
869 jnrlistF = jjnr[jidx+5];
870 jnrlistG = jjnr[jidx+6];
871 jnrlistH = jjnr[jidx+7];
872 /* Sign of each element will be negative for non-real atoms.
873 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
874 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
876 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
877 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
879 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
880 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
881 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
882 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
883 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
884 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
885 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
886 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
887 j_coord_offsetA = DIM*jnrA;
888 j_coord_offsetB = DIM*jnrB;
889 j_coord_offsetC = DIM*jnrC;
890 j_coord_offsetD = DIM*jnrD;
891 j_coord_offsetE = DIM*jnrE;
892 j_coord_offsetF = DIM*jnrF;
893 j_coord_offsetG = DIM*jnrG;
894 j_coord_offsetH = DIM*jnrH;
896 /* load j atom coordinates */
897 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
898 x+j_coord_offsetC,x+j_coord_offsetD,
899 x+j_coord_offsetE,x+j_coord_offsetF,
900 x+j_coord_offsetG,x+j_coord_offsetH,
901 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
903 /* Calculate displacement vector */
904 dx00 = _mm256_sub_ps(ix0,jx0);
905 dy00 = _mm256_sub_ps(iy0,jy0);
906 dz00 = _mm256_sub_ps(iz0,jz0);
907 dx01 = _mm256_sub_ps(ix0,jx1);
908 dy01 = _mm256_sub_ps(iy0,jy1);
909 dz01 = _mm256_sub_ps(iz0,jz1);
910 dx02 = _mm256_sub_ps(ix0,jx2);
911 dy02 = _mm256_sub_ps(iy0,jy2);
912 dz02 = _mm256_sub_ps(iz0,jz2);
913 dx10 = _mm256_sub_ps(ix1,jx0);
914 dy10 = _mm256_sub_ps(iy1,jy0);
915 dz10 = _mm256_sub_ps(iz1,jz0);
916 dx11 = _mm256_sub_ps(ix1,jx1);
917 dy11 = _mm256_sub_ps(iy1,jy1);
918 dz11 = _mm256_sub_ps(iz1,jz1);
919 dx12 = _mm256_sub_ps(ix1,jx2);
920 dy12 = _mm256_sub_ps(iy1,jy2);
921 dz12 = _mm256_sub_ps(iz1,jz2);
922 dx20 = _mm256_sub_ps(ix2,jx0);
923 dy20 = _mm256_sub_ps(iy2,jy0);
924 dz20 = _mm256_sub_ps(iz2,jz0);
925 dx21 = _mm256_sub_ps(ix2,jx1);
926 dy21 = _mm256_sub_ps(iy2,jy1);
927 dz21 = _mm256_sub_ps(iz2,jz1);
928 dx22 = _mm256_sub_ps(ix2,jx2);
929 dy22 = _mm256_sub_ps(iy2,jy2);
930 dz22 = _mm256_sub_ps(iz2,jz2);
932 /* Calculate squared distance and things based on it */
933 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
934 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
935 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
936 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
937 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
938 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
939 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
940 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
941 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
943 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
944 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
945 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
946 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
947 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
948 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
949 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
950 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
951 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
953 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
954 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
955 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
956 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
957 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
958 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
959 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
960 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
961 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
963 fjx0 = _mm256_setzero_ps();
964 fjy0 = _mm256_setzero_ps();
965 fjz0 = _mm256_setzero_ps();
966 fjx1 = _mm256_setzero_ps();
967 fjy1 = _mm256_setzero_ps();
968 fjz1 = _mm256_setzero_ps();
969 fjx2 = _mm256_setzero_ps();
970 fjy2 = _mm256_setzero_ps();
971 fjz2 = _mm256_setzero_ps();
973 /**************************
974 * CALCULATE INTERACTIONS *
975 **************************/
977 if (gmx_mm256_any_lt(rsq00,rcutoff2))
980 r00 = _mm256_mul_ps(rsq00,rinv00);
981 r00 = _mm256_andnot_ps(dummy_mask,r00);
983 /* EWALD ELECTROSTATICS */
985 /* Analytical PME correction */
986 zeta2 = _mm256_mul_ps(beta2,rsq00);
987 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
988 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
989 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
990 felec = _mm256_mul_ps(qq00,felec);
991 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
992 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
993 velec = _mm256_sub_ps(rinv00,pmecorrV);
994 velec = _mm256_mul_ps(qq00,velec);
996 d = _mm256_sub_ps(r00,rswitch);
997 d = _mm256_max_ps(d,_mm256_setzero_ps());
998 d2 = _mm256_mul_ps(d,d);
999 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1001 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1003 /* Evaluate switch function */
1004 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1005 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1006 velec = _mm256_mul_ps(velec,sw);
1007 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1009 /* Update potential sum for this i atom from the interaction with this j atom. */
1010 velec = _mm256_and_ps(velec,cutoff_mask);
1011 velec = _mm256_andnot_ps(dummy_mask,velec);
1012 velecsum = _mm256_add_ps(velecsum,velec);
1016 fscal = _mm256_and_ps(fscal,cutoff_mask);
1018 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1020 /* Calculate temporary vectorial force */
1021 tx = _mm256_mul_ps(fscal,dx00);
1022 ty = _mm256_mul_ps(fscal,dy00);
1023 tz = _mm256_mul_ps(fscal,dz00);
1025 /* Update vectorial force */
1026 fix0 = _mm256_add_ps(fix0,tx);
1027 fiy0 = _mm256_add_ps(fiy0,ty);
1028 fiz0 = _mm256_add_ps(fiz0,tz);
1030 fjx0 = _mm256_add_ps(fjx0,tx);
1031 fjy0 = _mm256_add_ps(fjy0,ty);
1032 fjz0 = _mm256_add_ps(fjz0,tz);
1036 /**************************
1037 * CALCULATE INTERACTIONS *
1038 **************************/
1040 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1043 r01 = _mm256_mul_ps(rsq01,rinv01);
1044 r01 = _mm256_andnot_ps(dummy_mask,r01);
1046 /* EWALD ELECTROSTATICS */
1048 /* Analytical PME correction */
1049 zeta2 = _mm256_mul_ps(beta2,rsq01);
1050 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
1051 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1052 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1053 felec = _mm256_mul_ps(qq01,felec);
1054 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1055 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1056 velec = _mm256_sub_ps(rinv01,pmecorrV);
1057 velec = _mm256_mul_ps(qq01,velec);
1059 d = _mm256_sub_ps(r01,rswitch);
1060 d = _mm256_max_ps(d,_mm256_setzero_ps());
1061 d2 = _mm256_mul_ps(d,d);
1062 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1064 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1066 /* Evaluate switch function */
1067 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1068 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1069 velec = _mm256_mul_ps(velec,sw);
1070 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 velec = _mm256_and_ps(velec,cutoff_mask);
1074 velec = _mm256_andnot_ps(dummy_mask,velec);
1075 velecsum = _mm256_add_ps(velecsum,velec);
1079 fscal = _mm256_and_ps(fscal,cutoff_mask);
1081 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1083 /* Calculate temporary vectorial force */
1084 tx = _mm256_mul_ps(fscal,dx01);
1085 ty = _mm256_mul_ps(fscal,dy01);
1086 tz = _mm256_mul_ps(fscal,dz01);
1088 /* Update vectorial force */
1089 fix0 = _mm256_add_ps(fix0,tx);
1090 fiy0 = _mm256_add_ps(fiy0,ty);
1091 fiz0 = _mm256_add_ps(fiz0,tz);
1093 fjx1 = _mm256_add_ps(fjx1,tx);
1094 fjy1 = _mm256_add_ps(fjy1,ty);
1095 fjz1 = _mm256_add_ps(fjz1,tz);
1099 /**************************
1100 * CALCULATE INTERACTIONS *
1101 **************************/
1103 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1106 r02 = _mm256_mul_ps(rsq02,rinv02);
1107 r02 = _mm256_andnot_ps(dummy_mask,r02);
1109 /* EWALD ELECTROSTATICS */
1111 /* Analytical PME correction */
1112 zeta2 = _mm256_mul_ps(beta2,rsq02);
1113 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
1114 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1115 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1116 felec = _mm256_mul_ps(qq02,felec);
1117 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1118 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1119 velec = _mm256_sub_ps(rinv02,pmecorrV);
1120 velec = _mm256_mul_ps(qq02,velec);
1122 d = _mm256_sub_ps(r02,rswitch);
1123 d = _mm256_max_ps(d,_mm256_setzero_ps());
1124 d2 = _mm256_mul_ps(d,d);
1125 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1127 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1129 /* Evaluate switch function */
1130 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1131 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1132 velec = _mm256_mul_ps(velec,sw);
1133 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1135 /* Update potential sum for this i atom from the interaction with this j atom. */
1136 velec = _mm256_and_ps(velec,cutoff_mask);
1137 velec = _mm256_andnot_ps(dummy_mask,velec);
1138 velecsum = _mm256_add_ps(velecsum,velec);
1142 fscal = _mm256_and_ps(fscal,cutoff_mask);
1144 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1146 /* Calculate temporary vectorial force */
1147 tx = _mm256_mul_ps(fscal,dx02);
1148 ty = _mm256_mul_ps(fscal,dy02);
1149 tz = _mm256_mul_ps(fscal,dz02);
1151 /* Update vectorial force */
1152 fix0 = _mm256_add_ps(fix0,tx);
1153 fiy0 = _mm256_add_ps(fiy0,ty);
1154 fiz0 = _mm256_add_ps(fiz0,tz);
1156 fjx2 = _mm256_add_ps(fjx2,tx);
1157 fjy2 = _mm256_add_ps(fjy2,ty);
1158 fjz2 = _mm256_add_ps(fjz2,tz);
1162 /**************************
1163 * CALCULATE INTERACTIONS *
1164 **************************/
1166 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1169 r10 = _mm256_mul_ps(rsq10,rinv10);
1170 r10 = _mm256_andnot_ps(dummy_mask,r10);
1172 /* EWALD ELECTROSTATICS */
1174 /* Analytical PME correction */
1175 zeta2 = _mm256_mul_ps(beta2,rsq10);
1176 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1177 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1178 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1179 felec = _mm256_mul_ps(qq10,felec);
1180 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1181 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1182 velec = _mm256_sub_ps(rinv10,pmecorrV);
1183 velec = _mm256_mul_ps(qq10,velec);
1185 d = _mm256_sub_ps(r10,rswitch);
1186 d = _mm256_max_ps(d,_mm256_setzero_ps());
1187 d2 = _mm256_mul_ps(d,d);
1188 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1190 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1192 /* Evaluate switch function */
1193 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1194 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1195 velec = _mm256_mul_ps(velec,sw);
1196 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1198 /* Update potential sum for this i atom from the interaction with this j atom. */
1199 velec = _mm256_and_ps(velec,cutoff_mask);
1200 velec = _mm256_andnot_ps(dummy_mask,velec);
1201 velecsum = _mm256_add_ps(velecsum,velec);
1205 fscal = _mm256_and_ps(fscal,cutoff_mask);
1207 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm256_mul_ps(fscal,dx10);
1211 ty = _mm256_mul_ps(fscal,dy10);
1212 tz = _mm256_mul_ps(fscal,dz10);
1214 /* Update vectorial force */
1215 fix1 = _mm256_add_ps(fix1,tx);
1216 fiy1 = _mm256_add_ps(fiy1,ty);
1217 fiz1 = _mm256_add_ps(fiz1,tz);
1219 fjx0 = _mm256_add_ps(fjx0,tx);
1220 fjy0 = _mm256_add_ps(fjy0,ty);
1221 fjz0 = _mm256_add_ps(fjz0,tz);
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1232 r11 = _mm256_mul_ps(rsq11,rinv11);
1233 r11 = _mm256_andnot_ps(dummy_mask,r11);
1235 /* EWALD ELECTROSTATICS */
1237 /* Analytical PME correction */
1238 zeta2 = _mm256_mul_ps(beta2,rsq11);
1239 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1240 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1241 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1242 felec = _mm256_mul_ps(qq11,felec);
1243 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1244 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1245 velec = _mm256_sub_ps(rinv11,pmecorrV);
1246 velec = _mm256_mul_ps(qq11,velec);
1248 d = _mm256_sub_ps(r11,rswitch);
1249 d = _mm256_max_ps(d,_mm256_setzero_ps());
1250 d2 = _mm256_mul_ps(d,d);
1251 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1253 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1255 /* Evaluate switch function */
1256 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1257 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1258 velec = _mm256_mul_ps(velec,sw);
1259 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1261 /* Update potential sum for this i atom from the interaction with this j atom. */
1262 velec = _mm256_and_ps(velec,cutoff_mask);
1263 velec = _mm256_andnot_ps(dummy_mask,velec);
1264 velecsum = _mm256_add_ps(velecsum,velec);
1268 fscal = _mm256_and_ps(fscal,cutoff_mask);
1270 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1272 /* Calculate temporary vectorial force */
1273 tx = _mm256_mul_ps(fscal,dx11);
1274 ty = _mm256_mul_ps(fscal,dy11);
1275 tz = _mm256_mul_ps(fscal,dz11);
1277 /* Update vectorial force */
1278 fix1 = _mm256_add_ps(fix1,tx);
1279 fiy1 = _mm256_add_ps(fiy1,ty);
1280 fiz1 = _mm256_add_ps(fiz1,tz);
1282 fjx1 = _mm256_add_ps(fjx1,tx);
1283 fjy1 = _mm256_add_ps(fjy1,ty);
1284 fjz1 = _mm256_add_ps(fjz1,tz);
1288 /**************************
1289 * CALCULATE INTERACTIONS *
1290 **************************/
1292 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1295 r12 = _mm256_mul_ps(rsq12,rinv12);
1296 r12 = _mm256_andnot_ps(dummy_mask,r12);
1298 /* EWALD ELECTROSTATICS */
1300 /* Analytical PME correction */
1301 zeta2 = _mm256_mul_ps(beta2,rsq12);
1302 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1303 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1304 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1305 felec = _mm256_mul_ps(qq12,felec);
1306 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1307 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1308 velec = _mm256_sub_ps(rinv12,pmecorrV);
1309 velec = _mm256_mul_ps(qq12,velec);
1311 d = _mm256_sub_ps(r12,rswitch);
1312 d = _mm256_max_ps(d,_mm256_setzero_ps());
1313 d2 = _mm256_mul_ps(d,d);
1314 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1316 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1318 /* Evaluate switch function */
1319 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1320 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1321 velec = _mm256_mul_ps(velec,sw);
1322 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1324 /* Update potential sum for this i atom from the interaction with this j atom. */
1325 velec = _mm256_and_ps(velec,cutoff_mask);
1326 velec = _mm256_andnot_ps(dummy_mask,velec);
1327 velecsum = _mm256_add_ps(velecsum,velec);
1331 fscal = _mm256_and_ps(fscal,cutoff_mask);
1333 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1335 /* Calculate temporary vectorial force */
1336 tx = _mm256_mul_ps(fscal,dx12);
1337 ty = _mm256_mul_ps(fscal,dy12);
1338 tz = _mm256_mul_ps(fscal,dz12);
1340 /* Update vectorial force */
1341 fix1 = _mm256_add_ps(fix1,tx);
1342 fiy1 = _mm256_add_ps(fiy1,ty);
1343 fiz1 = _mm256_add_ps(fiz1,tz);
1345 fjx2 = _mm256_add_ps(fjx2,tx);
1346 fjy2 = _mm256_add_ps(fjy2,ty);
1347 fjz2 = _mm256_add_ps(fjz2,tz);
1351 /**************************
1352 * CALCULATE INTERACTIONS *
1353 **************************/
1355 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1358 r20 = _mm256_mul_ps(rsq20,rinv20);
1359 r20 = _mm256_andnot_ps(dummy_mask,r20);
1361 /* EWALD ELECTROSTATICS */
1363 /* Analytical PME correction */
1364 zeta2 = _mm256_mul_ps(beta2,rsq20);
1365 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1366 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1367 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1368 felec = _mm256_mul_ps(qq20,felec);
1369 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1370 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1371 velec = _mm256_sub_ps(rinv20,pmecorrV);
1372 velec = _mm256_mul_ps(qq20,velec);
1374 d = _mm256_sub_ps(r20,rswitch);
1375 d = _mm256_max_ps(d,_mm256_setzero_ps());
1376 d2 = _mm256_mul_ps(d,d);
1377 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1379 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1381 /* Evaluate switch function */
1382 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1383 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1384 velec = _mm256_mul_ps(velec,sw);
1385 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1387 /* Update potential sum for this i atom from the interaction with this j atom. */
1388 velec = _mm256_and_ps(velec,cutoff_mask);
1389 velec = _mm256_andnot_ps(dummy_mask,velec);
1390 velecsum = _mm256_add_ps(velecsum,velec);
1394 fscal = _mm256_and_ps(fscal,cutoff_mask);
1396 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1398 /* Calculate temporary vectorial force */
1399 tx = _mm256_mul_ps(fscal,dx20);
1400 ty = _mm256_mul_ps(fscal,dy20);
1401 tz = _mm256_mul_ps(fscal,dz20);
1403 /* Update vectorial force */
1404 fix2 = _mm256_add_ps(fix2,tx);
1405 fiy2 = _mm256_add_ps(fiy2,ty);
1406 fiz2 = _mm256_add_ps(fiz2,tz);
1408 fjx0 = _mm256_add_ps(fjx0,tx);
1409 fjy0 = _mm256_add_ps(fjy0,ty);
1410 fjz0 = _mm256_add_ps(fjz0,tz);
1414 /**************************
1415 * CALCULATE INTERACTIONS *
1416 **************************/
1418 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1421 r21 = _mm256_mul_ps(rsq21,rinv21);
1422 r21 = _mm256_andnot_ps(dummy_mask,r21);
1424 /* EWALD ELECTROSTATICS */
1426 /* Analytical PME correction */
1427 zeta2 = _mm256_mul_ps(beta2,rsq21);
1428 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1429 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1430 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1431 felec = _mm256_mul_ps(qq21,felec);
1432 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1433 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1434 velec = _mm256_sub_ps(rinv21,pmecorrV);
1435 velec = _mm256_mul_ps(qq21,velec);
1437 d = _mm256_sub_ps(r21,rswitch);
1438 d = _mm256_max_ps(d,_mm256_setzero_ps());
1439 d2 = _mm256_mul_ps(d,d);
1440 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1442 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1444 /* Evaluate switch function */
1445 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1446 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1447 velec = _mm256_mul_ps(velec,sw);
1448 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1450 /* Update potential sum for this i atom from the interaction with this j atom. */
1451 velec = _mm256_and_ps(velec,cutoff_mask);
1452 velec = _mm256_andnot_ps(dummy_mask,velec);
1453 velecsum = _mm256_add_ps(velecsum,velec);
1457 fscal = _mm256_and_ps(fscal,cutoff_mask);
1459 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1461 /* Calculate temporary vectorial force */
1462 tx = _mm256_mul_ps(fscal,dx21);
1463 ty = _mm256_mul_ps(fscal,dy21);
1464 tz = _mm256_mul_ps(fscal,dz21);
1466 /* Update vectorial force */
1467 fix2 = _mm256_add_ps(fix2,tx);
1468 fiy2 = _mm256_add_ps(fiy2,ty);
1469 fiz2 = _mm256_add_ps(fiz2,tz);
1471 fjx1 = _mm256_add_ps(fjx1,tx);
1472 fjy1 = _mm256_add_ps(fjy1,ty);
1473 fjz1 = _mm256_add_ps(fjz1,tz);
1477 /**************************
1478 * CALCULATE INTERACTIONS *
1479 **************************/
1481 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1484 r22 = _mm256_mul_ps(rsq22,rinv22);
1485 r22 = _mm256_andnot_ps(dummy_mask,r22);
1487 /* EWALD ELECTROSTATICS */
1489 /* Analytical PME correction */
1490 zeta2 = _mm256_mul_ps(beta2,rsq22);
1491 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1492 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1493 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1494 felec = _mm256_mul_ps(qq22,felec);
1495 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1496 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1497 velec = _mm256_sub_ps(rinv22,pmecorrV);
1498 velec = _mm256_mul_ps(qq22,velec);
1500 d = _mm256_sub_ps(r22,rswitch);
1501 d = _mm256_max_ps(d,_mm256_setzero_ps());
1502 d2 = _mm256_mul_ps(d,d);
1503 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1505 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1507 /* Evaluate switch function */
1508 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1509 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1510 velec = _mm256_mul_ps(velec,sw);
1511 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1513 /* Update potential sum for this i atom from the interaction with this j atom. */
1514 velec = _mm256_and_ps(velec,cutoff_mask);
1515 velec = _mm256_andnot_ps(dummy_mask,velec);
1516 velecsum = _mm256_add_ps(velecsum,velec);
1520 fscal = _mm256_and_ps(fscal,cutoff_mask);
1522 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1524 /* Calculate temporary vectorial force */
1525 tx = _mm256_mul_ps(fscal,dx22);
1526 ty = _mm256_mul_ps(fscal,dy22);
1527 tz = _mm256_mul_ps(fscal,dz22);
1529 /* Update vectorial force */
1530 fix2 = _mm256_add_ps(fix2,tx);
1531 fiy2 = _mm256_add_ps(fiy2,ty);
1532 fiz2 = _mm256_add_ps(fiz2,tz);
1534 fjx2 = _mm256_add_ps(fjx2,tx);
1535 fjy2 = _mm256_add_ps(fjy2,ty);
1536 fjz2 = _mm256_add_ps(fjz2,tz);
1540 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1541 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1542 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1543 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1544 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1545 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1546 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1547 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1549 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1550 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1552 /* Inner loop uses 981 flops */
1555 /* End of innermost loop */
1557 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1558 f+i_coord_offset,fshift+i_shift_offset);
1561 /* Update potential energies */
1562 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1564 /* Increment number of inner iterations */
1565 inneriter += j_index_end - j_index_start;
1567 /* Outer loop uses 19 flops */
1570 /* Increment number of outer iterations */
1573 /* Update outer/inner flops */
1575 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*981);
1578 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1579 * Electrostatics interaction: Ewald
1580 * VdW interaction: None
1581 * Geometry: Water3-Water3
1582 * Calculate force/pot: Force
1585 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1586 (t_nblist * gmx_restrict nlist,
1587 rvec * gmx_restrict xx,
1588 rvec * gmx_restrict ff,
1589 t_forcerec * gmx_restrict fr,
1590 t_mdatoms * gmx_restrict mdatoms,
1591 nb_kernel_data_t * gmx_restrict kernel_data,
1592 t_nrnb * gmx_restrict nrnb)
1594 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1595 * just 0 for non-waters.
1596 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1597 * jnr indices corresponding to data put in the four positions in the SIMD register.
1599 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1600 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1601 int jnrA,jnrB,jnrC,jnrD;
1602 int jnrE,jnrF,jnrG,jnrH;
1603 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1604 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1605 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1606 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1607 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1608 real rcutoff_scalar;
1609 real *shiftvec,*fshift,*x,*f;
1610 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1611 real scratch[4*DIM];
1612 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1613 real * vdwioffsetptr0;
1614 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1615 real * vdwioffsetptr1;
1616 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1617 real * vdwioffsetptr2;
1618 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1619 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1620 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1621 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1622 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1623 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1624 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1625 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1626 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1627 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1628 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1629 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1630 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1631 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1632 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1633 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1634 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1637 __m128i ewitab_lo,ewitab_hi;
1638 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1639 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1641 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1642 real rswitch_scalar,d_scalar;
1643 __m256 dummy_mask,cutoff_mask;
1644 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1645 __m256 one = _mm256_set1_ps(1.0);
1646 __m256 two = _mm256_set1_ps(2.0);
1652 jindex = nlist->jindex;
1654 shiftidx = nlist->shift;
1656 shiftvec = fr->shift_vec[0];
1657 fshift = fr->fshift[0];
1658 facel = _mm256_set1_ps(fr->epsfac);
1659 charge = mdatoms->chargeA;
1661 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1662 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
1663 beta2 = _mm256_mul_ps(beta,beta);
1664 beta3 = _mm256_mul_ps(beta,beta2);
1666 ewtab = fr->ic->tabq_coul_FDV0;
1667 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1668 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1670 /* Setup water-specific parameters */
1671 inr = nlist->iinr[0];
1672 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1673 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1674 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1676 jq0 = _mm256_set1_ps(charge[inr+0]);
1677 jq1 = _mm256_set1_ps(charge[inr+1]);
1678 jq2 = _mm256_set1_ps(charge[inr+2]);
1679 qq00 = _mm256_mul_ps(iq0,jq0);
1680 qq01 = _mm256_mul_ps(iq0,jq1);
1681 qq02 = _mm256_mul_ps(iq0,jq2);
1682 qq10 = _mm256_mul_ps(iq1,jq0);
1683 qq11 = _mm256_mul_ps(iq1,jq1);
1684 qq12 = _mm256_mul_ps(iq1,jq2);
1685 qq20 = _mm256_mul_ps(iq2,jq0);
1686 qq21 = _mm256_mul_ps(iq2,jq1);
1687 qq22 = _mm256_mul_ps(iq2,jq2);
1689 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1690 rcutoff_scalar = fr->rcoulomb;
1691 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1692 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1694 rswitch_scalar = fr->rcoulomb_switch;
1695 rswitch = _mm256_set1_ps(rswitch_scalar);
1696 /* Setup switch parameters */
1697 d_scalar = rcutoff_scalar-rswitch_scalar;
1698 d = _mm256_set1_ps(d_scalar);
1699 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1700 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1701 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1702 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1703 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1704 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1706 /* Avoid stupid compiler warnings */
1707 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1708 j_coord_offsetA = 0;
1709 j_coord_offsetB = 0;
1710 j_coord_offsetC = 0;
1711 j_coord_offsetD = 0;
1712 j_coord_offsetE = 0;
1713 j_coord_offsetF = 0;
1714 j_coord_offsetG = 0;
1715 j_coord_offsetH = 0;
1720 for(iidx=0;iidx<4*DIM;iidx++)
1722 scratch[iidx] = 0.0;
1725 /* Start outer loop over neighborlists */
1726 for(iidx=0; iidx<nri; iidx++)
1728 /* Load shift vector for this list */
1729 i_shift_offset = DIM*shiftidx[iidx];
1731 /* Load limits for loop over neighbors */
1732 j_index_start = jindex[iidx];
1733 j_index_end = jindex[iidx+1];
1735 /* Get outer coordinate index */
1737 i_coord_offset = DIM*inr;
1739 /* Load i particle coords and add shift vector */
1740 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1741 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1743 fix0 = _mm256_setzero_ps();
1744 fiy0 = _mm256_setzero_ps();
1745 fiz0 = _mm256_setzero_ps();
1746 fix1 = _mm256_setzero_ps();
1747 fiy1 = _mm256_setzero_ps();
1748 fiz1 = _mm256_setzero_ps();
1749 fix2 = _mm256_setzero_ps();
1750 fiy2 = _mm256_setzero_ps();
1751 fiz2 = _mm256_setzero_ps();
1753 /* Start inner kernel loop */
1754 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1757 /* Get j neighbor index, and coordinate index */
1759 jnrB = jjnr[jidx+1];
1760 jnrC = jjnr[jidx+2];
1761 jnrD = jjnr[jidx+3];
1762 jnrE = jjnr[jidx+4];
1763 jnrF = jjnr[jidx+5];
1764 jnrG = jjnr[jidx+6];
1765 jnrH = jjnr[jidx+7];
1766 j_coord_offsetA = DIM*jnrA;
1767 j_coord_offsetB = DIM*jnrB;
1768 j_coord_offsetC = DIM*jnrC;
1769 j_coord_offsetD = DIM*jnrD;
1770 j_coord_offsetE = DIM*jnrE;
1771 j_coord_offsetF = DIM*jnrF;
1772 j_coord_offsetG = DIM*jnrG;
1773 j_coord_offsetH = DIM*jnrH;
1775 /* load j atom coordinates */
1776 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1777 x+j_coord_offsetC,x+j_coord_offsetD,
1778 x+j_coord_offsetE,x+j_coord_offsetF,
1779 x+j_coord_offsetG,x+j_coord_offsetH,
1780 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1782 /* Calculate displacement vector */
1783 dx00 = _mm256_sub_ps(ix0,jx0);
1784 dy00 = _mm256_sub_ps(iy0,jy0);
1785 dz00 = _mm256_sub_ps(iz0,jz0);
1786 dx01 = _mm256_sub_ps(ix0,jx1);
1787 dy01 = _mm256_sub_ps(iy0,jy1);
1788 dz01 = _mm256_sub_ps(iz0,jz1);
1789 dx02 = _mm256_sub_ps(ix0,jx2);
1790 dy02 = _mm256_sub_ps(iy0,jy2);
1791 dz02 = _mm256_sub_ps(iz0,jz2);
1792 dx10 = _mm256_sub_ps(ix1,jx0);
1793 dy10 = _mm256_sub_ps(iy1,jy0);
1794 dz10 = _mm256_sub_ps(iz1,jz0);
1795 dx11 = _mm256_sub_ps(ix1,jx1);
1796 dy11 = _mm256_sub_ps(iy1,jy1);
1797 dz11 = _mm256_sub_ps(iz1,jz1);
1798 dx12 = _mm256_sub_ps(ix1,jx2);
1799 dy12 = _mm256_sub_ps(iy1,jy2);
1800 dz12 = _mm256_sub_ps(iz1,jz2);
1801 dx20 = _mm256_sub_ps(ix2,jx0);
1802 dy20 = _mm256_sub_ps(iy2,jy0);
1803 dz20 = _mm256_sub_ps(iz2,jz0);
1804 dx21 = _mm256_sub_ps(ix2,jx1);
1805 dy21 = _mm256_sub_ps(iy2,jy1);
1806 dz21 = _mm256_sub_ps(iz2,jz1);
1807 dx22 = _mm256_sub_ps(ix2,jx2);
1808 dy22 = _mm256_sub_ps(iy2,jy2);
1809 dz22 = _mm256_sub_ps(iz2,jz2);
1811 /* Calculate squared distance and things based on it */
1812 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1813 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1814 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1815 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1816 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1817 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1818 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1819 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1820 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1822 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1823 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1824 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1825 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1826 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1827 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1828 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1829 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1830 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1832 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1833 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1834 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1835 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1836 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1837 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1838 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1839 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1840 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1842 fjx0 = _mm256_setzero_ps();
1843 fjy0 = _mm256_setzero_ps();
1844 fjz0 = _mm256_setzero_ps();
1845 fjx1 = _mm256_setzero_ps();
1846 fjy1 = _mm256_setzero_ps();
1847 fjz1 = _mm256_setzero_ps();
1848 fjx2 = _mm256_setzero_ps();
1849 fjy2 = _mm256_setzero_ps();
1850 fjz2 = _mm256_setzero_ps();
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1859 r00 = _mm256_mul_ps(rsq00,rinv00);
1861 /* EWALD ELECTROSTATICS */
1863 /* Analytical PME correction */
1864 zeta2 = _mm256_mul_ps(beta2,rsq00);
1865 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
1866 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1867 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1868 felec = _mm256_mul_ps(qq00,felec);
1869 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1870 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1871 velec = _mm256_sub_ps(rinv00,pmecorrV);
1872 velec = _mm256_mul_ps(qq00,velec);
1874 d = _mm256_sub_ps(r00,rswitch);
1875 d = _mm256_max_ps(d,_mm256_setzero_ps());
1876 d2 = _mm256_mul_ps(d,d);
1877 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1879 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1881 /* Evaluate switch function */
1882 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1883 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1884 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1888 fscal = _mm256_and_ps(fscal,cutoff_mask);
1890 /* Calculate temporary vectorial force */
1891 tx = _mm256_mul_ps(fscal,dx00);
1892 ty = _mm256_mul_ps(fscal,dy00);
1893 tz = _mm256_mul_ps(fscal,dz00);
1895 /* Update vectorial force */
1896 fix0 = _mm256_add_ps(fix0,tx);
1897 fiy0 = _mm256_add_ps(fiy0,ty);
1898 fiz0 = _mm256_add_ps(fiz0,tz);
1900 fjx0 = _mm256_add_ps(fjx0,tx);
1901 fjy0 = _mm256_add_ps(fjy0,ty);
1902 fjz0 = _mm256_add_ps(fjz0,tz);
1906 /**************************
1907 * CALCULATE INTERACTIONS *
1908 **************************/
1910 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1913 r01 = _mm256_mul_ps(rsq01,rinv01);
1915 /* EWALD ELECTROSTATICS */
1917 /* Analytical PME correction */
1918 zeta2 = _mm256_mul_ps(beta2,rsq01);
1919 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
1920 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1921 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1922 felec = _mm256_mul_ps(qq01,felec);
1923 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1924 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1925 velec = _mm256_sub_ps(rinv01,pmecorrV);
1926 velec = _mm256_mul_ps(qq01,velec);
1928 d = _mm256_sub_ps(r01,rswitch);
1929 d = _mm256_max_ps(d,_mm256_setzero_ps());
1930 d2 = _mm256_mul_ps(d,d);
1931 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1933 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1935 /* Evaluate switch function */
1936 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1937 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1938 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1942 fscal = _mm256_and_ps(fscal,cutoff_mask);
1944 /* Calculate temporary vectorial force */
1945 tx = _mm256_mul_ps(fscal,dx01);
1946 ty = _mm256_mul_ps(fscal,dy01);
1947 tz = _mm256_mul_ps(fscal,dz01);
1949 /* Update vectorial force */
1950 fix0 = _mm256_add_ps(fix0,tx);
1951 fiy0 = _mm256_add_ps(fiy0,ty);
1952 fiz0 = _mm256_add_ps(fiz0,tz);
1954 fjx1 = _mm256_add_ps(fjx1,tx);
1955 fjy1 = _mm256_add_ps(fjy1,ty);
1956 fjz1 = _mm256_add_ps(fjz1,tz);
1960 /**************************
1961 * CALCULATE INTERACTIONS *
1962 **************************/
1964 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1967 r02 = _mm256_mul_ps(rsq02,rinv02);
1969 /* EWALD ELECTROSTATICS */
1971 /* Analytical PME correction */
1972 zeta2 = _mm256_mul_ps(beta2,rsq02);
1973 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
1974 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1975 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1976 felec = _mm256_mul_ps(qq02,felec);
1977 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1978 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1979 velec = _mm256_sub_ps(rinv02,pmecorrV);
1980 velec = _mm256_mul_ps(qq02,velec);
1982 d = _mm256_sub_ps(r02,rswitch);
1983 d = _mm256_max_ps(d,_mm256_setzero_ps());
1984 d2 = _mm256_mul_ps(d,d);
1985 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1987 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1989 /* Evaluate switch function */
1990 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1991 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1992 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1996 fscal = _mm256_and_ps(fscal,cutoff_mask);
1998 /* Calculate temporary vectorial force */
1999 tx = _mm256_mul_ps(fscal,dx02);
2000 ty = _mm256_mul_ps(fscal,dy02);
2001 tz = _mm256_mul_ps(fscal,dz02);
2003 /* Update vectorial force */
2004 fix0 = _mm256_add_ps(fix0,tx);
2005 fiy0 = _mm256_add_ps(fiy0,ty);
2006 fiz0 = _mm256_add_ps(fiz0,tz);
2008 fjx2 = _mm256_add_ps(fjx2,tx);
2009 fjy2 = _mm256_add_ps(fjy2,ty);
2010 fjz2 = _mm256_add_ps(fjz2,tz);
2014 /**************************
2015 * CALCULATE INTERACTIONS *
2016 **************************/
2018 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2021 r10 = _mm256_mul_ps(rsq10,rinv10);
2023 /* EWALD ELECTROSTATICS */
2025 /* Analytical PME correction */
2026 zeta2 = _mm256_mul_ps(beta2,rsq10);
2027 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
2028 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2029 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2030 felec = _mm256_mul_ps(qq10,felec);
2031 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2032 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2033 velec = _mm256_sub_ps(rinv10,pmecorrV);
2034 velec = _mm256_mul_ps(qq10,velec);
2036 d = _mm256_sub_ps(r10,rswitch);
2037 d = _mm256_max_ps(d,_mm256_setzero_ps());
2038 d2 = _mm256_mul_ps(d,d);
2039 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2041 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2043 /* Evaluate switch function */
2044 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2045 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2046 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2050 fscal = _mm256_and_ps(fscal,cutoff_mask);
2052 /* Calculate temporary vectorial force */
2053 tx = _mm256_mul_ps(fscal,dx10);
2054 ty = _mm256_mul_ps(fscal,dy10);
2055 tz = _mm256_mul_ps(fscal,dz10);
2057 /* Update vectorial force */
2058 fix1 = _mm256_add_ps(fix1,tx);
2059 fiy1 = _mm256_add_ps(fiy1,ty);
2060 fiz1 = _mm256_add_ps(fiz1,tz);
2062 fjx0 = _mm256_add_ps(fjx0,tx);
2063 fjy0 = _mm256_add_ps(fjy0,ty);
2064 fjz0 = _mm256_add_ps(fjz0,tz);
2068 /**************************
2069 * CALCULATE INTERACTIONS *
2070 **************************/
2072 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2075 r11 = _mm256_mul_ps(rsq11,rinv11);
2077 /* EWALD ELECTROSTATICS */
2079 /* Analytical PME correction */
2080 zeta2 = _mm256_mul_ps(beta2,rsq11);
2081 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2082 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2083 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2084 felec = _mm256_mul_ps(qq11,felec);
2085 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2086 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2087 velec = _mm256_sub_ps(rinv11,pmecorrV);
2088 velec = _mm256_mul_ps(qq11,velec);
2090 d = _mm256_sub_ps(r11,rswitch);
2091 d = _mm256_max_ps(d,_mm256_setzero_ps());
2092 d2 = _mm256_mul_ps(d,d);
2093 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2095 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2097 /* Evaluate switch function */
2098 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2099 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2100 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2104 fscal = _mm256_and_ps(fscal,cutoff_mask);
2106 /* Calculate temporary vectorial force */
2107 tx = _mm256_mul_ps(fscal,dx11);
2108 ty = _mm256_mul_ps(fscal,dy11);
2109 tz = _mm256_mul_ps(fscal,dz11);
2111 /* Update vectorial force */
2112 fix1 = _mm256_add_ps(fix1,tx);
2113 fiy1 = _mm256_add_ps(fiy1,ty);
2114 fiz1 = _mm256_add_ps(fiz1,tz);
2116 fjx1 = _mm256_add_ps(fjx1,tx);
2117 fjy1 = _mm256_add_ps(fjy1,ty);
2118 fjz1 = _mm256_add_ps(fjz1,tz);
2122 /**************************
2123 * CALCULATE INTERACTIONS *
2124 **************************/
2126 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2129 r12 = _mm256_mul_ps(rsq12,rinv12);
2131 /* EWALD ELECTROSTATICS */
2133 /* Analytical PME correction */
2134 zeta2 = _mm256_mul_ps(beta2,rsq12);
2135 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2136 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2137 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2138 felec = _mm256_mul_ps(qq12,felec);
2139 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2140 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2141 velec = _mm256_sub_ps(rinv12,pmecorrV);
2142 velec = _mm256_mul_ps(qq12,velec);
2144 d = _mm256_sub_ps(r12,rswitch);
2145 d = _mm256_max_ps(d,_mm256_setzero_ps());
2146 d2 = _mm256_mul_ps(d,d);
2147 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2149 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2151 /* Evaluate switch function */
2152 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2153 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2154 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2158 fscal = _mm256_and_ps(fscal,cutoff_mask);
2160 /* Calculate temporary vectorial force */
2161 tx = _mm256_mul_ps(fscal,dx12);
2162 ty = _mm256_mul_ps(fscal,dy12);
2163 tz = _mm256_mul_ps(fscal,dz12);
2165 /* Update vectorial force */
2166 fix1 = _mm256_add_ps(fix1,tx);
2167 fiy1 = _mm256_add_ps(fiy1,ty);
2168 fiz1 = _mm256_add_ps(fiz1,tz);
2170 fjx2 = _mm256_add_ps(fjx2,tx);
2171 fjy2 = _mm256_add_ps(fjy2,ty);
2172 fjz2 = _mm256_add_ps(fjz2,tz);
2176 /**************************
2177 * CALCULATE INTERACTIONS *
2178 **************************/
2180 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2183 r20 = _mm256_mul_ps(rsq20,rinv20);
2185 /* EWALD ELECTROSTATICS */
2187 /* Analytical PME correction */
2188 zeta2 = _mm256_mul_ps(beta2,rsq20);
2189 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
2190 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2191 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2192 felec = _mm256_mul_ps(qq20,felec);
2193 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2194 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2195 velec = _mm256_sub_ps(rinv20,pmecorrV);
2196 velec = _mm256_mul_ps(qq20,velec);
2198 d = _mm256_sub_ps(r20,rswitch);
2199 d = _mm256_max_ps(d,_mm256_setzero_ps());
2200 d2 = _mm256_mul_ps(d,d);
2201 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2203 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2205 /* Evaluate switch function */
2206 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2207 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2208 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2212 fscal = _mm256_and_ps(fscal,cutoff_mask);
2214 /* Calculate temporary vectorial force */
2215 tx = _mm256_mul_ps(fscal,dx20);
2216 ty = _mm256_mul_ps(fscal,dy20);
2217 tz = _mm256_mul_ps(fscal,dz20);
2219 /* Update vectorial force */
2220 fix2 = _mm256_add_ps(fix2,tx);
2221 fiy2 = _mm256_add_ps(fiy2,ty);
2222 fiz2 = _mm256_add_ps(fiz2,tz);
2224 fjx0 = _mm256_add_ps(fjx0,tx);
2225 fjy0 = _mm256_add_ps(fjy0,ty);
2226 fjz0 = _mm256_add_ps(fjz0,tz);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2237 r21 = _mm256_mul_ps(rsq21,rinv21);
2239 /* EWALD ELECTROSTATICS */
2241 /* Analytical PME correction */
2242 zeta2 = _mm256_mul_ps(beta2,rsq21);
2243 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2244 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2245 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2246 felec = _mm256_mul_ps(qq21,felec);
2247 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2248 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2249 velec = _mm256_sub_ps(rinv21,pmecorrV);
2250 velec = _mm256_mul_ps(qq21,velec);
2252 d = _mm256_sub_ps(r21,rswitch);
2253 d = _mm256_max_ps(d,_mm256_setzero_ps());
2254 d2 = _mm256_mul_ps(d,d);
2255 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2257 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2259 /* Evaluate switch function */
2260 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2261 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2262 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2266 fscal = _mm256_and_ps(fscal,cutoff_mask);
2268 /* Calculate temporary vectorial force */
2269 tx = _mm256_mul_ps(fscal,dx21);
2270 ty = _mm256_mul_ps(fscal,dy21);
2271 tz = _mm256_mul_ps(fscal,dz21);
2273 /* Update vectorial force */
2274 fix2 = _mm256_add_ps(fix2,tx);
2275 fiy2 = _mm256_add_ps(fiy2,ty);
2276 fiz2 = _mm256_add_ps(fiz2,tz);
2278 fjx1 = _mm256_add_ps(fjx1,tx);
2279 fjy1 = _mm256_add_ps(fjy1,ty);
2280 fjz1 = _mm256_add_ps(fjz1,tz);
2284 /**************************
2285 * CALCULATE INTERACTIONS *
2286 **************************/
2288 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2291 r22 = _mm256_mul_ps(rsq22,rinv22);
2293 /* EWALD ELECTROSTATICS */
2295 /* Analytical PME correction */
2296 zeta2 = _mm256_mul_ps(beta2,rsq22);
2297 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2298 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2299 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2300 felec = _mm256_mul_ps(qq22,felec);
2301 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2302 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2303 velec = _mm256_sub_ps(rinv22,pmecorrV);
2304 velec = _mm256_mul_ps(qq22,velec);
2306 d = _mm256_sub_ps(r22,rswitch);
2307 d = _mm256_max_ps(d,_mm256_setzero_ps());
2308 d2 = _mm256_mul_ps(d,d);
2309 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2311 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2313 /* Evaluate switch function */
2314 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2315 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2316 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2320 fscal = _mm256_and_ps(fscal,cutoff_mask);
2322 /* Calculate temporary vectorial force */
2323 tx = _mm256_mul_ps(fscal,dx22);
2324 ty = _mm256_mul_ps(fscal,dy22);
2325 tz = _mm256_mul_ps(fscal,dz22);
2327 /* Update vectorial force */
2328 fix2 = _mm256_add_ps(fix2,tx);
2329 fiy2 = _mm256_add_ps(fiy2,ty);
2330 fiz2 = _mm256_add_ps(fiz2,tz);
2332 fjx2 = _mm256_add_ps(fjx2,tx);
2333 fjy2 = _mm256_add_ps(fjy2,ty);
2334 fjz2 = _mm256_add_ps(fjz2,tz);
2338 fjptrA = f+j_coord_offsetA;
2339 fjptrB = f+j_coord_offsetB;
2340 fjptrC = f+j_coord_offsetC;
2341 fjptrD = f+j_coord_offsetD;
2342 fjptrE = f+j_coord_offsetE;
2343 fjptrF = f+j_coord_offsetF;
2344 fjptrG = f+j_coord_offsetG;
2345 fjptrH = f+j_coord_offsetH;
2347 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2348 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2350 /* Inner loop uses 945 flops */
2353 if(jidx<j_index_end)
2356 /* Get j neighbor index, and coordinate index */
2357 jnrlistA = jjnr[jidx];
2358 jnrlistB = jjnr[jidx+1];
2359 jnrlistC = jjnr[jidx+2];
2360 jnrlistD = jjnr[jidx+3];
2361 jnrlistE = jjnr[jidx+4];
2362 jnrlistF = jjnr[jidx+5];
2363 jnrlistG = jjnr[jidx+6];
2364 jnrlistH = jjnr[jidx+7];
2365 /* Sign of each element will be negative for non-real atoms.
2366 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2367 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2369 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2370 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2372 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2373 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2374 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2375 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2376 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2377 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2378 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2379 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2380 j_coord_offsetA = DIM*jnrA;
2381 j_coord_offsetB = DIM*jnrB;
2382 j_coord_offsetC = DIM*jnrC;
2383 j_coord_offsetD = DIM*jnrD;
2384 j_coord_offsetE = DIM*jnrE;
2385 j_coord_offsetF = DIM*jnrF;
2386 j_coord_offsetG = DIM*jnrG;
2387 j_coord_offsetH = DIM*jnrH;
2389 /* load j atom coordinates */
2390 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2391 x+j_coord_offsetC,x+j_coord_offsetD,
2392 x+j_coord_offsetE,x+j_coord_offsetF,
2393 x+j_coord_offsetG,x+j_coord_offsetH,
2394 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2396 /* Calculate displacement vector */
2397 dx00 = _mm256_sub_ps(ix0,jx0);
2398 dy00 = _mm256_sub_ps(iy0,jy0);
2399 dz00 = _mm256_sub_ps(iz0,jz0);
2400 dx01 = _mm256_sub_ps(ix0,jx1);
2401 dy01 = _mm256_sub_ps(iy0,jy1);
2402 dz01 = _mm256_sub_ps(iz0,jz1);
2403 dx02 = _mm256_sub_ps(ix0,jx2);
2404 dy02 = _mm256_sub_ps(iy0,jy2);
2405 dz02 = _mm256_sub_ps(iz0,jz2);
2406 dx10 = _mm256_sub_ps(ix1,jx0);
2407 dy10 = _mm256_sub_ps(iy1,jy0);
2408 dz10 = _mm256_sub_ps(iz1,jz0);
2409 dx11 = _mm256_sub_ps(ix1,jx1);
2410 dy11 = _mm256_sub_ps(iy1,jy1);
2411 dz11 = _mm256_sub_ps(iz1,jz1);
2412 dx12 = _mm256_sub_ps(ix1,jx2);
2413 dy12 = _mm256_sub_ps(iy1,jy2);
2414 dz12 = _mm256_sub_ps(iz1,jz2);
2415 dx20 = _mm256_sub_ps(ix2,jx0);
2416 dy20 = _mm256_sub_ps(iy2,jy0);
2417 dz20 = _mm256_sub_ps(iz2,jz0);
2418 dx21 = _mm256_sub_ps(ix2,jx1);
2419 dy21 = _mm256_sub_ps(iy2,jy1);
2420 dz21 = _mm256_sub_ps(iz2,jz1);
2421 dx22 = _mm256_sub_ps(ix2,jx2);
2422 dy22 = _mm256_sub_ps(iy2,jy2);
2423 dz22 = _mm256_sub_ps(iz2,jz2);
2425 /* Calculate squared distance and things based on it */
2426 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2427 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2428 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2429 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2430 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2431 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2432 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2433 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2434 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2436 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2437 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2438 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2439 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2440 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2441 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2442 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2443 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2444 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2446 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2447 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
2448 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
2449 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
2450 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2451 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2452 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
2453 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2454 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2456 fjx0 = _mm256_setzero_ps();
2457 fjy0 = _mm256_setzero_ps();
2458 fjz0 = _mm256_setzero_ps();
2459 fjx1 = _mm256_setzero_ps();
2460 fjy1 = _mm256_setzero_ps();
2461 fjz1 = _mm256_setzero_ps();
2462 fjx2 = _mm256_setzero_ps();
2463 fjy2 = _mm256_setzero_ps();
2464 fjz2 = _mm256_setzero_ps();
2466 /**************************
2467 * CALCULATE INTERACTIONS *
2468 **************************/
2470 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2473 r00 = _mm256_mul_ps(rsq00,rinv00);
2474 r00 = _mm256_andnot_ps(dummy_mask,r00);
2476 /* EWALD ELECTROSTATICS */
2478 /* Analytical PME correction */
2479 zeta2 = _mm256_mul_ps(beta2,rsq00);
2480 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
2481 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2482 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2483 felec = _mm256_mul_ps(qq00,felec);
2484 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2485 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2486 velec = _mm256_sub_ps(rinv00,pmecorrV);
2487 velec = _mm256_mul_ps(qq00,velec);
2489 d = _mm256_sub_ps(r00,rswitch);
2490 d = _mm256_max_ps(d,_mm256_setzero_ps());
2491 d2 = _mm256_mul_ps(d,d);
2492 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2494 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2496 /* Evaluate switch function */
2497 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2498 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
2499 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2503 fscal = _mm256_and_ps(fscal,cutoff_mask);
2505 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2507 /* Calculate temporary vectorial force */
2508 tx = _mm256_mul_ps(fscal,dx00);
2509 ty = _mm256_mul_ps(fscal,dy00);
2510 tz = _mm256_mul_ps(fscal,dz00);
2512 /* Update vectorial force */
2513 fix0 = _mm256_add_ps(fix0,tx);
2514 fiy0 = _mm256_add_ps(fiy0,ty);
2515 fiz0 = _mm256_add_ps(fiz0,tz);
2517 fjx0 = _mm256_add_ps(fjx0,tx);
2518 fjy0 = _mm256_add_ps(fjy0,ty);
2519 fjz0 = _mm256_add_ps(fjz0,tz);
2523 /**************************
2524 * CALCULATE INTERACTIONS *
2525 **************************/
2527 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2530 r01 = _mm256_mul_ps(rsq01,rinv01);
2531 r01 = _mm256_andnot_ps(dummy_mask,r01);
2533 /* EWALD ELECTROSTATICS */
2535 /* Analytical PME correction */
2536 zeta2 = _mm256_mul_ps(beta2,rsq01);
2537 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
2538 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2539 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2540 felec = _mm256_mul_ps(qq01,felec);
2541 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2542 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2543 velec = _mm256_sub_ps(rinv01,pmecorrV);
2544 velec = _mm256_mul_ps(qq01,velec);
2546 d = _mm256_sub_ps(r01,rswitch);
2547 d = _mm256_max_ps(d,_mm256_setzero_ps());
2548 d2 = _mm256_mul_ps(d,d);
2549 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2551 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2553 /* Evaluate switch function */
2554 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2555 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2556 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2560 fscal = _mm256_and_ps(fscal,cutoff_mask);
2562 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2564 /* Calculate temporary vectorial force */
2565 tx = _mm256_mul_ps(fscal,dx01);
2566 ty = _mm256_mul_ps(fscal,dy01);
2567 tz = _mm256_mul_ps(fscal,dz01);
2569 /* Update vectorial force */
2570 fix0 = _mm256_add_ps(fix0,tx);
2571 fiy0 = _mm256_add_ps(fiy0,ty);
2572 fiz0 = _mm256_add_ps(fiz0,tz);
2574 fjx1 = _mm256_add_ps(fjx1,tx);
2575 fjy1 = _mm256_add_ps(fjy1,ty);
2576 fjz1 = _mm256_add_ps(fjz1,tz);
2580 /**************************
2581 * CALCULATE INTERACTIONS *
2582 **************************/
2584 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2587 r02 = _mm256_mul_ps(rsq02,rinv02);
2588 r02 = _mm256_andnot_ps(dummy_mask,r02);
2590 /* EWALD ELECTROSTATICS */
2592 /* Analytical PME correction */
2593 zeta2 = _mm256_mul_ps(beta2,rsq02);
2594 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
2595 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2596 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2597 felec = _mm256_mul_ps(qq02,felec);
2598 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2599 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2600 velec = _mm256_sub_ps(rinv02,pmecorrV);
2601 velec = _mm256_mul_ps(qq02,velec);
2603 d = _mm256_sub_ps(r02,rswitch);
2604 d = _mm256_max_ps(d,_mm256_setzero_ps());
2605 d2 = _mm256_mul_ps(d,d);
2606 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2608 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2610 /* Evaluate switch function */
2611 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2612 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2613 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2617 fscal = _mm256_and_ps(fscal,cutoff_mask);
2619 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2621 /* Calculate temporary vectorial force */
2622 tx = _mm256_mul_ps(fscal,dx02);
2623 ty = _mm256_mul_ps(fscal,dy02);
2624 tz = _mm256_mul_ps(fscal,dz02);
2626 /* Update vectorial force */
2627 fix0 = _mm256_add_ps(fix0,tx);
2628 fiy0 = _mm256_add_ps(fiy0,ty);
2629 fiz0 = _mm256_add_ps(fiz0,tz);
2631 fjx2 = _mm256_add_ps(fjx2,tx);
2632 fjy2 = _mm256_add_ps(fjy2,ty);
2633 fjz2 = _mm256_add_ps(fjz2,tz);
2637 /**************************
2638 * CALCULATE INTERACTIONS *
2639 **************************/
2641 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2644 r10 = _mm256_mul_ps(rsq10,rinv10);
2645 r10 = _mm256_andnot_ps(dummy_mask,r10);
2647 /* EWALD ELECTROSTATICS */
2649 /* Analytical PME correction */
2650 zeta2 = _mm256_mul_ps(beta2,rsq10);
2651 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
2652 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2653 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2654 felec = _mm256_mul_ps(qq10,felec);
2655 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2656 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2657 velec = _mm256_sub_ps(rinv10,pmecorrV);
2658 velec = _mm256_mul_ps(qq10,velec);
2660 d = _mm256_sub_ps(r10,rswitch);
2661 d = _mm256_max_ps(d,_mm256_setzero_ps());
2662 d2 = _mm256_mul_ps(d,d);
2663 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2665 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2667 /* Evaluate switch function */
2668 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2669 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2670 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2674 fscal = _mm256_and_ps(fscal,cutoff_mask);
2676 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2678 /* Calculate temporary vectorial force */
2679 tx = _mm256_mul_ps(fscal,dx10);
2680 ty = _mm256_mul_ps(fscal,dy10);
2681 tz = _mm256_mul_ps(fscal,dz10);
2683 /* Update vectorial force */
2684 fix1 = _mm256_add_ps(fix1,tx);
2685 fiy1 = _mm256_add_ps(fiy1,ty);
2686 fiz1 = _mm256_add_ps(fiz1,tz);
2688 fjx0 = _mm256_add_ps(fjx0,tx);
2689 fjy0 = _mm256_add_ps(fjy0,ty);
2690 fjz0 = _mm256_add_ps(fjz0,tz);
2694 /**************************
2695 * CALCULATE INTERACTIONS *
2696 **************************/
2698 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2701 r11 = _mm256_mul_ps(rsq11,rinv11);
2702 r11 = _mm256_andnot_ps(dummy_mask,r11);
2704 /* EWALD ELECTROSTATICS */
2706 /* Analytical PME correction */
2707 zeta2 = _mm256_mul_ps(beta2,rsq11);
2708 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2709 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2710 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2711 felec = _mm256_mul_ps(qq11,felec);
2712 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2713 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2714 velec = _mm256_sub_ps(rinv11,pmecorrV);
2715 velec = _mm256_mul_ps(qq11,velec);
2717 d = _mm256_sub_ps(r11,rswitch);
2718 d = _mm256_max_ps(d,_mm256_setzero_ps());
2719 d2 = _mm256_mul_ps(d,d);
2720 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2722 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2724 /* Evaluate switch function */
2725 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2726 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2727 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2731 fscal = _mm256_and_ps(fscal,cutoff_mask);
2733 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2735 /* Calculate temporary vectorial force */
2736 tx = _mm256_mul_ps(fscal,dx11);
2737 ty = _mm256_mul_ps(fscal,dy11);
2738 tz = _mm256_mul_ps(fscal,dz11);
2740 /* Update vectorial force */
2741 fix1 = _mm256_add_ps(fix1,tx);
2742 fiy1 = _mm256_add_ps(fiy1,ty);
2743 fiz1 = _mm256_add_ps(fiz1,tz);
2745 fjx1 = _mm256_add_ps(fjx1,tx);
2746 fjy1 = _mm256_add_ps(fjy1,ty);
2747 fjz1 = _mm256_add_ps(fjz1,tz);
2751 /**************************
2752 * CALCULATE INTERACTIONS *
2753 **************************/
2755 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2758 r12 = _mm256_mul_ps(rsq12,rinv12);
2759 r12 = _mm256_andnot_ps(dummy_mask,r12);
2761 /* EWALD ELECTROSTATICS */
2763 /* Analytical PME correction */
2764 zeta2 = _mm256_mul_ps(beta2,rsq12);
2765 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2766 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2767 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2768 felec = _mm256_mul_ps(qq12,felec);
2769 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2770 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2771 velec = _mm256_sub_ps(rinv12,pmecorrV);
2772 velec = _mm256_mul_ps(qq12,velec);
2774 d = _mm256_sub_ps(r12,rswitch);
2775 d = _mm256_max_ps(d,_mm256_setzero_ps());
2776 d2 = _mm256_mul_ps(d,d);
2777 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2779 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2781 /* Evaluate switch function */
2782 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2783 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2784 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2788 fscal = _mm256_and_ps(fscal,cutoff_mask);
2790 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2792 /* Calculate temporary vectorial force */
2793 tx = _mm256_mul_ps(fscal,dx12);
2794 ty = _mm256_mul_ps(fscal,dy12);
2795 tz = _mm256_mul_ps(fscal,dz12);
2797 /* Update vectorial force */
2798 fix1 = _mm256_add_ps(fix1,tx);
2799 fiy1 = _mm256_add_ps(fiy1,ty);
2800 fiz1 = _mm256_add_ps(fiz1,tz);
2802 fjx2 = _mm256_add_ps(fjx2,tx);
2803 fjy2 = _mm256_add_ps(fjy2,ty);
2804 fjz2 = _mm256_add_ps(fjz2,tz);
2808 /**************************
2809 * CALCULATE INTERACTIONS *
2810 **************************/
2812 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2815 r20 = _mm256_mul_ps(rsq20,rinv20);
2816 r20 = _mm256_andnot_ps(dummy_mask,r20);
2818 /* EWALD ELECTROSTATICS */
2820 /* Analytical PME correction */
2821 zeta2 = _mm256_mul_ps(beta2,rsq20);
2822 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
2823 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2824 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2825 felec = _mm256_mul_ps(qq20,felec);
2826 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2827 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2828 velec = _mm256_sub_ps(rinv20,pmecorrV);
2829 velec = _mm256_mul_ps(qq20,velec);
2831 d = _mm256_sub_ps(r20,rswitch);
2832 d = _mm256_max_ps(d,_mm256_setzero_ps());
2833 d2 = _mm256_mul_ps(d,d);
2834 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2836 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2838 /* Evaluate switch function */
2839 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2840 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2841 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2845 fscal = _mm256_and_ps(fscal,cutoff_mask);
2847 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2849 /* Calculate temporary vectorial force */
2850 tx = _mm256_mul_ps(fscal,dx20);
2851 ty = _mm256_mul_ps(fscal,dy20);
2852 tz = _mm256_mul_ps(fscal,dz20);
2854 /* Update vectorial force */
2855 fix2 = _mm256_add_ps(fix2,tx);
2856 fiy2 = _mm256_add_ps(fiy2,ty);
2857 fiz2 = _mm256_add_ps(fiz2,tz);
2859 fjx0 = _mm256_add_ps(fjx0,tx);
2860 fjy0 = _mm256_add_ps(fjy0,ty);
2861 fjz0 = _mm256_add_ps(fjz0,tz);
2865 /**************************
2866 * CALCULATE INTERACTIONS *
2867 **************************/
2869 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2872 r21 = _mm256_mul_ps(rsq21,rinv21);
2873 r21 = _mm256_andnot_ps(dummy_mask,r21);
2875 /* EWALD ELECTROSTATICS */
2877 /* Analytical PME correction */
2878 zeta2 = _mm256_mul_ps(beta2,rsq21);
2879 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2880 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2881 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2882 felec = _mm256_mul_ps(qq21,felec);
2883 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2884 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2885 velec = _mm256_sub_ps(rinv21,pmecorrV);
2886 velec = _mm256_mul_ps(qq21,velec);
2888 d = _mm256_sub_ps(r21,rswitch);
2889 d = _mm256_max_ps(d,_mm256_setzero_ps());
2890 d2 = _mm256_mul_ps(d,d);
2891 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2893 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2895 /* Evaluate switch function */
2896 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2897 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2898 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2902 fscal = _mm256_and_ps(fscal,cutoff_mask);
2904 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2906 /* Calculate temporary vectorial force */
2907 tx = _mm256_mul_ps(fscal,dx21);
2908 ty = _mm256_mul_ps(fscal,dy21);
2909 tz = _mm256_mul_ps(fscal,dz21);
2911 /* Update vectorial force */
2912 fix2 = _mm256_add_ps(fix2,tx);
2913 fiy2 = _mm256_add_ps(fiy2,ty);
2914 fiz2 = _mm256_add_ps(fiz2,tz);
2916 fjx1 = _mm256_add_ps(fjx1,tx);
2917 fjy1 = _mm256_add_ps(fjy1,ty);
2918 fjz1 = _mm256_add_ps(fjz1,tz);
2922 /**************************
2923 * CALCULATE INTERACTIONS *
2924 **************************/
2926 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2929 r22 = _mm256_mul_ps(rsq22,rinv22);
2930 r22 = _mm256_andnot_ps(dummy_mask,r22);
2932 /* EWALD ELECTROSTATICS */
2934 /* Analytical PME correction */
2935 zeta2 = _mm256_mul_ps(beta2,rsq22);
2936 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2937 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2938 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2939 felec = _mm256_mul_ps(qq22,felec);
2940 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2941 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2942 velec = _mm256_sub_ps(rinv22,pmecorrV);
2943 velec = _mm256_mul_ps(qq22,velec);
2945 d = _mm256_sub_ps(r22,rswitch);
2946 d = _mm256_max_ps(d,_mm256_setzero_ps());
2947 d2 = _mm256_mul_ps(d,d);
2948 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2950 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2952 /* Evaluate switch function */
2953 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2954 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2955 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2959 fscal = _mm256_and_ps(fscal,cutoff_mask);
2961 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2963 /* Calculate temporary vectorial force */
2964 tx = _mm256_mul_ps(fscal,dx22);
2965 ty = _mm256_mul_ps(fscal,dy22);
2966 tz = _mm256_mul_ps(fscal,dz22);
2968 /* Update vectorial force */
2969 fix2 = _mm256_add_ps(fix2,tx);
2970 fiy2 = _mm256_add_ps(fiy2,ty);
2971 fiz2 = _mm256_add_ps(fiz2,tz);
2973 fjx2 = _mm256_add_ps(fjx2,tx);
2974 fjy2 = _mm256_add_ps(fjy2,ty);
2975 fjz2 = _mm256_add_ps(fjz2,tz);
2979 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2980 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2981 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2982 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2983 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2984 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2985 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2986 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2988 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2989 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2991 /* Inner loop uses 954 flops */
2994 /* End of innermost loop */
2996 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2997 f+i_coord_offset,fshift+i_shift_offset);
2999 /* Increment number of inner iterations */
3000 inneriter += j_index_end - j_index_start;
3002 /* Outer loop uses 18 flops */
3005 /* Increment number of outer iterations */
3008 /* Update outer/inner flops */
3010 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*954);