2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr1;
73 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
79 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
81 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
83 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
84 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
85 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
86 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
87 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
88 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
89 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
90 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
91 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
92 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128i ewitab_lo,ewitab_hi;
97 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
100 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
101 real rswitch_scalar,d_scalar;
102 __m256 dummy_mask,cutoff_mask;
103 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
104 __m256 one = _mm256_set1_ps(1.0);
105 __m256 two = _mm256_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm256_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
120 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
121 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
122 beta2 = _mm256_mul_ps(beta,beta);
123 beta3 = _mm256_mul_ps(beta,beta2);
125 ewtab = fr->ic->tabq_coul_FDV0;
126 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
127 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
132 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
133 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
135 jq1 = _mm256_set1_ps(charge[inr+1]);
136 jq2 = _mm256_set1_ps(charge[inr+2]);
137 jq3 = _mm256_set1_ps(charge[inr+3]);
138 qq11 = _mm256_mul_ps(iq1,jq1);
139 qq12 = _mm256_mul_ps(iq1,jq2);
140 qq13 = _mm256_mul_ps(iq1,jq3);
141 qq21 = _mm256_mul_ps(iq2,jq1);
142 qq22 = _mm256_mul_ps(iq2,jq2);
143 qq23 = _mm256_mul_ps(iq2,jq3);
144 qq31 = _mm256_mul_ps(iq3,jq1);
145 qq32 = _mm256_mul_ps(iq3,jq2);
146 qq33 = _mm256_mul_ps(iq3,jq3);
148 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
149 rcutoff_scalar = fr->rcoulomb;
150 rcutoff = _mm256_set1_ps(rcutoff_scalar);
151 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
153 rswitch_scalar = fr->rcoulomb_switch;
154 rswitch = _mm256_set1_ps(rswitch_scalar);
155 /* Setup switch parameters */
156 d_scalar = rcutoff_scalar-rswitch_scalar;
157 d = _mm256_set1_ps(d_scalar);
158 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
159 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
160 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
161 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
162 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
163 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165 /* Avoid stupid compiler warnings */
166 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
179 for(iidx=0;iidx<4*DIM;iidx++)
184 /* Start outer loop over neighborlists */
185 for(iidx=0; iidx<nri; iidx++)
187 /* Load shift vector for this list */
188 i_shift_offset = DIM*shiftidx[iidx];
190 /* Load limits for loop over neighbors */
191 j_index_start = jindex[iidx];
192 j_index_end = jindex[iidx+1];
194 /* Get outer coordinate index */
196 i_coord_offset = DIM*inr;
198 /* Load i particle coords and add shift vector */
199 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
200 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
202 fix1 = _mm256_setzero_ps();
203 fiy1 = _mm256_setzero_ps();
204 fiz1 = _mm256_setzero_ps();
205 fix2 = _mm256_setzero_ps();
206 fiy2 = _mm256_setzero_ps();
207 fiz2 = _mm256_setzero_ps();
208 fix3 = _mm256_setzero_ps();
209 fiy3 = _mm256_setzero_ps();
210 fiz3 = _mm256_setzero_ps();
212 /* Reset potential sums */
213 velecsum = _mm256_setzero_ps();
215 /* Start inner kernel loop */
216 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
219 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
230 j_coord_offsetC = DIM*jnrC;
231 j_coord_offsetD = DIM*jnrD;
232 j_coord_offsetE = DIM*jnrE;
233 j_coord_offsetF = DIM*jnrF;
234 j_coord_offsetG = DIM*jnrG;
235 j_coord_offsetH = DIM*jnrH;
237 /* load j atom coordinates */
238 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
239 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
240 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
241 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
242 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
244 /* Calculate displacement vector */
245 dx11 = _mm256_sub_ps(ix1,jx1);
246 dy11 = _mm256_sub_ps(iy1,jy1);
247 dz11 = _mm256_sub_ps(iz1,jz1);
248 dx12 = _mm256_sub_ps(ix1,jx2);
249 dy12 = _mm256_sub_ps(iy1,jy2);
250 dz12 = _mm256_sub_ps(iz1,jz2);
251 dx13 = _mm256_sub_ps(ix1,jx3);
252 dy13 = _mm256_sub_ps(iy1,jy3);
253 dz13 = _mm256_sub_ps(iz1,jz3);
254 dx21 = _mm256_sub_ps(ix2,jx1);
255 dy21 = _mm256_sub_ps(iy2,jy1);
256 dz21 = _mm256_sub_ps(iz2,jz1);
257 dx22 = _mm256_sub_ps(ix2,jx2);
258 dy22 = _mm256_sub_ps(iy2,jy2);
259 dz22 = _mm256_sub_ps(iz2,jz2);
260 dx23 = _mm256_sub_ps(ix2,jx3);
261 dy23 = _mm256_sub_ps(iy2,jy3);
262 dz23 = _mm256_sub_ps(iz2,jz3);
263 dx31 = _mm256_sub_ps(ix3,jx1);
264 dy31 = _mm256_sub_ps(iy3,jy1);
265 dz31 = _mm256_sub_ps(iz3,jz1);
266 dx32 = _mm256_sub_ps(ix3,jx2);
267 dy32 = _mm256_sub_ps(iy3,jy2);
268 dz32 = _mm256_sub_ps(iz3,jz2);
269 dx33 = _mm256_sub_ps(ix3,jx3);
270 dy33 = _mm256_sub_ps(iy3,jy3);
271 dz33 = _mm256_sub_ps(iz3,jz3);
273 /* Calculate squared distance and things based on it */
274 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
275 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
276 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
277 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
278 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
279 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
280 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
281 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
282 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
284 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
285 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
286 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
287 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
288 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
289 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
290 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
291 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
292 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
294 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
295 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
296 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
297 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
298 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
299 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
300 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
301 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
302 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
304 fjx1 = _mm256_setzero_ps();
305 fjy1 = _mm256_setzero_ps();
306 fjz1 = _mm256_setzero_ps();
307 fjx2 = _mm256_setzero_ps();
308 fjy2 = _mm256_setzero_ps();
309 fjz2 = _mm256_setzero_ps();
310 fjx3 = _mm256_setzero_ps();
311 fjy3 = _mm256_setzero_ps();
312 fjz3 = _mm256_setzero_ps();
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 if (gmx_mm256_any_lt(rsq11,rcutoff2))
321 r11 = _mm256_mul_ps(rsq11,rinv11);
323 /* EWALD ELECTROSTATICS */
325 /* Analytical PME correction */
326 zeta2 = _mm256_mul_ps(beta2,rsq11);
327 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
328 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
329 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
330 felec = _mm256_mul_ps(qq11,felec);
331 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
332 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
333 velec = _mm256_sub_ps(rinv11,pmecorrV);
334 velec = _mm256_mul_ps(qq11,velec);
336 d = _mm256_sub_ps(r11,rswitch);
337 d = _mm256_max_ps(d,_mm256_setzero_ps());
338 d2 = _mm256_mul_ps(d,d);
339 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
341 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
343 /* Evaluate switch function */
344 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
345 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
346 velec = _mm256_mul_ps(velec,sw);
347 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velec = _mm256_and_ps(velec,cutoff_mask);
351 velecsum = _mm256_add_ps(velecsum,velec);
355 fscal = _mm256_and_ps(fscal,cutoff_mask);
357 /* Calculate temporary vectorial force */
358 tx = _mm256_mul_ps(fscal,dx11);
359 ty = _mm256_mul_ps(fscal,dy11);
360 tz = _mm256_mul_ps(fscal,dz11);
362 /* Update vectorial force */
363 fix1 = _mm256_add_ps(fix1,tx);
364 fiy1 = _mm256_add_ps(fiy1,ty);
365 fiz1 = _mm256_add_ps(fiz1,tz);
367 fjx1 = _mm256_add_ps(fjx1,tx);
368 fjy1 = _mm256_add_ps(fjy1,ty);
369 fjz1 = _mm256_add_ps(fjz1,tz);
373 /**************************
374 * CALCULATE INTERACTIONS *
375 **************************/
377 if (gmx_mm256_any_lt(rsq12,rcutoff2))
380 r12 = _mm256_mul_ps(rsq12,rinv12);
382 /* EWALD ELECTROSTATICS */
384 /* Analytical PME correction */
385 zeta2 = _mm256_mul_ps(beta2,rsq12);
386 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
387 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
388 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
389 felec = _mm256_mul_ps(qq12,felec);
390 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
391 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
392 velec = _mm256_sub_ps(rinv12,pmecorrV);
393 velec = _mm256_mul_ps(qq12,velec);
395 d = _mm256_sub_ps(r12,rswitch);
396 d = _mm256_max_ps(d,_mm256_setzero_ps());
397 d2 = _mm256_mul_ps(d,d);
398 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
400 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
402 /* Evaluate switch function */
403 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
404 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
405 velec = _mm256_mul_ps(velec,sw);
406 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
408 /* Update potential sum for this i atom from the interaction with this j atom. */
409 velec = _mm256_and_ps(velec,cutoff_mask);
410 velecsum = _mm256_add_ps(velecsum,velec);
414 fscal = _mm256_and_ps(fscal,cutoff_mask);
416 /* Calculate temporary vectorial force */
417 tx = _mm256_mul_ps(fscal,dx12);
418 ty = _mm256_mul_ps(fscal,dy12);
419 tz = _mm256_mul_ps(fscal,dz12);
421 /* Update vectorial force */
422 fix1 = _mm256_add_ps(fix1,tx);
423 fiy1 = _mm256_add_ps(fiy1,ty);
424 fiz1 = _mm256_add_ps(fiz1,tz);
426 fjx2 = _mm256_add_ps(fjx2,tx);
427 fjy2 = _mm256_add_ps(fjy2,ty);
428 fjz2 = _mm256_add_ps(fjz2,tz);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 if (gmx_mm256_any_lt(rsq13,rcutoff2))
439 r13 = _mm256_mul_ps(rsq13,rinv13);
441 /* EWALD ELECTROSTATICS */
443 /* Analytical PME correction */
444 zeta2 = _mm256_mul_ps(beta2,rsq13);
445 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
446 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
447 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
448 felec = _mm256_mul_ps(qq13,felec);
449 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
450 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
451 velec = _mm256_sub_ps(rinv13,pmecorrV);
452 velec = _mm256_mul_ps(qq13,velec);
454 d = _mm256_sub_ps(r13,rswitch);
455 d = _mm256_max_ps(d,_mm256_setzero_ps());
456 d2 = _mm256_mul_ps(d,d);
457 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
459 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
461 /* Evaluate switch function */
462 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
463 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
464 velec = _mm256_mul_ps(velec,sw);
465 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velec = _mm256_and_ps(velec,cutoff_mask);
469 velecsum = _mm256_add_ps(velecsum,velec);
473 fscal = _mm256_and_ps(fscal,cutoff_mask);
475 /* Calculate temporary vectorial force */
476 tx = _mm256_mul_ps(fscal,dx13);
477 ty = _mm256_mul_ps(fscal,dy13);
478 tz = _mm256_mul_ps(fscal,dz13);
480 /* Update vectorial force */
481 fix1 = _mm256_add_ps(fix1,tx);
482 fiy1 = _mm256_add_ps(fiy1,ty);
483 fiz1 = _mm256_add_ps(fiz1,tz);
485 fjx3 = _mm256_add_ps(fjx3,tx);
486 fjy3 = _mm256_add_ps(fjy3,ty);
487 fjz3 = _mm256_add_ps(fjz3,tz);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 if (gmx_mm256_any_lt(rsq21,rcutoff2))
498 r21 = _mm256_mul_ps(rsq21,rinv21);
500 /* EWALD ELECTROSTATICS */
502 /* Analytical PME correction */
503 zeta2 = _mm256_mul_ps(beta2,rsq21);
504 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
505 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
506 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
507 felec = _mm256_mul_ps(qq21,felec);
508 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
509 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
510 velec = _mm256_sub_ps(rinv21,pmecorrV);
511 velec = _mm256_mul_ps(qq21,velec);
513 d = _mm256_sub_ps(r21,rswitch);
514 d = _mm256_max_ps(d,_mm256_setzero_ps());
515 d2 = _mm256_mul_ps(d,d);
516 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
518 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
520 /* Evaluate switch function */
521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
522 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
523 velec = _mm256_mul_ps(velec,sw);
524 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velec = _mm256_and_ps(velec,cutoff_mask);
528 velecsum = _mm256_add_ps(velecsum,velec);
532 fscal = _mm256_and_ps(fscal,cutoff_mask);
534 /* Calculate temporary vectorial force */
535 tx = _mm256_mul_ps(fscal,dx21);
536 ty = _mm256_mul_ps(fscal,dy21);
537 tz = _mm256_mul_ps(fscal,dz21);
539 /* Update vectorial force */
540 fix2 = _mm256_add_ps(fix2,tx);
541 fiy2 = _mm256_add_ps(fiy2,ty);
542 fiz2 = _mm256_add_ps(fiz2,tz);
544 fjx1 = _mm256_add_ps(fjx1,tx);
545 fjy1 = _mm256_add_ps(fjy1,ty);
546 fjz1 = _mm256_add_ps(fjz1,tz);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 if (gmx_mm256_any_lt(rsq22,rcutoff2))
557 r22 = _mm256_mul_ps(rsq22,rinv22);
559 /* EWALD ELECTROSTATICS */
561 /* Analytical PME correction */
562 zeta2 = _mm256_mul_ps(beta2,rsq22);
563 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
564 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
565 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
566 felec = _mm256_mul_ps(qq22,felec);
567 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
568 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
569 velec = _mm256_sub_ps(rinv22,pmecorrV);
570 velec = _mm256_mul_ps(qq22,velec);
572 d = _mm256_sub_ps(r22,rswitch);
573 d = _mm256_max_ps(d,_mm256_setzero_ps());
574 d2 = _mm256_mul_ps(d,d);
575 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
577 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
579 /* Evaluate switch function */
580 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
582 velec = _mm256_mul_ps(velec,sw);
583 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec = _mm256_and_ps(velec,cutoff_mask);
587 velecsum = _mm256_add_ps(velecsum,velec);
591 fscal = _mm256_and_ps(fscal,cutoff_mask);
593 /* Calculate temporary vectorial force */
594 tx = _mm256_mul_ps(fscal,dx22);
595 ty = _mm256_mul_ps(fscal,dy22);
596 tz = _mm256_mul_ps(fscal,dz22);
598 /* Update vectorial force */
599 fix2 = _mm256_add_ps(fix2,tx);
600 fiy2 = _mm256_add_ps(fiy2,ty);
601 fiz2 = _mm256_add_ps(fiz2,tz);
603 fjx2 = _mm256_add_ps(fjx2,tx);
604 fjy2 = _mm256_add_ps(fjy2,ty);
605 fjz2 = _mm256_add_ps(fjz2,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 if (gmx_mm256_any_lt(rsq23,rcutoff2))
616 r23 = _mm256_mul_ps(rsq23,rinv23);
618 /* EWALD ELECTROSTATICS */
620 /* Analytical PME correction */
621 zeta2 = _mm256_mul_ps(beta2,rsq23);
622 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
623 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
624 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
625 felec = _mm256_mul_ps(qq23,felec);
626 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
627 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
628 velec = _mm256_sub_ps(rinv23,pmecorrV);
629 velec = _mm256_mul_ps(qq23,velec);
631 d = _mm256_sub_ps(r23,rswitch);
632 d = _mm256_max_ps(d,_mm256_setzero_ps());
633 d2 = _mm256_mul_ps(d,d);
634 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
636 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
638 /* Evaluate switch function */
639 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
640 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
641 velec = _mm256_mul_ps(velec,sw);
642 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec = _mm256_and_ps(velec,cutoff_mask);
646 velecsum = _mm256_add_ps(velecsum,velec);
650 fscal = _mm256_and_ps(fscal,cutoff_mask);
652 /* Calculate temporary vectorial force */
653 tx = _mm256_mul_ps(fscal,dx23);
654 ty = _mm256_mul_ps(fscal,dy23);
655 tz = _mm256_mul_ps(fscal,dz23);
657 /* Update vectorial force */
658 fix2 = _mm256_add_ps(fix2,tx);
659 fiy2 = _mm256_add_ps(fiy2,ty);
660 fiz2 = _mm256_add_ps(fiz2,tz);
662 fjx3 = _mm256_add_ps(fjx3,tx);
663 fjy3 = _mm256_add_ps(fjy3,ty);
664 fjz3 = _mm256_add_ps(fjz3,tz);
668 /**************************
669 * CALCULATE INTERACTIONS *
670 **************************/
672 if (gmx_mm256_any_lt(rsq31,rcutoff2))
675 r31 = _mm256_mul_ps(rsq31,rinv31);
677 /* EWALD ELECTROSTATICS */
679 /* Analytical PME correction */
680 zeta2 = _mm256_mul_ps(beta2,rsq31);
681 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
682 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
683 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
684 felec = _mm256_mul_ps(qq31,felec);
685 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
686 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
687 velec = _mm256_sub_ps(rinv31,pmecorrV);
688 velec = _mm256_mul_ps(qq31,velec);
690 d = _mm256_sub_ps(r31,rswitch);
691 d = _mm256_max_ps(d,_mm256_setzero_ps());
692 d2 = _mm256_mul_ps(d,d);
693 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
695 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
697 /* Evaluate switch function */
698 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
699 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
700 velec = _mm256_mul_ps(velec,sw);
701 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
703 /* Update potential sum for this i atom from the interaction with this j atom. */
704 velec = _mm256_and_ps(velec,cutoff_mask);
705 velecsum = _mm256_add_ps(velecsum,velec);
709 fscal = _mm256_and_ps(fscal,cutoff_mask);
711 /* Calculate temporary vectorial force */
712 tx = _mm256_mul_ps(fscal,dx31);
713 ty = _mm256_mul_ps(fscal,dy31);
714 tz = _mm256_mul_ps(fscal,dz31);
716 /* Update vectorial force */
717 fix3 = _mm256_add_ps(fix3,tx);
718 fiy3 = _mm256_add_ps(fiy3,ty);
719 fiz3 = _mm256_add_ps(fiz3,tz);
721 fjx1 = _mm256_add_ps(fjx1,tx);
722 fjy1 = _mm256_add_ps(fjy1,ty);
723 fjz1 = _mm256_add_ps(fjz1,tz);
727 /**************************
728 * CALCULATE INTERACTIONS *
729 **************************/
731 if (gmx_mm256_any_lt(rsq32,rcutoff2))
734 r32 = _mm256_mul_ps(rsq32,rinv32);
736 /* EWALD ELECTROSTATICS */
738 /* Analytical PME correction */
739 zeta2 = _mm256_mul_ps(beta2,rsq32);
740 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
741 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
742 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
743 felec = _mm256_mul_ps(qq32,felec);
744 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
745 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
746 velec = _mm256_sub_ps(rinv32,pmecorrV);
747 velec = _mm256_mul_ps(qq32,velec);
749 d = _mm256_sub_ps(r32,rswitch);
750 d = _mm256_max_ps(d,_mm256_setzero_ps());
751 d2 = _mm256_mul_ps(d,d);
752 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
754 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
756 /* Evaluate switch function */
757 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
758 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
759 velec = _mm256_mul_ps(velec,sw);
760 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
762 /* Update potential sum for this i atom from the interaction with this j atom. */
763 velec = _mm256_and_ps(velec,cutoff_mask);
764 velecsum = _mm256_add_ps(velecsum,velec);
768 fscal = _mm256_and_ps(fscal,cutoff_mask);
770 /* Calculate temporary vectorial force */
771 tx = _mm256_mul_ps(fscal,dx32);
772 ty = _mm256_mul_ps(fscal,dy32);
773 tz = _mm256_mul_ps(fscal,dz32);
775 /* Update vectorial force */
776 fix3 = _mm256_add_ps(fix3,tx);
777 fiy3 = _mm256_add_ps(fiy3,ty);
778 fiz3 = _mm256_add_ps(fiz3,tz);
780 fjx2 = _mm256_add_ps(fjx2,tx);
781 fjy2 = _mm256_add_ps(fjy2,ty);
782 fjz2 = _mm256_add_ps(fjz2,tz);
786 /**************************
787 * CALCULATE INTERACTIONS *
788 **************************/
790 if (gmx_mm256_any_lt(rsq33,rcutoff2))
793 r33 = _mm256_mul_ps(rsq33,rinv33);
795 /* EWALD ELECTROSTATICS */
797 /* Analytical PME correction */
798 zeta2 = _mm256_mul_ps(beta2,rsq33);
799 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
800 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
801 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
802 felec = _mm256_mul_ps(qq33,felec);
803 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
804 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
805 velec = _mm256_sub_ps(rinv33,pmecorrV);
806 velec = _mm256_mul_ps(qq33,velec);
808 d = _mm256_sub_ps(r33,rswitch);
809 d = _mm256_max_ps(d,_mm256_setzero_ps());
810 d2 = _mm256_mul_ps(d,d);
811 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
813 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
815 /* Evaluate switch function */
816 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
817 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
818 velec = _mm256_mul_ps(velec,sw);
819 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
821 /* Update potential sum for this i atom from the interaction with this j atom. */
822 velec = _mm256_and_ps(velec,cutoff_mask);
823 velecsum = _mm256_add_ps(velecsum,velec);
827 fscal = _mm256_and_ps(fscal,cutoff_mask);
829 /* Calculate temporary vectorial force */
830 tx = _mm256_mul_ps(fscal,dx33);
831 ty = _mm256_mul_ps(fscal,dy33);
832 tz = _mm256_mul_ps(fscal,dz33);
834 /* Update vectorial force */
835 fix3 = _mm256_add_ps(fix3,tx);
836 fiy3 = _mm256_add_ps(fiy3,ty);
837 fiz3 = _mm256_add_ps(fiz3,tz);
839 fjx3 = _mm256_add_ps(fjx3,tx);
840 fjy3 = _mm256_add_ps(fjy3,ty);
841 fjz3 = _mm256_add_ps(fjz3,tz);
845 fjptrA = f+j_coord_offsetA;
846 fjptrB = f+j_coord_offsetB;
847 fjptrC = f+j_coord_offsetC;
848 fjptrD = f+j_coord_offsetD;
849 fjptrE = f+j_coord_offsetE;
850 fjptrF = f+j_coord_offsetF;
851 fjptrG = f+j_coord_offsetG;
852 fjptrH = f+j_coord_offsetH;
854 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
855 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
856 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
858 /* Inner loop uses 972 flops */
864 /* Get j neighbor index, and coordinate index */
865 jnrlistA = jjnr[jidx];
866 jnrlistB = jjnr[jidx+1];
867 jnrlistC = jjnr[jidx+2];
868 jnrlistD = jjnr[jidx+3];
869 jnrlistE = jjnr[jidx+4];
870 jnrlistF = jjnr[jidx+5];
871 jnrlistG = jjnr[jidx+6];
872 jnrlistH = jjnr[jidx+7];
873 /* Sign of each element will be negative for non-real atoms.
874 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
875 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
877 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
878 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
880 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
881 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
882 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
883 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
884 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
885 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
886 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
887 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
888 j_coord_offsetA = DIM*jnrA;
889 j_coord_offsetB = DIM*jnrB;
890 j_coord_offsetC = DIM*jnrC;
891 j_coord_offsetD = DIM*jnrD;
892 j_coord_offsetE = DIM*jnrE;
893 j_coord_offsetF = DIM*jnrF;
894 j_coord_offsetG = DIM*jnrG;
895 j_coord_offsetH = DIM*jnrH;
897 /* load j atom coordinates */
898 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
899 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
900 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
901 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
902 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
904 /* Calculate displacement vector */
905 dx11 = _mm256_sub_ps(ix1,jx1);
906 dy11 = _mm256_sub_ps(iy1,jy1);
907 dz11 = _mm256_sub_ps(iz1,jz1);
908 dx12 = _mm256_sub_ps(ix1,jx2);
909 dy12 = _mm256_sub_ps(iy1,jy2);
910 dz12 = _mm256_sub_ps(iz1,jz2);
911 dx13 = _mm256_sub_ps(ix1,jx3);
912 dy13 = _mm256_sub_ps(iy1,jy3);
913 dz13 = _mm256_sub_ps(iz1,jz3);
914 dx21 = _mm256_sub_ps(ix2,jx1);
915 dy21 = _mm256_sub_ps(iy2,jy1);
916 dz21 = _mm256_sub_ps(iz2,jz1);
917 dx22 = _mm256_sub_ps(ix2,jx2);
918 dy22 = _mm256_sub_ps(iy2,jy2);
919 dz22 = _mm256_sub_ps(iz2,jz2);
920 dx23 = _mm256_sub_ps(ix2,jx3);
921 dy23 = _mm256_sub_ps(iy2,jy3);
922 dz23 = _mm256_sub_ps(iz2,jz3);
923 dx31 = _mm256_sub_ps(ix3,jx1);
924 dy31 = _mm256_sub_ps(iy3,jy1);
925 dz31 = _mm256_sub_ps(iz3,jz1);
926 dx32 = _mm256_sub_ps(ix3,jx2);
927 dy32 = _mm256_sub_ps(iy3,jy2);
928 dz32 = _mm256_sub_ps(iz3,jz2);
929 dx33 = _mm256_sub_ps(ix3,jx3);
930 dy33 = _mm256_sub_ps(iy3,jy3);
931 dz33 = _mm256_sub_ps(iz3,jz3);
933 /* Calculate squared distance and things based on it */
934 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
935 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
936 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
937 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
938 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
939 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
940 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
941 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
942 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
944 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
945 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
946 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
947 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
948 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
949 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
950 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
951 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
952 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
954 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
955 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
956 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
957 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
958 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
959 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
960 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
961 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
962 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
964 fjx1 = _mm256_setzero_ps();
965 fjy1 = _mm256_setzero_ps();
966 fjz1 = _mm256_setzero_ps();
967 fjx2 = _mm256_setzero_ps();
968 fjy2 = _mm256_setzero_ps();
969 fjz2 = _mm256_setzero_ps();
970 fjx3 = _mm256_setzero_ps();
971 fjy3 = _mm256_setzero_ps();
972 fjz3 = _mm256_setzero_ps();
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 if (gmx_mm256_any_lt(rsq11,rcutoff2))
981 r11 = _mm256_mul_ps(rsq11,rinv11);
982 r11 = _mm256_andnot_ps(dummy_mask,r11);
984 /* EWALD ELECTROSTATICS */
986 /* Analytical PME correction */
987 zeta2 = _mm256_mul_ps(beta2,rsq11);
988 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
989 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
990 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
991 felec = _mm256_mul_ps(qq11,felec);
992 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
993 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
994 velec = _mm256_sub_ps(rinv11,pmecorrV);
995 velec = _mm256_mul_ps(qq11,velec);
997 d = _mm256_sub_ps(r11,rswitch);
998 d = _mm256_max_ps(d,_mm256_setzero_ps());
999 d2 = _mm256_mul_ps(d,d);
1000 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1002 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1004 /* Evaluate switch function */
1005 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1006 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1007 velec = _mm256_mul_ps(velec,sw);
1008 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1010 /* Update potential sum for this i atom from the interaction with this j atom. */
1011 velec = _mm256_and_ps(velec,cutoff_mask);
1012 velec = _mm256_andnot_ps(dummy_mask,velec);
1013 velecsum = _mm256_add_ps(velecsum,velec);
1017 fscal = _mm256_and_ps(fscal,cutoff_mask);
1019 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1021 /* Calculate temporary vectorial force */
1022 tx = _mm256_mul_ps(fscal,dx11);
1023 ty = _mm256_mul_ps(fscal,dy11);
1024 tz = _mm256_mul_ps(fscal,dz11);
1026 /* Update vectorial force */
1027 fix1 = _mm256_add_ps(fix1,tx);
1028 fiy1 = _mm256_add_ps(fiy1,ty);
1029 fiz1 = _mm256_add_ps(fiz1,tz);
1031 fjx1 = _mm256_add_ps(fjx1,tx);
1032 fjy1 = _mm256_add_ps(fjy1,ty);
1033 fjz1 = _mm256_add_ps(fjz1,tz);
1037 /**************************
1038 * CALCULATE INTERACTIONS *
1039 **************************/
1041 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1044 r12 = _mm256_mul_ps(rsq12,rinv12);
1045 r12 = _mm256_andnot_ps(dummy_mask,r12);
1047 /* EWALD ELECTROSTATICS */
1049 /* Analytical PME correction */
1050 zeta2 = _mm256_mul_ps(beta2,rsq12);
1051 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1052 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1053 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1054 felec = _mm256_mul_ps(qq12,felec);
1055 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1056 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1057 velec = _mm256_sub_ps(rinv12,pmecorrV);
1058 velec = _mm256_mul_ps(qq12,velec);
1060 d = _mm256_sub_ps(r12,rswitch);
1061 d = _mm256_max_ps(d,_mm256_setzero_ps());
1062 d2 = _mm256_mul_ps(d,d);
1063 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1065 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1067 /* Evaluate switch function */
1068 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1069 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1070 velec = _mm256_mul_ps(velec,sw);
1071 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec = _mm256_and_ps(velec,cutoff_mask);
1075 velec = _mm256_andnot_ps(dummy_mask,velec);
1076 velecsum = _mm256_add_ps(velecsum,velec);
1080 fscal = _mm256_and_ps(fscal,cutoff_mask);
1082 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1084 /* Calculate temporary vectorial force */
1085 tx = _mm256_mul_ps(fscal,dx12);
1086 ty = _mm256_mul_ps(fscal,dy12);
1087 tz = _mm256_mul_ps(fscal,dz12);
1089 /* Update vectorial force */
1090 fix1 = _mm256_add_ps(fix1,tx);
1091 fiy1 = _mm256_add_ps(fiy1,ty);
1092 fiz1 = _mm256_add_ps(fiz1,tz);
1094 fjx2 = _mm256_add_ps(fjx2,tx);
1095 fjy2 = _mm256_add_ps(fjy2,ty);
1096 fjz2 = _mm256_add_ps(fjz2,tz);
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1104 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1107 r13 = _mm256_mul_ps(rsq13,rinv13);
1108 r13 = _mm256_andnot_ps(dummy_mask,r13);
1110 /* EWALD ELECTROSTATICS */
1112 /* Analytical PME correction */
1113 zeta2 = _mm256_mul_ps(beta2,rsq13);
1114 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1115 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1116 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1117 felec = _mm256_mul_ps(qq13,felec);
1118 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1119 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1120 velec = _mm256_sub_ps(rinv13,pmecorrV);
1121 velec = _mm256_mul_ps(qq13,velec);
1123 d = _mm256_sub_ps(r13,rswitch);
1124 d = _mm256_max_ps(d,_mm256_setzero_ps());
1125 d2 = _mm256_mul_ps(d,d);
1126 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1128 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1130 /* Evaluate switch function */
1131 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1132 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1133 velec = _mm256_mul_ps(velec,sw);
1134 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec = _mm256_and_ps(velec,cutoff_mask);
1138 velec = _mm256_andnot_ps(dummy_mask,velec);
1139 velecsum = _mm256_add_ps(velecsum,velec);
1143 fscal = _mm256_and_ps(fscal,cutoff_mask);
1145 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1147 /* Calculate temporary vectorial force */
1148 tx = _mm256_mul_ps(fscal,dx13);
1149 ty = _mm256_mul_ps(fscal,dy13);
1150 tz = _mm256_mul_ps(fscal,dz13);
1152 /* Update vectorial force */
1153 fix1 = _mm256_add_ps(fix1,tx);
1154 fiy1 = _mm256_add_ps(fiy1,ty);
1155 fiz1 = _mm256_add_ps(fiz1,tz);
1157 fjx3 = _mm256_add_ps(fjx3,tx);
1158 fjy3 = _mm256_add_ps(fjy3,ty);
1159 fjz3 = _mm256_add_ps(fjz3,tz);
1163 /**************************
1164 * CALCULATE INTERACTIONS *
1165 **************************/
1167 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1170 r21 = _mm256_mul_ps(rsq21,rinv21);
1171 r21 = _mm256_andnot_ps(dummy_mask,r21);
1173 /* EWALD ELECTROSTATICS */
1175 /* Analytical PME correction */
1176 zeta2 = _mm256_mul_ps(beta2,rsq21);
1177 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1178 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1179 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1180 felec = _mm256_mul_ps(qq21,felec);
1181 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1182 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1183 velec = _mm256_sub_ps(rinv21,pmecorrV);
1184 velec = _mm256_mul_ps(qq21,velec);
1186 d = _mm256_sub_ps(r21,rswitch);
1187 d = _mm256_max_ps(d,_mm256_setzero_ps());
1188 d2 = _mm256_mul_ps(d,d);
1189 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1191 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1193 /* Evaluate switch function */
1194 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1195 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1196 velec = _mm256_mul_ps(velec,sw);
1197 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1199 /* Update potential sum for this i atom from the interaction with this j atom. */
1200 velec = _mm256_and_ps(velec,cutoff_mask);
1201 velec = _mm256_andnot_ps(dummy_mask,velec);
1202 velecsum = _mm256_add_ps(velecsum,velec);
1206 fscal = _mm256_and_ps(fscal,cutoff_mask);
1208 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1210 /* Calculate temporary vectorial force */
1211 tx = _mm256_mul_ps(fscal,dx21);
1212 ty = _mm256_mul_ps(fscal,dy21);
1213 tz = _mm256_mul_ps(fscal,dz21);
1215 /* Update vectorial force */
1216 fix2 = _mm256_add_ps(fix2,tx);
1217 fiy2 = _mm256_add_ps(fiy2,ty);
1218 fiz2 = _mm256_add_ps(fiz2,tz);
1220 fjx1 = _mm256_add_ps(fjx1,tx);
1221 fjy1 = _mm256_add_ps(fjy1,ty);
1222 fjz1 = _mm256_add_ps(fjz1,tz);
1226 /**************************
1227 * CALCULATE INTERACTIONS *
1228 **************************/
1230 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1233 r22 = _mm256_mul_ps(rsq22,rinv22);
1234 r22 = _mm256_andnot_ps(dummy_mask,r22);
1236 /* EWALD ELECTROSTATICS */
1238 /* Analytical PME correction */
1239 zeta2 = _mm256_mul_ps(beta2,rsq22);
1240 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1241 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1242 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1243 felec = _mm256_mul_ps(qq22,felec);
1244 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1245 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1246 velec = _mm256_sub_ps(rinv22,pmecorrV);
1247 velec = _mm256_mul_ps(qq22,velec);
1249 d = _mm256_sub_ps(r22,rswitch);
1250 d = _mm256_max_ps(d,_mm256_setzero_ps());
1251 d2 = _mm256_mul_ps(d,d);
1252 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1254 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1256 /* Evaluate switch function */
1257 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1258 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1259 velec = _mm256_mul_ps(velec,sw);
1260 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1262 /* Update potential sum for this i atom from the interaction with this j atom. */
1263 velec = _mm256_and_ps(velec,cutoff_mask);
1264 velec = _mm256_andnot_ps(dummy_mask,velec);
1265 velecsum = _mm256_add_ps(velecsum,velec);
1269 fscal = _mm256_and_ps(fscal,cutoff_mask);
1271 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1273 /* Calculate temporary vectorial force */
1274 tx = _mm256_mul_ps(fscal,dx22);
1275 ty = _mm256_mul_ps(fscal,dy22);
1276 tz = _mm256_mul_ps(fscal,dz22);
1278 /* Update vectorial force */
1279 fix2 = _mm256_add_ps(fix2,tx);
1280 fiy2 = _mm256_add_ps(fiy2,ty);
1281 fiz2 = _mm256_add_ps(fiz2,tz);
1283 fjx2 = _mm256_add_ps(fjx2,tx);
1284 fjy2 = _mm256_add_ps(fjy2,ty);
1285 fjz2 = _mm256_add_ps(fjz2,tz);
1289 /**************************
1290 * CALCULATE INTERACTIONS *
1291 **************************/
1293 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1296 r23 = _mm256_mul_ps(rsq23,rinv23);
1297 r23 = _mm256_andnot_ps(dummy_mask,r23);
1299 /* EWALD ELECTROSTATICS */
1301 /* Analytical PME correction */
1302 zeta2 = _mm256_mul_ps(beta2,rsq23);
1303 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1304 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1305 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1306 felec = _mm256_mul_ps(qq23,felec);
1307 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1308 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1309 velec = _mm256_sub_ps(rinv23,pmecorrV);
1310 velec = _mm256_mul_ps(qq23,velec);
1312 d = _mm256_sub_ps(r23,rswitch);
1313 d = _mm256_max_ps(d,_mm256_setzero_ps());
1314 d2 = _mm256_mul_ps(d,d);
1315 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1317 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1319 /* Evaluate switch function */
1320 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1321 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1322 velec = _mm256_mul_ps(velec,sw);
1323 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1325 /* Update potential sum for this i atom from the interaction with this j atom. */
1326 velec = _mm256_and_ps(velec,cutoff_mask);
1327 velec = _mm256_andnot_ps(dummy_mask,velec);
1328 velecsum = _mm256_add_ps(velecsum,velec);
1332 fscal = _mm256_and_ps(fscal,cutoff_mask);
1334 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1336 /* Calculate temporary vectorial force */
1337 tx = _mm256_mul_ps(fscal,dx23);
1338 ty = _mm256_mul_ps(fscal,dy23);
1339 tz = _mm256_mul_ps(fscal,dz23);
1341 /* Update vectorial force */
1342 fix2 = _mm256_add_ps(fix2,tx);
1343 fiy2 = _mm256_add_ps(fiy2,ty);
1344 fiz2 = _mm256_add_ps(fiz2,tz);
1346 fjx3 = _mm256_add_ps(fjx3,tx);
1347 fjy3 = _mm256_add_ps(fjy3,ty);
1348 fjz3 = _mm256_add_ps(fjz3,tz);
1352 /**************************
1353 * CALCULATE INTERACTIONS *
1354 **************************/
1356 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1359 r31 = _mm256_mul_ps(rsq31,rinv31);
1360 r31 = _mm256_andnot_ps(dummy_mask,r31);
1362 /* EWALD ELECTROSTATICS */
1364 /* Analytical PME correction */
1365 zeta2 = _mm256_mul_ps(beta2,rsq31);
1366 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1367 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1368 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1369 felec = _mm256_mul_ps(qq31,felec);
1370 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1371 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1372 velec = _mm256_sub_ps(rinv31,pmecorrV);
1373 velec = _mm256_mul_ps(qq31,velec);
1375 d = _mm256_sub_ps(r31,rswitch);
1376 d = _mm256_max_ps(d,_mm256_setzero_ps());
1377 d2 = _mm256_mul_ps(d,d);
1378 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1380 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1382 /* Evaluate switch function */
1383 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1384 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1385 velec = _mm256_mul_ps(velec,sw);
1386 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1388 /* Update potential sum for this i atom from the interaction with this j atom. */
1389 velec = _mm256_and_ps(velec,cutoff_mask);
1390 velec = _mm256_andnot_ps(dummy_mask,velec);
1391 velecsum = _mm256_add_ps(velecsum,velec);
1395 fscal = _mm256_and_ps(fscal,cutoff_mask);
1397 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1399 /* Calculate temporary vectorial force */
1400 tx = _mm256_mul_ps(fscal,dx31);
1401 ty = _mm256_mul_ps(fscal,dy31);
1402 tz = _mm256_mul_ps(fscal,dz31);
1404 /* Update vectorial force */
1405 fix3 = _mm256_add_ps(fix3,tx);
1406 fiy3 = _mm256_add_ps(fiy3,ty);
1407 fiz3 = _mm256_add_ps(fiz3,tz);
1409 fjx1 = _mm256_add_ps(fjx1,tx);
1410 fjy1 = _mm256_add_ps(fjy1,ty);
1411 fjz1 = _mm256_add_ps(fjz1,tz);
1415 /**************************
1416 * CALCULATE INTERACTIONS *
1417 **************************/
1419 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1422 r32 = _mm256_mul_ps(rsq32,rinv32);
1423 r32 = _mm256_andnot_ps(dummy_mask,r32);
1425 /* EWALD ELECTROSTATICS */
1427 /* Analytical PME correction */
1428 zeta2 = _mm256_mul_ps(beta2,rsq32);
1429 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1430 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1431 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1432 felec = _mm256_mul_ps(qq32,felec);
1433 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1434 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1435 velec = _mm256_sub_ps(rinv32,pmecorrV);
1436 velec = _mm256_mul_ps(qq32,velec);
1438 d = _mm256_sub_ps(r32,rswitch);
1439 d = _mm256_max_ps(d,_mm256_setzero_ps());
1440 d2 = _mm256_mul_ps(d,d);
1441 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1443 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1445 /* Evaluate switch function */
1446 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1447 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1448 velec = _mm256_mul_ps(velec,sw);
1449 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1451 /* Update potential sum for this i atom from the interaction with this j atom. */
1452 velec = _mm256_and_ps(velec,cutoff_mask);
1453 velec = _mm256_andnot_ps(dummy_mask,velec);
1454 velecsum = _mm256_add_ps(velecsum,velec);
1458 fscal = _mm256_and_ps(fscal,cutoff_mask);
1460 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1462 /* Calculate temporary vectorial force */
1463 tx = _mm256_mul_ps(fscal,dx32);
1464 ty = _mm256_mul_ps(fscal,dy32);
1465 tz = _mm256_mul_ps(fscal,dz32);
1467 /* Update vectorial force */
1468 fix3 = _mm256_add_ps(fix3,tx);
1469 fiy3 = _mm256_add_ps(fiy3,ty);
1470 fiz3 = _mm256_add_ps(fiz3,tz);
1472 fjx2 = _mm256_add_ps(fjx2,tx);
1473 fjy2 = _mm256_add_ps(fjy2,ty);
1474 fjz2 = _mm256_add_ps(fjz2,tz);
1478 /**************************
1479 * CALCULATE INTERACTIONS *
1480 **************************/
1482 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1485 r33 = _mm256_mul_ps(rsq33,rinv33);
1486 r33 = _mm256_andnot_ps(dummy_mask,r33);
1488 /* EWALD ELECTROSTATICS */
1490 /* Analytical PME correction */
1491 zeta2 = _mm256_mul_ps(beta2,rsq33);
1492 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1493 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1494 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1495 felec = _mm256_mul_ps(qq33,felec);
1496 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1497 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1498 velec = _mm256_sub_ps(rinv33,pmecorrV);
1499 velec = _mm256_mul_ps(qq33,velec);
1501 d = _mm256_sub_ps(r33,rswitch);
1502 d = _mm256_max_ps(d,_mm256_setzero_ps());
1503 d2 = _mm256_mul_ps(d,d);
1504 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1506 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1508 /* Evaluate switch function */
1509 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1510 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1511 velec = _mm256_mul_ps(velec,sw);
1512 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1514 /* Update potential sum for this i atom from the interaction with this j atom. */
1515 velec = _mm256_and_ps(velec,cutoff_mask);
1516 velec = _mm256_andnot_ps(dummy_mask,velec);
1517 velecsum = _mm256_add_ps(velecsum,velec);
1521 fscal = _mm256_and_ps(fscal,cutoff_mask);
1523 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1525 /* Calculate temporary vectorial force */
1526 tx = _mm256_mul_ps(fscal,dx33);
1527 ty = _mm256_mul_ps(fscal,dy33);
1528 tz = _mm256_mul_ps(fscal,dz33);
1530 /* Update vectorial force */
1531 fix3 = _mm256_add_ps(fix3,tx);
1532 fiy3 = _mm256_add_ps(fiy3,ty);
1533 fiz3 = _mm256_add_ps(fiz3,tz);
1535 fjx3 = _mm256_add_ps(fjx3,tx);
1536 fjy3 = _mm256_add_ps(fjy3,ty);
1537 fjz3 = _mm256_add_ps(fjz3,tz);
1541 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1542 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1543 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1544 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1545 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1546 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1547 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1548 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1550 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1551 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1552 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1554 /* Inner loop uses 981 flops */
1557 /* End of innermost loop */
1559 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1560 f+i_coord_offset+DIM,fshift+i_shift_offset);
1563 /* Update potential energies */
1564 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1566 /* Increment number of inner iterations */
1567 inneriter += j_index_end - j_index_start;
1569 /* Outer loop uses 19 flops */
1572 /* Increment number of outer iterations */
1575 /* Update outer/inner flops */
1577 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*981);
1580 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1581 * Electrostatics interaction: Ewald
1582 * VdW interaction: None
1583 * Geometry: Water4-Water4
1584 * Calculate force/pot: Force
1587 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1588 (t_nblist * gmx_restrict nlist,
1589 rvec * gmx_restrict xx,
1590 rvec * gmx_restrict ff,
1591 t_forcerec * gmx_restrict fr,
1592 t_mdatoms * gmx_restrict mdatoms,
1593 nb_kernel_data_t * gmx_restrict kernel_data,
1594 t_nrnb * gmx_restrict nrnb)
1596 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1597 * just 0 for non-waters.
1598 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1599 * jnr indices corresponding to data put in the four positions in the SIMD register.
1601 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1602 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1603 int jnrA,jnrB,jnrC,jnrD;
1604 int jnrE,jnrF,jnrG,jnrH;
1605 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1606 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1607 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1608 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1609 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1610 real rcutoff_scalar;
1611 real *shiftvec,*fshift,*x,*f;
1612 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1613 real scratch[4*DIM];
1614 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1615 real * vdwioffsetptr1;
1616 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1617 real * vdwioffsetptr2;
1618 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1619 real * vdwioffsetptr3;
1620 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1621 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1622 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1623 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1624 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1625 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1626 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1627 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1628 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1629 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1630 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1631 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1632 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1633 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1634 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1635 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1636 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1639 __m128i ewitab_lo,ewitab_hi;
1640 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1641 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1643 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1644 real rswitch_scalar,d_scalar;
1645 __m256 dummy_mask,cutoff_mask;
1646 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1647 __m256 one = _mm256_set1_ps(1.0);
1648 __m256 two = _mm256_set1_ps(2.0);
1654 jindex = nlist->jindex;
1656 shiftidx = nlist->shift;
1658 shiftvec = fr->shift_vec[0];
1659 fshift = fr->fshift[0];
1660 facel = _mm256_set1_ps(fr->epsfac);
1661 charge = mdatoms->chargeA;
1663 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1664 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
1665 beta2 = _mm256_mul_ps(beta,beta);
1666 beta3 = _mm256_mul_ps(beta,beta2);
1668 ewtab = fr->ic->tabq_coul_FDV0;
1669 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1670 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1672 /* Setup water-specific parameters */
1673 inr = nlist->iinr[0];
1674 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1675 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1676 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1678 jq1 = _mm256_set1_ps(charge[inr+1]);
1679 jq2 = _mm256_set1_ps(charge[inr+2]);
1680 jq3 = _mm256_set1_ps(charge[inr+3]);
1681 qq11 = _mm256_mul_ps(iq1,jq1);
1682 qq12 = _mm256_mul_ps(iq1,jq2);
1683 qq13 = _mm256_mul_ps(iq1,jq3);
1684 qq21 = _mm256_mul_ps(iq2,jq1);
1685 qq22 = _mm256_mul_ps(iq2,jq2);
1686 qq23 = _mm256_mul_ps(iq2,jq3);
1687 qq31 = _mm256_mul_ps(iq3,jq1);
1688 qq32 = _mm256_mul_ps(iq3,jq2);
1689 qq33 = _mm256_mul_ps(iq3,jq3);
1691 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1692 rcutoff_scalar = fr->rcoulomb;
1693 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1694 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1696 rswitch_scalar = fr->rcoulomb_switch;
1697 rswitch = _mm256_set1_ps(rswitch_scalar);
1698 /* Setup switch parameters */
1699 d_scalar = rcutoff_scalar-rswitch_scalar;
1700 d = _mm256_set1_ps(d_scalar);
1701 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1702 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1703 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1704 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1705 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1706 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1708 /* Avoid stupid compiler warnings */
1709 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1710 j_coord_offsetA = 0;
1711 j_coord_offsetB = 0;
1712 j_coord_offsetC = 0;
1713 j_coord_offsetD = 0;
1714 j_coord_offsetE = 0;
1715 j_coord_offsetF = 0;
1716 j_coord_offsetG = 0;
1717 j_coord_offsetH = 0;
1722 for(iidx=0;iidx<4*DIM;iidx++)
1724 scratch[iidx] = 0.0;
1727 /* Start outer loop over neighborlists */
1728 for(iidx=0; iidx<nri; iidx++)
1730 /* Load shift vector for this list */
1731 i_shift_offset = DIM*shiftidx[iidx];
1733 /* Load limits for loop over neighbors */
1734 j_index_start = jindex[iidx];
1735 j_index_end = jindex[iidx+1];
1737 /* Get outer coordinate index */
1739 i_coord_offset = DIM*inr;
1741 /* Load i particle coords and add shift vector */
1742 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1743 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1745 fix1 = _mm256_setzero_ps();
1746 fiy1 = _mm256_setzero_ps();
1747 fiz1 = _mm256_setzero_ps();
1748 fix2 = _mm256_setzero_ps();
1749 fiy2 = _mm256_setzero_ps();
1750 fiz2 = _mm256_setzero_ps();
1751 fix3 = _mm256_setzero_ps();
1752 fiy3 = _mm256_setzero_ps();
1753 fiz3 = _mm256_setzero_ps();
1755 /* Start inner kernel loop */
1756 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1759 /* Get j neighbor index, and coordinate index */
1761 jnrB = jjnr[jidx+1];
1762 jnrC = jjnr[jidx+2];
1763 jnrD = jjnr[jidx+3];
1764 jnrE = jjnr[jidx+4];
1765 jnrF = jjnr[jidx+5];
1766 jnrG = jjnr[jidx+6];
1767 jnrH = jjnr[jidx+7];
1768 j_coord_offsetA = DIM*jnrA;
1769 j_coord_offsetB = DIM*jnrB;
1770 j_coord_offsetC = DIM*jnrC;
1771 j_coord_offsetD = DIM*jnrD;
1772 j_coord_offsetE = DIM*jnrE;
1773 j_coord_offsetF = DIM*jnrF;
1774 j_coord_offsetG = DIM*jnrG;
1775 j_coord_offsetH = DIM*jnrH;
1777 /* load j atom coordinates */
1778 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1779 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1780 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1781 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1782 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1784 /* Calculate displacement vector */
1785 dx11 = _mm256_sub_ps(ix1,jx1);
1786 dy11 = _mm256_sub_ps(iy1,jy1);
1787 dz11 = _mm256_sub_ps(iz1,jz1);
1788 dx12 = _mm256_sub_ps(ix1,jx2);
1789 dy12 = _mm256_sub_ps(iy1,jy2);
1790 dz12 = _mm256_sub_ps(iz1,jz2);
1791 dx13 = _mm256_sub_ps(ix1,jx3);
1792 dy13 = _mm256_sub_ps(iy1,jy3);
1793 dz13 = _mm256_sub_ps(iz1,jz3);
1794 dx21 = _mm256_sub_ps(ix2,jx1);
1795 dy21 = _mm256_sub_ps(iy2,jy1);
1796 dz21 = _mm256_sub_ps(iz2,jz1);
1797 dx22 = _mm256_sub_ps(ix2,jx2);
1798 dy22 = _mm256_sub_ps(iy2,jy2);
1799 dz22 = _mm256_sub_ps(iz2,jz2);
1800 dx23 = _mm256_sub_ps(ix2,jx3);
1801 dy23 = _mm256_sub_ps(iy2,jy3);
1802 dz23 = _mm256_sub_ps(iz2,jz3);
1803 dx31 = _mm256_sub_ps(ix3,jx1);
1804 dy31 = _mm256_sub_ps(iy3,jy1);
1805 dz31 = _mm256_sub_ps(iz3,jz1);
1806 dx32 = _mm256_sub_ps(ix3,jx2);
1807 dy32 = _mm256_sub_ps(iy3,jy2);
1808 dz32 = _mm256_sub_ps(iz3,jz2);
1809 dx33 = _mm256_sub_ps(ix3,jx3);
1810 dy33 = _mm256_sub_ps(iy3,jy3);
1811 dz33 = _mm256_sub_ps(iz3,jz3);
1813 /* Calculate squared distance and things based on it */
1814 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1815 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1816 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1817 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1818 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1819 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1820 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1821 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1822 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1824 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1825 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1826 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1827 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1828 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1829 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1830 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1831 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1832 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1834 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1835 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1836 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1837 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1838 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1839 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1840 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1841 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1842 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1844 fjx1 = _mm256_setzero_ps();
1845 fjy1 = _mm256_setzero_ps();
1846 fjz1 = _mm256_setzero_ps();
1847 fjx2 = _mm256_setzero_ps();
1848 fjy2 = _mm256_setzero_ps();
1849 fjz2 = _mm256_setzero_ps();
1850 fjx3 = _mm256_setzero_ps();
1851 fjy3 = _mm256_setzero_ps();
1852 fjz3 = _mm256_setzero_ps();
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1861 r11 = _mm256_mul_ps(rsq11,rinv11);
1863 /* EWALD ELECTROSTATICS */
1865 /* Analytical PME correction */
1866 zeta2 = _mm256_mul_ps(beta2,rsq11);
1867 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1868 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1869 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1870 felec = _mm256_mul_ps(qq11,felec);
1871 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1872 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1873 velec = _mm256_sub_ps(rinv11,pmecorrV);
1874 velec = _mm256_mul_ps(qq11,velec);
1876 d = _mm256_sub_ps(r11,rswitch);
1877 d = _mm256_max_ps(d,_mm256_setzero_ps());
1878 d2 = _mm256_mul_ps(d,d);
1879 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1881 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1883 /* Evaluate switch function */
1884 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1885 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1886 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1890 fscal = _mm256_and_ps(fscal,cutoff_mask);
1892 /* Calculate temporary vectorial force */
1893 tx = _mm256_mul_ps(fscal,dx11);
1894 ty = _mm256_mul_ps(fscal,dy11);
1895 tz = _mm256_mul_ps(fscal,dz11);
1897 /* Update vectorial force */
1898 fix1 = _mm256_add_ps(fix1,tx);
1899 fiy1 = _mm256_add_ps(fiy1,ty);
1900 fiz1 = _mm256_add_ps(fiz1,tz);
1902 fjx1 = _mm256_add_ps(fjx1,tx);
1903 fjy1 = _mm256_add_ps(fjy1,ty);
1904 fjz1 = _mm256_add_ps(fjz1,tz);
1908 /**************************
1909 * CALCULATE INTERACTIONS *
1910 **************************/
1912 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1915 r12 = _mm256_mul_ps(rsq12,rinv12);
1917 /* EWALD ELECTROSTATICS */
1919 /* Analytical PME correction */
1920 zeta2 = _mm256_mul_ps(beta2,rsq12);
1921 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1922 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1923 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1924 felec = _mm256_mul_ps(qq12,felec);
1925 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1926 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1927 velec = _mm256_sub_ps(rinv12,pmecorrV);
1928 velec = _mm256_mul_ps(qq12,velec);
1930 d = _mm256_sub_ps(r12,rswitch);
1931 d = _mm256_max_ps(d,_mm256_setzero_ps());
1932 d2 = _mm256_mul_ps(d,d);
1933 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1935 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1937 /* Evaluate switch function */
1938 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1939 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1940 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1944 fscal = _mm256_and_ps(fscal,cutoff_mask);
1946 /* Calculate temporary vectorial force */
1947 tx = _mm256_mul_ps(fscal,dx12);
1948 ty = _mm256_mul_ps(fscal,dy12);
1949 tz = _mm256_mul_ps(fscal,dz12);
1951 /* Update vectorial force */
1952 fix1 = _mm256_add_ps(fix1,tx);
1953 fiy1 = _mm256_add_ps(fiy1,ty);
1954 fiz1 = _mm256_add_ps(fiz1,tz);
1956 fjx2 = _mm256_add_ps(fjx2,tx);
1957 fjy2 = _mm256_add_ps(fjy2,ty);
1958 fjz2 = _mm256_add_ps(fjz2,tz);
1962 /**************************
1963 * CALCULATE INTERACTIONS *
1964 **************************/
1966 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1969 r13 = _mm256_mul_ps(rsq13,rinv13);
1971 /* EWALD ELECTROSTATICS */
1973 /* Analytical PME correction */
1974 zeta2 = _mm256_mul_ps(beta2,rsq13);
1975 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1976 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1977 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1978 felec = _mm256_mul_ps(qq13,felec);
1979 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1980 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1981 velec = _mm256_sub_ps(rinv13,pmecorrV);
1982 velec = _mm256_mul_ps(qq13,velec);
1984 d = _mm256_sub_ps(r13,rswitch);
1985 d = _mm256_max_ps(d,_mm256_setzero_ps());
1986 d2 = _mm256_mul_ps(d,d);
1987 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1989 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1991 /* Evaluate switch function */
1992 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1993 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1994 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1998 fscal = _mm256_and_ps(fscal,cutoff_mask);
2000 /* Calculate temporary vectorial force */
2001 tx = _mm256_mul_ps(fscal,dx13);
2002 ty = _mm256_mul_ps(fscal,dy13);
2003 tz = _mm256_mul_ps(fscal,dz13);
2005 /* Update vectorial force */
2006 fix1 = _mm256_add_ps(fix1,tx);
2007 fiy1 = _mm256_add_ps(fiy1,ty);
2008 fiz1 = _mm256_add_ps(fiz1,tz);
2010 fjx3 = _mm256_add_ps(fjx3,tx);
2011 fjy3 = _mm256_add_ps(fjy3,ty);
2012 fjz3 = _mm256_add_ps(fjz3,tz);
2016 /**************************
2017 * CALCULATE INTERACTIONS *
2018 **************************/
2020 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2023 r21 = _mm256_mul_ps(rsq21,rinv21);
2025 /* EWALD ELECTROSTATICS */
2027 /* Analytical PME correction */
2028 zeta2 = _mm256_mul_ps(beta2,rsq21);
2029 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2030 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2031 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2032 felec = _mm256_mul_ps(qq21,felec);
2033 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2034 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2035 velec = _mm256_sub_ps(rinv21,pmecorrV);
2036 velec = _mm256_mul_ps(qq21,velec);
2038 d = _mm256_sub_ps(r21,rswitch);
2039 d = _mm256_max_ps(d,_mm256_setzero_ps());
2040 d2 = _mm256_mul_ps(d,d);
2041 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2043 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2045 /* Evaluate switch function */
2046 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2047 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2048 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2052 fscal = _mm256_and_ps(fscal,cutoff_mask);
2054 /* Calculate temporary vectorial force */
2055 tx = _mm256_mul_ps(fscal,dx21);
2056 ty = _mm256_mul_ps(fscal,dy21);
2057 tz = _mm256_mul_ps(fscal,dz21);
2059 /* Update vectorial force */
2060 fix2 = _mm256_add_ps(fix2,tx);
2061 fiy2 = _mm256_add_ps(fiy2,ty);
2062 fiz2 = _mm256_add_ps(fiz2,tz);
2064 fjx1 = _mm256_add_ps(fjx1,tx);
2065 fjy1 = _mm256_add_ps(fjy1,ty);
2066 fjz1 = _mm256_add_ps(fjz1,tz);
2070 /**************************
2071 * CALCULATE INTERACTIONS *
2072 **************************/
2074 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2077 r22 = _mm256_mul_ps(rsq22,rinv22);
2079 /* EWALD ELECTROSTATICS */
2081 /* Analytical PME correction */
2082 zeta2 = _mm256_mul_ps(beta2,rsq22);
2083 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2084 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2085 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2086 felec = _mm256_mul_ps(qq22,felec);
2087 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2088 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2089 velec = _mm256_sub_ps(rinv22,pmecorrV);
2090 velec = _mm256_mul_ps(qq22,velec);
2092 d = _mm256_sub_ps(r22,rswitch);
2093 d = _mm256_max_ps(d,_mm256_setzero_ps());
2094 d2 = _mm256_mul_ps(d,d);
2095 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2097 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2099 /* Evaluate switch function */
2100 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2101 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2102 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2106 fscal = _mm256_and_ps(fscal,cutoff_mask);
2108 /* Calculate temporary vectorial force */
2109 tx = _mm256_mul_ps(fscal,dx22);
2110 ty = _mm256_mul_ps(fscal,dy22);
2111 tz = _mm256_mul_ps(fscal,dz22);
2113 /* Update vectorial force */
2114 fix2 = _mm256_add_ps(fix2,tx);
2115 fiy2 = _mm256_add_ps(fiy2,ty);
2116 fiz2 = _mm256_add_ps(fiz2,tz);
2118 fjx2 = _mm256_add_ps(fjx2,tx);
2119 fjy2 = _mm256_add_ps(fjy2,ty);
2120 fjz2 = _mm256_add_ps(fjz2,tz);
2124 /**************************
2125 * CALCULATE INTERACTIONS *
2126 **************************/
2128 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2131 r23 = _mm256_mul_ps(rsq23,rinv23);
2133 /* EWALD ELECTROSTATICS */
2135 /* Analytical PME correction */
2136 zeta2 = _mm256_mul_ps(beta2,rsq23);
2137 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2138 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2139 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2140 felec = _mm256_mul_ps(qq23,felec);
2141 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2142 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2143 velec = _mm256_sub_ps(rinv23,pmecorrV);
2144 velec = _mm256_mul_ps(qq23,velec);
2146 d = _mm256_sub_ps(r23,rswitch);
2147 d = _mm256_max_ps(d,_mm256_setzero_ps());
2148 d2 = _mm256_mul_ps(d,d);
2149 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2151 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2153 /* Evaluate switch function */
2154 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2155 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2156 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2160 fscal = _mm256_and_ps(fscal,cutoff_mask);
2162 /* Calculate temporary vectorial force */
2163 tx = _mm256_mul_ps(fscal,dx23);
2164 ty = _mm256_mul_ps(fscal,dy23);
2165 tz = _mm256_mul_ps(fscal,dz23);
2167 /* Update vectorial force */
2168 fix2 = _mm256_add_ps(fix2,tx);
2169 fiy2 = _mm256_add_ps(fiy2,ty);
2170 fiz2 = _mm256_add_ps(fiz2,tz);
2172 fjx3 = _mm256_add_ps(fjx3,tx);
2173 fjy3 = _mm256_add_ps(fjy3,ty);
2174 fjz3 = _mm256_add_ps(fjz3,tz);
2178 /**************************
2179 * CALCULATE INTERACTIONS *
2180 **************************/
2182 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2185 r31 = _mm256_mul_ps(rsq31,rinv31);
2187 /* EWALD ELECTROSTATICS */
2189 /* Analytical PME correction */
2190 zeta2 = _mm256_mul_ps(beta2,rsq31);
2191 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2192 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2193 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2194 felec = _mm256_mul_ps(qq31,felec);
2195 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2196 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2197 velec = _mm256_sub_ps(rinv31,pmecorrV);
2198 velec = _mm256_mul_ps(qq31,velec);
2200 d = _mm256_sub_ps(r31,rswitch);
2201 d = _mm256_max_ps(d,_mm256_setzero_ps());
2202 d2 = _mm256_mul_ps(d,d);
2203 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2205 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2207 /* Evaluate switch function */
2208 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2209 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2210 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2214 fscal = _mm256_and_ps(fscal,cutoff_mask);
2216 /* Calculate temporary vectorial force */
2217 tx = _mm256_mul_ps(fscal,dx31);
2218 ty = _mm256_mul_ps(fscal,dy31);
2219 tz = _mm256_mul_ps(fscal,dz31);
2221 /* Update vectorial force */
2222 fix3 = _mm256_add_ps(fix3,tx);
2223 fiy3 = _mm256_add_ps(fiy3,ty);
2224 fiz3 = _mm256_add_ps(fiz3,tz);
2226 fjx1 = _mm256_add_ps(fjx1,tx);
2227 fjy1 = _mm256_add_ps(fjy1,ty);
2228 fjz1 = _mm256_add_ps(fjz1,tz);
2232 /**************************
2233 * CALCULATE INTERACTIONS *
2234 **************************/
2236 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2239 r32 = _mm256_mul_ps(rsq32,rinv32);
2241 /* EWALD ELECTROSTATICS */
2243 /* Analytical PME correction */
2244 zeta2 = _mm256_mul_ps(beta2,rsq32);
2245 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2246 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2247 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2248 felec = _mm256_mul_ps(qq32,felec);
2249 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2250 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2251 velec = _mm256_sub_ps(rinv32,pmecorrV);
2252 velec = _mm256_mul_ps(qq32,velec);
2254 d = _mm256_sub_ps(r32,rswitch);
2255 d = _mm256_max_ps(d,_mm256_setzero_ps());
2256 d2 = _mm256_mul_ps(d,d);
2257 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2259 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2261 /* Evaluate switch function */
2262 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2263 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2264 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2268 fscal = _mm256_and_ps(fscal,cutoff_mask);
2270 /* Calculate temporary vectorial force */
2271 tx = _mm256_mul_ps(fscal,dx32);
2272 ty = _mm256_mul_ps(fscal,dy32);
2273 tz = _mm256_mul_ps(fscal,dz32);
2275 /* Update vectorial force */
2276 fix3 = _mm256_add_ps(fix3,tx);
2277 fiy3 = _mm256_add_ps(fiy3,ty);
2278 fiz3 = _mm256_add_ps(fiz3,tz);
2280 fjx2 = _mm256_add_ps(fjx2,tx);
2281 fjy2 = _mm256_add_ps(fjy2,ty);
2282 fjz2 = _mm256_add_ps(fjz2,tz);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2293 r33 = _mm256_mul_ps(rsq33,rinv33);
2295 /* EWALD ELECTROSTATICS */
2297 /* Analytical PME correction */
2298 zeta2 = _mm256_mul_ps(beta2,rsq33);
2299 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2300 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2301 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2302 felec = _mm256_mul_ps(qq33,felec);
2303 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2304 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2305 velec = _mm256_sub_ps(rinv33,pmecorrV);
2306 velec = _mm256_mul_ps(qq33,velec);
2308 d = _mm256_sub_ps(r33,rswitch);
2309 d = _mm256_max_ps(d,_mm256_setzero_ps());
2310 d2 = _mm256_mul_ps(d,d);
2311 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2313 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2315 /* Evaluate switch function */
2316 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2317 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2318 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2322 fscal = _mm256_and_ps(fscal,cutoff_mask);
2324 /* Calculate temporary vectorial force */
2325 tx = _mm256_mul_ps(fscal,dx33);
2326 ty = _mm256_mul_ps(fscal,dy33);
2327 tz = _mm256_mul_ps(fscal,dz33);
2329 /* Update vectorial force */
2330 fix3 = _mm256_add_ps(fix3,tx);
2331 fiy3 = _mm256_add_ps(fiy3,ty);
2332 fiz3 = _mm256_add_ps(fiz3,tz);
2334 fjx3 = _mm256_add_ps(fjx3,tx);
2335 fjy3 = _mm256_add_ps(fjy3,ty);
2336 fjz3 = _mm256_add_ps(fjz3,tz);
2340 fjptrA = f+j_coord_offsetA;
2341 fjptrB = f+j_coord_offsetB;
2342 fjptrC = f+j_coord_offsetC;
2343 fjptrD = f+j_coord_offsetD;
2344 fjptrE = f+j_coord_offsetE;
2345 fjptrF = f+j_coord_offsetF;
2346 fjptrG = f+j_coord_offsetG;
2347 fjptrH = f+j_coord_offsetH;
2349 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2350 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2351 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2353 /* Inner loop uses 945 flops */
2356 if(jidx<j_index_end)
2359 /* Get j neighbor index, and coordinate index */
2360 jnrlistA = jjnr[jidx];
2361 jnrlistB = jjnr[jidx+1];
2362 jnrlistC = jjnr[jidx+2];
2363 jnrlistD = jjnr[jidx+3];
2364 jnrlistE = jjnr[jidx+4];
2365 jnrlistF = jjnr[jidx+5];
2366 jnrlistG = jjnr[jidx+6];
2367 jnrlistH = jjnr[jidx+7];
2368 /* Sign of each element will be negative for non-real atoms.
2369 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2370 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2372 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2373 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2375 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2376 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2377 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2378 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2379 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2380 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2381 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2382 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2383 j_coord_offsetA = DIM*jnrA;
2384 j_coord_offsetB = DIM*jnrB;
2385 j_coord_offsetC = DIM*jnrC;
2386 j_coord_offsetD = DIM*jnrD;
2387 j_coord_offsetE = DIM*jnrE;
2388 j_coord_offsetF = DIM*jnrF;
2389 j_coord_offsetG = DIM*jnrG;
2390 j_coord_offsetH = DIM*jnrH;
2392 /* load j atom coordinates */
2393 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2394 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2395 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
2396 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
2397 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2399 /* Calculate displacement vector */
2400 dx11 = _mm256_sub_ps(ix1,jx1);
2401 dy11 = _mm256_sub_ps(iy1,jy1);
2402 dz11 = _mm256_sub_ps(iz1,jz1);
2403 dx12 = _mm256_sub_ps(ix1,jx2);
2404 dy12 = _mm256_sub_ps(iy1,jy2);
2405 dz12 = _mm256_sub_ps(iz1,jz2);
2406 dx13 = _mm256_sub_ps(ix1,jx3);
2407 dy13 = _mm256_sub_ps(iy1,jy3);
2408 dz13 = _mm256_sub_ps(iz1,jz3);
2409 dx21 = _mm256_sub_ps(ix2,jx1);
2410 dy21 = _mm256_sub_ps(iy2,jy1);
2411 dz21 = _mm256_sub_ps(iz2,jz1);
2412 dx22 = _mm256_sub_ps(ix2,jx2);
2413 dy22 = _mm256_sub_ps(iy2,jy2);
2414 dz22 = _mm256_sub_ps(iz2,jz2);
2415 dx23 = _mm256_sub_ps(ix2,jx3);
2416 dy23 = _mm256_sub_ps(iy2,jy3);
2417 dz23 = _mm256_sub_ps(iz2,jz3);
2418 dx31 = _mm256_sub_ps(ix3,jx1);
2419 dy31 = _mm256_sub_ps(iy3,jy1);
2420 dz31 = _mm256_sub_ps(iz3,jz1);
2421 dx32 = _mm256_sub_ps(ix3,jx2);
2422 dy32 = _mm256_sub_ps(iy3,jy2);
2423 dz32 = _mm256_sub_ps(iz3,jz2);
2424 dx33 = _mm256_sub_ps(ix3,jx3);
2425 dy33 = _mm256_sub_ps(iy3,jy3);
2426 dz33 = _mm256_sub_ps(iz3,jz3);
2428 /* Calculate squared distance and things based on it */
2429 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2430 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2431 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2432 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2433 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2434 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2435 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2436 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2437 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2439 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2440 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2441 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2442 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2443 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2444 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2445 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2446 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2447 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2449 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2450 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2451 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2452 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2453 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2454 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2455 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2456 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2457 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2459 fjx1 = _mm256_setzero_ps();
2460 fjy1 = _mm256_setzero_ps();
2461 fjz1 = _mm256_setzero_ps();
2462 fjx2 = _mm256_setzero_ps();
2463 fjy2 = _mm256_setzero_ps();
2464 fjz2 = _mm256_setzero_ps();
2465 fjx3 = _mm256_setzero_ps();
2466 fjy3 = _mm256_setzero_ps();
2467 fjz3 = _mm256_setzero_ps();
2469 /**************************
2470 * CALCULATE INTERACTIONS *
2471 **************************/
2473 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2476 r11 = _mm256_mul_ps(rsq11,rinv11);
2477 r11 = _mm256_andnot_ps(dummy_mask,r11);
2479 /* EWALD ELECTROSTATICS */
2481 /* Analytical PME correction */
2482 zeta2 = _mm256_mul_ps(beta2,rsq11);
2483 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2484 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2485 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2486 felec = _mm256_mul_ps(qq11,felec);
2487 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2488 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2489 velec = _mm256_sub_ps(rinv11,pmecorrV);
2490 velec = _mm256_mul_ps(qq11,velec);
2492 d = _mm256_sub_ps(r11,rswitch);
2493 d = _mm256_max_ps(d,_mm256_setzero_ps());
2494 d2 = _mm256_mul_ps(d,d);
2495 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2497 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2499 /* Evaluate switch function */
2500 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2501 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2502 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2506 fscal = _mm256_and_ps(fscal,cutoff_mask);
2508 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2510 /* Calculate temporary vectorial force */
2511 tx = _mm256_mul_ps(fscal,dx11);
2512 ty = _mm256_mul_ps(fscal,dy11);
2513 tz = _mm256_mul_ps(fscal,dz11);
2515 /* Update vectorial force */
2516 fix1 = _mm256_add_ps(fix1,tx);
2517 fiy1 = _mm256_add_ps(fiy1,ty);
2518 fiz1 = _mm256_add_ps(fiz1,tz);
2520 fjx1 = _mm256_add_ps(fjx1,tx);
2521 fjy1 = _mm256_add_ps(fjy1,ty);
2522 fjz1 = _mm256_add_ps(fjz1,tz);
2526 /**************************
2527 * CALCULATE INTERACTIONS *
2528 **************************/
2530 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2533 r12 = _mm256_mul_ps(rsq12,rinv12);
2534 r12 = _mm256_andnot_ps(dummy_mask,r12);
2536 /* EWALD ELECTROSTATICS */
2538 /* Analytical PME correction */
2539 zeta2 = _mm256_mul_ps(beta2,rsq12);
2540 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2541 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2542 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2543 felec = _mm256_mul_ps(qq12,felec);
2544 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2545 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2546 velec = _mm256_sub_ps(rinv12,pmecorrV);
2547 velec = _mm256_mul_ps(qq12,velec);
2549 d = _mm256_sub_ps(r12,rswitch);
2550 d = _mm256_max_ps(d,_mm256_setzero_ps());
2551 d2 = _mm256_mul_ps(d,d);
2552 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2554 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2556 /* Evaluate switch function */
2557 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2558 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2559 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2563 fscal = _mm256_and_ps(fscal,cutoff_mask);
2565 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2567 /* Calculate temporary vectorial force */
2568 tx = _mm256_mul_ps(fscal,dx12);
2569 ty = _mm256_mul_ps(fscal,dy12);
2570 tz = _mm256_mul_ps(fscal,dz12);
2572 /* Update vectorial force */
2573 fix1 = _mm256_add_ps(fix1,tx);
2574 fiy1 = _mm256_add_ps(fiy1,ty);
2575 fiz1 = _mm256_add_ps(fiz1,tz);
2577 fjx2 = _mm256_add_ps(fjx2,tx);
2578 fjy2 = _mm256_add_ps(fjy2,ty);
2579 fjz2 = _mm256_add_ps(fjz2,tz);
2583 /**************************
2584 * CALCULATE INTERACTIONS *
2585 **************************/
2587 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2590 r13 = _mm256_mul_ps(rsq13,rinv13);
2591 r13 = _mm256_andnot_ps(dummy_mask,r13);
2593 /* EWALD ELECTROSTATICS */
2595 /* Analytical PME correction */
2596 zeta2 = _mm256_mul_ps(beta2,rsq13);
2597 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2598 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2599 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2600 felec = _mm256_mul_ps(qq13,felec);
2601 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2602 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2603 velec = _mm256_sub_ps(rinv13,pmecorrV);
2604 velec = _mm256_mul_ps(qq13,velec);
2606 d = _mm256_sub_ps(r13,rswitch);
2607 d = _mm256_max_ps(d,_mm256_setzero_ps());
2608 d2 = _mm256_mul_ps(d,d);
2609 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2611 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2613 /* Evaluate switch function */
2614 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2615 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2616 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2620 fscal = _mm256_and_ps(fscal,cutoff_mask);
2622 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2624 /* Calculate temporary vectorial force */
2625 tx = _mm256_mul_ps(fscal,dx13);
2626 ty = _mm256_mul_ps(fscal,dy13);
2627 tz = _mm256_mul_ps(fscal,dz13);
2629 /* Update vectorial force */
2630 fix1 = _mm256_add_ps(fix1,tx);
2631 fiy1 = _mm256_add_ps(fiy1,ty);
2632 fiz1 = _mm256_add_ps(fiz1,tz);
2634 fjx3 = _mm256_add_ps(fjx3,tx);
2635 fjy3 = _mm256_add_ps(fjy3,ty);
2636 fjz3 = _mm256_add_ps(fjz3,tz);
2640 /**************************
2641 * CALCULATE INTERACTIONS *
2642 **************************/
2644 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2647 r21 = _mm256_mul_ps(rsq21,rinv21);
2648 r21 = _mm256_andnot_ps(dummy_mask,r21);
2650 /* EWALD ELECTROSTATICS */
2652 /* Analytical PME correction */
2653 zeta2 = _mm256_mul_ps(beta2,rsq21);
2654 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2655 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2656 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2657 felec = _mm256_mul_ps(qq21,felec);
2658 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2659 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2660 velec = _mm256_sub_ps(rinv21,pmecorrV);
2661 velec = _mm256_mul_ps(qq21,velec);
2663 d = _mm256_sub_ps(r21,rswitch);
2664 d = _mm256_max_ps(d,_mm256_setzero_ps());
2665 d2 = _mm256_mul_ps(d,d);
2666 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2668 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2670 /* Evaluate switch function */
2671 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2672 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2673 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2677 fscal = _mm256_and_ps(fscal,cutoff_mask);
2679 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2681 /* Calculate temporary vectorial force */
2682 tx = _mm256_mul_ps(fscal,dx21);
2683 ty = _mm256_mul_ps(fscal,dy21);
2684 tz = _mm256_mul_ps(fscal,dz21);
2686 /* Update vectorial force */
2687 fix2 = _mm256_add_ps(fix2,tx);
2688 fiy2 = _mm256_add_ps(fiy2,ty);
2689 fiz2 = _mm256_add_ps(fiz2,tz);
2691 fjx1 = _mm256_add_ps(fjx1,tx);
2692 fjy1 = _mm256_add_ps(fjy1,ty);
2693 fjz1 = _mm256_add_ps(fjz1,tz);
2697 /**************************
2698 * CALCULATE INTERACTIONS *
2699 **************************/
2701 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2704 r22 = _mm256_mul_ps(rsq22,rinv22);
2705 r22 = _mm256_andnot_ps(dummy_mask,r22);
2707 /* EWALD ELECTROSTATICS */
2709 /* Analytical PME correction */
2710 zeta2 = _mm256_mul_ps(beta2,rsq22);
2711 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2712 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2713 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2714 felec = _mm256_mul_ps(qq22,felec);
2715 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2716 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2717 velec = _mm256_sub_ps(rinv22,pmecorrV);
2718 velec = _mm256_mul_ps(qq22,velec);
2720 d = _mm256_sub_ps(r22,rswitch);
2721 d = _mm256_max_ps(d,_mm256_setzero_ps());
2722 d2 = _mm256_mul_ps(d,d);
2723 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2725 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2727 /* Evaluate switch function */
2728 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2729 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2730 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2734 fscal = _mm256_and_ps(fscal,cutoff_mask);
2736 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2738 /* Calculate temporary vectorial force */
2739 tx = _mm256_mul_ps(fscal,dx22);
2740 ty = _mm256_mul_ps(fscal,dy22);
2741 tz = _mm256_mul_ps(fscal,dz22);
2743 /* Update vectorial force */
2744 fix2 = _mm256_add_ps(fix2,tx);
2745 fiy2 = _mm256_add_ps(fiy2,ty);
2746 fiz2 = _mm256_add_ps(fiz2,tz);
2748 fjx2 = _mm256_add_ps(fjx2,tx);
2749 fjy2 = _mm256_add_ps(fjy2,ty);
2750 fjz2 = _mm256_add_ps(fjz2,tz);
2754 /**************************
2755 * CALCULATE INTERACTIONS *
2756 **************************/
2758 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2761 r23 = _mm256_mul_ps(rsq23,rinv23);
2762 r23 = _mm256_andnot_ps(dummy_mask,r23);
2764 /* EWALD ELECTROSTATICS */
2766 /* Analytical PME correction */
2767 zeta2 = _mm256_mul_ps(beta2,rsq23);
2768 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2769 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2770 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2771 felec = _mm256_mul_ps(qq23,felec);
2772 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2773 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2774 velec = _mm256_sub_ps(rinv23,pmecorrV);
2775 velec = _mm256_mul_ps(qq23,velec);
2777 d = _mm256_sub_ps(r23,rswitch);
2778 d = _mm256_max_ps(d,_mm256_setzero_ps());
2779 d2 = _mm256_mul_ps(d,d);
2780 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2782 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2784 /* Evaluate switch function */
2785 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2786 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2787 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2791 fscal = _mm256_and_ps(fscal,cutoff_mask);
2793 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2795 /* Calculate temporary vectorial force */
2796 tx = _mm256_mul_ps(fscal,dx23);
2797 ty = _mm256_mul_ps(fscal,dy23);
2798 tz = _mm256_mul_ps(fscal,dz23);
2800 /* Update vectorial force */
2801 fix2 = _mm256_add_ps(fix2,tx);
2802 fiy2 = _mm256_add_ps(fiy2,ty);
2803 fiz2 = _mm256_add_ps(fiz2,tz);
2805 fjx3 = _mm256_add_ps(fjx3,tx);
2806 fjy3 = _mm256_add_ps(fjy3,ty);
2807 fjz3 = _mm256_add_ps(fjz3,tz);
2811 /**************************
2812 * CALCULATE INTERACTIONS *
2813 **************************/
2815 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2818 r31 = _mm256_mul_ps(rsq31,rinv31);
2819 r31 = _mm256_andnot_ps(dummy_mask,r31);
2821 /* EWALD ELECTROSTATICS */
2823 /* Analytical PME correction */
2824 zeta2 = _mm256_mul_ps(beta2,rsq31);
2825 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2826 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2827 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2828 felec = _mm256_mul_ps(qq31,felec);
2829 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2830 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2831 velec = _mm256_sub_ps(rinv31,pmecorrV);
2832 velec = _mm256_mul_ps(qq31,velec);
2834 d = _mm256_sub_ps(r31,rswitch);
2835 d = _mm256_max_ps(d,_mm256_setzero_ps());
2836 d2 = _mm256_mul_ps(d,d);
2837 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2839 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2841 /* Evaluate switch function */
2842 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2843 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2844 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2848 fscal = _mm256_and_ps(fscal,cutoff_mask);
2850 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2852 /* Calculate temporary vectorial force */
2853 tx = _mm256_mul_ps(fscal,dx31);
2854 ty = _mm256_mul_ps(fscal,dy31);
2855 tz = _mm256_mul_ps(fscal,dz31);
2857 /* Update vectorial force */
2858 fix3 = _mm256_add_ps(fix3,tx);
2859 fiy3 = _mm256_add_ps(fiy3,ty);
2860 fiz3 = _mm256_add_ps(fiz3,tz);
2862 fjx1 = _mm256_add_ps(fjx1,tx);
2863 fjy1 = _mm256_add_ps(fjy1,ty);
2864 fjz1 = _mm256_add_ps(fjz1,tz);
2868 /**************************
2869 * CALCULATE INTERACTIONS *
2870 **************************/
2872 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2875 r32 = _mm256_mul_ps(rsq32,rinv32);
2876 r32 = _mm256_andnot_ps(dummy_mask,r32);
2878 /* EWALD ELECTROSTATICS */
2880 /* Analytical PME correction */
2881 zeta2 = _mm256_mul_ps(beta2,rsq32);
2882 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2883 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2884 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2885 felec = _mm256_mul_ps(qq32,felec);
2886 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2887 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2888 velec = _mm256_sub_ps(rinv32,pmecorrV);
2889 velec = _mm256_mul_ps(qq32,velec);
2891 d = _mm256_sub_ps(r32,rswitch);
2892 d = _mm256_max_ps(d,_mm256_setzero_ps());
2893 d2 = _mm256_mul_ps(d,d);
2894 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2896 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2898 /* Evaluate switch function */
2899 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2900 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2901 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2905 fscal = _mm256_and_ps(fscal,cutoff_mask);
2907 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2909 /* Calculate temporary vectorial force */
2910 tx = _mm256_mul_ps(fscal,dx32);
2911 ty = _mm256_mul_ps(fscal,dy32);
2912 tz = _mm256_mul_ps(fscal,dz32);
2914 /* Update vectorial force */
2915 fix3 = _mm256_add_ps(fix3,tx);
2916 fiy3 = _mm256_add_ps(fiy3,ty);
2917 fiz3 = _mm256_add_ps(fiz3,tz);
2919 fjx2 = _mm256_add_ps(fjx2,tx);
2920 fjy2 = _mm256_add_ps(fjy2,ty);
2921 fjz2 = _mm256_add_ps(fjz2,tz);
2925 /**************************
2926 * CALCULATE INTERACTIONS *
2927 **************************/
2929 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2932 r33 = _mm256_mul_ps(rsq33,rinv33);
2933 r33 = _mm256_andnot_ps(dummy_mask,r33);
2935 /* EWALD ELECTROSTATICS */
2937 /* Analytical PME correction */
2938 zeta2 = _mm256_mul_ps(beta2,rsq33);
2939 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2940 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2941 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2942 felec = _mm256_mul_ps(qq33,felec);
2943 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2944 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2945 velec = _mm256_sub_ps(rinv33,pmecorrV);
2946 velec = _mm256_mul_ps(qq33,velec);
2948 d = _mm256_sub_ps(r33,rswitch);
2949 d = _mm256_max_ps(d,_mm256_setzero_ps());
2950 d2 = _mm256_mul_ps(d,d);
2951 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2953 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2955 /* Evaluate switch function */
2956 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2957 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2958 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2962 fscal = _mm256_and_ps(fscal,cutoff_mask);
2964 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2966 /* Calculate temporary vectorial force */
2967 tx = _mm256_mul_ps(fscal,dx33);
2968 ty = _mm256_mul_ps(fscal,dy33);
2969 tz = _mm256_mul_ps(fscal,dz33);
2971 /* Update vectorial force */
2972 fix3 = _mm256_add_ps(fix3,tx);
2973 fiy3 = _mm256_add_ps(fiy3,ty);
2974 fiz3 = _mm256_add_ps(fiz3,tz);
2976 fjx3 = _mm256_add_ps(fjx3,tx);
2977 fjy3 = _mm256_add_ps(fjy3,ty);
2978 fjz3 = _mm256_add_ps(fjz3,tz);
2982 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2983 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2984 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2985 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2986 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2987 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2988 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2989 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2991 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2992 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2993 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2995 /* Inner loop uses 954 flops */
2998 /* End of innermost loop */
3000 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3001 f+i_coord_offset+DIM,fshift+i_shift_offset);
3003 /* Increment number of inner iterations */
3004 inneriter += j_index_end - j_index_start;
3006 /* Outer loop uses 18 flops */
3009 /* Increment number of outer iterations */
3012 /* Update outer/inner flops */
3014 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*954);