2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 real * vdwioffsetptr3;
79 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
101 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
105 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
107 __m128i ewitab_lo,ewitab_hi;
108 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
111 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
112 real rswitch_scalar,d_scalar;
113 __m256 dummy_mask,cutoff_mask;
114 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
115 __m256 one = _mm256_set1_ps(1.0);
116 __m256 two = _mm256_set1_ps(2.0);
122 jindex = nlist->jindex;
124 shiftidx = nlist->shift;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm256_set1_ps(fr->epsfac);
129 charge = mdatoms->chargeA;
130 nvdwtype = fr->ntype;
132 vdwtype = mdatoms->typeA;
134 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
135 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
136 beta2 = _mm256_mul_ps(beta,beta);
137 beta3 = _mm256_mul_ps(beta,beta2);
139 ewtab = fr->ic->tabq_coul_FDV0;
140 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
141 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
146 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
147 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
148 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
150 jq1 = _mm256_set1_ps(charge[inr+1]);
151 jq2 = _mm256_set1_ps(charge[inr+2]);
152 jq3 = _mm256_set1_ps(charge[inr+3]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
155 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
156 qq11 = _mm256_mul_ps(iq1,jq1);
157 qq12 = _mm256_mul_ps(iq1,jq2);
158 qq13 = _mm256_mul_ps(iq1,jq3);
159 qq21 = _mm256_mul_ps(iq2,jq1);
160 qq22 = _mm256_mul_ps(iq2,jq2);
161 qq23 = _mm256_mul_ps(iq2,jq3);
162 qq31 = _mm256_mul_ps(iq3,jq1);
163 qq32 = _mm256_mul_ps(iq3,jq2);
164 qq33 = _mm256_mul_ps(iq3,jq3);
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar = fr->rcoulomb;
168 rcutoff = _mm256_set1_ps(rcutoff_scalar);
169 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
171 rswitch_scalar = fr->rcoulomb_switch;
172 rswitch = _mm256_set1_ps(rswitch_scalar);
173 /* Setup switch parameters */
174 d_scalar = rcutoff_scalar-rswitch_scalar;
175 d = _mm256_set1_ps(d_scalar);
176 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
177 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
180 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
181 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183 /* Avoid stupid compiler warnings */
184 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
197 for(iidx=0;iidx<4*DIM;iidx++)
202 /* Start outer loop over neighborlists */
203 for(iidx=0; iidx<nri; iidx++)
205 /* Load shift vector for this list */
206 i_shift_offset = DIM*shiftidx[iidx];
208 /* Load limits for loop over neighbors */
209 j_index_start = jindex[iidx];
210 j_index_end = jindex[iidx+1];
212 /* Get outer coordinate index */
214 i_coord_offset = DIM*inr;
216 /* Load i particle coords and add shift vector */
217 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
220 fix0 = _mm256_setzero_ps();
221 fiy0 = _mm256_setzero_ps();
222 fiz0 = _mm256_setzero_ps();
223 fix1 = _mm256_setzero_ps();
224 fiy1 = _mm256_setzero_ps();
225 fiz1 = _mm256_setzero_ps();
226 fix2 = _mm256_setzero_ps();
227 fiy2 = _mm256_setzero_ps();
228 fiz2 = _mm256_setzero_ps();
229 fix3 = _mm256_setzero_ps();
230 fiy3 = _mm256_setzero_ps();
231 fiz3 = _mm256_setzero_ps();
233 /* Reset potential sums */
234 velecsum = _mm256_setzero_ps();
235 vvdwsum = _mm256_setzero_ps();
237 /* Start inner kernel loop */
238 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
241 /* Get j neighbor index, and coordinate index */
250 j_coord_offsetA = DIM*jnrA;
251 j_coord_offsetB = DIM*jnrB;
252 j_coord_offsetC = DIM*jnrC;
253 j_coord_offsetD = DIM*jnrD;
254 j_coord_offsetE = DIM*jnrE;
255 j_coord_offsetF = DIM*jnrF;
256 j_coord_offsetG = DIM*jnrG;
257 j_coord_offsetH = DIM*jnrH;
259 /* load j atom coordinates */
260 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
261 x+j_coord_offsetC,x+j_coord_offsetD,
262 x+j_coord_offsetE,x+j_coord_offsetF,
263 x+j_coord_offsetG,x+j_coord_offsetH,
264 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
265 &jy2,&jz2,&jx3,&jy3,&jz3);
267 /* Calculate displacement vector */
268 dx00 = _mm256_sub_ps(ix0,jx0);
269 dy00 = _mm256_sub_ps(iy0,jy0);
270 dz00 = _mm256_sub_ps(iz0,jz0);
271 dx11 = _mm256_sub_ps(ix1,jx1);
272 dy11 = _mm256_sub_ps(iy1,jy1);
273 dz11 = _mm256_sub_ps(iz1,jz1);
274 dx12 = _mm256_sub_ps(ix1,jx2);
275 dy12 = _mm256_sub_ps(iy1,jy2);
276 dz12 = _mm256_sub_ps(iz1,jz2);
277 dx13 = _mm256_sub_ps(ix1,jx3);
278 dy13 = _mm256_sub_ps(iy1,jy3);
279 dz13 = _mm256_sub_ps(iz1,jz3);
280 dx21 = _mm256_sub_ps(ix2,jx1);
281 dy21 = _mm256_sub_ps(iy2,jy1);
282 dz21 = _mm256_sub_ps(iz2,jz1);
283 dx22 = _mm256_sub_ps(ix2,jx2);
284 dy22 = _mm256_sub_ps(iy2,jy2);
285 dz22 = _mm256_sub_ps(iz2,jz2);
286 dx23 = _mm256_sub_ps(ix2,jx3);
287 dy23 = _mm256_sub_ps(iy2,jy3);
288 dz23 = _mm256_sub_ps(iz2,jz3);
289 dx31 = _mm256_sub_ps(ix3,jx1);
290 dy31 = _mm256_sub_ps(iy3,jy1);
291 dz31 = _mm256_sub_ps(iz3,jz1);
292 dx32 = _mm256_sub_ps(ix3,jx2);
293 dy32 = _mm256_sub_ps(iy3,jy2);
294 dz32 = _mm256_sub_ps(iz3,jz2);
295 dx33 = _mm256_sub_ps(ix3,jx3);
296 dy33 = _mm256_sub_ps(iy3,jy3);
297 dz33 = _mm256_sub_ps(iz3,jz3);
299 /* Calculate squared distance and things based on it */
300 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
301 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
302 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
303 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
304 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
305 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
306 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
307 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
308 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
309 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
311 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
312 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
313 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
314 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
315 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
316 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
317 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
318 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
319 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
320 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
322 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
323 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
324 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
325 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
326 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
327 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
328 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
329 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
330 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
331 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
333 fjx0 = _mm256_setzero_ps();
334 fjy0 = _mm256_setzero_ps();
335 fjz0 = _mm256_setzero_ps();
336 fjx1 = _mm256_setzero_ps();
337 fjy1 = _mm256_setzero_ps();
338 fjz1 = _mm256_setzero_ps();
339 fjx2 = _mm256_setzero_ps();
340 fjy2 = _mm256_setzero_ps();
341 fjz2 = _mm256_setzero_ps();
342 fjx3 = _mm256_setzero_ps();
343 fjy3 = _mm256_setzero_ps();
344 fjz3 = _mm256_setzero_ps();
346 /**************************
347 * CALCULATE INTERACTIONS *
348 **************************/
350 if (gmx_mm256_any_lt(rsq00,rcutoff2))
353 r00 = _mm256_mul_ps(rsq00,rinv00);
355 /* LENNARD-JONES DISPERSION/REPULSION */
357 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
358 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
359 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
360 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
361 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
363 d = _mm256_sub_ps(r00,rswitch);
364 d = _mm256_max_ps(d,_mm256_setzero_ps());
365 d2 = _mm256_mul_ps(d,d);
366 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
368 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
370 /* Evaluate switch function */
371 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
372 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
373 vvdw = _mm256_mul_ps(vvdw,sw);
374 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
378 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
382 fscal = _mm256_and_ps(fscal,cutoff_mask);
384 /* Calculate temporary vectorial force */
385 tx = _mm256_mul_ps(fscal,dx00);
386 ty = _mm256_mul_ps(fscal,dy00);
387 tz = _mm256_mul_ps(fscal,dz00);
389 /* Update vectorial force */
390 fix0 = _mm256_add_ps(fix0,tx);
391 fiy0 = _mm256_add_ps(fiy0,ty);
392 fiz0 = _mm256_add_ps(fiz0,tz);
394 fjx0 = _mm256_add_ps(fjx0,tx);
395 fjy0 = _mm256_add_ps(fjy0,ty);
396 fjz0 = _mm256_add_ps(fjz0,tz);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 if (gmx_mm256_any_lt(rsq11,rcutoff2))
407 r11 = _mm256_mul_ps(rsq11,rinv11);
409 /* EWALD ELECTROSTATICS */
411 /* Analytical PME correction */
412 zeta2 = _mm256_mul_ps(beta2,rsq11);
413 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
414 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
415 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
416 felec = _mm256_mul_ps(qq11,felec);
417 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
418 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
419 velec = _mm256_sub_ps(rinv11,pmecorrV);
420 velec = _mm256_mul_ps(qq11,velec);
422 d = _mm256_sub_ps(r11,rswitch);
423 d = _mm256_max_ps(d,_mm256_setzero_ps());
424 d2 = _mm256_mul_ps(d,d);
425 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
427 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
429 /* Evaluate switch function */
430 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
431 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
432 velec = _mm256_mul_ps(velec,sw);
433 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
435 /* Update potential sum for this i atom from the interaction with this j atom. */
436 velec = _mm256_and_ps(velec,cutoff_mask);
437 velecsum = _mm256_add_ps(velecsum,velec);
441 fscal = _mm256_and_ps(fscal,cutoff_mask);
443 /* Calculate temporary vectorial force */
444 tx = _mm256_mul_ps(fscal,dx11);
445 ty = _mm256_mul_ps(fscal,dy11);
446 tz = _mm256_mul_ps(fscal,dz11);
448 /* Update vectorial force */
449 fix1 = _mm256_add_ps(fix1,tx);
450 fiy1 = _mm256_add_ps(fiy1,ty);
451 fiz1 = _mm256_add_ps(fiz1,tz);
453 fjx1 = _mm256_add_ps(fjx1,tx);
454 fjy1 = _mm256_add_ps(fjy1,ty);
455 fjz1 = _mm256_add_ps(fjz1,tz);
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
463 if (gmx_mm256_any_lt(rsq12,rcutoff2))
466 r12 = _mm256_mul_ps(rsq12,rinv12);
468 /* EWALD ELECTROSTATICS */
470 /* Analytical PME correction */
471 zeta2 = _mm256_mul_ps(beta2,rsq12);
472 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
473 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
474 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
475 felec = _mm256_mul_ps(qq12,felec);
476 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
477 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
478 velec = _mm256_sub_ps(rinv12,pmecorrV);
479 velec = _mm256_mul_ps(qq12,velec);
481 d = _mm256_sub_ps(r12,rswitch);
482 d = _mm256_max_ps(d,_mm256_setzero_ps());
483 d2 = _mm256_mul_ps(d,d);
484 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
486 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
488 /* Evaluate switch function */
489 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
490 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
491 velec = _mm256_mul_ps(velec,sw);
492 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velec = _mm256_and_ps(velec,cutoff_mask);
496 velecsum = _mm256_add_ps(velecsum,velec);
500 fscal = _mm256_and_ps(fscal,cutoff_mask);
502 /* Calculate temporary vectorial force */
503 tx = _mm256_mul_ps(fscal,dx12);
504 ty = _mm256_mul_ps(fscal,dy12);
505 tz = _mm256_mul_ps(fscal,dz12);
507 /* Update vectorial force */
508 fix1 = _mm256_add_ps(fix1,tx);
509 fiy1 = _mm256_add_ps(fiy1,ty);
510 fiz1 = _mm256_add_ps(fiz1,tz);
512 fjx2 = _mm256_add_ps(fjx2,tx);
513 fjy2 = _mm256_add_ps(fjy2,ty);
514 fjz2 = _mm256_add_ps(fjz2,tz);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 if (gmx_mm256_any_lt(rsq13,rcutoff2))
525 r13 = _mm256_mul_ps(rsq13,rinv13);
527 /* EWALD ELECTROSTATICS */
529 /* Analytical PME correction */
530 zeta2 = _mm256_mul_ps(beta2,rsq13);
531 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
532 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
533 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
534 felec = _mm256_mul_ps(qq13,felec);
535 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
536 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
537 velec = _mm256_sub_ps(rinv13,pmecorrV);
538 velec = _mm256_mul_ps(qq13,velec);
540 d = _mm256_sub_ps(r13,rswitch);
541 d = _mm256_max_ps(d,_mm256_setzero_ps());
542 d2 = _mm256_mul_ps(d,d);
543 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
545 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
547 /* Evaluate switch function */
548 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
549 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
550 velec = _mm256_mul_ps(velec,sw);
551 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
553 /* Update potential sum for this i atom from the interaction with this j atom. */
554 velec = _mm256_and_ps(velec,cutoff_mask);
555 velecsum = _mm256_add_ps(velecsum,velec);
559 fscal = _mm256_and_ps(fscal,cutoff_mask);
561 /* Calculate temporary vectorial force */
562 tx = _mm256_mul_ps(fscal,dx13);
563 ty = _mm256_mul_ps(fscal,dy13);
564 tz = _mm256_mul_ps(fscal,dz13);
566 /* Update vectorial force */
567 fix1 = _mm256_add_ps(fix1,tx);
568 fiy1 = _mm256_add_ps(fiy1,ty);
569 fiz1 = _mm256_add_ps(fiz1,tz);
571 fjx3 = _mm256_add_ps(fjx3,tx);
572 fjy3 = _mm256_add_ps(fjy3,ty);
573 fjz3 = _mm256_add_ps(fjz3,tz);
577 /**************************
578 * CALCULATE INTERACTIONS *
579 **************************/
581 if (gmx_mm256_any_lt(rsq21,rcutoff2))
584 r21 = _mm256_mul_ps(rsq21,rinv21);
586 /* EWALD ELECTROSTATICS */
588 /* Analytical PME correction */
589 zeta2 = _mm256_mul_ps(beta2,rsq21);
590 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
591 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
592 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
593 felec = _mm256_mul_ps(qq21,felec);
594 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
595 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
596 velec = _mm256_sub_ps(rinv21,pmecorrV);
597 velec = _mm256_mul_ps(qq21,velec);
599 d = _mm256_sub_ps(r21,rswitch);
600 d = _mm256_max_ps(d,_mm256_setzero_ps());
601 d2 = _mm256_mul_ps(d,d);
602 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
604 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
606 /* Evaluate switch function */
607 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
608 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
609 velec = _mm256_mul_ps(velec,sw);
610 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velec = _mm256_and_ps(velec,cutoff_mask);
614 velecsum = _mm256_add_ps(velecsum,velec);
618 fscal = _mm256_and_ps(fscal,cutoff_mask);
620 /* Calculate temporary vectorial force */
621 tx = _mm256_mul_ps(fscal,dx21);
622 ty = _mm256_mul_ps(fscal,dy21);
623 tz = _mm256_mul_ps(fscal,dz21);
625 /* Update vectorial force */
626 fix2 = _mm256_add_ps(fix2,tx);
627 fiy2 = _mm256_add_ps(fiy2,ty);
628 fiz2 = _mm256_add_ps(fiz2,tz);
630 fjx1 = _mm256_add_ps(fjx1,tx);
631 fjy1 = _mm256_add_ps(fjy1,ty);
632 fjz1 = _mm256_add_ps(fjz1,tz);
636 /**************************
637 * CALCULATE INTERACTIONS *
638 **************************/
640 if (gmx_mm256_any_lt(rsq22,rcutoff2))
643 r22 = _mm256_mul_ps(rsq22,rinv22);
645 /* EWALD ELECTROSTATICS */
647 /* Analytical PME correction */
648 zeta2 = _mm256_mul_ps(beta2,rsq22);
649 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
650 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
651 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
652 felec = _mm256_mul_ps(qq22,felec);
653 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
654 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
655 velec = _mm256_sub_ps(rinv22,pmecorrV);
656 velec = _mm256_mul_ps(qq22,velec);
658 d = _mm256_sub_ps(r22,rswitch);
659 d = _mm256_max_ps(d,_mm256_setzero_ps());
660 d2 = _mm256_mul_ps(d,d);
661 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
663 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
665 /* Evaluate switch function */
666 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
667 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
668 velec = _mm256_mul_ps(velec,sw);
669 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
671 /* Update potential sum for this i atom from the interaction with this j atom. */
672 velec = _mm256_and_ps(velec,cutoff_mask);
673 velecsum = _mm256_add_ps(velecsum,velec);
677 fscal = _mm256_and_ps(fscal,cutoff_mask);
679 /* Calculate temporary vectorial force */
680 tx = _mm256_mul_ps(fscal,dx22);
681 ty = _mm256_mul_ps(fscal,dy22);
682 tz = _mm256_mul_ps(fscal,dz22);
684 /* Update vectorial force */
685 fix2 = _mm256_add_ps(fix2,tx);
686 fiy2 = _mm256_add_ps(fiy2,ty);
687 fiz2 = _mm256_add_ps(fiz2,tz);
689 fjx2 = _mm256_add_ps(fjx2,tx);
690 fjy2 = _mm256_add_ps(fjy2,ty);
691 fjz2 = _mm256_add_ps(fjz2,tz);
695 /**************************
696 * CALCULATE INTERACTIONS *
697 **************************/
699 if (gmx_mm256_any_lt(rsq23,rcutoff2))
702 r23 = _mm256_mul_ps(rsq23,rinv23);
704 /* EWALD ELECTROSTATICS */
706 /* Analytical PME correction */
707 zeta2 = _mm256_mul_ps(beta2,rsq23);
708 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
709 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
710 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
711 felec = _mm256_mul_ps(qq23,felec);
712 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
713 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
714 velec = _mm256_sub_ps(rinv23,pmecorrV);
715 velec = _mm256_mul_ps(qq23,velec);
717 d = _mm256_sub_ps(r23,rswitch);
718 d = _mm256_max_ps(d,_mm256_setzero_ps());
719 d2 = _mm256_mul_ps(d,d);
720 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
722 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
724 /* Evaluate switch function */
725 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
726 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
727 velec = _mm256_mul_ps(velec,sw);
728 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
730 /* Update potential sum for this i atom from the interaction with this j atom. */
731 velec = _mm256_and_ps(velec,cutoff_mask);
732 velecsum = _mm256_add_ps(velecsum,velec);
736 fscal = _mm256_and_ps(fscal,cutoff_mask);
738 /* Calculate temporary vectorial force */
739 tx = _mm256_mul_ps(fscal,dx23);
740 ty = _mm256_mul_ps(fscal,dy23);
741 tz = _mm256_mul_ps(fscal,dz23);
743 /* Update vectorial force */
744 fix2 = _mm256_add_ps(fix2,tx);
745 fiy2 = _mm256_add_ps(fiy2,ty);
746 fiz2 = _mm256_add_ps(fiz2,tz);
748 fjx3 = _mm256_add_ps(fjx3,tx);
749 fjy3 = _mm256_add_ps(fjy3,ty);
750 fjz3 = _mm256_add_ps(fjz3,tz);
754 /**************************
755 * CALCULATE INTERACTIONS *
756 **************************/
758 if (gmx_mm256_any_lt(rsq31,rcutoff2))
761 r31 = _mm256_mul_ps(rsq31,rinv31);
763 /* EWALD ELECTROSTATICS */
765 /* Analytical PME correction */
766 zeta2 = _mm256_mul_ps(beta2,rsq31);
767 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
768 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
769 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
770 felec = _mm256_mul_ps(qq31,felec);
771 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
772 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
773 velec = _mm256_sub_ps(rinv31,pmecorrV);
774 velec = _mm256_mul_ps(qq31,velec);
776 d = _mm256_sub_ps(r31,rswitch);
777 d = _mm256_max_ps(d,_mm256_setzero_ps());
778 d2 = _mm256_mul_ps(d,d);
779 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
781 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
783 /* Evaluate switch function */
784 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
785 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
786 velec = _mm256_mul_ps(velec,sw);
787 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
789 /* Update potential sum for this i atom from the interaction with this j atom. */
790 velec = _mm256_and_ps(velec,cutoff_mask);
791 velecsum = _mm256_add_ps(velecsum,velec);
795 fscal = _mm256_and_ps(fscal,cutoff_mask);
797 /* Calculate temporary vectorial force */
798 tx = _mm256_mul_ps(fscal,dx31);
799 ty = _mm256_mul_ps(fscal,dy31);
800 tz = _mm256_mul_ps(fscal,dz31);
802 /* Update vectorial force */
803 fix3 = _mm256_add_ps(fix3,tx);
804 fiy3 = _mm256_add_ps(fiy3,ty);
805 fiz3 = _mm256_add_ps(fiz3,tz);
807 fjx1 = _mm256_add_ps(fjx1,tx);
808 fjy1 = _mm256_add_ps(fjy1,ty);
809 fjz1 = _mm256_add_ps(fjz1,tz);
813 /**************************
814 * CALCULATE INTERACTIONS *
815 **************************/
817 if (gmx_mm256_any_lt(rsq32,rcutoff2))
820 r32 = _mm256_mul_ps(rsq32,rinv32);
822 /* EWALD ELECTROSTATICS */
824 /* Analytical PME correction */
825 zeta2 = _mm256_mul_ps(beta2,rsq32);
826 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
827 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
828 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
829 felec = _mm256_mul_ps(qq32,felec);
830 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
831 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
832 velec = _mm256_sub_ps(rinv32,pmecorrV);
833 velec = _mm256_mul_ps(qq32,velec);
835 d = _mm256_sub_ps(r32,rswitch);
836 d = _mm256_max_ps(d,_mm256_setzero_ps());
837 d2 = _mm256_mul_ps(d,d);
838 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
840 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
842 /* Evaluate switch function */
843 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
844 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
845 velec = _mm256_mul_ps(velec,sw);
846 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm256_and_ps(velec,cutoff_mask);
850 velecsum = _mm256_add_ps(velecsum,velec);
854 fscal = _mm256_and_ps(fscal,cutoff_mask);
856 /* Calculate temporary vectorial force */
857 tx = _mm256_mul_ps(fscal,dx32);
858 ty = _mm256_mul_ps(fscal,dy32);
859 tz = _mm256_mul_ps(fscal,dz32);
861 /* Update vectorial force */
862 fix3 = _mm256_add_ps(fix3,tx);
863 fiy3 = _mm256_add_ps(fiy3,ty);
864 fiz3 = _mm256_add_ps(fiz3,tz);
866 fjx2 = _mm256_add_ps(fjx2,tx);
867 fjy2 = _mm256_add_ps(fjy2,ty);
868 fjz2 = _mm256_add_ps(fjz2,tz);
872 /**************************
873 * CALCULATE INTERACTIONS *
874 **************************/
876 if (gmx_mm256_any_lt(rsq33,rcutoff2))
879 r33 = _mm256_mul_ps(rsq33,rinv33);
881 /* EWALD ELECTROSTATICS */
883 /* Analytical PME correction */
884 zeta2 = _mm256_mul_ps(beta2,rsq33);
885 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
886 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
887 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
888 felec = _mm256_mul_ps(qq33,felec);
889 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
890 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
891 velec = _mm256_sub_ps(rinv33,pmecorrV);
892 velec = _mm256_mul_ps(qq33,velec);
894 d = _mm256_sub_ps(r33,rswitch);
895 d = _mm256_max_ps(d,_mm256_setzero_ps());
896 d2 = _mm256_mul_ps(d,d);
897 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
899 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
901 /* Evaluate switch function */
902 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
903 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
904 velec = _mm256_mul_ps(velec,sw);
905 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm256_and_ps(velec,cutoff_mask);
909 velecsum = _mm256_add_ps(velecsum,velec);
913 fscal = _mm256_and_ps(fscal,cutoff_mask);
915 /* Calculate temporary vectorial force */
916 tx = _mm256_mul_ps(fscal,dx33);
917 ty = _mm256_mul_ps(fscal,dy33);
918 tz = _mm256_mul_ps(fscal,dz33);
920 /* Update vectorial force */
921 fix3 = _mm256_add_ps(fix3,tx);
922 fiy3 = _mm256_add_ps(fiy3,ty);
923 fiz3 = _mm256_add_ps(fiz3,tz);
925 fjx3 = _mm256_add_ps(fjx3,tx);
926 fjy3 = _mm256_add_ps(fjy3,ty);
927 fjz3 = _mm256_add_ps(fjz3,tz);
931 fjptrA = f+j_coord_offsetA;
932 fjptrB = f+j_coord_offsetB;
933 fjptrC = f+j_coord_offsetC;
934 fjptrD = f+j_coord_offsetD;
935 fjptrE = f+j_coord_offsetE;
936 fjptrF = f+j_coord_offsetF;
937 fjptrG = f+j_coord_offsetG;
938 fjptrH = f+j_coord_offsetH;
940 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
941 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
942 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
944 /* Inner loop uses 1034 flops */
950 /* Get j neighbor index, and coordinate index */
951 jnrlistA = jjnr[jidx];
952 jnrlistB = jjnr[jidx+1];
953 jnrlistC = jjnr[jidx+2];
954 jnrlistD = jjnr[jidx+3];
955 jnrlistE = jjnr[jidx+4];
956 jnrlistF = jjnr[jidx+5];
957 jnrlistG = jjnr[jidx+6];
958 jnrlistH = jjnr[jidx+7];
959 /* Sign of each element will be negative for non-real atoms.
960 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
961 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
963 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
964 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
966 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
967 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
968 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
969 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
970 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
971 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
972 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
973 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
974 j_coord_offsetA = DIM*jnrA;
975 j_coord_offsetB = DIM*jnrB;
976 j_coord_offsetC = DIM*jnrC;
977 j_coord_offsetD = DIM*jnrD;
978 j_coord_offsetE = DIM*jnrE;
979 j_coord_offsetF = DIM*jnrF;
980 j_coord_offsetG = DIM*jnrG;
981 j_coord_offsetH = DIM*jnrH;
983 /* load j atom coordinates */
984 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
985 x+j_coord_offsetC,x+j_coord_offsetD,
986 x+j_coord_offsetE,x+j_coord_offsetF,
987 x+j_coord_offsetG,x+j_coord_offsetH,
988 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
989 &jy2,&jz2,&jx3,&jy3,&jz3);
991 /* Calculate displacement vector */
992 dx00 = _mm256_sub_ps(ix0,jx0);
993 dy00 = _mm256_sub_ps(iy0,jy0);
994 dz00 = _mm256_sub_ps(iz0,jz0);
995 dx11 = _mm256_sub_ps(ix1,jx1);
996 dy11 = _mm256_sub_ps(iy1,jy1);
997 dz11 = _mm256_sub_ps(iz1,jz1);
998 dx12 = _mm256_sub_ps(ix1,jx2);
999 dy12 = _mm256_sub_ps(iy1,jy2);
1000 dz12 = _mm256_sub_ps(iz1,jz2);
1001 dx13 = _mm256_sub_ps(ix1,jx3);
1002 dy13 = _mm256_sub_ps(iy1,jy3);
1003 dz13 = _mm256_sub_ps(iz1,jz3);
1004 dx21 = _mm256_sub_ps(ix2,jx1);
1005 dy21 = _mm256_sub_ps(iy2,jy1);
1006 dz21 = _mm256_sub_ps(iz2,jz1);
1007 dx22 = _mm256_sub_ps(ix2,jx2);
1008 dy22 = _mm256_sub_ps(iy2,jy2);
1009 dz22 = _mm256_sub_ps(iz2,jz2);
1010 dx23 = _mm256_sub_ps(ix2,jx3);
1011 dy23 = _mm256_sub_ps(iy2,jy3);
1012 dz23 = _mm256_sub_ps(iz2,jz3);
1013 dx31 = _mm256_sub_ps(ix3,jx1);
1014 dy31 = _mm256_sub_ps(iy3,jy1);
1015 dz31 = _mm256_sub_ps(iz3,jz1);
1016 dx32 = _mm256_sub_ps(ix3,jx2);
1017 dy32 = _mm256_sub_ps(iy3,jy2);
1018 dz32 = _mm256_sub_ps(iz3,jz2);
1019 dx33 = _mm256_sub_ps(ix3,jx3);
1020 dy33 = _mm256_sub_ps(iy3,jy3);
1021 dz33 = _mm256_sub_ps(iz3,jz3);
1023 /* Calculate squared distance and things based on it */
1024 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1025 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1026 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1027 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1028 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1029 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1030 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1031 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1032 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1033 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1035 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1036 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1037 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1038 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1039 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1040 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1041 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1042 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1043 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1044 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1046 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1047 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1048 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1049 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1050 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1051 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1052 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1053 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1054 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1055 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1057 fjx0 = _mm256_setzero_ps();
1058 fjy0 = _mm256_setzero_ps();
1059 fjz0 = _mm256_setzero_ps();
1060 fjx1 = _mm256_setzero_ps();
1061 fjy1 = _mm256_setzero_ps();
1062 fjz1 = _mm256_setzero_ps();
1063 fjx2 = _mm256_setzero_ps();
1064 fjy2 = _mm256_setzero_ps();
1065 fjz2 = _mm256_setzero_ps();
1066 fjx3 = _mm256_setzero_ps();
1067 fjy3 = _mm256_setzero_ps();
1068 fjz3 = _mm256_setzero_ps();
1070 /**************************
1071 * CALCULATE INTERACTIONS *
1072 **************************/
1074 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1077 r00 = _mm256_mul_ps(rsq00,rinv00);
1078 r00 = _mm256_andnot_ps(dummy_mask,r00);
1080 /* LENNARD-JONES DISPERSION/REPULSION */
1082 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1083 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1084 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1085 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1086 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1088 d = _mm256_sub_ps(r00,rswitch);
1089 d = _mm256_max_ps(d,_mm256_setzero_ps());
1090 d2 = _mm256_mul_ps(d,d);
1091 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1093 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1095 /* Evaluate switch function */
1096 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1097 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1098 vvdw = _mm256_mul_ps(vvdw,sw);
1099 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1101 /* Update potential sum for this i atom from the interaction with this j atom. */
1102 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
1103 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1104 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1108 fscal = _mm256_and_ps(fscal,cutoff_mask);
1110 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1112 /* Calculate temporary vectorial force */
1113 tx = _mm256_mul_ps(fscal,dx00);
1114 ty = _mm256_mul_ps(fscal,dy00);
1115 tz = _mm256_mul_ps(fscal,dz00);
1117 /* Update vectorial force */
1118 fix0 = _mm256_add_ps(fix0,tx);
1119 fiy0 = _mm256_add_ps(fiy0,ty);
1120 fiz0 = _mm256_add_ps(fiz0,tz);
1122 fjx0 = _mm256_add_ps(fjx0,tx);
1123 fjy0 = _mm256_add_ps(fjy0,ty);
1124 fjz0 = _mm256_add_ps(fjz0,tz);
1128 /**************************
1129 * CALCULATE INTERACTIONS *
1130 **************************/
1132 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1135 r11 = _mm256_mul_ps(rsq11,rinv11);
1136 r11 = _mm256_andnot_ps(dummy_mask,r11);
1138 /* EWALD ELECTROSTATICS */
1140 /* Analytical PME correction */
1141 zeta2 = _mm256_mul_ps(beta2,rsq11);
1142 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1143 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1144 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1145 felec = _mm256_mul_ps(qq11,felec);
1146 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1147 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1148 velec = _mm256_sub_ps(rinv11,pmecorrV);
1149 velec = _mm256_mul_ps(qq11,velec);
1151 d = _mm256_sub_ps(r11,rswitch);
1152 d = _mm256_max_ps(d,_mm256_setzero_ps());
1153 d2 = _mm256_mul_ps(d,d);
1154 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1156 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1158 /* Evaluate switch function */
1159 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1160 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1161 velec = _mm256_mul_ps(velec,sw);
1162 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1164 /* Update potential sum for this i atom from the interaction with this j atom. */
1165 velec = _mm256_and_ps(velec,cutoff_mask);
1166 velec = _mm256_andnot_ps(dummy_mask,velec);
1167 velecsum = _mm256_add_ps(velecsum,velec);
1171 fscal = _mm256_and_ps(fscal,cutoff_mask);
1173 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1175 /* Calculate temporary vectorial force */
1176 tx = _mm256_mul_ps(fscal,dx11);
1177 ty = _mm256_mul_ps(fscal,dy11);
1178 tz = _mm256_mul_ps(fscal,dz11);
1180 /* Update vectorial force */
1181 fix1 = _mm256_add_ps(fix1,tx);
1182 fiy1 = _mm256_add_ps(fiy1,ty);
1183 fiz1 = _mm256_add_ps(fiz1,tz);
1185 fjx1 = _mm256_add_ps(fjx1,tx);
1186 fjy1 = _mm256_add_ps(fjy1,ty);
1187 fjz1 = _mm256_add_ps(fjz1,tz);
1191 /**************************
1192 * CALCULATE INTERACTIONS *
1193 **************************/
1195 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1198 r12 = _mm256_mul_ps(rsq12,rinv12);
1199 r12 = _mm256_andnot_ps(dummy_mask,r12);
1201 /* EWALD ELECTROSTATICS */
1203 /* Analytical PME correction */
1204 zeta2 = _mm256_mul_ps(beta2,rsq12);
1205 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1206 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1207 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1208 felec = _mm256_mul_ps(qq12,felec);
1209 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1210 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1211 velec = _mm256_sub_ps(rinv12,pmecorrV);
1212 velec = _mm256_mul_ps(qq12,velec);
1214 d = _mm256_sub_ps(r12,rswitch);
1215 d = _mm256_max_ps(d,_mm256_setzero_ps());
1216 d2 = _mm256_mul_ps(d,d);
1217 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1219 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1221 /* Evaluate switch function */
1222 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1223 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1224 velec = _mm256_mul_ps(velec,sw);
1225 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1227 /* Update potential sum for this i atom from the interaction with this j atom. */
1228 velec = _mm256_and_ps(velec,cutoff_mask);
1229 velec = _mm256_andnot_ps(dummy_mask,velec);
1230 velecsum = _mm256_add_ps(velecsum,velec);
1234 fscal = _mm256_and_ps(fscal,cutoff_mask);
1236 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1238 /* Calculate temporary vectorial force */
1239 tx = _mm256_mul_ps(fscal,dx12);
1240 ty = _mm256_mul_ps(fscal,dy12);
1241 tz = _mm256_mul_ps(fscal,dz12);
1243 /* Update vectorial force */
1244 fix1 = _mm256_add_ps(fix1,tx);
1245 fiy1 = _mm256_add_ps(fiy1,ty);
1246 fiz1 = _mm256_add_ps(fiz1,tz);
1248 fjx2 = _mm256_add_ps(fjx2,tx);
1249 fjy2 = _mm256_add_ps(fjy2,ty);
1250 fjz2 = _mm256_add_ps(fjz2,tz);
1254 /**************************
1255 * CALCULATE INTERACTIONS *
1256 **************************/
1258 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1261 r13 = _mm256_mul_ps(rsq13,rinv13);
1262 r13 = _mm256_andnot_ps(dummy_mask,r13);
1264 /* EWALD ELECTROSTATICS */
1266 /* Analytical PME correction */
1267 zeta2 = _mm256_mul_ps(beta2,rsq13);
1268 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1269 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1270 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1271 felec = _mm256_mul_ps(qq13,felec);
1272 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1273 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1274 velec = _mm256_sub_ps(rinv13,pmecorrV);
1275 velec = _mm256_mul_ps(qq13,velec);
1277 d = _mm256_sub_ps(r13,rswitch);
1278 d = _mm256_max_ps(d,_mm256_setzero_ps());
1279 d2 = _mm256_mul_ps(d,d);
1280 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1282 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1284 /* Evaluate switch function */
1285 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1286 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1287 velec = _mm256_mul_ps(velec,sw);
1288 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1290 /* Update potential sum for this i atom from the interaction with this j atom. */
1291 velec = _mm256_and_ps(velec,cutoff_mask);
1292 velec = _mm256_andnot_ps(dummy_mask,velec);
1293 velecsum = _mm256_add_ps(velecsum,velec);
1297 fscal = _mm256_and_ps(fscal,cutoff_mask);
1299 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1301 /* Calculate temporary vectorial force */
1302 tx = _mm256_mul_ps(fscal,dx13);
1303 ty = _mm256_mul_ps(fscal,dy13);
1304 tz = _mm256_mul_ps(fscal,dz13);
1306 /* Update vectorial force */
1307 fix1 = _mm256_add_ps(fix1,tx);
1308 fiy1 = _mm256_add_ps(fiy1,ty);
1309 fiz1 = _mm256_add_ps(fiz1,tz);
1311 fjx3 = _mm256_add_ps(fjx3,tx);
1312 fjy3 = _mm256_add_ps(fjy3,ty);
1313 fjz3 = _mm256_add_ps(fjz3,tz);
1317 /**************************
1318 * CALCULATE INTERACTIONS *
1319 **************************/
1321 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1324 r21 = _mm256_mul_ps(rsq21,rinv21);
1325 r21 = _mm256_andnot_ps(dummy_mask,r21);
1327 /* EWALD ELECTROSTATICS */
1329 /* Analytical PME correction */
1330 zeta2 = _mm256_mul_ps(beta2,rsq21);
1331 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1332 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1333 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1334 felec = _mm256_mul_ps(qq21,felec);
1335 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1336 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1337 velec = _mm256_sub_ps(rinv21,pmecorrV);
1338 velec = _mm256_mul_ps(qq21,velec);
1340 d = _mm256_sub_ps(r21,rswitch);
1341 d = _mm256_max_ps(d,_mm256_setzero_ps());
1342 d2 = _mm256_mul_ps(d,d);
1343 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1345 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1347 /* Evaluate switch function */
1348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1349 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1350 velec = _mm256_mul_ps(velec,sw);
1351 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1353 /* Update potential sum for this i atom from the interaction with this j atom. */
1354 velec = _mm256_and_ps(velec,cutoff_mask);
1355 velec = _mm256_andnot_ps(dummy_mask,velec);
1356 velecsum = _mm256_add_ps(velecsum,velec);
1360 fscal = _mm256_and_ps(fscal,cutoff_mask);
1362 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1364 /* Calculate temporary vectorial force */
1365 tx = _mm256_mul_ps(fscal,dx21);
1366 ty = _mm256_mul_ps(fscal,dy21);
1367 tz = _mm256_mul_ps(fscal,dz21);
1369 /* Update vectorial force */
1370 fix2 = _mm256_add_ps(fix2,tx);
1371 fiy2 = _mm256_add_ps(fiy2,ty);
1372 fiz2 = _mm256_add_ps(fiz2,tz);
1374 fjx1 = _mm256_add_ps(fjx1,tx);
1375 fjy1 = _mm256_add_ps(fjy1,ty);
1376 fjz1 = _mm256_add_ps(fjz1,tz);
1380 /**************************
1381 * CALCULATE INTERACTIONS *
1382 **************************/
1384 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1387 r22 = _mm256_mul_ps(rsq22,rinv22);
1388 r22 = _mm256_andnot_ps(dummy_mask,r22);
1390 /* EWALD ELECTROSTATICS */
1392 /* Analytical PME correction */
1393 zeta2 = _mm256_mul_ps(beta2,rsq22);
1394 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1395 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1396 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1397 felec = _mm256_mul_ps(qq22,felec);
1398 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1399 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1400 velec = _mm256_sub_ps(rinv22,pmecorrV);
1401 velec = _mm256_mul_ps(qq22,velec);
1403 d = _mm256_sub_ps(r22,rswitch);
1404 d = _mm256_max_ps(d,_mm256_setzero_ps());
1405 d2 = _mm256_mul_ps(d,d);
1406 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1408 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1410 /* Evaluate switch function */
1411 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1412 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1413 velec = _mm256_mul_ps(velec,sw);
1414 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1416 /* Update potential sum for this i atom from the interaction with this j atom. */
1417 velec = _mm256_and_ps(velec,cutoff_mask);
1418 velec = _mm256_andnot_ps(dummy_mask,velec);
1419 velecsum = _mm256_add_ps(velecsum,velec);
1423 fscal = _mm256_and_ps(fscal,cutoff_mask);
1425 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1427 /* Calculate temporary vectorial force */
1428 tx = _mm256_mul_ps(fscal,dx22);
1429 ty = _mm256_mul_ps(fscal,dy22);
1430 tz = _mm256_mul_ps(fscal,dz22);
1432 /* Update vectorial force */
1433 fix2 = _mm256_add_ps(fix2,tx);
1434 fiy2 = _mm256_add_ps(fiy2,ty);
1435 fiz2 = _mm256_add_ps(fiz2,tz);
1437 fjx2 = _mm256_add_ps(fjx2,tx);
1438 fjy2 = _mm256_add_ps(fjy2,ty);
1439 fjz2 = _mm256_add_ps(fjz2,tz);
1443 /**************************
1444 * CALCULATE INTERACTIONS *
1445 **************************/
1447 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1450 r23 = _mm256_mul_ps(rsq23,rinv23);
1451 r23 = _mm256_andnot_ps(dummy_mask,r23);
1453 /* EWALD ELECTROSTATICS */
1455 /* Analytical PME correction */
1456 zeta2 = _mm256_mul_ps(beta2,rsq23);
1457 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1458 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1459 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1460 felec = _mm256_mul_ps(qq23,felec);
1461 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1462 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1463 velec = _mm256_sub_ps(rinv23,pmecorrV);
1464 velec = _mm256_mul_ps(qq23,velec);
1466 d = _mm256_sub_ps(r23,rswitch);
1467 d = _mm256_max_ps(d,_mm256_setzero_ps());
1468 d2 = _mm256_mul_ps(d,d);
1469 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1471 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1473 /* Evaluate switch function */
1474 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1475 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1476 velec = _mm256_mul_ps(velec,sw);
1477 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1479 /* Update potential sum for this i atom from the interaction with this j atom. */
1480 velec = _mm256_and_ps(velec,cutoff_mask);
1481 velec = _mm256_andnot_ps(dummy_mask,velec);
1482 velecsum = _mm256_add_ps(velecsum,velec);
1486 fscal = _mm256_and_ps(fscal,cutoff_mask);
1488 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1490 /* Calculate temporary vectorial force */
1491 tx = _mm256_mul_ps(fscal,dx23);
1492 ty = _mm256_mul_ps(fscal,dy23);
1493 tz = _mm256_mul_ps(fscal,dz23);
1495 /* Update vectorial force */
1496 fix2 = _mm256_add_ps(fix2,tx);
1497 fiy2 = _mm256_add_ps(fiy2,ty);
1498 fiz2 = _mm256_add_ps(fiz2,tz);
1500 fjx3 = _mm256_add_ps(fjx3,tx);
1501 fjy3 = _mm256_add_ps(fjy3,ty);
1502 fjz3 = _mm256_add_ps(fjz3,tz);
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1513 r31 = _mm256_mul_ps(rsq31,rinv31);
1514 r31 = _mm256_andnot_ps(dummy_mask,r31);
1516 /* EWALD ELECTROSTATICS */
1518 /* Analytical PME correction */
1519 zeta2 = _mm256_mul_ps(beta2,rsq31);
1520 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1521 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1522 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1523 felec = _mm256_mul_ps(qq31,felec);
1524 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1525 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1526 velec = _mm256_sub_ps(rinv31,pmecorrV);
1527 velec = _mm256_mul_ps(qq31,velec);
1529 d = _mm256_sub_ps(r31,rswitch);
1530 d = _mm256_max_ps(d,_mm256_setzero_ps());
1531 d2 = _mm256_mul_ps(d,d);
1532 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1534 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1536 /* Evaluate switch function */
1537 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1538 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1539 velec = _mm256_mul_ps(velec,sw);
1540 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1542 /* Update potential sum for this i atom from the interaction with this j atom. */
1543 velec = _mm256_and_ps(velec,cutoff_mask);
1544 velec = _mm256_andnot_ps(dummy_mask,velec);
1545 velecsum = _mm256_add_ps(velecsum,velec);
1549 fscal = _mm256_and_ps(fscal,cutoff_mask);
1551 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1553 /* Calculate temporary vectorial force */
1554 tx = _mm256_mul_ps(fscal,dx31);
1555 ty = _mm256_mul_ps(fscal,dy31);
1556 tz = _mm256_mul_ps(fscal,dz31);
1558 /* Update vectorial force */
1559 fix3 = _mm256_add_ps(fix3,tx);
1560 fiy3 = _mm256_add_ps(fiy3,ty);
1561 fiz3 = _mm256_add_ps(fiz3,tz);
1563 fjx1 = _mm256_add_ps(fjx1,tx);
1564 fjy1 = _mm256_add_ps(fjy1,ty);
1565 fjz1 = _mm256_add_ps(fjz1,tz);
1569 /**************************
1570 * CALCULATE INTERACTIONS *
1571 **************************/
1573 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1576 r32 = _mm256_mul_ps(rsq32,rinv32);
1577 r32 = _mm256_andnot_ps(dummy_mask,r32);
1579 /* EWALD ELECTROSTATICS */
1581 /* Analytical PME correction */
1582 zeta2 = _mm256_mul_ps(beta2,rsq32);
1583 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1584 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1585 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1586 felec = _mm256_mul_ps(qq32,felec);
1587 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1588 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1589 velec = _mm256_sub_ps(rinv32,pmecorrV);
1590 velec = _mm256_mul_ps(qq32,velec);
1592 d = _mm256_sub_ps(r32,rswitch);
1593 d = _mm256_max_ps(d,_mm256_setzero_ps());
1594 d2 = _mm256_mul_ps(d,d);
1595 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1597 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1599 /* Evaluate switch function */
1600 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1601 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1602 velec = _mm256_mul_ps(velec,sw);
1603 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1605 /* Update potential sum for this i atom from the interaction with this j atom. */
1606 velec = _mm256_and_ps(velec,cutoff_mask);
1607 velec = _mm256_andnot_ps(dummy_mask,velec);
1608 velecsum = _mm256_add_ps(velecsum,velec);
1612 fscal = _mm256_and_ps(fscal,cutoff_mask);
1614 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1616 /* Calculate temporary vectorial force */
1617 tx = _mm256_mul_ps(fscal,dx32);
1618 ty = _mm256_mul_ps(fscal,dy32);
1619 tz = _mm256_mul_ps(fscal,dz32);
1621 /* Update vectorial force */
1622 fix3 = _mm256_add_ps(fix3,tx);
1623 fiy3 = _mm256_add_ps(fiy3,ty);
1624 fiz3 = _mm256_add_ps(fiz3,tz);
1626 fjx2 = _mm256_add_ps(fjx2,tx);
1627 fjy2 = _mm256_add_ps(fjy2,ty);
1628 fjz2 = _mm256_add_ps(fjz2,tz);
1632 /**************************
1633 * CALCULATE INTERACTIONS *
1634 **************************/
1636 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1639 r33 = _mm256_mul_ps(rsq33,rinv33);
1640 r33 = _mm256_andnot_ps(dummy_mask,r33);
1642 /* EWALD ELECTROSTATICS */
1644 /* Analytical PME correction */
1645 zeta2 = _mm256_mul_ps(beta2,rsq33);
1646 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1647 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1648 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1649 felec = _mm256_mul_ps(qq33,felec);
1650 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1651 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1652 velec = _mm256_sub_ps(rinv33,pmecorrV);
1653 velec = _mm256_mul_ps(qq33,velec);
1655 d = _mm256_sub_ps(r33,rswitch);
1656 d = _mm256_max_ps(d,_mm256_setzero_ps());
1657 d2 = _mm256_mul_ps(d,d);
1658 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1660 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1662 /* Evaluate switch function */
1663 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1664 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1665 velec = _mm256_mul_ps(velec,sw);
1666 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1668 /* Update potential sum for this i atom from the interaction with this j atom. */
1669 velec = _mm256_and_ps(velec,cutoff_mask);
1670 velec = _mm256_andnot_ps(dummy_mask,velec);
1671 velecsum = _mm256_add_ps(velecsum,velec);
1675 fscal = _mm256_and_ps(fscal,cutoff_mask);
1677 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1679 /* Calculate temporary vectorial force */
1680 tx = _mm256_mul_ps(fscal,dx33);
1681 ty = _mm256_mul_ps(fscal,dy33);
1682 tz = _mm256_mul_ps(fscal,dz33);
1684 /* Update vectorial force */
1685 fix3 = _mm256_add_ps(fix3,tx);
1686 fiy3 = _mm256_add_ps(fiy3,ty);
1687 fiz3 = _mm256_add_ps(fiz3,tz);
1689 fjx3 = _mm256_add_ps(fjx3,tx);
1690 fjy3 = _mm256_add_ps(fjy3,ty);
1691 fjz3 = _mm256_add_ps(fjz3,tz);
1695 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1696 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1697 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1698 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1699 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1700 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1701 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1702 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1704 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1705 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1706 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1708 /* Inner loop uses 1044 flops */
1711 /* End of innermost loop */
1713 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1714 f+i_coord_offset,fshift+i_shift_offset);
1717 /* Update potential energies */
1718 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1719 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1721 /* Increment number of inner iterations */
1722 inneriter += j_index_end - j_index_start;
1724 /* Outer loop uses 26 flops */
1727 /* Increment number of outer iterations */
1730 /* Update outer/inner flops */
1732 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*1044);
1735 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single
1736 * Electrostatics interaction: Ewald
1737 * VdW interaction: LennardJones
1738 * Geometry: Water4-Water4
1739 * Calculate force/pot: Force
1742 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single
1743 (t_nblist * gmx_restrict nlist,
1744 rvec * gmx_restrict xx,
1745 rvec * gmx_restrict ff,
1746 t_forcerec * gmx_restrict fr,
1747 t_mdatoms * gmx_restrict mdatoms,
1748 nb_kernel_data_t * gmx_restrict kernel_data,
1749 t_nrnb * gmx_restrict nrnb)
1751 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1752 * just 0 for non-waters.
1753 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1754 * jnr indices corresponding to data put in the four positions in the SIMD register.
1756 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1757 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1758 int jnrA,jnrB,jnrC,jnrD;
1759 int jnrE,jnrF,jnrG,jnrH;
1760 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1761 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1762 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1763 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1764 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1765 real rcutoff_scalar;
1766 real *shiftvec,*fshift,*x,*f;
1767 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1768 real scratch[4*DIM];
1769 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1770 real * vdwioffsetptr0;
1771 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1772 real * vdwioffsetptr1;
1773 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1774 real * vdwioffsetptr2;
1775 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1776 real * vdwioffsetptr3;
1777 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1778 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1779 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1780 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1781 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1782 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1783 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1784 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1785 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1786 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1787 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1788 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1789 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1790 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1791 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1792 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1793 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1794 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1795 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1796 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1799 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1802 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1803 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1805 __m128i ewitab_lo,ewitab_hi;
1806 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1807 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1809 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1810 real rswitch_scalar,d_scalar;
1811 __m256 dummy_mask,cutoff_mask;
1812 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1813 __m256 one = _mm256_set1_ps(1.0);
1814 __m256 two = _mm256_set1_ps(2.0);
1820 jindex = nlist->jindex;
1822 shiftidx = nlist->shift;
1824 shiftvec = fr->shift_vec[0];
1825 fshift = fr->fshift[0];
1826 facel = _mm256_set1_ps(fr->epsfac);
1827 charge = mdatoms->chargeA;
1828 nvdwtype = fr->ntype;
1829 vdwparam = fr->nbfp;
1830 vdwtype = mdatoms->typeA;
1832 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1833 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
1834 beta2 = _mm256_mul_ps(beta,beta);
1835 beta3 = _mm256_mul_ps(beta,beta2);
1837 ewtab = fr->ic->tabq_coul_FDV0;
1838 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1839 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1841 /* Setup water-specific parameters */
1842 inr = nlist->iinr[0];
1843 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1844 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1845 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1846 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1848 jq1 = _mm256_set1_ps(charge[inr+1]);
1849 jq2 = _mm256_set1_ps(charge[inr+2]);
1850 jq3 = _mm256_set1_ps(charge[inr+3]);
1851 vdwjidx0A = 2*vdwtype[inr+0];
1852 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1853 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1854 qq11 = _mm256_mul_ps(iq1,jq1);
1855 qq12 = _mm256_mul_ps(iq1,jq2);
1856 qq13 = _mm256_mul_ps(iq1,jq3);
1857 qq21 = _mm256_mul_ps(iq2,jq1);
1858 qq22 = _mm256_mul_ps(iq2,jq2);
1859 qq23 = _mm256_mul_ps(iq2,jq3);
1860 qq31 = _mm256_mul_ps(iq3,jq1);
1861 qq32 = _mm256_mul_ps(iq3,jq2);
1862 qq33 = _mm256_mul_ps(iq3,jq3);
1864 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1865 rcutoff_scalar = fr->rcoulomb;
1866 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1867 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1869 rswitch_scalar = fr->rcoulomb_switch;
1870 rswitch = _mm256_set1_ps(rswitch_scalar);
1871 /* Setup switch parameters */
1872 d_scalar = rcutoff_scalar-rswitch_scalar;
1873 d = _mm256_set1_ps(d_scalar);
1874 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1875 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1876 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1877 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1878 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1879 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1881 /* Avoid stupid compiler warnings */
1882 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1883 j_coord_offsetA = 0;
1884 j_coord_offsetB = 0;
1885 j_coord_offsetC = 0;
1886 j_coord_offsetD = 0;
1887 j_coord_offsetE = 0;
1888 j_coord_offsetF = 0;
1889 j_coord_offsetG = 0;
1890 j_coord_offsetH = 0;
1895 for(iidx=0;iidx<4*DIM;iidx++)
1897 scratch[iidx] = 0.0;
1900 /* Start outer loop over neighborlists */
1901 for(iidx=0; iidx<nri; iidx++)
1903 /* Load shift vector for this list */
1904 i_shift_offset = DIM*shiftidx[iidx];
1906 /* Load limits for loop over neighbors */
1907 j_index_start = jindex[iidx];
1908 j_index_end = jindex[iidx+1];
1910 /* Get outer coordinate index */
1912 i_coord_offset = DIM*inr;
1914 /* Load i particle coords and add shift vector */
1915 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1916 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1918 fix0 = _mm256_setzero_ps();
1919 fiy0 = _mm256_setzero_ps();
1920 fiz0 = _mm256_setzero_ps();
1921 fix1 = _mm256_setzero_ps();
1922 fiy1 = _mm256_setzero_ps();
1923 fiz1 = _mm256_setzero_ps();
1924 fix2 = _mm256_setzero_ps();
1925 fiy2 = _mm256_setzero_ps();
1926 fiz2 = _mm256_setzero_ps();
1927 fix3 = _mm256_setzero_ps();
1928 fiy3 = _mm256_setzero_ps();
1929 fiz3 = _mm256_setzero_ps();
1931 /* Start inner kernel loop */
1932 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1935 /* Get j neighbor index, and coordinate index */
1937 jnrB = jjnr[jidx+1];
1938 jnrC = jjnr[jidx+2];
1939 jnrD = jjnr[jidx+3];
1940 jnrE = jjnr[jidx+4];
1941 jnrF = jjnr[jidx+5];
1942 jnrG = jjnr[jidx+6];
1943 jnrH = jjnr[jidx+7];
1944 j_coord_offsetA = DIM*jnrA;
1945 j_coord_offsetB = DIM*jnrB;
1946 j_coord_offsetC = DIM*jnrC;
1947 j_coord_offsetD = DIM*jnrD;
1948 j_coord_offsetE = DIM*jnrE;
1949 j_coord_offsetF = DIM*jnrF;
1950 j_coord_offsetG = DIM*jnrG;
1951 j_coord_offsetH = DIM*jnrH;
1953 /* load j atom coordinates */
1954 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1955 x+j_coord_offsetC,x+j_coord_offsetD,
1956 x+j_coord_offsetE,x+j_coord_offsetF,
1957 x+j_coord_offsetG,x+j_coord_offsetH,
1958 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1959 &jy2,&jz2,&jx3,&jy3,&jz3);
1961 /* Calculate displacement vector */
1962 dx00 = _mm256_sub_ps(ix0,jx0);
1963 dy00 = _mm256_sub_ps(iy0,jy0);
1964 dz00 = _mm256_sub_ps(iz0,jz0);
1965 dx11 = _mm256_sub_ps(ix1,jx1);
1966 dy11 = _mm256_sub_ps(iy1,jy1);
1967 dz11 = _mm256_sub_ps(iz1,jz1);
1968 dx12 = _mm256_sub_ps(ix1,jx2);
1969 dy12 = _mm256_sub_ps(iy1,jy2);
1970 dz12 = _mm256_sub_ps(iz1,jz2);
1971 dx13 = _mm256_sub_ps(ix1,jx3);
1972 dy13 = _mm256_sub_ps(iy1,jy3);
1973 dz13 = _mm256_sub_ps(iz1,jz3);
1974 dx21 = _mm256_sub_ps(ix2,jx1);
1975 dy21 = _mm256_sub_ps(iy2,jy1);
1976 dz21 = _mm256_sub_ps(iz2,jz1);
1977 dx22 = _mm256_sub_ps(ix2,jx2);
1978 dy22 = _mm256_sub_ps(iy2,jy2);
1979 dz22 = _mm256_sub_ps(iz2,jz2);
1980 dx23 = _mm256_sub_ps(ix2,jx3);
1981 dy23 = _mm256_sub_ps(iy2,jy3);
1982 dz23 = _mm256_sub_ps(iz2,jz3);
1983 dx31 = _mm256_sub_ps(ix3,jx1);
1984 dy31 = _mm256_sub_ps(iy3,jy1);
1985 dz31 = _mm256_sub_ps(iz3,jz1);
1986 dx32 = _mm256_sub_ps(ix3,jx2);
1987 dy32 = _mm256_sub_ps(iy3,jy2);
1988 dz32 = _mm256_sub_ps(iz3,jz2);
1989 dx33 = _mm256_sub_ps(ix3,jx3);
1990 dy33 = _mm256_sub_ps(iy3,jy3);
1991 dz33 = _mm256_sub_ps(iz3,jz3);
1993 /* Calculate squared distance and things based on it */
1994 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1995 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1996 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1997 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1998 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1999 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2000 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2001 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2002 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2003 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2005 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2006 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2007 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2008 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2009 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2010 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2011 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2012 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2013 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2014 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2016 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2017 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2018 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2019 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2020 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2021 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2022 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2023 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2024 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2025 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2027 fjx0 = _mm256_setzero_ps();
2028 fjy0 = _mm256_setzero_ps();
2029 fjz0 = _mm256_setzero_ps();
2030 fjx1 = _mm256_setzero_ps();
2031 fjy1 = _mm256_setzero_ps();
2032 fjz1 = _mm256_setzero_ps();
2033 fjx2 = _mm256_setzero_ps();
2034 fjy2 = _mm256_setzero_ps();
2035 fjz2 = _mm256_setzero_ps();
2036 fjx3 = _mm256_setzero_ps();
2037 fjy3 = _mm256_setzero_ps();
2038 fjz3 = _mm256_setzero_ps();
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2047 r00 = _mm256_mul_ps(rsq00,rinv00);
2049 /* LENNARD-JONES DISPERSION/REPULSION */
2051 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2052 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
2053 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2054 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2055 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2057 d = _mm256_sub_ps(r00,rswitch);
2058 d = _mm256_max_ps(d,_mm256_setzero_ps());
2059 d2 = _mm256_mul_ps(d,d);
2060 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2062 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2064 /* Evaluate switch function */
2065 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2066 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2067 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2071 fscal = _mm256_and_ps(fscal,cutoff_mask);
2073 /* Calculate temporary vectorial force */
2074 tx = _mm256_mul_ps(fscal,dx00);
2075 ty = _mm256_mul_ps(fscal,dy00);
2076 tz = _mm256_mul_ps(fscal,dz00);
2078 /* Update vectorial force */
2079 fix0 = _mm256_add_ps(fix0,tx);
2080 fiy0 = _mm256_add_ps(fiy0,ty);
2081 fiz0 = _mm256_add_ps(fiz0,tz);
2083 fjx0 = _mm256_add_ps(fjx0,tx);
2084 fjy0 = _mm256_add_ps(fjy0,ty);
2085 fjz0 = _mm256_add_ps(fjz0,tz);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2096 r11 = _mm256_mul_ps(rsq11,rinv11);
2098 /* EWALD ELECTROSTATICS */
2100 /* Analytical PME correction */
2101 zeta2 = _mm256_mul_ps(beta2,rsq11);
2102 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2103 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2104 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2105 felec = _mm256_mul_ps(qq11,felec);
2106 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2107 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2108 velec = _mm256_sub_ps(rinv11,pmecorrV);
2109 velec = _mm256_mul_ps(qq11,velec);
2111 d = _mm256_sub_ps(r11,rswitch);
2112 d = _mm256_max_ps(d,_mm256_setzero_ps());
2113 d2 = _mm256_mul_ps(d,d);
2114 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2116 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2118 /* Evaluate switch function */
2119 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2120 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2121 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2125 fscal = _mm256_and_ps(fscal,cutoff_mask);
2127 /* Calculate temporary vectorial force */
2128 tx = _mm256_mul_ps(fscal,dx11);
2129 ty = _mm256_mul_ps(fscal,dy11);
2130 tz = _mm256_mul_ps(fscal,dz11);
2132 /* Update vectorial force */
2133 fix1 = _mm256_add_ps(fix1,tx);
2134 fiy1 = _mm256_add_ps(fiy1,ty);
2135 fiz1 = _mm256_add_ps(fiz1,tz);
2137 fjx1 = _mm256_add_ps(fjx1,tx);
2138 fjy1 = _mm256_add_ps(fjy1,ty);
2139 fjz1 = _mm256_add_ps(fjz1,tz);
2143 /**************************
2144 * CALCULATE INTERACTIONS *
2145 **************************/
2147 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2150 r12 = _mm256_mul_ps(rsq12,rinv12);
2152 /* EWALD ELECTROSTATICS */
2154 /* Analytical PME correction */
2155 zeta2 = _mm256_mul_ps(beta2,rsq12);
2156 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2157 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2158 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2159 felec = _mm256_mul_ps(qq12,felec);
2160 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2161 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2162 velec = _mm256_sub_ps(rinv12,pmecorrV);
2163 velec = _mm256_mul_ps(qq12,velec);
2165 d = _mm256_sub_ps(r12,rswitch);
2166 d = _mm256_max_ps(d,_mm256_setzero_ps());
2167 d2 = _mm256_mul_ps(d,d);
2168 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2170 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2172 /* Evaluate switch function */
2173 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2174 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2175 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2179 fscal = _mm256_and_ps(fscal,cutoff_mask);
2181 /* Calculate temporary vectorial force */
2182 tx = _mm256_mul_ps(fscal,dx12);
2183 ty = _mm256_mul_ps(fscal,dy12);
2184 tz = _mm256_mul_ps(fscal,dz12);
2186 /* Update vectorial force */
2187 fix1 = _mm256_add_ps(fix1,tx);
2188 fiy1 = _mm256_add_ps(fiy1,ty);
2189 fiz1 = _mm256_add_ps(fiz1,tz);
2191 fjx2 = _mm256_add_ps(fjx2,tx);
2192 fjy2 = _mm256_add_ps(fjy2,ty);
2193 fjz2 = _mm256_add_ps(fjz2,tz);
2197 /**************************
2198 * CALCULATE INTERACTIONS *
2199 **************************/
2201 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2204 r13 = _mm256_mul_ps(rsq13,rinv13);
2206 /* EWALD ELECTROSTATICS */
2208 /* Analytical PME correction */
2209 zeta2 = _mm256_mul_ps(beta2,rsq13);
2210 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2211 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2212 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2213 felec = _mm256_mul_ps(qq13,felec);
2214 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2215 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2216 velec = _mm256_sub_ps(rinv13,pmecorrV);
2217 velec = _mm256_mul_ps(qq13,velec);
2219 d = _mm256_sub_ps(r13,rswitch);
2220 d = _mm256_max_ps(d,_mm256_setzero_ps());
2221 d2 = _mm256_mul_ps(d,d);
2222 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2224 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2226 /* Evaluate switch function */
2227 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2228 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2229 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2233 fscal = _mm256_and_ps(fscal,cutoff_mask);
2235 /* Calculate temporary vectorial force */
2236 tx = _mm256_mul_ps(fscal,dx13);
2237 ty = _mm256_mul_ps(fscal,dy13);
2238 tz = _mm256_mul_ps(fscal,dz13);
2240 /* Update vectorial force */
2241 fix1 = _mm256_add_ps(fix1,tx);
2242 fiy1 = _mm256_add_ps(fiy1,ty);
2243 fiz1 = _mm256_add_ps(fiz1,tz);
2245 fjx3 = _mm256_add_ps(fjx3,tx);
2246 fjy3 = _mm256_add_ps(fjy3,ty);
2247 fjz3 = _mm256_add_ps(fjz3,tz);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2258 r21 = _mm256_mul_ps(rsq21,rinv21);
2260 /* EWALD ELECTROSTATICS */
2262 /* Analytical PME correction */
2263 zeta2 = _mm256_mul_ps(beta2,rsq21);
2264 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2265 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2266 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2267 felec = _mm256_mul_ps(qq21,felec);
2268 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2269 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2270 velec = _mm256_sub_ps(rinv21,pmecorrV);
2271 velec = _mm256_mul_ps(qq21,velec);
2273 d = _mm256_sub_ps(r21,rswitch);
2274 d = _mm256_max_ps(d,_mm256_setzero_ps());
2275 d2 = _mm256_mul_ps(d,d);
2276 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2278 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2280 /* Evaluate switch function */
2281 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2282 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2283 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2287 fscal = _mm256_and_ps(fscal,cutoff_mask);
2289 /* Calculate temporary vectorial force */
2290 tx = _mm256_mul_ps(fscal,dx21);
2291 ty = _mm256_mul_ps(fscal,dy21);
2292 tz = _mm256_mul_ps(fscal,dz21);
2294 /* Update vectorial force */
2295 fix2 = _mm256_add_ps(fix2,tx);
2296 fiy2 = _mm256_add_ps(fiy2,ty);
2297 fiz2 = _mm256_add_ps(fiz2,tz);
2299 fjx1 = _mm256_add_ps(fjx1,tx);
2300 fjy1 = _mm256_add_ps(fjy1,ty);
2301 fjz1 = _mm256_add_ps(fjz1,tz);
2305 /**************************
2306 * CALCULATE INTERACTIONS *
2307 **************************/
2309 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2312 r22 = _mm256_mul_ps(rsq22,rinv22);
2314 /* EWALD ELECTROSTATICS */
2316 /* Analytical PME correction */
2317 zeta2 = _mm256_mul_ps(beta2,rsq22);
2318 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2319 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2320 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2321 felec = _mm256_mul_ps(qq22,felec);
2322 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2323 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2324 velec = _mm256_sub_ps(rinv22,pmecorrV);
2325 velec = _mm256_mul_ps(qq22,velec);
2327 d = _mm256_sub_ps(r22,rswitch);
2328 d = _mm256_max_ps(d,_mm256_setzero_ps());
2329 d2 = _mm256_mul_ps(d,d);
2330 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2332 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2334 /* Evaluate switch function */
2335 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2336 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2337 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2341 fscal = _mm256_and_ps(fscal,cutoff_mask);
2343 /* Calculate temporary vectorial force */
2344 tx = _mm256_mul_ps(fscal,dx22);
2345 ty = _mm256_mul_ps(fscal,dy22);
2346 tz = _mm256_mul_ps(fscal,dz22);
2348 /* Update vectorial force */
2349 fix2 = _mm256_add_ps(fix2,tx);
2350 fiy2 = _mm256_add_ps(fiy2,ty);
2351 fiz2 = _mm256_add_ps(fiz2,tz);
2353 fjx2 = _mm256_add_ps(fjx2,tx);
2354 fjy2 = _mm256_add_ps(fjy2,ty);
2355 fjz2 = _mm256_add_ps(fjz2,tz);
2359 /**************************
2360 * CALCULATE INTERACTIONS *
2361 **************************/
2363 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2366 r23 = _mm256_mul_ps(rsq23,rinv23);
2368 /* EWALD ELECTROSTATICS */
2370 /* Analytical PME correction */
2371 zeta2 = _mm256_mul_ps(beta2,rsq23);
2372 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2373 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2374 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2375 felec = _mm256_mul_ps(qq23,felec);
2376 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2377 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2378 velec = _mm256_sub_ps(rinv23,pmecorrV);
2379 velec = _mm256_mul_ps(qq23,velec);
2381 d = _mm256_sub_ps(r23,rswitch);
2382 d = _mm256_max_ps(d,_mm256_setzero_ps());
2383 d2 = _mm256_mul_ps(d,d);
2384 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2386 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2388 /* Evaluate switch function */
2389 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2390 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2391 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2395 fscal = _mm256_and_ps(fscal,cutoff_mask);
2397 /* Calculate temporary vectorial force */
2398 tx = _mm256_mul_ps(fscal,dx23);
2399 ty = _mm256_mul_ps(fscal,dy23);
2400 tz = _mm256_mul_ps(fscal,dz23);
2402 /* Update vectorial force */
2403 fix2 = _mm256_add_ps(fix2,tx);
2404 fiy2 = _mm256_add_ps(fiy2,ty);
2405 fiz2 = _mm256_add_ps(fiz2,tz);
2407 fjx3 = _mm256_add_ps(fjx3,tx);
2408 fjy3 = _mm256_add_ps(fjy3,ty);
2409 fjz3 = _mm256_add_ps(fjz3,tz);
2413 /**************************
2414 * CALCULATE INTERACTIONS *
2415 **************************/
2417 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2420 r31 = _mm256_mul_ps(rsq31,rinv31);
2422 /* EWALD ELECTROSTATICS */
2424 /* Analytical PME correction */
2425 zeta2 = _mm256_mul_ps(beta2,rsq31);
2426 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2427 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2428 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2429 felec = _mm256_mul_ps(qq31,felec);
2430 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2431 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2432 velec = _mm256_sub_ps(rinv31,pmecorrV);
2433 velec = _mm256_mul_ps(qq31,velec);
2435 d = _mm256_sub_ps(r31,rswitch);
2436 d = _mm256_max_ps(d,_mm256_setzero_ps());
2437 d2 = _mm256_mul_ps(d,d);
2438 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2440 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2442 /* Evaluate switch function */
2443 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2444 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2445 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2449 fscal = _mm256_and_ps(fscal,cutoff_mask);
2451 /* Calculate temporary vectorial force */
2452 tx = _mm256_mul_ps(fscal,dx31);
2453 ty = _mm256_mul_ps(fscal,dy31);
2454 tz = _mm256_mul_ps(fscal,dz31);
2456 /* Update vectorial force */
2457 fix3 = _mm256_add_ps(fix3,tx);
2458 fiy3 = _mm256_add_ps(fiy3,ty);
2459 fiz3 = _mm256_add_ps(fiz3,tz);
2461 fjx1 = _mm256_add_ps(fjx1,tx);
2462 fjy1 = _mm256_add_ps(fjy1,ty);
2463 fjz1 = _mm256_add_ps(fjz1,tz);
2467 /**************************
2468 * CALCULATE INTERACTIONS *
2469 **************************/
2471 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2474 r32 = _mm256_mul_ps(rsq32,rinv32);
2476 /* EWALD ELECTROSTATICS */
2478 /* Analytical PME correction */
2479 zeta2 = _mm256_mul_ps(beta2,rsq32);
2480 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2481 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2482 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2483 felec = _mm256_mul_ps(qq32,felec);
2484 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2485 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2486 velec = _mm256_sub_ps(rinv32,pmecorrV);
2487 velec = _mm256_mul_ps(qq32,velec);
2489 d = _mm256_sub_ps(r32,rswitch);
2490 d = _mm256_max_ps(d,_mm256_setzero_ps());
2491 d2 = _mm256_mul_ps(d,d);
2492 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2494 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2496 /* Evaluate switch function */
2497 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2498 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2499 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2503 fscal = _mm256_and_ps(fscal,cutoff_mask);
2505 /* Calculate temporary vectorial force */
2506 tx = _mm256_mul_ps(fscal,dx32);
2507 ty = _mm256_mul_ps(fscal,dy32);
2508 tz = _mm256_mul_ps(fscal,dz32);
2510 /* Update vectorial force */
2511 fix3 = _mm256_add_ps(fix3,tx);
2512 fiy3 = _mm256_add_ps(fiy3,ty);
2513 fiz3 = _mm256_add_ps(fiz3,tz);
2515 fjx2 = _mm256_add_ps(fjx2,tx);
2516 fjy2 = _mm256_add_ps(fjy2,ty);
2517 fjz2 = _mm256_add_ps(fjz2,tz);
2521 /**************************
2522 * CALCULATE INTERACTIONS *
2523 **************************/
2525 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2528 r33 = _mm256_mul_ps(rsq33,rinv33);
2530 /* EWALD ELECTROSTATICS */
2532 /* Analytical PME correction */
2533 zeta2 = _mm256_mul_ps(beta2,rsq33);
2534 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2535 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2536 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2537 felec = _mm256_mul_ps(qq33,felec);
2538 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2539 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2540 velec = _mm256_sub_ps(rinv33,pmecorrV);
2541 velec = _mm256_mul_ps(qq33,velec);
2543 d = _mm256_sub_ps(r33,rswitch);
2544 d = _mm256_max_ps(d,_mm256_setzero_ps());
2545 d2 = _mm256_mul_ps(d,d);
2546 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2548 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2550 /* Evaluate switch function */
2551 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2552 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2553 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2557 fscal = _mm256_and_ps(fscal,cutoff_mask);
2559 /* Calculate temporary vectorial force */
2560 tx = _mm256_mul_ps(fscal,dx33);
2561 ty = _mm256_mul_ps(fscal,dy33);
2562 tz = _mm256_mul_ps(fscal,dz33);
2564 /* Update vectorial force */
2565 fix3 = _mm256_add_ps(fix3,tx);
2566 fiy3 = _mm256_add_ps(fiy3,ty);
2567 fiz3 = _mm256_add_ps(fiz3,tz);
2569 fjx3 = _mm256_add_ps(fjx3,tx);
2570 fjy3 = _mm256_add_ps(fjy3,ty);
2571 fjz3 = _mm256_add_ps(fjz3,tz);
2575 fjptrA = f+j_coord_offsetA;
2576 fjptrB = f+j_coord_offsetB;
2577 fjptrC = f+j_coord_offsetC;
2578 fjptrD = f+j_coord_offsetD;
2579 fjptrE = f+j_coord_offsetE;
2580 fjptrF = f+j_coord_offsetF;
2581 fjptrG = f+j_coord_offsetG;
2582 fjptrH = f+j_coord_offsetH;
2584 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2585 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2586 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2588 /* Inner loop uses 1004 flops */
2591 if(jidx<j_index_end)
2594 /* Get j neighbor index, and coordinate index */
2595 jnrlistA = jjnr[jidx];
2596 jnrlistB = jjnr[jidx+1];
2597 jnrlistC = jjnr[jidx+2];
2598 jnrlistD = jjnr[jidx+3];
2599 jnrlistE = jjnr[jidx+4];
2600 jnrlistF = jjnr[jidx+5];
2601 jnrlistG = jjnr[jidx+6];
2602 jnrlistH = jjnr[jidx+7];
2603 /* Sign of each element will be negative for non-real atoms.
2604 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2605 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2607 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2608 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2610 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2611 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2612 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2613 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2614 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2615 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2616 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2617 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2618 j_coord_offsetA = DIM*jnrA;
2619 j_coord_offsetB = DIM*jnrB;
2620 j_coord_offsetC = DIM*jnrC;
2621 j_coord_offsetD = DIM*jnrD;
2622 j_coord_offsetE = DIM*jnrE;
2623 j_coord_offsetF = DIM*jnrF;
2624 j_coord_offsetG = DIM*jnrG;
2625 j_coord_offsetH = DIM*jnrH;
2627 /* load j atom coordinates */
2628 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2629 x+j_coord_offsetC,x+j_coord_offsetD,
2630 x+j_coord_offsetE,x+j_coord_offsetF,
2631 x+j_coord_offsetG,x+j_coord_offsetH,
2632 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2633 &jy2,&jz2,&jx3,&jy3,&jz3);
2635 /* Calculate displacement vector */
2636 dx00 = _mm256_sub_ps(ix0,jx0);
2637 dy00 = _mm256_sub_ps(iy0,jy0);
2638 dz00 = _mm256_sub_ps(iz0,jz0);
2639 dx11 = _mm256_sub_ps(ix1,jx1);
2640 dy11 = _mm256_sub_ps(iy1,jy1);
2641 dz11 = _mm256_sub_ps(iz1,jz1);
2642 dx12 = _mm256_sub_ps(ix1,jx2);
2643 dy12 = _mm256_sub_ps(iy1,jy2);
2644 dz12 = _mm256_sub_ps(iz1,jz2);
2645 dx13 = _mm256_sub_ps(ix1,jx3);
2646 dy13 = _mm256_sub_ps(iy1,jy3);
2647 dz13 = _mm256_sub_ps(iz1,jz3);
2648 dx21 = _mm256_sub_ps(ix2,jx1);
2649 dy21 = _mm256_sub_ps(iy2,jy1);
2650 dz21 = _mm256_sub_ps(iz2,jz1);
2651 dx22 = _mm256_sub_ps(ix2,jx2);
2652 dy22 = _mm256_sub_ps(iy2,jy2);
2653 dz22 = _mm256_sub_ps(iz2,jz2);
2654 dx23 = _mm256_sub_ps(ix2,jx3);
2655 dy23 = _mm256_sub_ps(iy2,jy3);
2656 dz23 = _mm256_sub_ps(iz2,jz3);
2657 dx31 = _mm256_sub_ps(ix3,jx1);
2658 dy31 = _mm256_sub_ps(iy3,jy1);
2659 dz31 = _mm256_sub_ps(iz3,jz1);
2660 dx32 = _mm256_sub_ps(ix3,jx2);
2661 dy32 = _mm256_sub_ps(iy3,jy2);
2662 dz32 = _mm256_sub_ps(iz3,jz2);
2663 dx33 = _mm256_sub_ps(ix3,jx3);
2664 dy33 = _mm256_sub_ps(iy3,jy3);
2665 dz33 = _mm256_sub_ps(iz3,jz3);
2667 /* Calculate squared distance and things based on it */
2668 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2669 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2670 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2671 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2672 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2673 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2674 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2675 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2676 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2677 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2679 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2680 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2681 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2682 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2683 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2684 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2685 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2686 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2687 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2688 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2690 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2691 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2692 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2693 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2694 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2695 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2696 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2697 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2698 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2699 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2701 fjx0 = _mm256_setzero_ps();
2702 fjy0 = _mm256_setzero_ps();
2703 fjz0 = _mm256_setzero_ps();
2704 fjx1 = _mm256_setzero_ps();
2705 fjy1 = _mm256_setzero_ps();
2706 fjz1 = _mm256_setzero_ps();
2707 fjx2 = _mm256_setzero_ps();
2708 fjy2 = _mm256_setzero_ps();
2709 fjz2 = _mm256_setzero_ps();
2710 fjx3 = _mm256_setzero_ps();
2711 fjy3 = _mm256_setzero_ps();
2712 fjz3 = _mm256_setzero_ps();
2714 /**************************
2715 * CALCULATE INTERACTIONS *
2716 **************************/
2718 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2721 r00 = _mm256_mul_ps(rsq00,rinv00);
2722 r00 = _mm256_andnot_ps(dummy_mask,r00);
2724 /* LENNARD-JONES DISPERSION/REPULSION */
2726 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2727 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
2728 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2729 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2730 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2732 d = _mm256_sub_ps(r00,rswitch);
2733 d = _mm256_max_ps(d,_mm256_setzero_ps());
2734 d2 = _mm256_mul_ps(d,d);
2735 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2737 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2739 /* Evaluate switch function */
2740 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2741 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2742 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2746 fscal = _mm256_and_ps(fscal,cutoff_mask);
2748 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2750 /* Calculate temporary vectorial force */
2751 tx = _mm256_mul_ps(fscal,dx00);
2752 ty = _mm256_mul_ps(fscal,dy00);
2753 tz = _mm256_mul_ps(fscal,dz00);
2755 /* Update vectorial force */
2756 fix0 = _mm256_add_ps(fix0,tx);
2757 fiy0 = _mm256_add_ps(fiy0,ty);
2758 fiz0 = _mm256_add_ps(fiz0,tz);
2760 fjx0 = _mm256_add_ps(fjx0,tx);
2761 fjy0 = _mm256_add_ps(fjy0,ty);
2762 fjz0 = _mm256_add_ps(fjz0,tz);
2766 /**************************
2767 * CALCULATE INTERACTIONS *
2768 **************************/
2770 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2773 r11 = _mm256_mul_ps(rsq11,rinv11);
2774 r11 = _mm256_andnot_ps(dummy_mask,r11);
2776 /* EWALD ELECTROSTATICS */
2778 /* Analytical PME correction */
2779 zeta2 = _mm256_mul_ps(beta2,rsq11);
2780 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2781 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2782 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2783 felec = _mm256_mul_ps(qq11,felec);
2784 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2785 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2786 velec = _mm256_sub_ps(rinv11,pmecorrV);
2787 velec = _mm256_mul_ps(qq11,velec);
2789 d = _mm256_sub_ps(r11,rswitch);
2790 d = _mm256_max_ps(d,_mm256_setzero_ps());
2791 d2 = _mm256_mul_ps(d,d);
2792 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2794 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2796 /* Evaluate switch function */
2797 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2798 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2799 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2803 fscal = _mm256_and_ps(fscal,cutoff_mask);
2805 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2807 /* Calculate temporary vectorial force */
2808 tx = _mm256_mul_ps(fscal,dx11);
2809 ty = _mm256_mul_ps(fscal,dy11);
2810 tz = _mm256_mul_ps(fscal,dz11);
2812 /* Update vectorial force */
2813 fix1 = _mm256_add_ps(fix1,tx);
2814 fiy1 = _mm256_add_ps(fiy1,ty);
2815 fiz1 = _mm256_add_ps(fiz1,tz);
2817 fjx1 = _mm256_add_ps(fjx1,tx);
2818 fjy1 = _mm256_add_ps(fjy1,ty);
2819 fjz1 = _mm256_add_ps(fjz1,tz);
2823 /**************************
2824 * CALCULATE INTERACTIONS *
2825 **************************/
2827 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2830 r12 = _mm256_mul_ps(rsq12,rinv12);
2831 r12 = _mm256_andnot_ps(dummy_mask,r12);
2833 /* EWALD ELECTROSTATICS */
2835 /* Analytical PME correction */
2836 zeta2 = _mm256_mul_ps(beta2,rsq12);
2837 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2838 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2839 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2840 felec = _mm256_mul_ps(qq12,felec);
2841 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2842 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2843 velec = _mm256_sub_ps(rinv12,pmecorrV);
2844 velec = _mm256_mul_ps(qq12,velec);
2846 d = _mm256_sub_ps(r12,rswitch);
2847 d = _mm256_max_ps(d,_mm256_setzero_ps());
2848 d2 = _mm256_mul_ps(d,d);
2849 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2851 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2853 /* Evaluate switch function */
2854 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2855 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2856 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2860 fscal = _mm256_and_ps(fscal,cutoff_mask);
2862 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2864 /* Calculate temporary vectorial force */
2865 tx = _mm256_mul_ps(fscal,dx12);
2866 ty = _mm256_mul_ps(fscal,dy12);
2867 tz = _mm256_mul_ps(fscal,dz12);
2869 /* Update vectorial force */
2870 fix1 = _mm256_add_ps(fix1,tx);
2871 fiy1 = _mm256_add_ps(fiy1,ty);
2872 fiz1 = _mm256_add_ps(fiz1,tz);
2874 fjx2 = _mm256_add_ps(fjx2,tx);
2875 fjy2 = _mm256_add_ps(fjy2,ty);
2876 fjz2 = _mm256_add_ps(fjz2,tz);
2880 /**************************
2881 * CALCULATE INTERACTIONS *
2882 **************************/
2884 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2887 r13 = _mm256_mul_ps(rsq13,rinv13);
2888 r13 = _mm256_andnot_ps(dummy_mask,r13);
2890 /* EWALD ELECTROSTATICS */
2892 /* Analytical PME correction */
2893 zeta2 = _mm256_mul_ps(beta2,rsq13);
2894 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2895 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2896 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2897 felec = _mm256_mul_ps(qq13,felec);
2898 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2899 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2900 velec = _mm256_sub_ps(rinv13,pmecorrV);
2901 velec = _mm256_mul_ps(qq13,velec);
2903 d = _mm256_sub_ps(r13,rswitch);
2904 d = _mm256_max_ps(d,_mm256_setzero_ps());
2905 d2 = _mm256_mul_ps(d,d);
2906 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2908 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2910 /* Evaluate switch function */
2911 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2912 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2913 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2917 fscal = _mm256_and_ps(fscal,cutoff_mask);
2919 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2921 /* Calculate temporary vectorial force */
2922 tx = _mm256_mul_ps(fscal,dx13);
2923 ty = _mm256_mul_ps(fscal,dy13);
2924 tz = _mm256_mul_ps(fscal,dz13);
2926 /* Update vectorial force */
2927 fix1 = _mm256_add_ps(fix1,tx);
2928 fiy1 = _mm256_add_ps(fiy1,ty);
2929 fiz1 = _mm256_add_ps(fiz1,tz);
2931 fjx3 = _mm256_add_ps(fjx3,tx);
2932 fjy3 = _mm256_add_ps(fjy3,ty);
2933 fjz3 = _mm256_add_ps(fjz3,tz);
2937 /**************************
2938 * CALCULATE INTERACTIONS *
2939 **************************/
2941 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2944 r21 = _mm256_mul_ps(rsq21,rinv21);
2945 r21 = _mm256_andnot_ps(dummy_mask,r21);
2947 /* EWALD ELECTROSTATICS */
2949 /* Analytical PME correction */
2950 zeta2 = _mm256_mul_ps(beta2,rsq21);
2951 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2952 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2953 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2954 felec = _mm256_mul_ps(qq21,felec);
2955 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2956 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2957 velec = _mm256_sub_ps(rinv21,pmecorrV);
2958 velec = _mm256_mul_ps(qq21,velec);
2960 d = _mm256_sub_ps(r21,rswitch);
2961 d = _mm256_max_ps(d,_mm256_setzero_ps());
2962 d2 = _mm256_mul_ps(d,d);
2963 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2965 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2967 /* Evaluate switch function */
2968 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2969 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2970 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2974 fscal = _mm256_and_ps(fscal,cutoff_mask);
2976 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2978 /* Calculate temporary vectorial force */
2979 tx = _mm256_mul_ps(fscal,dx21);
2980 ty = _mm256_mul_ps(fscal,dy21);
2981 tz = _mm256_mul_ps(fscal,dz21);
2983 /* Update vectorial force */
2984 fix2 = _mm256_add_ps(fix2,tx);
2985 fiy2 = _mm256_add_ps(fiy2,ty);
2986 fiz2 = _mm256_add_ps(fiz2,tz);
2988 fjx1 = _mm256_add_ps(fjx1,tx);
2989 fjy1 = _mm256_add_ps(fjy1,ty);
2990 fjz1 = _mm256_add_ps(fjz1,tz);
2994 /**************************
2995 * CALCULATE INTERACTIONS *
2996 **************************/
2998 if (gmx_mm256_any_lt(rsq22,rcutoff2))
3001 r22 = _mm256_mul_ps(rsq22,rinv22);
3002 r22 = _mm256_andnot_ps(dummy_mask,r22);
3004 /* EWALD ELECTROSTATICS */
3006 /* Analytical PME correction */
3007 zeta2 = _mm256_mul_ps(beta2,rsq22);
3008 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
3009 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3010 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3011 felec = _mm256_mul_ps(qq22,felec);
3012 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3013 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3014 velec = _mm256_sub_ps(rinv22,pmecorrV);
3015 velec = _mm256_mul_ps(qq22,velec);
3017 d = _mm256_sub_ps(r22,rswitch);
3018 d = _mm256_max_ps(d,_mm256_setzero_ps());
3019 d2 = _mm256_mul_ps(d,d);
3020 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3022 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3024 /* Evaluate switch function */
3025 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3026 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
3027 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
3031 fscal = _mm256_and_ps(fscal,cutoff_mask);
3033 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3035 /* Calculate temporary vectorial force */
3036 tx = _mm256_mul_ps(fscal,dx22);
3037 ty = _mm256_mul_ps(fscal,dy22);
3038 tz = _mm256_mul_ps(fscal,dz22);
3040 /* Update vectorial force */
3041 fix2 = _mm256_add_ps(fix2,tx);
3042 fiy2 = _mm256_add_ps(fiy2,ty);
3043 fiz2 = _mm256_add_ps(fiz2,tz);
3045 fjx2 = _mm256_add_ps(fjx2,tx);
3046 fjy2 = _mm256_add_ps(fjy2,ty);
3047 fjz2 = _mm256_add_ps(fjz2,tz);
3051 /**************************
3052 * CALCULATE INTERACTIONS *
3053 **************************/
3055 if (gmx_mm256_any_lt(rsq23,rcutoff2))
3058 r23 = _mm256_mul_ps(rsq23,rinv23);
3059 r23 = _mm256_andnot_ps(dummy_mask,r23);
3061 /* EWALD ELECTROSTATICS */
3063 /* Analytical PME correction */
3064 zeta2 = _mm256_mul_ps(beta2,rsq23);
3065 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
3066 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3067 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3068 felec = _mm256_mul_ps(qq23,felec);
3069 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3070 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3071 velec = _mm256_sub_ps(rinv23,pmecorrV);
3072 velec = _mm256_mul_ps(qq23,velec);
3074 d = _mm256_sub_ps(r23,rswitch);
3075 d = _mm256_max_ps(d,_mm256_setzero_ps());
3076 d2 = _mm256_mul_ps(d,d);
3077 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3079 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3081 /* Evaluate switch function */
3082 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3083 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
3084 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
3088 fscal = _mm256_and_ps(fscal,cutoff_mask);
3090 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3092 /* Calculate temporary vectorial force */
3093 tx = _mm256_mul_ps(fscal,dx23);
3094 ty = _mm256_mul_ps(fscal,dy23);
3095 tz = _mm256_mul_ps(fscal,dz23);
3097 /* Update vectorial force */
3098 fix2 = _mm256_add_ps(fix2,tx);
3099 fiy2 = _mm256_add_ps(fiy2,ty);
3100 fiz2 = _mm256_add_ps(fiz2,tz);
3102 fjx3 = _mm256_add_ps(fjx3,tx);
3103 fjy3 = _mm256_add_ps(fjy3,ty);
3104 fjz3 = _mm256_add_ps(fjz3,tz);
3108 /**************************
3109 * CALCULATE INTERACTIONS *
3110 **************************/
3112 if (gmx_mm256_any_lt(rsq31,rcutoff2))
3115 r31 = _mm256_mul_ps(rsq31,rinv31);
3116 r31 = _mm256_andnot_ps(dummy_mask,r31);
3118 /* EWALD ELECTROSTATICS */
3120 /* Analytical PME correction */
3121 zeta2 = _mm256_mul_ps(beta2,rsq31);
3122 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
3123 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3124 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3125 felec = _mm256_mul_ps(qq31,felec);
3126 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3127 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3128 velec = _mm256_sub_ps(rinv31,pmecorrV);
3129 velec = _mm256_mul_ps(qq31,velec);
3131 d = _mm256_sub_ps(r31,rswitch);
3132 d = _mm256_max_ps(d,_mm256_setzero_ps());
3133 d2 = _mm256_mul_ps(d,d);
3134 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3136 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3138 /* Evaluate switch function */
3139 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3140 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
3141 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
3145 fscal = _mm256_and_ps(fscal,cutoff_mask);
3147 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3149 /* Calculate temporary vectorial force */
3150 tx = _mm256_mul_ps(fscal,dx31);
3151 ty = _mm256_mul_ps(fscal,dy31);
3152 tz = _mm256_mul_ps(fscal,dz31);
3154 /* Update vectorial force */
3155 fix3 = _mm256_add_ps(fix3,tx);
3156 fiy3 = _mm256_add_ps(fiy3,ty);
3157 fiz3 = _mm256_add_ps(fiz3,tz);
3159 fjx1 = _mm256_add_ps(fjx1,tx);
3160 fjy1 = _mm256_add_ps(fjy1,ty);
3161 fjz1 = _mm256_add_ps(fjz1,tz);
3165 /**************************
3166 * CALCULATE INTERACTIONS *
3167 **************************/
3169 if (gmx_mm256_any_lt(rsq32,rcutoff2))
3172 r32 = _mm256_mul_ps(rsq32,rinv32);
3173 r32 = _mm256_andnot_ps(dummy_mask,r32);
3175 /* EWALD ELECTROSTATICS */
3177 /* Analytical PME correction */
3178 zeta2 = _mm256_mul_ps(beta2,rsq32);
3179 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
3180 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3181 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3182 felec = _mm256_mul_ps(qq32,felec);
3183 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3184 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3185 velec = _mm256_sub_ps(rinv32,pmecorrV);
3186 velec = _mm256_mul_ps(qq32,velec);
3188 d = _mm256_sub_ps(r32,rswitch);
3189 d = _mm256_max_ps(d,_mm256_setzero_ps());
3190 d2 = _mm256_mul_ps(d,d);
3191 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3193 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3195 /* Evaluate switch function */
3196 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3197 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
3198 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
3202 fscal = _mm256_and_ps(fscal,cutoff_mask);
3204 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3206 /* Calculate temporary vectorial force */
3207 tx = _mm256_mul_ps(fscal,dx32);
3208 ty = _mm256_mul_ps(fscal,dy32);
3209 tz = _mm256_mul_ps(fscal,dz32);
3211 /* Update vectorial force */
3212 fix3 = _mm256_add_ps(fix3,tx);
3213 fiy3 = _mm256_add_ps(fiy3,ty);
3214 fiz3 = _mm256_add_ps(fiz3,tz);
3216 fjx2 = _mm256_add_ps(fjx2,tx);
3217 fjy2 = _mm256_add_ps(fjy2,ty);
3218 fjz2 = _mm256_add_ps(fjz2,tz);
3222 /**************************
3223 * CALCULATE INTERACTIONS *
3224 **************************/
3226 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3229 r33 = _mm256_mul_ps(rsq33,rinv33);
3230 r33 = _mm256_andnot_ps(dummy_mask,r33);
3232 /* EWALD ELECTROSTATICS */
3234 /* Analytical PME correction */
3235 zeta2 = _mm256_mul_ps(beta2,rsq33);
3236 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
3237 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3238 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3239 felec = _mm256_mul_ps(qq33,felec);
3240 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3241 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3242 velec = _mm256_sub_ps(rinv33,pmecorrV);
3243 velec = _mm256_mul_ps(qq33,velec);
3245 d = _mm256_sub_ps(r33,rswitch);
3246 d = _mm256_max_ps(d,_mm256_setzero_ps());
3247 d2 = _mm256_mul_ps(d,d);
3248 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3250 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3252 /* Evaluate switch function */
3253 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3254 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
3255 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
3259 fscal = _mm256_and_ps(fscal,cutoff_mask);
3261 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3263 /* Calculate temporary vectorial force */
3264 tx = _mm256_mul_ps(fscal,dx33);
3265 ty = _mm256_mul_ps(fscal,dy33);
3266 tz = _mm256_mul_ps(fscal,dz33);
3268 /* Update vectorial force */
3269 fix3 = _mm256_add_ps(fix3,tx);
3270 fiy3 = _mm256_add_ps(fiy3,ty);
3271 fiz3 = _mm256_add_ps(fiz3,tz);
3273 fjx3 = _mm256_add_ps(fjx3,tx);
3274 fjy3 = _mm256_add_ps(fjy3,ty);
3275 fjz3 = _mm256_add_ps(fjz3,tz);
3279 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3280 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3281 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3282 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3283 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3284 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3285 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3286 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3288 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3289 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3290 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3292 /* Inner loop uses 1014 flops */
3295 /* End of innermost loop */
3297 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3298 f+i_coord_offset,fshift+i_shift_offset);
3300 /* Increment number of inner iterations */
3301 inneriter += j_index_end - j_index_start;
3303 /* Outer loop uses 24 flops */
3306 /* Increment number of outer iterations */
3309 /* Update outer/inner flops */
3311 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*1014);