2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 real * vdwioffsetptr3;
79 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
101 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
105 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
107 __m128i ewitab_lo,ewitab_hi;
108 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
111 __m256 dummy_mask,cutoff_mask;
112 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113 __m256 one = _mm256_set1_ps(1.0);
114 __m256 two = _mm256_set1_ps(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_ps(fr->epsfac);
127 charge = mdatoms->chargeA;
128 nvdwtype = fr->ntype;
130 vdwtype = mdatoms->typeA;
132 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
133 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
134 beta2 = _mm256_mul_ps(beta,beta);
135 beta3 = _mm256_mul_ps(beta,beta2);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
144 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
145 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
146 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
148 jq1 = _mm256_set1_ps(charge[inr+1]);
149 jq2 = _mm256_set1_ps(charge[inr+2]);
150 jq3 = _mm256_set1_ps(charge[inr+3]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
153 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
154 qq11 = _mm256_mul_ps(iq1,jq1);
155 qq12 = _mm256_mul_ps(iq1,jq2);
156 qq13 = _mm256_mul_ps(iq1,jq3);
157 qq21 = _mm256_mul_ps(iq2,jq1);
158 qq22 = _mm256_mul_ps(iq2,jq2);
159 qq23 = _mm256_mul_ps(iq2,jq3);
160 qq31 = _mm256_mul_ps(iq3,jq1);
161 qq32 = _mm256_mul_ps(iq3,jq2);
162 qq33 = _mm256_mul_ps(iq3,jq3);
164 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165 rcutoff_scalar = fr->rcoulomb;
166 rcutoff = _mm256_set1_ps(rcutoff_scalar);
167 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
169 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
170 rvdw = _mm256_set1_ps(fr->rvdw);
172 /* Avoid stupid compiler warnings */
173 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
186 for(iidx=0;iidx<4*DIM;iidx++)
191 /* Start outer loop over neighborlists */
192 for(iidx=0; iidx<nri; iidx++)
194 /* Load shift vector for this list */
195 i_shift_offset = DIM*shiftidx[iidx];
197 /* Load limits for loop over neighbors */
198 j_index_start = jindex[iidx];
199 j_index_end = jindex[iidx+1];
201 /* Get outer coordinate index */
203 i_coord_offset = DIM*inr;
205 /* Load i particle coords and add shift vector */
206 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
207 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
209 fix0 = _mm256_setzero_ps();
210 fiy0 = _mm256_setzero_ps();
211 fiz0 = _mm256_setzero_ps();
212 fix1 = _mm256_setzero_ps();
213 fiy1 = _mm256_setzero_ps();
214 fiz1 = _mm256_setzero_ps();
215 fix2 = _mm256_setzero_ps();
216 fiy2 = _mm256_setzero_ps();
217 fiz2 = _mm256_setzero_ps();
218 fix3 = _mm256_setzero_ps();
219 fiy3 = _mm256_setzero_ps();
220 fiz3 = _mm256_setzero_ps();
222 /* Reset potential sums */
223 velecsum = _mm256_setzero_ps();
224 vvdwsum = _mm256_setzero_ps();
226 /* Start inner kernel loop */
227 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
230 /* Get j neighbor index, and coordinate index */
239 j_coord_offsetA = DIM*jnrA;
240 j_coord_offsetB = DIM*jnrB;
241 j_coord_offsetC = DIM*jnrC;
242 j_coord_offsetD = DIM*jnrD;
243 j_coord_offsetE = DIM*jnrE;
244 j_coord_offsetF = DIM*jnrF;
245 j_coord_offsetG = DIM*jnrG;
246 j_coord_offsetH = DIM*jnrH;
248 /* load j atom coordinates */
249 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
250 x+j_coord_offsetC,x+j_coord_offsetD,
251 x+j_coord_offsetE,x+j_coord_offsetF,
252 x+j_coord_offsetG,x+j_coord_offsetH,
253 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
254 &jy2,&jz2,&jx3,&jy3,&jz3);
256 /* Calculate displacement vector */
257 dx00 = _mm256_sub_ps(ix0,jx0);
258 dy00 = _mm256_sub_ps(iy0,jy0);
259 dz00 = _mm256_sub_ps(iz0,jz0);
260 dx11 = _mm256_sub_ps(ix1,jx1);
261 dy11 = _mm256_sub_ps(iy1,jy1);
262 dz11 = _mm256_sub_ps(iz1,jz1);
263 dx12 = _mm256_sub_ps(ix1,jx2);
264 dy12 = _mm256_sub_ps(iy1,jy2);
265 dz12 = _mm256_sub_ps(iz1,jz2);
266 dx13 = _mm256_sub_ps(ix1,jx3);
267 dy13 = _mm256_sub_ps(iy1,jy3);
268 dz13 = _mm256_sub_ps(iz1,jz3);
269 dx21 = _mm256_sub_ps(ix2,jx1);
270 dy21 = _mm256_sub_ps(iy2,jy1);
271 dz21 = _mm256_sub_ps(iz2,jz1);
272 dx22 = _mm256_sub_ps(ix2,jx2);
273 dy22 = _mm256_sub_ps(iy2,jy2);
274 dz22 = _mm256_sub_ps(iz2,jz2);
275 dx23 = _mm256_sub_ps(ix2,jx3);
276 dy23 = _mm256_sub_ps(iy2,jy3);
277 dz23 = _mm256_sub_ps(iz2,jz3);
278 dx31 = _mm256_sub_ps(ix3,jx1);
279 dy31 = _mm256_sub_ps(iy3,jy1);
280 dz31 = _mm256_sub_ps(iz3,jz1);
281 dx32 = _mm256_sub_ps(ix3,jx2);
282 dy32 = _mm256_sub_ps(iy3,jy2);
283 dz32 = _mm256_sub_ps(iz3,jz2);
284 dx33 = _mm256_sub_ps(ix3,jx3);
285 dy33 = _mm256_sub_ps(iy3,jy3);
286 dz33 = _mm256_sub_ps(iz3,jz3);
288 /* Calculate squared distance and things based on it */
289 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
290 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
291 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
292 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
293 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
294 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
295 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
296 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
297 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
298 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
300 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
301 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
302 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
303 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
304 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
305 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
306 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
307 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
308 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
310 rinvsq00 = gmx_mm256_inv_ps(rsq00);
311 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
312 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
313 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
314 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
315 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
316 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
317 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
318 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
319 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
321 fjx0 = _mm256_setzero_ps();
322 fjy0 = _mm256_setzero_ps();
323 fjz0 = _mm256_setzero_ps();
324 fjx1 = _mm256_setzero_ps();
325 fjy1 = _mm256_setzero_ps();
326 fjz1 = _mm256_setzero_ps();
327 fjx2 = _mm256_setzero_ps();
328 fjy2 = _mm256_setzero_ps();
329 fjz2 = _mm256_setzero_ps();
330 fjx3 = _mm256_setzero_ps();
331 fjy3 = _mm256_setzero_ps();
332 fjz3 = _mm256_setzero_ps();
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 if (gmx_mm256_any_lt(rsq00,rcutoff2))
341 /* LENNARD-JONES DISPERSION/REPULSION */
343 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
344 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
345 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
346 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
347 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
348 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
350 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
354 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
358 fscal = _mm256_and_ps(fscal,cutoff_mask);
360 /* Calculate temporary vectorial force */
361 tx = _mm256_mul_ps(fscal,dx00);
362 ty = _mm256_mul_ps(fscal,dy00);
363 tz = _mm256_mul_ps(fscal,dz00);
365 /* Update vectorial force */
366 fix0 = _mm256_add_ps(fix0,tx);
367 fiy0 = _mm256_add_ps(fiy0,ty);
368 fiz0 = _mm256_add_ps(fiz0,tz);
370 fjx0 = _mm256_add_ps(fjx0,tx);
371 fjy0 = _mm256_add_ps(fjy0,ty);
372 fjz0 = _mm256_add_ps(fjz0,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 if (gmx_mm256_any_lt(rsq11,rcutoff2))
383 r11 = _mm256_mul_ps(rsq11,rinv11);
385 /* EWALD ELECTROSTATICS */
387 /* Analytical PME correction */
388 zeta2 = _mm256_mul_ps(beta2,rsq11);
389 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
390 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
391 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
392 felec = _mm256_mul_ps(qq11,felec);
393 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
394 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
395 velec = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
396 velec = _mm256_mul_ps(qq11,velec);
398 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velec = _mm256_and_ps(velec,cutoff_mask);
402 velecsum = _mm256_add_ps(velecsum,velec);
406 fscal = _mm256_and_ps(fscal,cutoff_mask);
408 /* Calculate temporary vectorial force */
409 tx = _mm256_mul_ps(fscal,dx11);
410 ty = _mm256_mul_ps(fscal,dy11);
411 tz = _mm256_mul_ps(fscal,dz11);
413 /* Update vectorial force */
414 fix1 = _mm256_add_ps(fix1,tx);
415 fiy1 = _mm256_add_ps(fiy1,ty);
416 fiz1 = _mm256_add_ps(fiz1,tz);
418 fjx1 = _mm256_add_ps(fjx1,tx);
419 fjy1 = _mm256_add_ps(fjy1,ty);
420 fjz1 = _mm256_add_ps(fjz1,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 if (gmx_mm256_any_lt(rsq12,rcutoff2))
431 r12 = _mm256_mul_ps(rsq12,rinv12);
433 /* EWALD ELECTROSTATICS */
435 /* Analytical PME correction */
436 zeta2 = _mm256_mul_ps(beta2,rsq12);
437 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
438 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
439 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
440 felec = _mm256_mul_ps(qq12,felec);
441 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
442 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
443 velec = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
444 velec = _mm256_mul_ps(qq12,velec);
446 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velec = _mm256_and_ps(velec,cutoff_mask);
450 velecsum = _mm256_add_ps(velecsum,velec);
454 fscal = _mm256_and_ps(fscal,cutoff_mask);
456 /* Calculate temporary vectorial force */
457 tx = _mm256_mul_ps(fscal,dx12);
458 ty = _mm256_mul_ps(fscal,dy12);
459 tz = _mm256_mul_ps(fscal,dz12);
461 /* Update vectorial force */
462 fix1 = _mm256_add_ps(fix1,tx);
463 fiy1 = _mm256_add_ps(fiy1,ty);
464 fiz1 = _mm256_add_ps(fiz1,tz);
466 fjx2 = _mm256_add_ps(fjx2,tx);
467 fjy2 = _mm256_add_ps(fjy2,ty);
468 fjz2 = _mm256_add_ps(fjz2,tz);
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
476 if (gmx_mm256_any_lt(rsq13,rcutoff2))
479 r13 = _mm256_mul_ps(rsq13,rinv13);
481 /* EWALD ELECTROSTATICS */
483 /* Analytical PME correction */
484 zeta2 = _mm256_mul_ps(beta2,rsq13);
485 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
486 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
487 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
488 felec = _mm256_mul_ps(qq13,felec);
489 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
490 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
491 velec = _mm256_sub_ps(_mm256_sub_ps(rinv13,sh_ewald),pmecorrV);
492 velec = _mm256_mul_ps(qq13,velec);
494 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec = _mm256_and_ps(velec,cutoff_mask);
498 velecsum = _mm256_add_ps(velecsum,velec);
502 fscal = _mm256_and_ps(fscal,cutoff_mask);
504 /* Calculate temporary vectorial force */
505 tx = _mm256_mul_ps(fscal,dx13);
506 ty = _mm256_mul_ps(fscal,dy13);
507 tz = _mm256_mul_ps(fscal,dz13);
509 /* Update vectorial force */
510 fix1 = _mm256_add_ps(fix1,tx);
511 fiy1 = _mm256_add_ps(fiy1,ty);
512 fiz1 = _mm256_add_ps(fiz1,tz);
514 fjx3 = _mm256_add_ps(fjx3,tx);
515 fjy3 = _mm256_add_ps(fjy3,ty);
516 fjz3 = _mm256_add_ps(fjz3,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm256_any_lt(rsq21,rcutoff2))
527 r21 = _mm256_mul_ps(rsq21,rinv21);
529 /* EWALD ELECTROSTATICS */
531 /* Analytical PME correction */
532 zeta2 = _mm256_mul_ps(beta2,rsq21);
533 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
534 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
535 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
536 felec = _mm256_mul_ps(qq21,felec);
537 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
538 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
539 velec = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
540 velec = _mm256_mul_ps(qq21,velec);
542 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velec = _mm256_and_ps(velec,cutoff_mask);
546 velecsum = _mm256_add_ps(velecsum,velec);
550 fscal = _mm256_and_ps(fscal,cutoff_mask);
552 /* Calculate temporary vectorial force */
553 tx = _mm256_mul_ps(fscal,dx21);
554 ty = _mm256_mul_ps(fscal,dy21);
555 tz = _mm256_mul_ps(fscal,dz21);
557 /* Update vectorial force */
558 fix2 = _mm256_add_ps(fix2,tx);
559 fiy2 = _mm256_add_ps(fiy2,ty);
560 fiz2 = _mm256_add_ps(fiz2,tz);
562 fjx1 = _mm256_add_ps(fjx1,tx);
563 fjy1 = _mm256_add_ps(fjy1,ty);
564 fjz1 = _mm256_add_ps(fjz1,tz);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 if (gmx_mm256_any_lt(rsq22,rcutoff2))
575 r22 = _mm256_mul_ps(rsq22,rinv22);
577 /* EWALD ELECTROSTATICS */
579 /* Analytical PME correction */
580 zeta2 = _mm256_mul_ps(beta2,rsq22);
581 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
582 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
583 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
584 felec = _mm256_mul_ps(qq22,felec);
585 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
586 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
587 velec = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
588 velec = _mm256_mul_ps(qq22,velec);
590 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
592 /* Update potential sum for this i atom from the interaction with this j atom. */
593 velec = _mm256_and_ps(velec,cutoff_mask);
594 velecsum = _mm256_add_ps(velecsum,velec);
598 fscal = _mm256_and_ps(fscal,cutoff_mask);
600 /* Calculate temporary vectorial force */
601 tx = _mm256_mul_ps(fscal,dx22);
602 ty = _mm256_mul_ps(fscal,dy22);
603 tz = _mm256_mul_ps(fscal,dz22);
605 /* Update vectorial force */
606 fix2 = _mm256_add_ps(fix2,tx);
607 fiy2 = _mm256_add_ps(fiy2,ty);
608 fiz2 = _mm256_add_ps(fiz2,tz);
610 fjx2 = _mm256_add_ps(fjx2,tx);
611 fjy2 = _mm256_add_ps(fjy2,ty);
612 fjz2 = _mm256_add_ps(fjz2,tz);
616 /**************************
617 * CALCULATE INTERACTIONS *
618 **************************/
620 if (gmx_mm256_any_lt(rsq23,rcutoff2))
623 r23 = _mm256_mul_ps(rsq23,rinv23);
625 /* EWALD ELECTROSTATICS */
627 /* Analytical PME correction */
628 zeta2 = _mm256_mul_ps(beta2,rsq23);
629 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
630 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
631 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
632 felec = _mm256_mul_ps(qq23,felec);
633 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
634 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
635 velec = _mm256_sub_ps(_mm256_sub_ps(rinv23,sh_ewald),pmecorrV);
636 velec = _mm256_mul_ps(qq23,velec);
638 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
640 /* Update potential sum for this i atom from the interaction with this j atom. */
641 velec = _mm256_and_ps(velec,cutoff_mask);
642 velecsum = _mm256_add_ps(velecsum,velec);
646 fscal = _mm256_and_ps(fscal,cutoff_mask);
648 /* Calculate temporary vectorial force */
649 tx = _mm256_mul_ps(fscal,dx23);
650 ty = _mm256_mul_ps(fscal,dy23);
651 tz = _mm256_mul_ps(fscal,dz23);
653 /* Update vectorial force */
654 fix2 = _mm256_add_ps(fix2,tx);
655 fiy2 = _mm256_add_ps(fiy2,ty);
656 fiz2 = _mm256_add_ps(fiz2,tz);
658 fjx3 = _mm256_add_ps(fjx3,tx);
659 fjy3 = _mm256_add_ps(fjy3,ty);
660 fjz3 = _mm256_add_ps(fjz3,tz);
664 /**************************
665 * CALCULATE INTERACTIONS *
666 **************************/
668 if (gmx_mm256_any_lt(rsq31,rcutoff2))
671 r31 = _mm256_mul_ps(rsq31,rinv31);
673 /* EWALD ELECTROSTATICS */
675 /* Analytical PME correction */
676 zeta2 = _mm256_mul_ps(beta2,rsq31);
677 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
678 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
679 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
680 felec = _mm256_mul_ps(qq31,felec);
681 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
682 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
683 velec = _mm256_sub_ps(_mm256_sub_ps(rinv31,sh_ewald),pmecorrV);
684 velec = _mm256_mul_ps(qq31,velec);
686 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
688 /* Update potential sum for this i atom from the interaction with this j atom. */
689 velec = _mm256_and_ps(velec,cutoff_mask);
690 velecsum = _mm256_add_ps(velecsum,velec);
694 fscal = _mm256_and_ps(fscal,cutoff_mask);
696 /* Calculate temporary vectorial force */
697 tx = _mm256_mul_ps(fscal,dx31);
698 ty = _mm256_mul_ps(fscal,dy31);
699 tz = _mm256_mul_ps(fscal,dz31);
701 /* Update vectorial force */
702 fix3 = _mm256_add_ps(fix3,tx);
703 fiy3 = _mm256_add_ps(fiy3,ty);
704 fiz3 = _mm256_add_ps(fiz3,tz);
706 fjx1 = _mm256_add_ps(fjx1,tx);
707 fjy1 = _mm256_add_ps(fjy1,ty);
708 fjz1 = _mm256_add_ps(fjz1,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 if (gmx_mm256_any_lt(rsq32,rcutoff2))
719 r32 = _mm256_mul_ps(rsq32,rinv32);
721 /* EWALD ELECTROSTATICS */
723 /* Analytical PME correction */
724 zeta2 = _mm256_mul_ps(beta2,rsq32);
725 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
726 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
727 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
728 felec = _mm256_mul_ps(qq32,felec);
729 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
730 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
731 velec = _mm256_sub_ps(_mm256_sub_ps(rinv32,sh_ewald),pmecorrV);
732 velec = _mm256_mul_ps(qq32,velec);
734 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec = _mm256_and_ps(velec,cutoff_mask);
738 velecsum = _mm256_add_ps(velecsum,velec);
742 fscal = _mm256_and_ps(fscal,cutoff_mask);
744 /* Calculate temporary vectorial force */
745 tx = _mm256_mul_ps(fscal,dx32);
746 ty = _mm256_mul_ps(fscal,dy32);
747 tz = _mm256_mul_ps(fscal,dz32);
749 /* Update vectorial force */
750 fix3 = _mm256_add_ps(fix3,tx);
751 fiy3 = _mm256_add_ps(fiy3,ty);
752 fiz3 = _mm256_add_ps(fiz3,tz);
754 fjx2 = _mm256_add_ps(fjx2,tx);
755 fjy2 = _mm256_add_ps(fjy2,ty);
756 fjz2 = _mm256_add_ps(fjz2,tz);
760 /**************************
761 * CALCULATE INTERACTIONS *
762 **************************/
764 if (gmx_mm256_any_lt(rsq33,rcutoff2))
767 r33 = _mm256_mul_ps(rsq33,rinv33);
769 /* EWALD ELECTROSTATICS */
771 /* Analytical PME correction */
772 zeta2 = _mm256_mul_ps(beta2,rsq33);
773 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
774 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
775 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
776 felec = _mm256_mul_ps(qq33,felec);
777 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
778 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
779 velec = _mm256_sub_ps(_mm256_sub_ps(rinv33,sh_ewald),pmecorrV);
780 velec = _mm256_mul_ps(qq33,velec);
782 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
784 /* Update potential sum for this i atom from the interaction with this j atom. */
785 velec = _mm256_and_ps(velec,cutoff_mask);
786 velecsum = _mm256_add_ps(velecsum,velec);
790 fscal = _mm256_and_ps(fscal,cutoff_mask);
792 /* Calculate temporary vectorial force */
793 tx = _mm256_mul_ps(fscal,dx33);
794 ty = _mm256_mul_ps(fscal,dy33);
795 tz = _mm256_mul_ps(fscal,dz33);
797 /* Update vectorial force */
798 fix3 = _mm256_add_ps(fix3,tx);
799 fiy3 = _mm256_add_ps(fiy3,ty);
800 fiz3 = _mm256_add_ps(fiz3,tz);
802 fjx3 = _mm256_add_ps(fjx3,tx);
803 fjy3 = _mm256_add_ps(fjy3,ty);
804 fjz3 = _mm256_add_ps(fjz3,tz);
808 fjptrA = f+j_coord_offsetA;
809 fjptrB = f+j_coord_offsetB;
810 fjptrC = f+j_coord_offsetC;
811 fjptrD = f+j_coord_offsetD;
812 fjptrE = f+j_coord_offsetE;
813 fjptrF = f+j_coord_offsetF;
814 fjptrG = f+j_coord_offsetG;
815 fjptrH = f+j_coord_offsetH;
817 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
818 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
819 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
821 /* Inner loop uses 1025 flops */
827 /* Get j neighbor index, and coordinate index */
828 jnrlistA = jjnr[jidx];
829 jnrlistB = jjnr[jidx+1];
830 jnrlistC = jjnr[jidx+2];
831 jnrlistD = jjnr[jidx+3];
832 jnrlistE = jjnr[jidx+4];
833 jnrlistF = jjnr[jidx+5];
834 jnrlistG = jjnr[jidx+6];
835 jnrlistH = jjnr[jidx+7];
836 /* Sign of each element will be negative for non-real atoms.
837 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
838 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
840 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
841 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
843 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
844 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
845 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
846 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
847 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
848 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
849 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
850 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
851 j_coord_offsetA = DIM*jnrA;
852 j_coord_offsetB = DIM*jnrB;
853 j_coord_offsetC = DIM*jnrC;
854 j_coord_offsetD = DIM*jnrD;
855 j_coord_offsetE = DIM*jnrE;
856 j_coord_offsetF = DIM*jnrF;
857 j_coord_offsetG = DIM*jnrG;
858 j_coord_offsetH = DIM*jnrH;
860 /* load j atom coordinates */
861 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
862 x+j_coord_offsetC,x+j_coord_offsetD,
863 x+j_coord_offsetE,x+j_coord_offsetF,
864 x+j_coord_offsetG,x+j_coord_offsetH,
865 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
866 &jy2,&jz2,&jx3,&jy3,&jz3);
868 /* Calculate displacement vector */
869 dx00 = _mm256_sub_ps(ix0,jx0);
870 dy00 = _mm256_sub_ps(iy0,jy0);
871 dz00 = _mm256_sub_ps(iz0,jz0);
872 dx11 = _mm256_sub_ps(ix1,jx1);
873 dy11 = _mm256_sub_ps(iy1,jy1);
874 dz11 = _mm256_sub_ps(iz1,jz1);
875 dx12 = _mm256_sub_ps(ix1,jx2);
876 dy12 = _mm256_sub_ps(iy1,jy2);
877 dz12 = _mm256_sub_ps(iz1,jz2);
878 dx13 = _mm256_sub_ps(ix1,jx3);
879 dy13 = _mm256_sub_ps(iy1,jy3);
880 dz13 = _mm256_sub_ps(iz1,jz3);
881 dx21 = _mm256_sub_ps(ix2,jx1);
882 dy21 = _mm256_sub_ps(iy2,jy1);
883 dz21 = _mm256_sub_ps(iz2,jz1);
884 dx22 = _mm256_sub_ps(ix2,jx2);
885 dy22 = _mm256_sub_ps(iy2,jy2);
886 dz22 = _mm256_sub_ps(iz2,jz2);
887 dx23 = _mm256_sub_ps(ix2,jx3);
888 dy23 = _mm256_sub_ps(iy2,jy3);
889 dz23 = _mm256_sub_ps(iz2,jz3);
890 dx31 = _mm256_sub_ps(ix3,jx1);
891 dy31 = _mm256_sub_ps(iy3,jy1);
892 dz31 = _mm256_sub_ps(iz3,jz1);
893 dx32 = _mm256_sub_ps(ix3,jx2);
894 dy32 = _mm256_sub_ps(iy3,jy2);
895 dz32 = _mm256_sub_ps(iz3,jz2);
896 dx33 = _mm256_sub_ps(ix3,jx3);
897 dy33 = _mm256_sub_ps(iy3,jy3);
898 dz33 = _mm256_sub_ps(iz3,jz3);
900 /* Calculate squared distance and things based on it */
901 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
902 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
903 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
904 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
905 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
906 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
907 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
908 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
909 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
910 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
912 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
913 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
914 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
915 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
916 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
917 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
918 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
919 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
920 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
922 rinvsq00 = gmx_mm256_inv_ps(rsq00);
923 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
924 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
925 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
926 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
927 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
928 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
929 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
930 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
931 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
933 fjx0 = _mm256_setzero_ps();
934 fjy0 = _mm256_setzero_ps();
935 fjz0 = _mm256_setzero_ps();
936 fjx1 = _mm256_setzero_ps();
937 fjy1 = _mm256_setzero_ps();
938 fjz1 = _mm256_setzero_ps();
939 fjx2 = _mm256_setzero_ps();
940 fjy2 = _mm256_setzero_ps();
941 fjz2 = _mm256_setzero_ps();
942 fjx3 = _mm256_setzero_ps();
943 fjy3 = _mm256_setzero_ps();
944 fjz3 = _mm256_setzero_ps();
946 /**************************
947 * CALCULATE INTERACTIONS *
948 **************************/
950 if (gmx_mm256_any_lt(rsq00,rcutoff2))
953 /* LENNARD-JONES DISPERSION/REPULSION */
955 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
956 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
957 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
958 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
959 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
960 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
962 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
966 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
967 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
971 fscal = _mm256_and_ps(fscal,cutoff_mask);
973 fscal = _mm256_andnot_ps(dummy_mask,fscal);
975 /* Calculate temporary vectorial force */
976 tx = _mm256_mul_ps(fscal,dx00);
977 ty = _mm256_mul_ps(fscal,dy00);
978 tz = _mm256_mul_ps(fscal,dz00);
980 /* Update vectorial force */
981 fix0 = _mm256_add_ps(fix0,tx);
982 fiy0 = _mm256_add_ps(fiy0,ty);
983 fiz0 = _mm256_add_ps(fiz0,tz);
985 fjx0 = _mm256_add_ps(fjx0,tx);
986 fjy0 = _mm256_add_ps(fjy0,ty);
987 fjz0 = _mm256_add_ps(fjz0,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm256_any_lt(rsq11,rcutoff2))
998 r11 = _mm256_mul_ps(rsq11,rinv11);
999 r11 = _mm256_andnot_ps(dummy_mask,r11);
1001 /* EWALD ELECTROSTATICS */
1003 /* Analytical PME correction */
1004 zeta2 = _mm256_mul_ps(beta2,rsq11);
1005 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1006 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1007 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1008 felec = _mm256_mul_ps(qq11,felec);
1009 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1010 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1011 velec = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
1012 velec = _mm256_mul_ps(qq11,velec);
1014 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1016 /* Update potential sum for this i atom from the interaction with this j atom. */
1017 velec = _mm256_and_ps(velec,cutoff_mask);
1018 velec = _mm256_andnot_ps(dummy_mask,velec);
1019 velecsum = _mm256_add_ps(velecsum,velec);
1023 fscal = _mm256_and_ps(fscal,cutoff_mask);
1025 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1027 /* Calculate temporary vectorial force */
1028 tx = _mm256_mul_ps(fscal,dx11);
1029 ty = _mm256_mul_ps(fscal,dy11);
1030 tz = _mm256_mul_ps(fscal,dz11);
1032 /* Update vectorial force */
1033 fix1 = _mm256_add_ps(fix1,tx);
1034 fiy1 = _mm256_add_ps(fiy1,ty);
1035 fiz1 = _mm256_add_ps(fiz1,tz);
1037 fjx1 = _mm256_add_ps(fjx1,tx);
1038 fjy1 = _mm256_add_ps(fjy1,ty);
1039 fjz1 = _mm256_add_ps(fjz1,tz);
1043 /**************************
1044 * CALCULATE INTERACTIONS *
1045 **************************/
1047 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1050 r12 = _mm256_mul_ps(rsq12,rinv12);
1051 r12 = _mm256_andnot_ps(dummy_mask,r12);
1053 /* EWALD ELECTROSTATICS */
1055 /* Analytical PME correction */
1056 zeta2 = _mm256_mul_ps(beta2,rsq12);
1057 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1058 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1059 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1060 felec = _mm256_mul_ps(qq12,felec);
1061 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1062 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1063 velec = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
1064 velec = _mm256_mul_ps(qq12,velec);
1066 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1068 /* Update potential sum for this i atom from the interaction with this j atom. */
1069 velec = _mm256_and_ps(velec,cutoff_mask);
1070 velec = _mm256_andnot_ps(dummy_mask,velec);
1071 velecsum = _mm256_add_ps(velecsum,velec);
1075 fscal = _mm256_and_ps(fscal,cutoff_mask);
1077 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1079 /* Calculate temporary vectorial force */
1080 tx = _mm256_mul_ps(fscal,dx12);
1081 ty = _mm256_mul_ps(fscal,dy12);
1082 tz = _mm256_mul_ps(fscal,dz12);
1084 /* Update vectorial force */
1085 fix1 = _mm256_add_ps(fix1,tx);
1086 fiy1 = _mm256_add_ps(fiy1,ty);
1087 fiz1 = _mm256_add_ps(fiz1,tz);
1089 fjx2 = _mm256_add_ps(fjx2,tx);
1090 fjy2 = _mm256_add_ps(fjy2,ty);
1091 fjz2 = _mm256_add_ps(fjz2,tz);
1095 /**************************
1096 * CALCULATE INTERACTIONS *
1097 **************************/
1099 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1102 r13 = _mm256_mul_ps(rsq13,rinv13);
1103 r13 = _mm256_andnot_ps(dummy_mask,r13);
1105 /* EWALD ELECTROSTATICS */
1107 /* Analytical PME correction */
1108 zeta2 = _mm256_mul_ps(beta2,rsq13);
1109 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1110 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1111 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1112 felec = _mm256_mul_ps(qq13,felec);
1113 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1114 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1115 velec = _mm256_sub_ps(_mm256_sub_ps(rinv13,sh_ewald),pmecorrV);
1116 velec = _mm256_mul_ps(qq13,velec);
1118 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1120 /* Update potential sum for this i atom from the interaction with this j atom. */
1121 velec = _mm256_and_ps(velec,cutoff_mask);
1122 velec = _mm256_andnot_ps(dummy_mask,velec);
1123 velecsum = _mm256_add_ps(velecsum,velec);
1127 fscal = _mm256_and_ps(fscal,cutoff_mask);
1129 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1131 /* Calculate temporary vectorial force */
1132 tx = _mm256_mul_ps(fscal,dx13);
1133 ty = _mm256_mul_ps(fscal,dy13);
1134 tz = _mm256_mul_ps(fscal,dz13);
1136 /* Update vectorial force */
1137 fix1 = _mm256_add_ps(fix1,tx);
1138 fiy1 = _mm256_add_ps(fiy1,ty);
1139 fiz1 = _mm256_add_ps(fiz1,tz);
1141 fjx3 = _mm256_add_ps(fjx3,tx);
1142 fjy3 = _mm256_add_ps(fjy3,ty);
1143 fjz3 = _mm256_add_ps(fjz3,tz);
1147 /**************************
1148 * CALCULATE INTERACTIONS *
1149 **************************/
1151 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1154 r21 = _mm256_mul_ps(rsq21,rinv21);
1155 r21 = _mm256_andnot_ps(dummy_mask,r21);
1157 /* EWALD ELECTROSTATICS */
1159 /* Analytical PME correction */
1160 zeta2 = _mm256_mul_ps(beta2,rsq21);
1161 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1162 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1163 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1164 felec = _mm256_mul_ps(qq21,felec);
1165 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1166 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1167 velec = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
1168 velec = _mm256_mul_ps(qq21,velec);
1170 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1172 /* Update potential sum for this i atom from the interaction with this j atom. */
1173 velec = _mm256_and_ps(velec,cutoff_mask);
1174 velec = _mm256_andnot_ps(dummy_mask,velec);
1175 velecsum = _mm256_add_ps(velecsum,velec);
1179 fscal = _mm256_and_ps(fscal,cutoff_mask);
1181 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1183 /* Calculate temporary vectorial force */
1184 tx = _mm256_mul_ps(fscal,dx21);
1185 ty = _mm256_mul_ps(fscal,dy21);
1186 tz = _mm256_mul_ps(fscal,dz21);
1188 /* Update vectorial force */
1189 fix2 = _mm256_add_ps(fix2,tx);
1190 fiy2 = _mm256_add_ps(fiy2,ty);
1191 fiz2 = _mm256_add_ps(fiz2,tz);
1193 fjx1 = _mm256_add_ps(fjx1,tx);
1194 fjy1 = _mm256_add_ps(fjy1,ty);
1195 fjz1 = _mm256_add_ps(fjz1,tz);
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1206 r22 = _mm256_mul_ps(rsq22,rinv22);
1207 r22 = _mm256_andnot_ps(dummy_mask,r22);
1209 /* EWALD ELECTROSTATICS */
1211 /* Analytical PME correction */
1212 zeta2 = _mm256_mul_ps(beta2,rsq22);
1213 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1214 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1215 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1216 felec = _mm256_mul_ps(qq22,felec);
1217 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1218 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1219 velec = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
1220 velec = _mm256_mul_ps(qq22,velec);
1222 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1224 /* Update potential sum for this i atom from the interaction with this j atom. */
1225 velec = _mm256_and_ps(velec,cutoff_mask);
1226 velec = _mm256_andnot_ps(dummy_mask,velec);
1227 velecsum = _mm256_add_ps(velecsum,velec);
1231 fscal = _mm256_and_ps(fscal,cutoff_mask);
1233 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1235 /* Calculate temporary vectorial force */
1236 tx = _mm256_mul_ps(fscal,dx22);
1237 ty = _mm256_mul_ps(fscal,dy22);
1238 tz = _mm256_mul_ps(fscal,dz22);
1240 /* Update vectorial force */
1241 fix2 = _mm256_add_ps(fix2,tx);
1242 fiy2 = _mm256_add_ps(fiy2,ty);
1243 fiz2 = _mm256_add_ps(fiz2,tz);
1245 fjx2 = _mm256_add_ps(fjx2,tx);
1246 fjy2 = _mm256_add_ps(fjy2,ty);
1247 fjz2 = _mm256_add_ps(fjz2,tz);
1251 /**************************
1252 * CALCULATE INTERACTIONS *
1253 **************************/
1255 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1258 r23 = _mm256_mul_ps(rsq23,rinv23);
1259 r23 = _mm256_andnot_ps(dummy_mask,r23);
1261 /* EWALD ELECTROSTATICS */
1263 /* Analytical PME correction */
1264 zeta2 = _mm256_mul_ps(beta2,rsq23);
1265 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1266 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1267 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1268 felec = _mm256_mul_ps(qq23,felec);
1269 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1270 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1271 velec = _mm256_sub_ps(_mm256_sub_ps(rinv23,sh_ewald),pmecorrV);
1272 velec = _mm256_mul_ps(qq23,velec);
1274 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1276 /* Update potential sum for this i atom from the interaction with this j atom. */
1277 velec = _mm256_and_ps(velec,cutoff_mask);
1278 velec = _mm256_andnot_ps(dummy_mask,velec);
1279 velecsum = _mm256_add_ps(velecsum,velec);
1283 fscal = _mm256_and_ps(fscal,cutoff_mask);
1285 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1287 /* Calculate temporary vectorial force */
1288 tx = _mm256_mul_ps(fscal,dx23);
1289 ty = _mm256_mul_ps(fscal,dy23);
1290 tz = _mm256_mul_ps(fscal,dz23);
1292 /* Update vectorial force */
1293 fix2 = _mm256_add_ps(fix2,tx);
1294 fiy2 = _mm256_add_ps(fiy2,ty);
1295 fiz2 = _mm256_add_ps(fiz2,tz);
1297 fjx3 = _mm256_add_ps(fjx3,tx);
1298 fjy3 = _mm256_add_ps(fjy3,ty);
1299 fjz3 = _mm256_add_ps(fjz3,tz);
1303 /**************************
1304 * CALCULATE INTERACTIONS *
1305 **************************/
1307 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1310 r31 = _mm256_mul_ps(rsq31,rinv31);
1311 r31 = _mm256_andnot_ps(dummy_mask,r31);
1313 /* EWALD ELECTROSTATICS */
1315 /* Analytical PME correction */
1316 zeta2 = _mm256_mul_ps(beta2,rsq31);
1317 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1318 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1319 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1320 felec = _mm256_mul_ps(qq31,felec);
1321 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1322 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1323 velec = _mm256_sub_ps(_mm256_sub_ps(rinv31,sh_ewald),pmecorrV);
1324 velec = _mm256_mul_ps(qq31,velec);
1326 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1328 /* Update potential sum for this i atom from the interaction with this j atom. */
1329 velec = _mm256_and_ps(velec,cutoff_mask);
1330 velec = _mm256_andnot_ps(dummy_mask,velec);
1331 velecsum = _mm256_add_ps(velecsum,velec);
1335 fscal = _mm256_and_ps(fscal,cutoff_mask);
1337 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1339 /* Calculate temporary vectorial force */
1340 tx = _mm256_mul_ps(fscal,dx31);
1341 ty = _mm256_mul_ps(fscal,dy31);
1342 tz = _mm256_mul_ps(fscal,dz31);
1344 /* Update vectorial force */
1345 fix3 = _mm256_add_ps(fix3,tx);
1346 fiy3 = _mm256_add_ps(fiy3,ty);
1347 fiz3 = _mm256_add_ps(fiz3,tz);
1349 fjx1 = _mm256_add_ps(fjx1,tx);
1350 fjy1 = _mm256_add_ps(fjy1,ty);
1351 fjz1 = _mm256_add_ps(fjz1,tz);
1355 /**************************
1356 * CALCULATE INTERACTIONS *
1357 **************************/
1359 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1362 r32 = _mm256_mul_ps(rsq32,rinv32);
1363 r32 = _mm256_andnot_ps(dummy_mask,r32);
1365 /* EWALD ELECTROSTATICS */
1367 /* Analytical PME correction */
1368 zeta2 = _mm256_mul_ps(beta2,rsq32);
1369 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1370 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1371 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1372 felec = _mm256_mul_ps(qq32,felec);
1373 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1374 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1375 velec = _mm256_sub_ps(_mm256_sub_ps(rinv32,sh_ewald),pmecorrV);
1376 velec = _mm256_mul_ps(qq32,velec);
1378 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1380 /* Update potential sum for this i atom from the interaction with this j atom. */
1381 velec = _mm256_and_ps(velec,cutoff_mask);
1382 velec = _mm256_andnot_ps(dummy_mask,velec);
1383 velecsum = _mm256_add_ps(velecsum,velec);
1387 fscal = _mm256_and_ps(fscal,cutoff_mask);
1389 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1391 /* Calculate temporary vectorial force */
1392 tx = _mm256_mul_ps(fscal,dx32);
1393 ty = _mm256_mul_ps(fscal,dy32);
1394 tz = _mm256_mul_ps(fscal,dz32);
1396 /* Update vectorial force */
1397 fix3 = _mm256_add_ps(fix3,tx);
1398 fiy3 = _mm256_add_ps(fiy3,ty);
1399 fiz3 = _mm256_add_ps(fiz3,tz);
1401 fjx2 = _mm256_add_ps(fjx2,tx);
1402 fjy2 = _mm256_add_ps(fjy2,ty);
1403 fjz2 = _mm256_add_ps(fjz2,tz);
1407 /**************************
1408 * CALCULATE INTERACTIONS *
1409 **************************/
1411 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1414 r33 = _mm256_mul_ps(rsq33,rinv33);
1415 r33 = _mm256_andnot_ps(dummy_mask,r33);
1417 /* EWALD ELECTROSTATICS */
1419 /* Analytical PME correction */
1420 zeta2 = _mm256_mul_ps(beta2,rsq33);
1421 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1422 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1423 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1424 felec = _mm256_mul_ps(qq33,felec);
1425 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1426 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1427 velec = _mm256_sub_ps(_mm256_sub_ps(rinv33,sh_ewald),pmecorrV);
1428 velec = _mm256_mul_ps(qq33,velec);
1430 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1432 /* Update potential sum for this i atom from the interaction with this j atom. */
1433 velec = _mm256_and_ps(velec,cutoff_mask);
1434 velec = _mm256_andnot_ps(dummy_mask,velec);
1435 velecsum = _mm256_add_ps(velecsum,velec);
1439 fscal = _mm256_and_ps(fscal,cutoff_mask);
1441 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1443 /* Calculate temporary vectorial force */
1444 tx = _mm256_mul_ps(fscal,dx33);
1445 ty = _mm256_mul_ps(fscal,dy33);
1446 tz = _mm256_mul_ps(fscal,dz33);
1448 /* Update vectorial force */
1449 fix3 = _mm256_add_ps(fix3,tx);
1450 fiy3 = _mm256_add_ps(fiy3,ty);
1451 fiz3 = _mm256_add_ps(fiz3,tz);
1453 fjx3 = _mm256_add_ps(fjx3,tx);
1454 fjy3 = _mm256_add_ps(fjy3,ty);
1455 fjz3 = _mm256_add_ps(fjz3,tz);
1459 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1460 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1461 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1462 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1463 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1464 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1465 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1466 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1468 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1469 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1470 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1472 /* Inner loop uses 1034 flops */
1475 /* End of innermost loop */
1477 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1478 f+i_coord_offset,fshift+i_shift_offset);
1481 /* Update potential energies */
1482 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1483 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1485 /* Increment number of inner iterations */
1486 inneriter += j_index_end - j_index_start;
1488 /* Outer loop uses 26 flops */
1491 /* Increment number of outer iterations */
1494 /* Update outer/inner flops */
1496 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*1034);
1499 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_single
1500 * Electrostatics interaction: Ewald
1501 * VdW interaction: LennardJones
1502 * Geometry: Water4-Water4
1503 * Calculate force/pot: Force
1506 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_single
1507 (t_nblist * gmx_restrict nlist,
1508 rvec * gmx_restrict xx,
1509 rvec * gmx_restrict ff,
1510 t_forcerec * gmx_restrict fr,
1511 t_mdatoms * gmx_restrict mdatoms,
1512 nb_kernel_data_t * gmx_restrict kernel_data,
1513 t_nrnb * gmx_restrict nrnb)
1515 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1516 * just 0 for non-waters.
1517 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1518 * jnr indices corresponding to data put in the four positions in the SIMD register.
1520 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1521 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1522 int jnrA,jnrB,jnrC,jnrD;
1523 int jnrE,jnrF,jnrG,jnrH;
1524 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1525 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1526 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1527 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1528 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1529 real rcutoff_scalar;
1530 real *shiftvec,*fshift,*x,*f;
1531 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1532 real scratch[4*DIM];
1533 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1534 real * vdwioffsetptr0;
1535 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1536 real * vdwioffsetptr1;
1537 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1538 real * vdwioffsetptr2;
1539 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1540 real * vdwioffsetptr3;
1541 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1542 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1543 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1544 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1545 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1546 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1547 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1548 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1549 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1550 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1551 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1552 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1553 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1554 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1555 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1556 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1557 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1558 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1559 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1560 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1563 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1566 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1567 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1569 __m128i ewitab_lo,ewitab_hi;
1570 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1571 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1573 __m256 dummy_mask,cutoff_mask;
1574 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1575 __m256 one = _mm256_set1_ps(1.0);
1576 __m256 two = _mm256_set1_ps(2.0);
1582 jindex = nlist->jindex;
1584 shiftidx = nlist->shift;
1586 shiftvec = fr->shift_vec[0];
1587 fshift = fr->fshift[0];
1588 facel = _mm256_set1_ps(fr->epsfac);
1589 charge = mdatoms->chargeA;
1590 nvdwtype = fr->ntype;
1591 vdwparam = fr->nbfp;
1592 vdwtype = mdatoms->typeA;
1594 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1595 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
1596 beta2 = _mm256_mul_ps(beta,beta);
1597 beta3 = _mm256_mul_ps(beta,beta2);
1599 ewtab = fr->ic->tabq_coul_F;
1600 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1601 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1603 /* Setup water-specific parameters */
1604 inr = nlist->iinr[0];
1605 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1606 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1607 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1608 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1610 jq1 = _mm256_set1_ps(charge[inr+1]);
1611 jq2 = _mm256_set1_ps(charge[inr+2]);
1612 jq3 = _mm256_set1_ps(charge[inr+3]);
1613 vdwjidx0A = 2*vdwtype[inr+0];
1614 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1615 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1616 qq11 = _mm256_mul_ps(iq1,jq1);
1617 qq12 = _mm256_mul_ps(iq1,jq2);
1618 qq13 = _mm256_mul_ps(iq1,jq3);
1619 qq21 = _mm256_mul_ps(iq2,jq1);
1620 qq22 = _mm256_mul_ps(iq2,jq2);
1621 qq23 = _mm256_mul_ps(iq2,jq3);
1622 qq31 = _mm256_mul_ps(iq3,jq1);
1623 qq32 = _mm256_mul_ps(iq3,jq2);
1624 qq33 = _mm256_mul_ps(iq3,jq3);
1626 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1627 rcutoff_scalar = fr->rcoulomb;
1628 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1629 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1631 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
1632 rvdw = _mm256_set1_ps(fr->rvdw);
1634 /* Avoid stupid compiler warnings */
1635 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1636 j_coord_offsetA = 0;
1637 j_coord_offsetB = 0;
1638 j_coord_offsetC = 0;
1639 j_coord_offsetD = 0;
1640 j_coord_offsetE = 0;
1641 j_coord_offsetF = 0;
1642 j_coord_offsetG = 0;
1643 j_coord_offsetH = 0;
1648 for(iidx=0;iidx<4*DIM;iidx++)
1650 scratch[iidx] = 0.0;
1653 /* Start outer loop over neighborlists */
1654 for(iidx=0; iidx<nri; iidx++)
1656 /* Load shift vector for this list */
1657 i_shift_offset = DIM*shiftidx[iidx];
1659 /* Load limits for loop over neighbors */
1660 j_index_start = jindex[iidx];
1661 j_index_end = jindex[iidx+1];
1663 /* Get outer coordinate index */
1665 i_coord_offset = DIM*inr;
1667 /* Load i particle coords and add shift vector */
1668 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1669 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1671 fix0 = _mm256_setzero_ps();
1672 fiy0 = _mm256_setzero_ps();
1673 fiz0 = _mm256_setzero_ps();
1674 fix1 = _mm256_setzero_ps();
1675 fiy1 = _mm256_setzero_ps();
1676 fiz1 = _mm256_setzero_ps();
1677 fix2 = _mm256_setzero_ps();
1678 fiy2 = _mm256_setzero_ps();
1679 fiz2 = _mm256_setzero_ps();
1680 fix3 = _mm256_setzero_ps();
1681 fiy3 = _mm256_setzero_ps();
1682 fiz3 = _mm256_setzero_ps();
1684 /* Start inner kernel loop */
1685 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1688 /* Get j neighbor index, and coordinate index */
1690 jnrB = jjnr[jidx+1];
1691 jnrC = jjnr[jidx+2];
1692 jnrD = jjnr[jidx+3];
1693 jnrE = jjnr[jidx+4];
1694 jnrF = jjnr[jidx+5];
1695 jnrG = jjnr[jidx+6];
1696 jnrH = jjnr[jidx+7];
1697 j_coord_offsetA = DIM*jnrA;
1698 j_coord_offsetB = DIM*jnrB;
1699 j_coord_offsetC = DIM*jnrC;
1700 j_coord_offsetD = DIM*jnrD;
1701 j_coord_offsetE = DIM*jnrE;
1702 j_coord_offsetF = DIM*jnrF;
1703 j_coord_offsetG = DIM*jnrG;
1704 j_coord_offsetH = DIM*jnrH;
1706 /* load j atom coordinates */
1707 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1708 x+j_coord_offsetC,x+j_coord_offsetD,
1709 x+j_coord_offsetE,x+j_coord_offsetF,
1710 x+j_coord_offsetG,x+j_coord_offsetH,
1711 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1712 &jy2,&jz2,&jx3,&jy3,&jz3);
1714 /* Calculate displacement vector */
1715 dx00 = _mm256_sub_ps(ix0,jx0);
1716 dy00 = _mm256_sub_ps(iy0,jy0);
1717 dz00 = _mm256_sub_ps(iz0,jz0);
1718 dx11 = _mm256_sub_ps(ix1,jx1);
1719 dy11 = _mm256_sub_ps(iy1,jy1);
1720 dz11 = _mm256_sub_ps(iz1,jz1);
1721 dx12 = _mm256_sub_ps(ix1,jx2);
1722 dy12 = _mm256_sub_ps(iy1,jy2);
1723 dz12 = _mm256_sub_ps(iz1,jz2);
1724 dx13 = _mm256_sub_ps(ix1,jx3);
1725 dy13 = _mm256_sub_ps(iy1,jy3);
1726 dz13 = _mm256_sub_ps(iz1,jz3);
1727 dx21 = _mm256_sub_ps(ix2,jx1);
1728 dy21 = _mm256_sub_ps(iy2,jy1);
1729 dz21 = _mm256_sub_ps(iz2,jz1);
1730 dx22 = _mm256_sub_ps(ix2,jx2);
1731 dy22 = _mm256_sub_ps(iy2,jy2);
1732 dz22 = _mm256_sub_ps(iz2,jz2);
1733 dx23 = _mm256_sub_ps(ix2,jx3);
1734 dy23 = _mm256_sub_ps(iy2,jy3);
1735 dz23 = _mm256_sub_ps(iz2,jz3);
1736 dx31 = _mm256_sub_ps(ix3,jx1);
1737 dy31 = _mm256_sub_ps(iy3,jy1);
1738 dz31 = _mm256_sub_ps(iz3,jz1);
1739 dx32 = _mm256_sub_ps(ix3,jx2);
1740 dy32 = _mm256_sub_ps(iy3,jy2);
1741 dz32 = _mm256_sub_ps(iz3,jz2);
1742 dx33 = _mm256_sub_ps(ix3,jx3);
1743 dy33 = _mm256_sub_ps(iy3,jy3);
1744 dz33 = _mm256_sub_ps(iz3,jz3);
1746 /* Calculate squared distance and things based on it */
1747 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1748 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1749 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1750 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1751 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1752 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1753 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1754 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1755 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1756 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1758 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1759 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1760 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1761 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1762 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1763 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1764 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1765 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1766 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1768 rinvsq00 = gmx_mm256_inv_ps(rsq00);
1769 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1770 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1771 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1772 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1773 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1774 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1775 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1776 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1777 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1779 fjx0 = _mm256_setzero_ps();
1780 fjy0 = _mm256_setzero_ps();
1781 fjz0 = _mm256_setzero_ps();
1782 fjx1 = _mm256_setzero_ps();
1783 fjy1 = _mm256_setzero_ps();
1784 fjz1 = _mm256_setzero_ps();
1785 fjx2 = _mm256_setzero_ps();
1786 fjy2 = _mm256_setzero_ps();
1787 fjz2 = _mm256_setzero_ps();
1788 fjx3 = _mm256_setzero_ps();
1789 fjy3 = _mm256_setzero_ps();
1790 fjz3 = _mm256_setzero_ps();
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1796 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1799 /* LENNARD-JONES DISPERSION/REPULSION */
1801 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1802 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1804 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1808 fscal = _mm256_and_ps(fscal,cutoff_mask);
1810 /* Calculate temporary vectorial force */
1811 tx = _mm256_mul_ps(fscal,dx00);
1812 ty = _mm256_mul_ps(fscal,dy00);
1813 tz = _mm256_mul_ps(fscal,dz00);
1815 /* Update vectorial force */
1816 fix0 = _mm256_add_ps(fix0,tx);
1817 fiy0 = _mm256_add_ps(fiy0,ty);
1818 fiz0 = _mm256_add_ps(fiz0,tz);
1820 fjx0 = _mm256_add_ps(fjx0,tx);
1821 fjy0 = _mm256_add_ps(fjy0,ty);
1822 fjz0 = _mm256_add_ps(fjz0,tz);
1826 /**************************
1827 * CALCULATE INTERACTIONS *
1828 **************************/
1830 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1833 r11 = _mm256_mul_ps(rsq11,rinv11);
1835 /* EWALD ELECTROSTATICS */
1837 /* Analytical PME correction */
1838 zeta2 = _mm256_mul_ps(beta2,rsq11);
1839 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1840 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1841 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1842 felec = _mm256_mul_ps(qq11,felec);
1844 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1848 fscal = _mm256_and_ps(fscal,cutoff_mask);
1850 /* Calculate temporary vectorial force */
1851 tx = _mm256_mul_ps(fscal,dx11);
1852 ty = _mm256_mul_ps(fscal,dy11);
1853 tz = _mm256_mul_ps(fscal,dz11);
1855 /* Update vectorial force */
1856 fix1 = _mm256_add_ps(fix1,tx);
1857 fiy1 = _mm256_add_ps(fiy1,ty);
1858 fiz1 = _mm256_add_ps(fiz1,tz);
1860 fjx1 = _mm256_add_ps(fjx1,tx);
1861 fjy1 = _mm256_add_ps(fjy1,ty);
1862 fjz1 = _mm256_add_ps(fjz1,tz);
1866 /**************************
1867 * CALCULATE INTERACTIONS *
1868 **************************/
1870 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1873 r12 = _mm256_mul_ps(rsq12,rinv12);
1875 /* EWALD ELECTROSTATICS */
1877 /* Analytical PME correction */
1878 zeta2 = _mm256_mul_ps(beta2,rsq12);
1879 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1880 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1881 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1882 felec = _mm256_mul_ps(qq12,felec);
1884 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1888 fscal = _mm256_and_ps(fscal,cutoff_mask);
1890 /* Calculate temporary vectorial force */
1891 tx = _mm256_mul_ps(fscal,dx12);
1892 ty = _mm256_mul_ps(fscal,dy12);
1893 tz = _mm256_mul_ps(fscal,dz12);
1895 /* Update vectorial force */
1896 fix1 = _mm256_add_ps(fix1,tx);
1897 fiy1 = _mm256_add_ps(fiy1,ty);
1898 fiz1 = _mm256_add_ps(fiz1,tz);
1900 fjx2 = _mm256_add_ps(fjx2,tx);
1901 fjy2 = _mm256_add_ps(fjy2,ty);
1902 fjz2 = _mm256_add_ps(fjz2,tz);
1906 /**************************
1907 * CALCULATE INTERACTIONS *
1908 **************************/
1910 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1913 r13 = _mm256_mul_ps(rsq13,rinv13);
1915 /* EWALD ELECTROSTATICS */
1917 /* Analytical PME correction */
1918 zeta2 = _mm256_mul_ps(beta2,rsq13);
1919 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1920 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1921 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1922 felec = _mm256_mul_ps(qq13,felec);
1924 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1928 fscal = _mm256_and_ps(fscal,cutoff_mask);
1930 /* Calculate temporary vectorial force */
1931 tx = _mm256_mul_ps(fscal,dx13);
1932 ty = _mm256_mul_ps(fscal,dy13);
1933 tz = _mm256_mul_ps(fscal,dz13);
1935 /* Update vectorial force */
1936 fix1 = _mm256_add_ps(fix1,tx);
1937 fiy1 = _mm256_add_ps(fiy1,ty);
1938 fiz1 = _mm256_add_ps(fiz1,tz);
1940 fjx3 = _mm256_add_ps(fjx3,tx);
1941 fjy3 = _mm256_add_ps(fjy3,ty);
1942 fjz3 = _mm256_add_ps(fjz3,tz);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1953 r21 = _mm256_mul_ps(rsq21,rinv21);
1955 /* EWALD ELECTROSTATICS */
1957 /* Analytical PME correction */
1958 zeta2 = _mm256_mul_ps(beta2,rsq21);
1959 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1960 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1961 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1962 felec = _mm256_mul_ps(qq21,felec);
1964 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1968 fscal = _mm256_and_ps(fscal,cutoff_mask);
1970 /* Calculate temporary vectorial force */
1971 tx = _mm256_mul_ps(fscal,dx21);
1972 ty = _mm256_mul_ps(fscal,dy21);
1973 tz = _mm256_mul_ps(fscal,dz21);
1975 /* Update vectorial force */
1976 fix2 = _mm256_add_ps(fix2,tx);
1977 fiy2 = _mm256_add_ps(fiy2,ty);
1978 fiz2 = _mm256_add_ps(fiz2,tz);
1980 fjx1 = _mm256_add_ps(fjx1,tx);
1981 fjy1 = _mm256_add_ps(fjy1,ty);
1982 fjz1 = _mm256_add_ps(fjz1,tz);
1986 /**************************
1987 * CALCULATE INTERACTIONS *
1988 **************************/
1990 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1993 r22 = _mm256_mul_ps(rsq22,rinv22);
1995 /* EWALD ELECTROSTATICS */
1997 /* Analytical PME correction */
1998 zeta2 = _mm256_mul_ps(beta2,rsq22);
1999 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2000 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2001 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2002 felec = _mm256_mul_ps(qq22,felec);
2004 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2008 fscal = _mm256_and_ps(fscal,cutoff_mask);
2010 /* Calculate temporary vectorial force */
2011 tx = _mm256_mul_ps(fscal,dx22);
2012 ty = _mm256_mul_ps(fscal,dy22);
2013 tz = _mm256_mul_ps(fscal,dz22);
2015 /* Update vectorial force */
2016 fix2 = _mm256_add_ps(fix2,tx);
2017 fiy2 = _mm256_add_ps(fiy2,ty);
2018 fiz2 = _mm256_add_ps(fiz2,tz);
2020 fjx2 = _mm256_add_ps(fjx2,tx);
2021 fjy2 = _mm256_add_ps(fjy2,ty);
2022 fjz2 = _mm256_add_ps(fjz2,tz);
2026 /**************************
2027 * CALCULATE INTERACTIONS *
2028 **************************/
2030 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2033 r23 = _mm256_mul_ps(rsq23,rinv23);
2035 /* EWALD ELECTROSTATICS */
2037 /* Analytical PME correction */
2038 zeta2 = _mm256_mul_ps(beta2,rsq23);
2039 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2040 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2041 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2042 felec = _mm256_mul_ps(qq23,felec);
2044 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2048 fscal = _mm256_and_ps(fscal,cutoff_mask);
2050 /* Calculate temporary vectorial force */
2051 tx = _mm256_mul_ps(fscal,dx23);
2052 ty = _mm256_mul_ps(fscal,dy23);
2053 tz = _mm256_mul_ps(fscal,dz23);
2055 /* Update vectorial force */
2056 fix2 = _mm256_add_ps(fix2,tx);
2057 fiy2 = _mm256_add_ps(fiy2,ty);
2058 fiz2 = _mm256_add_ps(fiz2,tz);
2060 fjx3 = _mm256_add_ps(fjx3,tx);
2061 fjy3 = _mm256_add_ps(fjy3,ty);
2062 fjz3 = _mm256_add_ps(fjz3,tz);
2066 /**************************
2067 * CALCULATE INTERACTIONS *
2068 **************************/
2070 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2073 r31 = _mm256_mul_ps(rsq31,rinv31);
2075 /* EWALD ELECTROSTATICS */
2077 /* Analytical PME correction */
2078 zeta2 = _mm256_mul_ps(beta2,rsq31);
2079 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2080 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2081 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2082 felec = _mm256_mul_ps(qq31,felec);
2084 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2088 fscal = _mm256_and_ps(fscal,cutoff_mask);
2090 /* Calculate temporary vectorial force */
2091 tx = _mm256_mul_ps(fscal,dx31);
2092 ty = _mm256_mul_ps(fscal,dy31);
2093 tz = _mm256_mul_ps(fscal,dz31);
2095 /* Update vectorial force */
2096 fix3 = _mm256_add_ps(fix3,tx);
2097 fiy3 = _mm256_add_ps(fiy3,ty);
2098 fiz3 = _mm256_add_ps(fiz3,tz);
2100 fjx1 = _mm256_add_ps(fjx1,tx);
2101 fjy1 = _mm256_add_ps(fjy1,ty);
2102 fjz1 = _mm256_add_ps(fjz1,tz);
2106 /**************************
2107 * CALCULATE INTERACTIONS *
2108 **************************/
2110 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2113 r32 = _mm256_mul_ps(rsq32,rinv32);
2115 /* EWALD ELECTROSTATICS */
2117 /* Analytical PME correction */
2118 zeta2 = _mm256_mul_ps(beta2,rsq32);
2119 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2120 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2121 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2122 felec = _mm256_mul_ps(qq32,felec);
2124 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2128 fscal = _mm256_and_ps(fscal,cutoff_mask);
2130 /* Calculate temporary vectorial force */
2131 tx = _mm256_mul_ps(fscal,dx32);
2132 ty = _mm256_mul_ps(fscal,dy32);
2133 tz = _mm256_mul_ps(fscal,dz32);
2135 /* Update vectorial force */
2136 fix3 = _mm256_add_ps(fix3,tx);
2137 fiy3 = _mm256_add_ps(fiy3,ty);
2138 fiz3 = _mm256_add_ps(fiz3,tz);
2140 fjx2 = _mm256_add_ps(fjx2,tx);
2141 fjy2 = _mm256_add_ps(fjy2,ty);
2142 fjz2 = _mm256_add_ps(fjz2,tz);
2146 /**************************
2147 * CALCULATE INTERACTIONS *
2148 **************************/
2150 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2153 r33 = _mm256_mul_ps(rsq33,rinv33);
2155 /* EWALD ELECTROSTATICS */
2157 /* Analytical PME correction */
2158 zeta2 = _mm256_mul_ps(beta2,rsq33);
2159 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2160 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2161 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2162 felec = _mm256_mul_ps(qq33,felec);
2164 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2168 fscal = _mm256_and_ps(fscal,cutoff_mask);
2170 /* Calculate temporary vectorial force */
2171 tx = _mm256_mul_ps(fscal,dx33);
2172 ty = _mm256_mul_ps(fscal,dy33);
2173 tz = _mm256_mul_ps(fscal,dz33);
2175 /* Update vectorial force */
2176 fix3 = _mm256_add_ps(fix3,tx);
2177 fiy3 = _mm256_add_ps(fiy3,ty);
2178 fiz3 = _mm256_add_ps(fiz3,tz);
2180 fjx3 = _mm256_add_ps(fjx3,tx);
2181 fjy3 = _mm256_add_ps(fjy3,ty);
2182 fjz3 = _mm256_add_ps(fjz3,tz);
2186 fjptrA = f+j_coord_offsetA;
2187 fjptrB = f+j_coord_offsetB;
2188 fjptrC = f+j_coord_offsetC;
2189 fjptrD = f+j_coord_offsetD;
2190 fjptrE = f+j_coord_offsetE;
2191 fjptrF = f+j_coord_offsetF;
2192 fjptrG = f+j_coord_offsetG;
2193 fjptrH = f+j_coord_offsetH;
2195 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2196 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2197 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2199 /* Inner loop uses 564 flops */
2202 if(jidx<j_index_end)
2205 /* Get j neighbor index, and coordinate index */
2206 jnrlistA = jjnr[jidx];
2207 jnrlistB = jjnr[jidx+1];
2208 jnrlistC = jjnr[jidx+2];
2209 jnrlistD = jjnr[jidx+3];
2210 jnrlistE = jjnr[jidx+4];
2211 jnrlistF = jjnr[jidx+5];
2212 jnrlistG = jjnr[jidx+6];
2213 jnrlistH = jjnr[jidx+7];
2214 /* Sign of each element will be negative for non-real atoms.
2215 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2216 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2218 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2219 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2221 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2222 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2223 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2224 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2225 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2226 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2227 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2228 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2229 j_coord_offsetA = DIM*jnrA;
2230 j_coord_offsetB = DIM*jnrB;
2231 j_coord_offsetC = DIM*jnrC;
2232 j_coord_offsetD = DIM*jnrD;
2233 j_coord_offsetE = DIM*jnrE;
2234 j_coord_offsetF = DIM*jnrF;
2235 j_coord_offsetG = DIM*jnrG;
2236 j_coord_offsetH = DIM*jnrH;
2238 /* load j atom coordinates */
2239 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2240 x+j_coord_offsetC,x+j_coord_offsetD,
2241 x+j_coord_offsetE,x+j_coord_offsetF,
2242 x+j_coord_offsetG,x+j_coord_offsetH,
2243 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2244 &jy2,&jz2,&jx3,&jy3,&jz3);
2246 /* Calculate displacement vector */
2247 dx00 = _mm256_sub_ps(ix0,jx0);
2248 dy00 = _mm256_sub_ps(iy0,jy0);
2249 dz00 = _mm256_sub_ps(iz0,jz0);
2250 dx11 = _mm256_sub_ps(ix1,jx1);
2251 dy11 = _mm256_sub_ps(iy1,jy1);
2252 dz11 = _mm256_sub_ps(iz1,jz1);
2253 dx12 = _mm256_sub_ps(ix1,jx2);
2254 dy12 = _mm256_sub_ps(iy1,jy2);
2255 dz12 = _mm256_sub_ps(iz1,jz2);
2256 dx13 = _mm256_sub_ps(ix1,jx3);
2257 dy13 = _mm256_sub_ps(iy1,jy3);
2258 dz13 = _mm256_sub_ps(iz1,jz3);
2259 dx21 = _mm256_sub_ps(ix2,jx1);
2260 dy21 = _mm256_sub_ps(iy2,jy1);
2261 dz21 = _mm256_sub_ps(iz2,jz1);
2262 dx22 = _mm256_sub_ps(ix2,jx2);
2263 dy22 = _mm256_sub_ps(iy2,jy2);
2264 dz22 = _mm256_sub_ps(iz2,jz2);
2265 dx23 = _mm256_sub_ps(ix2,jx3);
2266 dy23 = _mm256_sub_ps(iy2,jy3);
2267 dz23 = _mm256_sub_ps(iz2,jz3);
2268 dx31 = _mm256_sub_ps(ix3,jx1);
2269 dy31 = _mm256_sub_ps(iy3,jy1);
2270 dz31 = _mm256_sub_ps(iz3,jz1);
2271 dx32 = _mm256_sub_ps(ix3,jx2);
2272 dy32 = _mm256_sub_ps(iy3,jy2);
2273 dz32 = _mm256_sub_ps(iz3,jz2);
2274 dx33 = _mm256_sub_ps(ix3,jx3);
2275 dy33 = _mm256_sub_ps(iy3,jy3);
2276 dz33 = _mm256_sub_ps(iz3,jz3);
2278 /* Calculate squared distance and things based on it */
2279 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2280 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2281 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2282 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2283 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2284 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2285 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2286 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2287 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2288 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2290 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2291 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2292 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2293 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2294 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2295 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2296 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2297 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2298 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2300 rinvsq00 = gmx_mm256_inv_ps(rsq00);
2301 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2302 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2303 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2304 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2305 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2306 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2307 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2308 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2309 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2311 fjx0 = _mm256_setzero_ps();
2312 fjy0 = _mm256_setzero_ps();
2313 fjz0 = _mm256_setzero_ps();
2314 fjx1 = _mm256_setzero_ps();
2315 fjy1 = _mm256_setzero_ps();
2316 fjz1 = _mm256_setzero_ps();
2317 fjx2 = _mm256_setzero_ps();
2318 fjy2 = _mm256_setzero_ps();
2319 fjz2 = _mm256_setzero_ps();
2320 fjx3 = _mm256_setzero_ps();
2321 fjy3 = _mm256_setzero_ps();
2322 fjz3 = _mm256_setzero_ps();
2324 /**************************
2325 * CALCULATE INTERACTIONS *
2326 **************************/
2328 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2331 /* LENNARD-JONES DISPERSION/REPULSION */
2333 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2334 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2336 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2340 fscal = _mm256_and_ps(fscal,cutoff_mask);
2342 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2344 /* Calculate temporary vectorial force */
2345 tx = _mm256_mul_ps(fscal,dx00);
2346 ty = _mm256_mul_ps(fscal,dy00);
2347 tz = _mm256_mul_ps(fscal,dz00);
2349 /* Update vectorial force */
2350 fix0 = _mm256_add_ps(fix0,tx);
2351 fiy0 = _mm256_add_ps(fiy0,ty);
2352 fiz0 = _mm256_add_ps(fiz0,tz);
2354 fjx0 = _mm256_add_ps(fjx0,tx);
2355 fjy0 = _mm256_add_ps(fjy0,ty);
2356 fjz0 = _mm256_add_ps(fjz0,tz);
2360 /**************************
2361 * CALCULATE INTERACTIONS *
2362 **************************/
2364 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2367 r11 = _mm256_mul_ps(rsq11,rinv11);
2368 r11 = _mm256_andnot_ps(dummy_mask,r11);
2370 /* EWALD ELECTROSTATICS */
2372 /* Analytical PME correction */
2373 zeta2 = _mm256_mul_ps(beta2,rsq11);
2374 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2375 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2376 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2377 felec = _mm256_mul_ps(qq11,felec);
2379 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2383 fscal = _mm256_and_ps(fscal,cutoff_mask);
2385 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2387 /* Calculate temporary vectorial force */
2388 tx = _mm256_mul_ps(fscal,dx11);
2389 ty = _mm256_mul_ps(fscal,dy11);
2390 tz = _mm256_mul_ps(fscal,dz11);
2392 /* Update vectorial force */
2393 fix1 = _mm256_add_ps(fix1,tx);
2394 fiy1 = _mm256_add_ps(fiy1,ty);
2395 fiz1 = _mm256_add_ps(fiz1,tz);
2397 fjx1 = _mm256_add_ps(fjx1,tx);
2398 fjy1 = _mm256_add_ps(fjy1,ty);
2399 fjz1 = _mm256_add_ps(fjz1,tz);
2403 /**************************
2404 * CALCULATE INTERACTIONS *
2405 **************************/
2407 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2410 r12 = _mm256_mul_ps(rsq12,rinv12);
2411 r12 = _mm256_andnot_ps(dummy_mask,r12);
2413 /* EWALD ELECTROSTATICS */
2415 /* Analytical PME correction */
2416 zeta2 = _mm256_mul_ps(beta2,rsq12);
2417 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2418 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2419 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2420 felec = _mm256_mul_ps(qq12,felec);
2422 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2426 fscal = _mm256_and_ps(fscal,cutoff_mask);
2428 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2430 /* Calculate temporary vectorial force */
2431 tx = _mm256_mul_ps(fscal,dx12);
2432 ty = _mm256_mul_ps(fscal,dy12);
2433 tz = _mm256_mul_ps(fscal,dz12);
2435 /* Update vectorial force */
2436 fix1 = _mm256_add_ps(fix1,tx);
2437 fiy1 = _mm256_add_ps(fiy1,ty);
2438 fiz1 = _mm256_add_ps(fiz1,tz);
2440 fjx2 = _mm256_add_ps(fjx2,tx);
2441 fjy2 = _mm256_add_ps(fjy2,ty);
2442 fjz2 = _mm256_add_ps(fjz2,tz);
2446 /**************************
2447 * CALCULATE INTERACTIONS *
2448 **************************/
2450 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2453 r13 = _mm256_mul_ps(rsq13,rinv13);
2454 r13 = _mm256_andnot_ps(dummy_mask,r13);
2456 /* EWALD ELECTROSTATICS */
2458 /* Analytical PME correction */
2459 zeta2 = _mm256_mul_ps(beta2,rsq13);
2460 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2461 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2462 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2463 felec = _mm256_mul_ps(qq13,felec);
2465 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2469 fscal = _mm256_and_ps(fscal,cutoff_mask);
2471 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2473 /* Calculate temporary vectorial force */
2474 tx = _mm256_mul_ps(fscal,dx13);
2475 ty = _mm256_mul_ps(fscal,dy13);
2476 tz = _mm256_mul_ps(fscal,dz13);
2478 /* Update vectorial force */
2479 fix1 = _mm256_add_ps(fix1,tx);
2480 fiy1 = _mm256_add_ps(fiy1,ty);
2481 fiz1 = _mm256_add_ps(fiz1,tz);
2483 fjx3 = _mm256_add_ps(fjx3,tx);
2484 fjy3 = _mm256_add_ps(fjy3,ty);
2485 fjz3 = _mm256_add_ps(fjz3,tz);
2489 /**************************
2490 * CALCULATE INTERACTIONS *
2491 **************************/
2493 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2496 r21 = _mm256_mul_ps(rsq21,rinv21);
2497 r21 = _mm256_andnot_ps(dummy_mask,r21);
2499 /* EWALD ELECTROSTATICS */
2501 /* Analytical PME correction */
2502 zeta2 = _mm256_mul_ps(beta2,rsq21);
2503 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2504 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2505 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2506 felec = _mm256_mul_ps(qq21,felec);
2508 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2512 fscal = _mm256_and_ps(fscal,cutoff_mask);
2514 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2516 /* Calculate temporary vectorial force */
2517 tx = _mm256_mul_ps(fscal,dx21);
2518 ty = _mm256_mul_ps(fscal,dy21);
2519 tz = _mm256_mul_ps(fscal,dz21);
2521 /* Update vectorial force */
2522 fix2 = _mm256_add_ps(fix2,tx);
2523 fiy2 = _mm256_add_ps(fiy2,ty);
2524 fiz2 = _mm256_add_ps(fiz2,tz);
2526 fjx1 = _mm256_add_ps(fjx1,tx);
2527 fjy1 = _mm256_add_ps(fjy1,ty);
2528 fjz1 = _mm256_add_ps(fjz1,tz);
2532 /**************************
2533 * CALCULATE INTERACTIONS *
2534 **************************/
2536 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2539 r22 = _mm256_mul_ps(rsq22,rinv22);
2540 r22 = _mm256_andnot_ps(dummy_mask,r22);
2542 /* EWALD ELECTROSTATICS */
2544 /* Analytical PME correction */
2545 zeta2 = _mm256_mul_ps(beta2,rsq22);
2546 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2547 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2548 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2549 felec = _mm256_mul_ps(qq22,felec);
2551 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2555 fscal = _mm256_and_ps(fscal,cutoff_mask);
2557 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2559 /* Calculate temporary vectorial force */
2560 tx = _mm256_mul_ps(fscal,dx22);
2561 ty = _mm256_mul_ps(fscal,dy22);
2562 tz = _mm256_mul_ps(fscal,dz22);
2564 /* Update vectorial force */
2565 fix2 = _mm256_add_ps(fix2,tx);
2566 fiy2 = _mm256_add_ps(fiy2,ty);
2567 fiz2 = _mm256_add_ps(fiz2,tz);
2569 fjx2 = _mm256_add_ps(fjx2,tx);
2570 fjy2 = _mm256_add_ps(fjy2,ty);
2571 fjz2 = _mm256_add_ps(fjz2,tz);
2575 /**************************
2576 * CALCULATE INTERACTIONS *
2577 **************************/
2579 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2582 r23 = _mm256_mul_ps(rsq23,rinv23);
2583 r23 = _mm256_andnot_ps(dummy_mask,r23);
2585 /* EWALD ELECTROSTATICS */
2587 /* Analytical PME correction */
2588 zeta2 = _mm256_mul_ps(beta2,rsq23);
2589 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2590 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2591 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2592 felec = _mm256_mul_ps(qq23,felec);
2594 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2598 fscal = _mm256_and_ps(fscal,cutoff_mask);
2600 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2602 /* Calculate temporary vectorial force */
2603 tx = _mm256_mul_ps(fscal,dx23);
2604 ty = _mm256_mul_ps(fscal,dy23);
2605 tz = _mm256_mul_ps(fscal,dz23);
2607 /* Update vectorial force */
2608 fix2 = _mm256_add_ps(fix2,tx);
2609 fiy2 = _mm256_add_ps(fiy2,ty);
2610 fiz2 = _mm256_add_ps(fiz2,tz);
2612 fjx3 = _mm256_add_ps(fjx3,tx);
2613 fjy3 = _mm256_add_ps(fjy3,ty);
2614 fjz3 = _mm256_add_ps(fjz3,tz);
2618 /**************************
2619 * CALCULATE INTERACTIONS *
2620 **************************/
2622 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2625 r31 = _mm256_mul_ps(rsq31,rinv31);
2626 r31 = _mm256_andnot_ps(dummy_mask,r31);
2628 /* EWALD ELECTROSTATICS */
2630 /* Analytical PME correction */
2631 zeta2 = _mm256_mul_ps(beta2,rsq31);
2632 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2633 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2634 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2635 felec = _mm256_mul_ps(qq31,felec);
2637 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2641 fscal = _mm256_and_ps(fscal,cutoff_mask);
2643 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2645 /* Calculate temporary vectorial force */
2646 tx = _mm256_mul_ps(fscal,dx31);
2647 ty = _mm256_mul_ps(fscal,dy31);
2648 tz = _mm256_mul_ps(fscal,dz31);
2650 /* Update vectorial force */
2651 fix3 = _mm256_add_ps(fix3,tx);
2652 fiy3 = _mm256_add_ps(fiy3,ty);
2653 fiz3 = _mm256_add_ps(fiz3,tz);
2655 fjx1 = _mm256_add_ps(fjx1,tx);
2656 fjy1 = _mm256_add_ps(fjy1,ty);
2657 fjz1 = _mm256_add_ps(fjz1,tz);
2661 /**************************
2662 * CALCULATE INTERACTIONS *
2663 **************************/
2665 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2668 r32 = _mm256_mul_ps(rsq32,rinv32);
2669 r32 = _mm256_andnot_ps(dummy_mask,r32);
2671 /* EWALD ELECTROSTATICS */
2673 /* Analytical PME correction */
2674 zeta2 = _mm256_mul_ps(beta2,rsq32);
2675 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2676 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2677 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2678 felec = _mm256_mul_ps(qq32,felec);
2680 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2684 fscal = _mm256_and_ps(fscal,cutoff_mask);
2686 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2688 /* Calculate temporary vectorial force */
2689 tx = _mm256_mul_ps(fscal,dx32);
2690 ty = _mm256_mul_ps(fscal,dy32);
2691 tz = _mm256_mul_ps(fscal,dz32);
2693 /* Update vectorial force */
2694 fix3 = _mm256_add_ps(fix3,tx);
2695 fiy3 = _mm256_add_ps(fiy3,ty);
2696 fiz3 = _mm256_add_ps(fiz3,tz);
2698 fjx2 = _mm256_add_ps(fjx2,tx);
2699 fjy2 = _mm256_add_ps(fjy2,ty);
2700 fjz2 = _mm256_add_ps(fjz2,tz);
2704 /**************************
2705 * CALCULATE INTERACTIONS *
2706 **************************/
2708 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2711 r33 = _mm256_mul_ps(rsq33,rinv33);
2712 r33 = _mm256_andnot_ps(dummy_mask,r33);
2714 /* EWALD ELECTROSTATICS */
2716 /* Analytical PME correction */
2717 zeta2 = _mm256_mul_ps(beta2,rsq33);
2718 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2719 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2720 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2721 felec = _mm256_mul_ps(qq33,felec);
2723 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2727 fscal = _mm256_and_ps(fscal,cutoff_mask);
2729 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2731 /* Calculate temporary vectorial force */
2732 tx = _mm256_mul_ps(fscal,dx33);
2733 ty = _mm256_mul_ps(fscal,dy33);
2734 tz = _mm256_mul_ps(fscal,dz33);
2736 /* Update vectorial force */
2737 fix3 = _mm256_add_ps(fix3,tx);
2738 fiy3 = _mm256_add_ps(fiy3,ty);
2739 fiz3 = _mm256_add_ps(fiz3,tz);
2741 fjx3 = _mm256_add_ps(fjx3,tx);
2742 fjy3 = _mm256_add_ps(fjy3,ty);
2743 fjz3 = _mm256_add_ps(fjz3,tz);
2747 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2748 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2749 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2750 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2751 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2752 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2753 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2754 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2756 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2757 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2758 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2760 /* Inner loop uses 573 flops */
2763 /* End of innermost loop */
2765 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2766 f+i_coord_offset,fshift+i_shift_offset);
2768 /* Increment number of inner iterations */
2769 inneriter += j_index_end - j_index_start;
2771 /* Outer loop uses 24 flops */
2774 /* Increment number of outer iterations */
2777 /* Update outer/inner flops */
2779 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*573);