2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
116 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
118 __m128i ewitab_lo,ewitab_hi;
119 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
122 __m256 dummy_mask,cutoff_mask;
123 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
124 __m256 one = _mm256_set1_ps(1.0);
125 __m256 two = _mm256_set1_ps(2.0);
131 jindex = nlist->jindex;
133 shiftidx = nlist->shift;
135 shiftvec = fr->shift_vec[0];
136 fshift = fr->fshift[0];
137 facel = _mm256_set1_ps(fr->epsfac);
138 charge = mdatoms->chargeA;
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
144 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
145 beta2 = _mm256_mul_ps(beta,beta);
146 beta3 = _mm256_mul_ps(beta,beta2);
148 ewtab = fr->ic->tabq_coul_FDV0;
149 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
150 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
152 /* Setup water-specific parameters */
153 inr = nlist->iinr[0];
154 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
155 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
156 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
157 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
159 jq0 = _mm256_set1_ps(charge[inr+0]);
160 jq1 = _mm256_set1_ps(charge[inr+1]);
161 jq2 = _mm256_set1_ps(charge[inr+2]);
162 vdwjidx0A = 2*vdwtype[inr+0];
163 qq00 = _mm256_mul_ps(iq0,jq0);
164 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
165 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
166 qq01 = _mm256_mul_ps(iq0,jq1);
167 qq02 = _mm256_mul_ps(iq0,jq2);
168 qq10 = _mm256_mul_ps(iq1,jq0);
169 qq11 = _mm256_mul_ps(iq1,jq1);
170 qq12 = _mm256_mul_ps(iq1,jq2);
171 qq20 = _mm256_mul_ps(iq2,jq0);
172 qq21 = _mm256_mul_ps(iq2,jq1);
173 qq22 = _mm256_mul_ps(iq2,jq2);
175 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
176 rcutoff_scalar = fr->rcoulomb;
177 rcutoff = _mm256_set1_ps(rcutoff_scalar);
178 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
180 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
181 rvdw = _mm256_set1_ps(fr->rvdw);
183 /* Avoid stupid compiler warnings */
184 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
197 for(iidx=0;iidx<4*DIM;iidx++)
202 /* Start outer loop over neighborlists */
203 for(iidx=0; iidx<nri; iidx++)
205 /* Load shift vector for this list */
206 i_shift_offset = DIM*shiftidx[iidx];
208 /* Load limits for loop over neighbors */
209 j_index_start = jindex[iidx];
210 j_index_end = jindex[iidx+1];
212 /* Get outer coordinate index */
214 i_coord_offset = DIM*inr;
216 /* Load i particle coords and add shift vector */
217 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
220 fix0 = _mm256_setzero_ps();
221 fiy0 = _mm256_setzero_ps();
222 fiz0 = _mm256_setzero_ps();
223 fix1 = _mm256_setzero_ps();
224 fiy1 = _mm256_setzero_ps();
225 fiz1 = _mm256_setzero_ps();
226 fix2 = _mm256_setzero_ps();
227 fiy2 = _mm256_setzero_ps();
228 fiz2 = _mm256_setzero_ps();
230 /* Reset potential sums */
231 velecsum = _mm256_setzero_ps();
232 vvdwsum = _mm256_setzero_ps();
234 /* Start inner kernel loop */
235 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
238 /* Get j neighbor index, and coordinate index */
247 j_coord_offsetA = DIM*jnrA;
248 j_coord_offsetB = DIM*jnrB;
249 j_coord_offsetC = DIM*jnrC;
250 j_coord_offsetD = DIM*jnrD;
251 j_coord_offsetE = DIM*jnrE;
252 j_coord_offsetF = DIM*jnrF;
253 j_coord_offsetG = DIM*jnrG;
254 j_coord_offsetH = DIM*jnrH;
256 /* load j atom coordinates */
257 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
258 x+j_coord_offsetC,x+j_coord_offsetD,
259 x+j_coord_offsetE,x+j_coord_offsetF,
260 x+j_coord_offsetG,x+j_coord_offsetH,
261 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
263 /* Calculate displacement vector */
264 dx00 = _mm256_sub_ps(ix0,jx0);
265 dy00 = _mm256_sub_ps(iy0,jy0);
266 dz00 = _mm256_sub_ps(iz0,jz0);
267 dx01 = _mm256_sub_ps(ix0,jx1);
268 dy01 = _mm256_sub_ps(iy0,jy1);
269 dz01 = _mm256_sub_ps(iz0,jz1);
270 dx02 = _mm256_sub_ps(ix0,jx2);
271 dy02 = _mm256_sub_ps(iy0,jy2);
272 dz02 = _mm256_sub_ps(iz0,jz2);
273 dx10 = _mm256_sub_ps(ix1,jx0);
274 dy10 = _mm256_sub_ps(iy1,jy0);
275 dz10 = _mm256_sub_ps(iz1,jz0);
276 dx11 = _mm256_sub_ps(ix1,jx1);
277 dy11 = _mm256_sub_ps(iy1,jy1);
278 dz11 = _mm256_sub_ps(iz1,jz1);
279 dx12 = _mm256_sub_ps(ix1,jx2);
280 dy12 = _mm256_sub_ps(iy1,jy2);
281 dz12 = _mm256_sub_ps(iz1,jz2);
282 dx20 = _mm256_sub_ps(ix2,jx0);
283 dy20 = _mm256_sub_ps(iy2,jy0);
284 dz20 = _mm256_sub_ps(iz2,jz0);
285 dx21 = _mm256_sub_ps(ix2,jx1);
286 dy21 = _mm256_sub_ps(iy2,jy1);
287 dz21 = _mm256_sub_ps(iz2,jz1);
288 dx22 = _mm256_sub_ps(ix2,jx2);
289 dy22 = _mm256_sub_ps(iy2,jy2);
290 dz22 = _mm256_sub_ps(iz2,jz2);
292 /* Calculate squared distance and things based on it */
293 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
294 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
295 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
296 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
297 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
298 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
299 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
300 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
301 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
303 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
304 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
305 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
306 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
307 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
308 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
309 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
310 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
311 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
313 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
314 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
315 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
316 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
317 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
318 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
319 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
320 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
321 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
323 fjx0 = _mm256_setzero_ps();
324 fjy0 = _mm256_setzero_ps();
325 fjz0 = _mm256_setzero_ps();
326 fjx1 = _mm256_setzero_ps();
327 fjy1 = _mm256_setzero_ps();
328 fjz1 = _mm256_setzero_ps();
329 fjx2 = _mm256_setzero_ps();
330 fjy2 = _mm256_setzero_ps();
331 fjz2 = _mm256_setzero_ps();
333 /**************************
334 * CALCULATE INTERACTIONS *
335 **************************/
337 if (gmx_mm256_any_lt(rsq00,rcutoff2))
340 r00 = _mm256_mul_ps(rsq00,rinv00);
342 /* EWALD ELECTROSTATICS */
344 /* Analytical PME correction */
345 zeta2 = _mm256_mul_ps(beta2,rsq00);
346 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
347 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
348 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
349 felec = _mm256_mul_ps(qq00,felec);
350 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
351 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
352 velec = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
353 velec = _mm256_mul_ps(qq00,velec);
355 /* LENNARD-JONES DISPERSION/REPULSION */
357 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
358 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
359 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
360 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
361 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
362 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
364 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velec = _mm256_and_ps(velec,cutoff_mask);
368 velecsum = _mm256_add_ps(velecsum,velec);
369 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
370 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
372 fscal = _mm256_add_ps(felec,fvdw);
374 fscal = _mm256_and_ps(fscal,cutoff_mask);
376 /* Calculate temporary vectorial force */
377 tx = _mm256_mul_ps(fscal,dx00);
378 ty = _mm256_mul_ps(fscal,dy00);
379 tz = _mm256_mul_ps(fscal,dz00);
381 /* Update vectorial force */
382 fix0 = _mm256_add_ps(fix0,tx);
383 fiy0 = _mm256_add_ps(fiy0,ty);
384 fiz0 = _mm256_add_ps(fiz0,tz);
386 fjx0 = _mm256_add_ps(fjx0,tx);
387 fjy0 = _mm256_add_ps(fjy0,ty);
388 fjz0 = _mm256_add_ps(fjz0,tz);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 if (gmx_mm256_any_lt(rsq01,rcutoff2))
399 r01 = _mm256_mul_ps(rsq01,rinv01);
401 /* EWALD ELECTROSTATICS */
403 /* Analytical PME correction */
404 zeta2 = _mm256_mul_ps(beta2,rsq01);
405 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
406 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
407 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
408 felec = _mm256_mul_ps(qq01,felec);
409 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
410 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
411 velec = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
412 velec = _mm256_mul_ps(qq01,velec);
414 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velec = _mm256_and_ps(velec,cutoff_mask);
418 velecsum = _mm256_add_ps(velecsum,velec);
422 fscal = _mm256_and_ps(fscal,cutoff_mask);
424 /* Calculate temporary vectorial force */
425 tx = _mm256_mul_ps(fscal,dx01);
426 ty = _mm256_mul_ps(fscal,dy01);
427 tz = _mm256_mul_ps(fscal,dz01);
429 /* Update vectorial force */
430 fix0 = _mm256_add_ps(fix0,tx);
431 fiy0 = _mm256_add_ps(fiy0,ty);
432 fiz0 = _mm256_add_ps(fiz0,tz);
434 fjx1 = _mm256_add_ps(fjx1,tx);
435 fjy1 = _mm256_add_ps(fjy1,ty);
436 fjz1 = _mm256_add_ps(fjz1,tz);
440 /**************************
441 * CALCULATE INTERACTIONS *
442 **************************/
444 if (gmx_mm256_any_lt(rsq02,rcutoff2))
447 r02 = _mm256_mul_ps(rsq02,rinv02);
449 /* EWALD ELECTROSTATICS */
451 /* Analytical PME correction */
452 zeta2 = _mm256_mul_ps(beta2,rsq02);
453 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
454 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
455 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
456 felec = _mm256_mul_ps(qq02,felec);
457 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
458 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
459 velec = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
460 velec = _mm256_mul_ps(qq02,velec);
462 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm256_and_ps(velec,cutoff_mask);
466 velecsum = _mm256_add_ps(velecsum,velec);
470 fscal = _mm256_and_ps(fscal,cutoff_mask);
472 /* Calculate temporary vectorial force */
473 tx = _mm256_mul_ps(fscal,dx02);
474 ty = _mm256_mul_ps(fscal,dy02);
475 tz = _mm256_mul_ps(fscal,dz02);
477 /* Update vectorial force */
478 fix0 = _mm256_add_ps(fix0,tx);
479 fiy0 = _mm256_add_ps(fiy0,ty);
480 fiz0 = _mm256_add_ps(fiz0,tz);
482 fjx2 = _mm256_add_ps(fjx2,tx);
483 fjy2 = _mm256_add_ps(fjy2,ty);
484 fjz2 = _mm256_add_ps(fjz2,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 if (gmx_mm256_any_lt(rsq10,rcutoff2))
495 r10 = _mm256_mul_ps(rsq10,rinv10);
497 /* EWALD ELECTROSTATICS */
499 /* Analytical PME correction */
500 zeta2 = _mm256_mul_ps(beta2,rsq10);
501 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
502 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
503 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
504 felec = _mm256_mul_ps(qq10,felec);
505 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
506 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
507 velec = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
508 velec = _mm256_mul_ps(qq10,velec);
510 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velec = _mm256_and_ps(velec,cutoff_mask);
514 velecsum = _mm256_add_ps(velecsum,velec);
518 fscal = _mm256_and_ps(fscal,cutoff_mask);
520 /* Calculate temporary vectorial force */
521 tx = _mm256_mul_ps(fscal,dx10);
522 ty = _mm256_mul_ps(fscal,dy10);
523 tz = _mm256_mul_ps(fscal,dz10);
525 /* Update vectorial force */
526 fix1 = _mm256_add_ps(fix1,tx);
527 fiy1 = _mm256_add_ps(fiy1,ty);
528 fiz1 = _mm256_add_ps(fiz1,tz);
530 fjx0 = _mm256_add_ps(fjx0,tx);
531 fjy0 = _mm256_add_ps(fjy0,ty);
532 fjz0 = _mm256_add_ps(fjz0,tz);
536 /**************************
537 * CALCULATE INTERACTIONS *
538 **************************/
540 if (gmx_mm256_any_lt(rsq11,rcutoff2))
543 r11 = _mm256_mul_ps(rsq11,rinv11);
545 /* EWALD ELECTROSTATICS */
547 /* Analytical PME correction */
548 zeta2 = _mm256_mul_ps(beta2,rsq11);
549 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
550 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
551 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
552 felec = _mm256_mul_ps(qq11,felec);
553 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
554 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
555 velec = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
556 velec = _mm256_mul_ps(qq11,velec);
558 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velec = _mm256_and_ps(velec,cutoff_mask);
562 velecsum = _mm256_add_ps(velecsum,velec);
566 fscal = _mm256_and_ps(fscal,cutoff_mask);
568 /* Calculate temporary vectorial force */
569 tx = _mm256_mul_ps(fscal,dx11);
570 ty = _mm256_mul_ps(fscal,dy11);
571 tz = _mm256_mul_ps(fscal,dz11);
573 /* Update vectorial force */
574 fix1 = _mm256_add_ps(fix1,tx);
575 fiy1 = _mm256_add_ps(fiy1,ty);
576 fiz1 = _mm256_add_ps(fiz1,tz);
578 fjx1 = _mm256_add_ps(fjx1,tx);
579 fjy1 = _mm256_add_ps(fjy1,ty);
580 fjz1 = _mm256_add_ps(fjz1,tz);
584 /**************************
585 * CALCULATE INTERACTIONS *
586 **************************/
588 if (gmx_mm256_any_lt(rsq12,rcutoff2))
591 r12 = _mm256_mul_ps(rsq12,rinv12);
593 /* EWALD ELECTROSTATICS */
595 /* Analytical PME correction */
596 zeta2 = _mm256_mul_ps(beta2,rsq12);
597 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
598 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
599 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
600 felec = _mm256_mul_ps(qq12,felec);
601 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
602 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
603 velec = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
604 velec = _mm256_mul_ps(qq12,velec);
606 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velec = _mm256_and_ps(velec,cutoff_mask);
610 velecsum = _mm256_add_ps(velecsum,velec);
614 fscal = _mm256_and_ps(fscal,cutoff_mask);
616 /* Calculate temporary vectorial force */
617 tx = _mm256_mul_ps(fscal,dx12);
618 ty = _mm256_mul_ps(fscal,dy12);
619 tz = _mm256_mul_ps(fscal,dz12);
621 /* Update vectorial force */
622 fix1 = _mm256_add_ps(fix1,tx);
623 fiy1 = _mm256_add_ps(fiy1,ty);
624 fiz1 = _mm256_add_ps(fiz1,tz);
626 fjx2 = _mm256_add_ps(fjx2,tx);
627 fjy2 = _mm256_add_ps(fjy2,ty);
628 fjz2 = _mm256_add_ps(fjz2,tz);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 if (gmx_mm256_any_lt(rsq20,rcutoff2))
639 r20 = _mm256_mul_ps(rsq20,rinv20);
641 /* EWALD ELECTROSTATICS */
643 /* Analytical PME correction */
644 zeta2 = _mm256_mul_ps(beta2,rsq20);
645 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
646 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
647 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
648 felec = _mm256_mul_ps(qq20,felec);
649 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
650 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
651 velec = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
652 velec = _mm256_mul_ps(qq20,velec);
654 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
656 /* Update potential sum for this i atom from the interaction with this j atom. */
657 velec = _mm256_and_ps(velec,cutoff_mask);
658 velecsum = _mm256_add_ps(velecsum,velec);
662 fscal = _mm256_and_ps(fscal,cutoff_mask);
664 /* Calculate temporary vectorial force */
665 tx = _mm256_mul_ps(fscal,dx20);
666 ty = _mm256_mul_ps(fscal,dy20);
667 tz = _mm256_mul_ps(fscal,dz20);
669 /* Update vectorial force */
670 fix2 = _mm256_add_ps(fix2,tx);
671 fiy2 = _mm256_add_ps(fiy2,ty);
672 fiz2 = _mm256_add_ps(fiz2,tz);
674 fjx0 = _mm256_add_ps(fjx0,tx);
675 fjy0 = _mm256_add_ps(fjy0,ty);
676 fjz0 = _mm256_add_ps(fjz0,tz);
680 /**************************
681 * CALCULATE INTERACTIONS *
682 **************************/
684 if (gmx_mm256_any_lt(rsq21,rcutoff2))
687 r21 = _mm256_mul_ps(rsq21,rinv21);
689 /* EWALD ELECTROSTATICS */
691 /* Analytical PME correction */
692 zeta2 = _mm256_mul_ps(beta2,rsq21);
693 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
694 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
695 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
696 felec = _mm256_mul_ps(qq21,felec);
697 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
698 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
699 velec = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
700 velec = _mm256_mul_ps(qq21,velec);
702 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
704 /* Update potential sum for this i atom from the interaction with this j atom. */
705 velec = _mm256_and_ps(velec,cutoff_mask);
706 velecsum = _mm256_add_ps(velecsum,velec);
710 fscal = _mm256_and_ps(fscal,cutoff_mask);
712 /* Calculate temporary vectorial force */
713 tx = _mm256_mul_ps(fscal,dx21);
714 ty = _mm256_mul_ps(fscal,dy21);
715 tz = _mm256_mul_ps(fscal,dz21);
717 /* Update vectorial force */
718 fix2 = _mm256_add_ps(fix2,tx);
719 fiy2 = _mm256_add_ps(fiy2,ty);
720 fiz2 = _mm256_add_ps(fiz2,tz);
722 fjx1 = _mm256_add_ps(fjx1,tx);
723 fjy1 = _mm256_add_ps(fjy1,ty);
724 fjz1 = _mm256_add_ps(fjz1,tz);
728 /**************************
729 * CALCULATE INTERACTIONS *
730 **************************/
732 if (gmx_mm256_any_lt(rsq22,rcutoff2))
735 r22 = _mm256_mul_ps(rsq22,rinv22);
737 /* EWALD ELECTROSTATICS */
739 /* Analytical PME correction */
740 zeta2 = _mm256_mul_ps(beta2,rsq22);
741 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
742 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
743 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
744 felec = _mm256_mul_ps(qq22,felec);
745 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
746 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
747 velec = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
748 velec = _mm256_mul_ps(qq22,velec);
750 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
752 /* Update potential sum for this i atom from the interaction with this j atom. */
753 velec = _mm256_and_ps(velec,cutoff_mask);
754 velecsum = _mm256_add_ps(velecsum,velec);
758 fscal = _mm256_and_ps(fscal,cutoff_mask);
760 /* Calculate temporary vectorial force */
761 tx = _mm256_mul_ps(fscal,dx22);
762 ty = _mm256_mul_ps(fscal,dy22);
763 tz = _mm256_mul_ps(fscal,dz22);
765 /* Update vectorial force */
766 fix2 = _mm256_add_ps(fix2,tx);
767 fiy2 = _mm256_add_ps(fiy2,ty);
768 fiz2 = _mm256_add_ps(fiz2,tz);
770 fjx2 = _mm256_add_ps(fjx2,tx);
771 fjy2 = _mm256_add_ps(fjy2,ty);
772 fjz2 = _mm256_add_ps(fjz2,tz);
776 fjptrA = f+j_coord_offsetA;
777 fjptrB = f+j_coord_offsetB;
778 fjptrC = f+j_coord_offsetC;
779 fjptrD = f+j_coord_offsetD;
780 fjptrE = f+j_coord_offsetE;
781 fjptrF = f+j_coord_offsetF;
782 fjptrG = f+j_coord_offsetG;
783 fjptrH = f+j_coord_offsetH;
785 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
786 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
788 /* Inner loop uses 999 flops */
794 /* Get j neighbor index, and coordinate index */
795 jnrlistA = jjnr[jidx];
796 jnrlistB = jjnr[jidx+1];
797 jnrlistC = jjnr[jidx+2];
798 jnrlistD = jjnr[jidx+3];
799 jnrlistE = jjnr[jidx+4];
800 jnrlistF = jjnr[jidx+5];
801 jnrlistG = jjnr[jidx+6];
802 jnrlistH = jjnr[jidx+7];
803 /* Sign of each element will be negative for non-real atoms.
804 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
805 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
807 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
808 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
810 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
811 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
812 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
813 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
814 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
815 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
816 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
817 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
818 j_coord_offsetA = DIM*jnrA;
819 j_coord_offsetB = DIM*jnrB;
820 j_coord_offsetC = DIM*jnrC;
821 j_coord_offsetD = DIM*jnrD;
822 j_coord_offsetE = DIM*jnrE;
823 j_coord_offsetF = DIM*jnrF;
824 j_coord_offsetG = DIM*jnrG;
825 j_coord_offsetH = DIM*jnrH;
827 /* load j atom coordinates */
828 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
829 x+j_coord_offsetC,x+j_coord_offsetD,
830 x+j_coord_offsetE,x+j_coord_offsetF,
831 x+j_coord_offsetG,x+j_coord_offsetH,
832 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
834 /* Calculate displacement vector */
835 dx00 = _mm256_sub_ps(ix0,jx0);
836 dy00 = _mm256_sub_ps(iy0,jy0);
837 dz00 = _mm256_sub_ps(iz0,jz0);
838 dx01 = _mm256_sub_ps(ix0,jx1);
839 dy01 = _mm256_sub_ps(iy0,jy1);
840 dz01 = _mm256_sub_ps(iz0,jz1);
841 dx02 = _mm256_sub_ps(ix0,jx2);
842 dy02 = _mm256_sub_ps(iy0,jy2);
843 dz02 = _mm256_sub_ps(iz0,jz2);
844 dx10 = _mm256_sub_ps(ix1,jx0);
845 dy10 = _mm256_sub_ps(iy1,jy0);
846 dz10 = _mm256_sub_ps(iz1,jz0);
847 dx11 = _mm256_sub_ps(ix1,jx1);
848 dy11 = _mm256_sub_ps(iy1,jy1);
849 dz11 = _mm256_sub_ps(iz1,jz1);
850 dx12 = _mm256_sub_ps(ix1,jx2);
851 dy12 = _mm256_sub_ps(iy1,jy2);
852 dz12 = _mm256_sub_ps(iz1,jz2);
853 dx20 = _mm256_sub_ps(ix2,jx0);
854 dy20 = _mm256_sub_ps(iy2,jy0);
855 dz20 = _mm256_sub_ps(iz2,jz0);
856 dx21 = _mm256_sub_ps(ix2,jx1);
857 dy21 = _mm256_sub_ps(iy2,jy1);
858 dz21 = _mm256_sub_ps(iz2,jz1);
859 dx22 = _mm256_sub_ps(ix2,jx2);
860 dy22 = _mm256_sub_ps(iy2,jy2);
861 dz22 = _mm256_sub_ps(iz2,jz2);
863 /* Calculate squared distance and things based on it */
864 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
865 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
866 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
867 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
868 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
869 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
870 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
871 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
872 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
874 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
875 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
876 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
877 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
878 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
879 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
880 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
881 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
882 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
884 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
885 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
886 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
887 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
888 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
889 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
890 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
891 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
892 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
894 fjx0 = _mm256_setzero_ps();
895 fjy0 = _mm256_setzero_ps();
896 fjz0 = _mm256_setzero_ps();
897 fjx1 = _mm256_setzero_ps();
898 fjy1 = _mm256_setzero_ps();
899 fjz1 = _mm256_setzero_ps();
900 fjx2 = _mm256_setzero_ps();
901 fjy2 = _mm256_setzero_ps();
902 fjz2 = _mm256_setzero_ps();
904 /**************************
905 * CALCULATE INTERACTIONS *
906 **************************/
908 if (gmx_mm256_any_lt(rsq00,rcutoff2))
911 r00 = _mm256_mul_ps(rsq00,rinv00);
912 r00 = _mm256_andnot_ps(dummy_mask,r00);
914 /* EWALD ELECTROSTATICS */
916 /* Analytical PME correction */
917 zeta2 = _mm256_mul_ps(beta2,rsq00);
918 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
919 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
920 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
921 felec = _mm256_mul_ps(qq00,felec);
922 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
923 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
924 velec = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
925 velec = _mm256_mul_ps(qq00,velec);
927 /* LENNARD-JONES DISPERSION/REPULSION */
929 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
930 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
931 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
932 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
933 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
934 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
936 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
938 /* Update potential sum for this i atom from the interaction with this j atom. */
939 velec = _mm256_and_ps(velec,cutoff_mask);
940 velec = _mm256_andnot_ps(dummy_mask,velec);
941 velecsum = _mm256_add_ps(velecsum,velec);
942 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
943 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
944 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
946 fscal = _mm256_add_ps(felec,fvdw);
948 fscal = _mm256_and_ps(fscal,cutoff_mask);
950 fscal = _mm256_andnot_ps(dummy_mask,fscal);
952 /* Calculate temporary vectorial force */
953 tx = _mm256_mul_ps(fscal,dx00);
954 ty = _mm256_mul_ps(fscal,dy00);
955 tz = _mm256_mul_ps(fscal,dz00);
957 /* Update vectorial force */
958 fix0 = _mm256_add_ps(fix0,tx);
959 fiy0 = _mm256_add_ps(fiy0,ty);
960 fiz0 = _mm256_add_ps(fiz0,tz);
962 fjx0 = _mm256_add_ps(fjx0,tx);
963 fjy0 = _mm256_add_ps(fjy0,ty);
964 fjz0 = _mm256_add_ps(fjz0,tz);
968 /**************************
969 * CALCULATE INTERACTIONS *
970 **************************/
972 if (gmx_mm256_any_lt(rsq01,rcutoff2))
975 r01 = _mm256_mul_ps(rsq01,rinv01);
976 r01 = _mm256_andnot_ps(dummy_mask,r01);
978 /* EWALD ELECTROSTATICS */
980 /* Analytical PME correction */
981 zeta2 = _mm256_mul_ps(beta2,rsq01);
982 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
983 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
984 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
985 felec = _mm256_mul_ps(qq01,felec);
986 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
987 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
988 velec = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
989 velec = _mm256_mul_ps(qq01,velec);
991 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
993 /* Update potential sum for this i atom from the interaction with this j atom. */
994 velec = _mm256_and_ps(velec,cutoff_mask);
995 velec = _mm256_andnot_ps(dummy_mask,velec);
996 velecsum = _mm256_add_ps(velecsum,velec);
1000 fscal = _mm256_and_ps(fscal,cutoff_mask);
1002 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1004 /* Calculate temporary vectorial force */
1005 tx = _mm256_mul_ps(fscal,dx01);
1006 ty = _mm256_mul_ps(fscal,dy01);
1007 tz = _mm256_mul_ps(fscal,dz01);
1009 /* Update vectorial force */
1010 fix0 = _mm256_add_ps(fix0,tx);
1011 fiy0 = _mm256_add_ps(fiy0,ty);
1012 fiz0 = _mm256_add_ps(fiz0,tz);
1014 fjx1 = _mm256_add_ps(fjx1,tx);
1015 fjy1 = _mm256_add_ps(fjy1,ty);
1016 fjz1 = _mm256_add_ps(fjz1,tz);
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1024 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1027 r02 = _mm256_mul_ps(rsq02,rinv02);
1028 r02 = _mm256_andnot_ps(dummy_mask,r02);
1030 /* EWALD ELECTROSTATICS */
1032 /* Analytical PME correction */
1033 zeta2 = _mm256_mul_ps(beta2,rsq02);
1034 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
1035 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1036 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1037 felec = _mm256_mul_ps(qq02,felec);
1038 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1039 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1040 velec = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
1041 velec = _mm256_mul_ps(qq02,velec);
1043 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1045 /* Update potential sum for this i atom from the interaction with this j atom. */
1046 velec = _mm256_and_ps(velec,cutoff_mask);
1047 velec = _mm256_andnot_ps(dummy_mask,velec);
1048 velecsum = _mm256_add_ps(velecsum,velec);
1052 fscal = _mm256_and_ps(fscal,cutoff_mask);
1054 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1056 /* Calculate temporary vectorial force */
1057 tx = _mm256_mul_ps(fscal,dx02);
1058 ty = _mm256_mul_ps(fscal,dy02);
1059 tz = _mm256_mul_ps(fscal,dz02);
1061 /* Update vectorial force */
1062 fix0 = _mm256_add_ps(fix0,tx);
1063 fiy0 = _mm256_add_ps(fiy0,ty);
1064 fiz0 = _mm256_add_ps(fiz0,tz);
1066 fjx2 = _mm256_add_ps(fjx2,tx);
1067 fjy2 = _mm256_add_ps(fjy2,ty);
1068 fjz2 = _mm256_add_ps(fjz2,tz);
1072 /**************************
1073 * CALCULATE INTERACTIONS *
1074 **************************/
1076 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1079 r10 = _mm256_mul_ps(rsq10,rinv10);
1080 r10 = _mm256_andnot_ps(dummy_mask,r10);
1082 /* EWALD ELECTROSTATICS */
1084 /* Analytical PME correction */
1085 zeta2 = _mm256_mul_ps(beta2,rsq10);
1086 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1087 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1088 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1089 felec = _mm256_mul_ps(qq10,felec);
1090 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1091 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1092 velec = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
1093 velec = _mm256_mul_ps(qq10,velec);
1095 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1097 /* Update potential sum for this i atom from the interaction with this j atom. */
1098 velec = _mm256_and_ps(velec,cutoff_mask);
1099 velec = _mm256_andnot_ps(dummy_mask,velec);
1100 velecsum = _mm256_add_ps(velecsum,velec);
1104 fscal = _mm256_and_ps(fscal,cutoff_mask);
1106 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1108 /* Calculate temporary vectorial force */
1109 tx = _mm256_mul_ps(fscal,dx10);
1110 ty = _mm256_mul_ps(fscal,dy10);
1111 tz = _mm256_mul_ps(fscal,dz10);
1113 /* Update vectorial force */
1114 fix1 = _mm256_add_ps(fix1,tx);
1115 fiy1 = _mm256_add_ps(fiy1,ty);
1116 fiz1 = _mm256_add_ps(fiz1,tz);
1118 fjx0 = _mm256_add_ps(fjx0,tx);
1119 fjy0 = _mm256_add_ps(fjy0,ty);
1120 fjz0 = _mm256_add_ps(fjz0,tz);
1124 /**************************
1125 * CALCULATE INTERACTIONS *
1126 **************************/
1128 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1131 r11 = _mm256_mul_ps(rsq11,rinv11);
1132 r11 = _mm256_andnot_ps(dummy_mask,r11);
1134 /* EWALD ELECTROSTATICS */
1136 /* Analytical PME correction */
1137 zeta2 = _mm256_mul_ps(beta2,rsq11);
1138 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1139 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1140 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1141 felec = _mm256_mul_ps(qq11,felec);
1142 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1143 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1144 velec = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
1145 velec = _mm256_mul_ps(qq11,velec);
1147 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm256_and_ps(velec,cutoff_mask);
1151 velec = _mm256_andnot_ps(dummy_mask,velec);
1152 velecsum = _mm256_add_ps(velecsum,velec);
1156 fscal = _mm256_and_ps(fscal,cutoff_mask);
1158 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1160 /* Calculate temporary vectorial force */
1161 tx = _mm256_mul_ps(fscal,dx11);
1162 ty = _mm256_mul_ps(fscal,dy11);
1163 tz = _mm256_mul_ps(fscal,dz11);
1165 /* Update vectorial force */
1166 fix1 = _mm256_add_ps(fix1,tx);
1167 fiy1 = _mm256_add_ps(fiy1,ty);
1168 fiz1 = _mm256_add_ps(fiz1,tz);
1170 fjx1 = _mm256_add_ps(fjx1,tx);
1171 fjy1 = _mm256_add_ps(fjy1,ty);
1172 fjz1 = _mm256_add_ps(fjz1,tz);
1176 /**************************
1177 * CALCULATE INTERACTIONS *
1178 **************************/
1180 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1183 r12 = _mm256_mul_ps(rsq12,rinv12);
1184 r12 = _mm256_andnot_ps(dummy_mask,r12);
1186 /* EWALD ELECTROSTATICS */
1188 /* Analytical PME correction */
1189 zeta2 = _mm256_mul_ps(beta2,rsq12);
1190 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1191 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1192 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1193 felec = _mm256_mul_ps(qq12,felec);
1194 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1195 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1196 velec = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
1197 velec = _mm256_mul_ps(qq12,velec);
1199 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1201 /* Update potential sum for this i atom from the interaction with this j atom. */
1202 velec = _mm256_and_ps(velec,cutoff_mask);
1203 velec = _mm256_andnot_ps(dummy_mask,velec);
1204 velecsum = _mm256_add_ps(velecsum,velec);
1208 fscal = _mm256_and_ps(fscal,cutoff_mask);
1210 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1212 /* Calculate temporary vectorial force */
1213 tx = _mm256_mul_ps(fscal,dx12);
1214 ty = _mm256_mul_ps(fscal,dy12);
1215 tz = _mm256_mul_ps(fscal,dz12);
1217 /* Update vectorial force */
1218 fix1 = _mm256_add_ps(fix1,tx);
1219 fiy1 = _mm256_add_ps(fiy1,ty);
1220 fiz1 = _mm256_add_ps(fiz1,tz);
1222 fjx2 = _mm256_add_ps(fjx2,tx);
1223 fjy2 = _mm256_add_ps(fjy2,ty);
1224 fjz2 = _mm256_add_ps(fjz2,tz);
1228 /**************************
1229 * CALCULATE INTERACTIONS *
1230 **************************/
1232 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1235 r20 = _mm256_mul_ps(rsq20,rinv20);
1236 r20 = _mm256_andnot_ps(dummy_mask,r20);
1238 /* EWALD ELECTROSTATICS */
1240 /* Analytical PME correction */
1241 zeta2 = _mm256_mul_ps(beta2,rsq20);
1242 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1243 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1244 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1245 felec = _mm256_mul_ps(qq20,felec);
1246 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1247 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1248 velec = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
1249 velec = _mm256_mul_ps(qq20,velec);
1251 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1253 /* Update potential sum for this i atom from the interaction with this j atom. */
1254 velec = _mm256_and_ps(velec,cutoff_mask);
1255 velec = _mm256_andnot_ps(dummy_mask,velec);
1256 velecsum = _mm256_add_ps(velecsum,velec);
1260 fscal = _mm256_and_ps(fscal,cutoff_mask);
1262 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1264 /* Calculate temporary vectorial force */
1265 tx = _mm256_mul_ps(fscal,dx20);
1266 ty = _mm256_mul_ps(fscal,dy20);
1267 tz = _mm256_mul_ps(fscal,dz20);
1269 /* Update vectorial force */
1270 fix2 = _mm256_add_ps(fix2,tx);
1271 fiy2 = _mm256_add_ps(fiy2,ty);
1272 fiz2 = _mm256_add_ps(fiz2,tz);
1274 fjx0 = _mm256_add_ps(fjx0,tx);
1275 fjy0 = _mm256_add_ps(fjy0,ty);
1276 fjz0 = _mm256_add_ps(fjz0,tz);
1280 /**************************
1281 * CALCULATE INTERACTIONS *
1282 **************************/
1284 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1287 r21 = _mm256_mul_ps(rsq21,rinv21);
1288 r21 = _mm256_andnot_ps(dummy_mask,r21);
1290 /* EWALD ELECTROSTATICS */
1292 /* Analytical PME correction */
1293 zeta2 = _mm256_mul_ps(beta2,rsq21);
1294 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1295 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1296 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1297 felec = _mm256_mul_ps(qq21,felec);
1298 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1299 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1300 velec = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
1301 velec = _mm256_mul_ps(qq21,velec);
1303 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1305 /* Update potential sum for this i atom from the interaction with this j atom. */
1306 velec = _mm256_and_ps(velec,cutoff_mask);
1307 velec = _mm256_andnot_ps(dummy_mask,velec);
1308 velecsum = _mm256_add_ps(velecsum,velec);
1312 fscal = _mm256_and_ps(fscal,cutoff_mask);
1314 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1316 /* Calculate temporary vectorial force */
1317 tx = _mm256_mul_ps(fscal,dx21);
1318 ty = _mm256_mul_ps(fscal,dy21);
1319 tz = _mm256_mul_ps(fscal,dz21);
1321 /* Update vectorial force */
1322 fix2 = _mm256_add_ps(fix2,tx);
1323 fiy2 = _mm256_add_ps(fiy2,ty);
1324 fiz2 = _mm256_add_ps(fiz2,tz);
1326 fjx1 = _mm256_add_ps(fjx1,tx);
1327 fjy1 = _mm256_add_ps(fjy1,ty);
1328 fjz1 = _mm256_add_ps(fjz1,tz);
1332 /**************************
1333 * CALCULATE INTERACTIONS *
1334 **************************/
1336 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1339 r22 = _mm256_mul_ps(rsq22,rinv22);
1340 r22 = _mm256_andnot_ps(dummy_mask,r22);
1342 /* EWALD ELECTROSTATICS */
1344 /* Analytical PME correction */
1345 zeta2 = _mm256_mul_ps(beta2,rsq22);
1346 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1347 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1348 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1349 felec = _mm256_mul_ps(qq22,felec);
1350 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1351 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1352 velec = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
1353 velec = _mm256_mul_ps(qq22,velec);
1355 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1357 /* Update potential sum for this i atom from the interaction with this j atom. */
1358 velec = _mm256_and_ps(velec,cutoff_mask);
1359 velec = _mm256_andnot_ps(dummy_mask,velec);
1360 velecsum = _mm256_add_ps(velecsum,velec);
1364 fscal = _mm256_and_ps(fscal,cutoff_mask);
1366 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1368 /* Calculate temporary vectorial force */
1369 tx = _mm256_mul_ps(fscal,dx22);
1370 ty = _mm256_mul_ps(fscal,dy22);
1371 tz = _mm256_mul_ps(fscal,dz22);
1373 /* Update vectorial force */
1374 fix2 = _mm256_add_ps(fix2,tx);
1375 fiy2 = _mm256_add_ps(fiy2,ty);
1376 fiz2 = _mm256_add_ps(fiz2,tz);
1378 fjx2 = _mm256_add_ps(fjx2,tx);
1379 fjy2 = _mm256_add_ps(fjy2,ty);
1380 fjz2 = _mm256_add_ps(fjz2,tz);
1384 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1385 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1386 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1387 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1388 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1389 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1390 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1391 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1393 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1394 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1396 /* Inner loop uses 1008 flops */
1399 /* End of innermost loop */
1401 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1402 f+i_coord_offset,fshift+i_shift_offset);
1405 /* Update potential energies */
1406 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1407 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1409 /* Increment number of inner iterations */
1410 inneriter += j_index_end - j_index_start;
1412 /* Outer loop uses 20 flops */
1415 /* Increment number of outer iterations */
1418 /* Update outer/inner flops */
1420 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*1008);
1423 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single
1424 * Electrostatics interaction: Ewald
1425 * VdW interaction: LennardJones
1426 * Geometry: Water3-Water3
1427 * Calculate force/pot: Force
1430 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single
1431 (t_nblist * gmx_restrict nlist,
1432 rvec * gmx_restrict xx,
1433 rvec * gmx_restrict ff,
1434 t_forcerec * gmx_restrict fr,
1435 t_mdatoms * gmx_restrict mdatoms,
1436 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1437 t_nrnb * gmx_restrict nrnb)
1439 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1440 * just 0 for non-waters.
1441 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1442 * jnr indices corresponding to data put in the four positions in the SIMD register.
1444 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1445 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1446 int jnrA,jnrB,jnrC,jnrD;
1447 int jnrE,jnrF,jnrG,jnrH;
1448 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1449 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1450 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1451 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1452 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1453 real rcutoff_scalar;
1454 real *shiftvec,*fshift,*x,*f;
1455 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1456 real scratch[4*DIM];
1457 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1458 real * vdwioffsetptr0;
1459 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1460 real * vdwioffsetptr1;
1461 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1462 real * vdwioffsetptr2;
1463 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1464 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1465 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1466 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1467 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1468 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1469 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1470 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1471 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1472 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1473 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1474 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1475 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1476 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1477 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1478 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1479 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1482 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1485 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1486 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1488 __m128i ewitab_lo,ewitab_hi;
1489 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1490 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1492 __m256 dummy_mask,cutoff_mask;
1493 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1494 __m256 one = _mm256_set1_ps(1.0);
1495 __m256 two = _mm256_set1_ps(2.0);
1501 jindex = nlist->jindex;
1503 shiftidx = nlist->shift;
1505 shiftvec = fr->shift_vec[0];
1506 fshift = fr->fshift[0];
1507 facel = _mm256_set1_ps(fr->epsfac);
1508 charge = mdatoms->chargeA;
1509 nvdwtype = fr->ntype;
1510 vdwparam = fr->nbfp;
1511 vdwtype = mdatoms->typeA;
1513 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1514 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1515 beta2 = _mm256_mul_ps(beta,beta);
1516 beta3 = _mm256_mul_ps(beta,beta2);
1518 ewtab = fr->ic->tabq_coul_F;
1519 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1520 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1522 /* Setup water-specific parameters */
1523 inr = nlist->iinr[0];
1524 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1525 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1526 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1527 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1529 jq0 = _mm256_set1_ps(charge[inr+0]);
1530 jq1 = _mm256_set1_ps(charge[inr+1]);
1531 jq2 = _mm256_set1_ps(charge[inr+2]);
1532 vdwjidx0A = 2*vdwtype[inr+0];
1533 qq00 = _mm256_mul_ps(iq0,jq0);
1534 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1535 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1536 qq01 = _mm256_mul_ps(iq0,jq1);
1537 qq02 = _mm256_mul_ps(iq0,jq2);
1538 qq10 = _mm256_mul_ps(iq1,jq0);
1539 qq11 = _mm256_mul_ps(iq1,jq1);
1540 qq12 = _mm256_mul_ps(iq1,jq2);
1541 qq20 = _mm256_mul_ps(iq2,jq0);
1542 qq21 = _mm256_mul_ps(iq2,jq1);
1543 qq22 = _mm256_mul_ps(iq2,jq2);
1545 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1546 rcutoff_scalar = fr->rcoulomb;
1547 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1548 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1550 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
1551 rvdw = _mm256_set1_ps(fr->rvdw);
1553 /* Avoid stupid compiler warnings */
1554 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1555 j_coord_offsetA = 0;
1556 j_coord_offsetB = 0;
1557 j_coord_offsetC = 0;
1558 j_coord_offsetD = 0;
1559 j_coord_offsetE = 0;
1560 j_coord_offsetF = 0;
1561 j_coord_offsetG = 0;
1562 j_coord_offsetH = 0;
1567 for(iidx=0;iidx<4*DIM;iidx++)
1569 scratch[iidx] = 0.0;
1572 /* Start outer loop over neighborlists */
1573 for(iidx=0; iidx<nri; iidx++)
1575 /* Load shift vector for this list */
1576 i_shift_offset = DIM*shiftidx[iidx];
1578 /* Load limits for loop over neighbors */
1579 j_index_start = jindex[iidx];
1580 j_index_end = jindex[iidx+1];
1582 /* Get outer coordinate index */
1584 i_coord_offset = DIM*inr;
1586 /* Load i particle coords and add shift vector */
1587 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1588 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1590 fix0 = _mm256_setzero_ps();
1591 fiy0 = _mm256_setzero_ps();
1592 fiz0 = _mm256_setzero_ps();
1593 fix1 = _mm256_setzero_ps();
1594 fiy1 = _mm256_setzero_ps();
1595 fiz1 = _mm256_setzero_ps();
1596 fix2 = _mm256_setzero_ps();
1597 fiy2 = _mm256_setzero_ps();
1598 fiz2 = _mm256_setzero_ps();
1600 /* Start inner kernel loop */
1601 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1604 /* Get j neighbor index, and coordinate index */
1606 jnrB = jjnr[jidx+1];
1607 jnrC = jjnr[jidx+2];
1608 jnrD = jjnr[jidx+3];
1609 jnrE = jjnr[jidx+4];
1610 jnrF = jjnr[jidx+5];
1611 jnrG = jjnr[jidx+6];
1612 jnrH = jjnr[jidx+7];
1613 j_coord_offsetA = DIM*jnrA;
1614 j_coord_offsetB = DIM*jnrB;
1615 j_coord_offsetC = DIM*jnrC;
1616 j_coord_offsetD = DIM*jnrD;
1617 j_coord_offsetE = DIM*jnrE;
1618 j_coord_offsetF = DIM*jnrF;
1619 j_coord_offsetG = DIM*jnrG;
1620 j_coord_offsetH = DIM*jnrH;
1622 /* load j atom coordinates */
1623 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1624 x+j_coord_offsetC,x+j_coord_offsetD,
1625 x+j_coord_offsetE,x+j_coord_offsetF,
1626 x+j_coord_offsetG,x+j_coord_offsetH,
1627 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1629 /* Calculate displacement vector */
1630 dx00 = _mm256_sub_ps(ix0,jx0);
1631 dy00 = _mm256_sub_ps(iy0,jy0);
1632 dz00 = _mm256_sub_ps(iz0,jz0);
1633 dx01 = _mm256_sub_ps(ix0,jx1);
1634 dy01 = _mm256_sub_ps(iy0,jy1);
1635 dz01 = _mm256_sub_ps(iz0,jz1);
1636 dx02 = _mm256_sub_ps(ix0,jx2);
1637 dy02 = _mm256_sub_ps(iy0,jy2);
1638 dz02 = _mm256_sub_ps(iz0,jz2);
1639 dx10 = _mm256_sub_ps(ix1,jx0);
1640 dy10 = _mm256_sub_ps(iy1,jy0);
1641 dz10 = _mm256_sub_ps(iz1,jz0);
1642 dx11 = _mm256_sub_ps(ix1,jx1);
1643 dy11 = _mm256_sub_ps(iy1,jy1);
1644 dz11 = _mm256_sub_ps(iz1,jz1);
1645 dx12 = _mm256_sub_ps(ix1,jx2);
1646 dy12 = _mm256_sub_ps(iy1,jy2);
1647 dz12 = _mm256_sub_ps(iz1,jz2);
1648 dx20 = _mm256_sub_ps(ix2,jx0);
1649 dy20 = _mm256_sub_ps(iy2,jy0);
1650 dz20 = _mm256_sub_ps(iz2,jz0);
1651 dx21 = _mm256_sub_ps(ix2,jx1);
1652 dy21 = _mm256_sub_ps(iy2,jy1);
1653 dz21 = _mm256_sub_ps(iz2,jz1);
1654 dx22 = _mm256_sub_ps(ix2,jx2);
1655 dy22 = _mm256_sub_ps(iy2,jy2);
1656 dz22 = _mm256_sub_ps(iz2,jz2);
1658 /* Calculate squared distance and things based on it */
1659 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1660 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1661 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1662 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1663 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1664 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1665 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1666 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1667 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1669 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1670 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1671 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1672 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1673 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1674 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1675 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1676 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1677 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1679 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1680 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1681 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1682 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1683 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1684 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1685 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1686 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1687 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1689 fjx0 = _mm256_setzero_ps();
1690 fjy0 = _mm256_setzero_ps();
1691 fjz0 = _mm256_setzero_ps();
1692 fjx1 = _mm256_setzero_ps();
1693 fjy1 = _mm256_setzero_ps();
1694 fjz1 = _mm256_setzero_ps();
1695 fjx2 = _mm256_setzero_ps();
1696 fjy2 = _mm256_setzero_ps();
1697 fjz2 = _mm256_setzero_ps();
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1703 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1706 r00 = _mm256_mul_ps(rsq00,rinv00);
1708 /* EWALD ELECTROSTATICS */
1710 /* Analytical PME correction */
1711 zeta2 = _mm256_mul_ps(beta2,rsq00);
1712 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
1713 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1714 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1715 felec = _mm256_mul_ps(qq00,felec);
1717 /* LENNARD-JONES DISPERSION/REPULSION */
1719 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1720 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1722 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1724 fscal = _mm256_add_ps(felec,fvdw);
1726 fscal = _mm256_and_ps(fscal,cutoff_mask);
1728 /* Calculate temporary vectorial force */
1729 tx = _mm256_mul_ps(fscal,dx00);
1730 ty = _mm256_mul_ps(fscal,dy00);
1731 tz = _mm256_mul_ps(fscal,dz00);
1733 /* Update vectorial force */
1734 fix0 = _mm256_add_ps(fix0,tx);
1735 fiy0 = _mm256_add_ps(fiy0,ty);
1736 fiz0 = _mm256_add_ps(fiz0,tz);
1738 fjx0 = _mm256_add_ps(fjx0,tx);
1739 fjy0 = _mm256_add_ps(fjy0,ty);
1740 fjz0 = _mm256_add_ps(fjz0,tz);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1751 r01 = _mm256_mul_ps(rsq01,rinv01);
1753 /* EWALD ELECTROSTATICS */
1755 /* Analytical PME correction */
1756 zeta2 = _mm256_mul_ps(beta2,rsq01);
1757 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
1758 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1759 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1760 felec = _mm256_mul_ps(qq01,felec);
1762 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1766 fscal = _mm256_and_ps(fscal,cutoff_mask);
1768 /* Calculate temporary vectorial force */
1769 tx = _mm256_mul_ps(fscal,dx01);
1770 ty = _mm256_mul_ps(fscal,dy01);
1771 tz = _mm256_mul_ps(fscal,dz01);
1773 /* Update vectorial force */
1774 fix0 = _mm256_add_ps(fix0,tx);
1775 fiy0 = _mm256_add_ps(fiy0,ty);
1776 fiz0 = _mm256_add_ps(fiz0,tz);
1778 fjx1 = _mm256_add_ps(fjx1,tx);
1779 fjy1 = _mm256_add_ps(fjy1,ty);
1780 fjz1 = _mm256_add_ps(fjz1,tz);
1784 /**************************
1785 * CALCULATE INTERACTIONS *
1786 **************************/
1788 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1791 r02 = _mm256_mul_ps(rsq02,rinv02);
1793 /* EWALD ELECTROSTATICS */
1795 /* Analytical PME correction */
1796 zeta2 = _mm256_mul_ps(beta2,rsq02);
1797 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
1798 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1799 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1800 felec = _mm256_mul_ps(qq02,felec);
1802 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1806 fscal = _mm256_and_ps(fscal,cutoff_mask);
1808 /* Calculate temporary vectorial force */
1809 tx = _mm256_mul_ps(fscal,dx02);
1810 ty = _mm256_mul_ps(fscal,dy02);
1811 tz = _mm256_mul_ps(fscal,dz02);
1813 /* Update vectorial force */
1814 fix0 = _mm256_add_ps(fix0,tx);
1815 fiy0 = _mm256_add_ps(fiy0,ty);
1816 fiz0 = _mm256_add_ps(fiz0,tz);
1818 fjx2 = _mm256_add_ps(fjx2,tx);
1819 fjy2 = _mm256_add_ps(fjy2,ty);
1820 fjz2 = _mm256_add_ps(fjz2,tz);
1824 /**************************
1825 * CALCULATE INTERACTIONS *
1826 **************************/
1828 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1831 r10 = _mm256_mul_ps(rsq10,rinv10);
1833 /* EWALD ELECTROSTATICS */
1835 /* Analytical PME correction */
1836 zeta2 = _mm256_mul_ps(beta2,rsq10);
1837 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1838 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1839 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1840 felec = _mm256_mul_ps(qq10,felec);
1842 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1846 fscal = _mm256_and_ps(fscal,cutoff_mask);
1848 /* Calculate temporary vectorial force */
1849 tx = _mm256_mul_ps(fscal,dx10);
1850 ty = _mm256_mul_ps(fscal,dy10);
1851 tz = _mm256_mul_ps(fscal,dz10);
1853 /* Update vectorial force */
1854 fix1 = _mm256_add_ps(fix1,tx);
1855 fiy1 = _mm256_add_ps(fiy1,ty);
1856 fiz1 = _mm256_add_ps(fiz1,tz);
1858 fjx0 = _mm256_add_ps(fjx0,tx);
1859 fjy0 = _mm256_add_ps(fjy0,ty);
1860 fjz0 = _mm256_add_ps(fjz0,tz);
1864 /**************************
1865 * CALCULATE INTERACTIONS *
1866 **************************/
1868 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1871 r11 = _mm256_mul_ps(rsq11,rinv11);
1873 /* EWALD ELECTROSTATICS */
1875 /* Analytical PME correction */
1876 zeta2 = _mm256_mul_ps(beta2,rsq11);
1877 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1878 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1879 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1880 felec = _mm256_mul_ps(qq11,felec);
1882 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1886 fscal = _mm256_and_ps(fscal,cutoff_mask);
1888 /* Calculate temporary vectorial force */
1889 tx = _mm256_mul_ps(fscal,dx11);
1890 ty = _mm256_mul_ps(fscal,dy11);
1891 tz = _mm256_mul_ps(fscal,dz11);
1893 /* Update vectorial force */
1894 fix1 = _mm256_add_ps(fix1,tx);
1895 fiy1 = _mm256_add_ps(fiy1,ty);
1896 fiz1 = _mm256_add_ps(fiz1,tz);
1898 fjx1 = _mm256_add_ps(fjx1,tx);
1899 fjy1 = _mm256_add_ps(fjy1,ty);
1900 fjz1 = _mm256_add_ps(fjz1,tz);
1904 /**************************
1905 * CALCULATE INTERACTIONS *
1906 **************************/
1908 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1911 r12 = _mm256_mul_ps(rsq12,rinv12);
1913 /* EWALD ELECTROSTATICS */
1915 /* Analytical PME correction */
1916 zeta2 = _mm256_mul_ps(beta2,rsq12);
1917 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1918 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1919 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1920 felec = _mm256_mul_ps(qq12,felec);
1922 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1926 fscal = _mm256_and_ps(fscal,cutoff_mask);
1928 /* Calculate temporary vectorial force */
1929 tx = _mm256_mul_ps(fscal,dx12);
1930 ty = _mm256_mul_ps(fscal,dy12);
1931 tz = _mm256_mul_ps(fscal,dz12);
1933 /* Update vectorial force */
1934 fix1 = _mm256_add_ps(fix1,tx);
1935 fiy1 = _mm256_add_ps(fiy1,ty);
1936 fiz1 = _mm256_add_ps(fiz1,tz);
1938 fjx2 = _mm256_add_ps(fjx2,tx);
1939 fjy2 = _mm256_add_ps(fjy2,ty);
1940 fjz2 = _mm256_add_ps(fjz2,tz);
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1951 r20 = _mm256_mul_ps(rsq20,rinv20);
1953 /* EWALD ELECTROSTATICS */
1955 /* Analytical PME correction */
1956 zeta2 = _mm256_mul_ps(beta2,rsq20);
1957 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1958 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1959 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1960 felec = _mm256_mul_ps(qq20,felec);
1962 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1966 fscal = _mm256_and_ps(fscal,cutoff_mask);
1968 /* Calculate temporary vectorial force */
1969 tx = _mm256_mul_ps(fscal,dx20);
1970 ty = _mm256_mul_ps(fscal,dy20);
1971 tz = _mm256_mul_ps(fscal,dz20);
1973 /* Update vectorial force */
1974 fix2 = _mm256_add_ps(fix2,tx);
1975 fiy2 = _mm256_add_ps(fiy2,ty);
1976 fiz2 = _mm256_add_ps(fiz2,tz);
1978 fjx0 = _mm256_add_ps(fjx0,tx);
1979 fjy0 = _mm256_add_ps(fjy0,ty);
1980 fjz0 = _mm256_add_ps(fjz0,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1991 r21 = _mm256_mul_ps(rsq21,rinv21);
1993 /* EWALD ELECTROSTATICS */
1995 /* Analytical PME correction */
1996 zeta2 = _mm256_mul_ps(beta2,rsq21);
1997 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1998 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1999 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2000 felec = _mm256_mul_ps(qq21,felec);
2002 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2006 fscal = _mm256_and_ps(fscal,cutoff_mask);
2008 /* Calculate temporary vectorial force */
2009 tx = _mm256_mul_ps(fscal,dx21);
2010 ty = _mm256_mul_ps(fscal,dy21);
2011 tz = _mm256_mul_ps(fscal,dz21);
2013 /* Update vectorial force */
2014 fix2 = _mm256_add_ps(fix2,tx);
2015 fiy2 = _mm256_add_ps(fiy2,ty);
2016 fiz2 = _mm256_add_ps(fiz2,tz);
2018 fjx1 = _mm256_add_ps(fjx1,tx);
2019 fjy1 = _mm256_add_ps(fjy1,ty);
2020 fjz1 = _mm256_add_ps(fjz1,tz);
2024 /**************************
2025 * CALCULATE INTERACTIONS *
2026 **************************/
2028 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2031 r22 = _mm256_mul_ps(rsq22,rinv22);
2033 /* EWALD ELECTROSTATICS */
2035 /* Analytical PME correction */
2036 zeta2 = _mm256_mul_ps(beta2,rsq22);
2037 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2038 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2039 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2040 felec = _mm256_mul_ps(qq22,felec);
2042 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2046 fscal = _mm256_and_ps(fscal,cutoff_mask);
2048 /* Calculate temporary vectorial force */
2049 tx = _mm256_mul_ps(fscal,dx22);
2050 ty = _mm256_mul_ps(fscal,dy22);
2051 tz = _mm256_mul_ps(fscal,dz22);
2053 /* Update vectorial force */
2054 fix2 = _mm256_add_ps(fix2,tx);
2055 fiy2 = _mm256_add_ps(fiy2,ty);
2056 fiz2 = _mm256_add_ps(fiz2,tz);
2058 fjx2 = _mm256_add_ps(fjx2,tx);
2059 fjy2 = _mm256_add_ps(fjy2,ty);
2060 fjz2 = _mm256_add_ps(fjz2,tz);
2064 fjptrA = f+j_coord_offsetA;
2065 fjptrB = f+j_coord_offsetB;
2066 fjptrC = f+j_coord_offsetC;
2067 fjptrD = f+j_coord_offsetD;
2068 fjptrE = f+j_coord_offsetE;
2069 fjptrF = f+j_coord_offsetF;
2070 fjptrG = f+j_coord_offsetG;
2071 fjptrH = f+j_coord_offsetH;
2073 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2074 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2076 /* Inner loop uses 538 flops */
2079 if(jidx<j_index_end)
2082 /* Get j neighbor index, and coordinate index */
2083 jnrlistA = jjnr[jidx];
2084 jnrlistB = jjnr[jidx+1];
2085 jnrlistC = jjnr[jidx+2];
2086 jnrlistD = jjnr[jidx+3];
2087 jnrlistE = jjnr[jidx+4];
2088 jnrlistF = jjnr[jidx+5];
2089 jnrlistG = jjnr[jidx+6];
2090 jnrlistH = jjnr[jidx+7];
2091 /* Sign of each element will be negative for non-real atoms.
2092 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2093 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2095 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2096 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2098 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2099 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2100 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2101 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2102 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2103 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2104 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2105 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2106 j_coord_offsetA = DIM*jnrA;
2107 j_coord_offsetB = DIM*jnrB;
2108 j_coord_offsetC = DIM*jnrC;
2109 j_coord_offsetD = DIM*jnrD;
2110 j_coord_offsetE = DIM*jnrE;
2111 j_coord_offsetF = DIM*jnrF;
2112 j_coord_offsetG = DIM*jnrG;
2113 j_coord_offsetH = DIM*jnrH;
2115 /* load j atom coordinates */
2116 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2117 x+j_coord_offsetC,x+j_coord_offsetD,
2118 x+j_coord_offsetE,x+j_coord_offsetF,
2119 x+j_coord_offsetG,x+j_coord_offsetH,
2120 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2122 /* Calculate displacement vector */
2123 dx00 = _mm256_sub_ps(ix0,jx0);
2124 dy00 = _mm256_sub_ps(iy0,jy0);
2125 dz00 = _mm256_sub_ps(iz0,jz0);
2126 dx01 = _mm256_sub_ps(ix0,jx1);
2127 dy01 = _mm256_sub_ps(iy0,jy1);
2128 dz01 = _mm256_sub_ps(iz0,jz1);
2129 dx02 = _mm256_sub_ps(ix0,jx2);
2130 dy02 = _mm256_sub_ps(iy0,jy2);
2131 dz02 = _mm256_sub_ps(iz0,jz2);
2132 dx10 = _mm256_sub_ps(ix1,jx0);
2133 dy10 = _mm256_sub_ps(iy1,jy0);
2134 dz10 = _mm256_sub_ps(iz1,jz0);
2135 dx11 = _mm256_sub_ps(ix1,jx1);
2136 dy11 = _mm256_sub_ps(iy1,jy1);
2137 dz11 = _mm256_sub_ps(iz1,jz1);
2138 dx12 = _mm256_sub_ps(ix1,jx2);
2139 dy12 = _mm256_sub_ps(iy1,jy2);
2140 dz12 = _mm256_sub_ps(iz1,jz2);
2141 dx20 = _mm256_sub_ps(ix2,jx0);
2142 dy20 = _mm256_sub_ps(iy2,jy0);
2143 dz20 = _mm256_sub_ps(iz2,jz0);
2144 dx21 = _mm256_sub_ps(ix2,jx1);
2145 dy21 = _mm256_sub_ps(iy2,jy1);
2146 dz21 = _mm256_sub_ps(iz2,jz1);
2147 dx22 = _mm256_sub_ps(ix2,jx2);
2148 dy22 = _mm256_sub_ps(iy2,jy2);
2149 dz22 = _mm256_sub_ps(iz2,jz2);
2151 /* Calculate squared distance and things based on it */
2152 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2153 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2154 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2155 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2156 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2157 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2158 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2159 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2160 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2162 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2163 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2164 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2165 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2166 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2167 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2168 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2169 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2170 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2172 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2173 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
2174 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
2175 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
2176 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2177 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2178 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
2179 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2180 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2182 fjx0 = _mm256_setzero_ps();
2183 fjy0 = _mm256_setzero_ps();
2184 fjz0 = _mm256_setzero_ps();
2185 fjx1 = _mm256_setzero_ps();
2186 fjy1 = _mm256_setzero_ps();
2187 fjz1 = _mm256_setzero_ps();
2188 fjx2 = _mm256_setzero_ps();
2189 fjy2 = _mm256_setzero_ps();
2190 fjz2 = _mm256_setzero_ps();
2192 /**************************
2193 * CALCULATE INTERACTIONS *
2194 **************************/
2196 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2199 r00 = _mm256_mul_ps(rsq00,rinv00);
2200 r00 = _mm256_andnot_ps(dummy_mask,r00);
2202 /* EWALD ELECTROSTATICS */
2204 /* Analytical PME correction */
2205 zeta2 = _mm256_mul_ps(beta2,rsq00);
2206 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
2207 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2208 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2209 felec = _mm256_mul_ps(qq00,felec);
2211 /* LENNARD-JONES DISPERSION/REPULSION */
2213 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2214 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2216 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2218 fscal = _mm256_add_ps(felec,fvdw);
2220 fscal = _mm256_and_ps(fscal,cutoff_mask);
2222 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2224 /* Calculate temporary vectorial force */
2225 tx = _mm256_mul_ps(fscal,dx00);
2226 ty = _mm256_mul_ps(fscal,dy00);
2227 tz = _mm256_mul_ps(fscal,dz00);
2229 /* Update vectorial force */
2230 fix0 = _mm256_add_ps(fix0,tx);
2231 fiy0 = _mm256_add_ps(fiy0,ty);
2232 fiz0 = _mm256_add_ps(fiz0,tz);
2234 fjx0 = _mm256_add_ps(fjx0,tx);
2235 fjy0 = _mm256_add_ps(fjy0,ty);
2236 fjz0 = _mm256_add_ps(fjz0,tz);
2240 /**************************
2241 * CALCULATE INTERACTIONS *
2242 **************************/
2244 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2247 r01 = _mm256_mul_ps(rsq01,rinv01);
2248 r01 = _mm256_andnot_ps(dummy_mask,r01);
2250 /* EWALD ELECTROSTATICS */
2252 /* Analytical PME correction */
2253 zeta2 = _mm256_mul_ps(beta2,rsq01);
2254 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
2255 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2256 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2257 felec = _mm256_mul_ps(qq01,felec);
2259 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2263 fscal = _mm256_and_ps(fscal,cutoff_mask);
2265 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2267 /* Calculate temporary vectorial force */
2268 tx = _mm256_mul_ps(fscal,dx01);
2269 ty = _mm256_mul_ps(fscal,dy01);
2270 tz = _mm256_mul_ps(fscal,dz01);
2272 /* Update vectorial force */
2273 fix0 = _mm256_add_ps(fix0,tx);
2274 fiy0 = _mm256_add_ps(fiy0,ty);
2275 fiz0 = _mm256_add_ps(fiz0,tz);
2277 fjx1 = _mm256_add_ps(fjx1,tx);
2278 fjy1 = _mm256_add_ps(fjy1,ty);
2279 fjz1 = _mm256_add_ps(fjz1,tz);
2283 /**************************
2284 * CALCULATE INTERACTIONS *
2285 **************************/
2287 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2290 r02 = _mm256_mul_ps(rsq02,rinv02);
2291 r02 = _mm256_andnot_ps(dummy_mask,r02);
2293 /* EWALD ELECTROSTATICS */
2295 /* Analytical PME correction */
2296 zeta2 = _mm256_mul_ps(beta2,rsq02);
2297 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
2298 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2299 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2300 felec = _mm256_mul_ps(qq02,felec);
2302 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2306 fscal = _mm256_and_ps(fscal,cutoff_mask);
2308 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2310 /* Calculate temporary vectorial force */
2311 tx = _mm256_mul_ps(fscal,dx02);
2312 ty = _mm256_mul_ps(fscal,dy02);
2313 tz = _mm256_mul_ps(fscal,dz02);
2315 /* Update vectorial force */
2316 fix0 = _mm256_add_ps(fix0,tx);
2317 fiy0 = _mm256_add_ps(fiy0,ty);
2318 fiz0 = _mm256_add_ps(fiz0,tz);
2320 fjx2 = _mm256_add_ps(fjx2,tx);
2321 fjy2 = _mm256_add_ps(fjy2,ty);
2322 fjz2 = _mm256_add_ps(fjz2,tz);
2326 /**************************
2327 * CALCULATE INTERACTIONS *
2328 **************************/
2330 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2333 r10 = _mm256_mul_ps(rsq10,rinv10);
2334 r10 = _mm256_andnot_ps(dummy_mask,r10);
2336 /* EWALD ELECTROSTATICS */
2338 /* Analytical PME correction */
2339 zeta2 = _mm256_mul_ps(beta2,rsq10);
2340 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
2341 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2342 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2343 felec = _mm256_mul_ps(qq10,felec);
2345 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2349 fscal = _mm256_and_ps(fscal,cutoff_mask);
2351 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2353 /* Calculate temporary vectorial force */
2354 tx = _mm256_mul_ps(fscal,dx10);
2355 ty = _mm256_mul_ps(fscal,dy10);
2356 tz = _mm256_mul_ps(fscal,dz10);
2358 /* Update vectorial force */
2359 fix1 = _mm256_add_ps(fix1,tx);
2360 fiy1 = _mm256_add_ps(fiy1,ty);
2361 fiz1 = _mm256_add_ps(fiz1,tz);
2363 fjx0 = _mm256_add_ps(fjx0,tx);
2364 fjy0 = _mm256_add_ps(fjy0,ty);
2365 fjz0 = _mm256_add_ps(fjz0,tz);
2369 /**************************
2370 * CALCULATE INTERACTIONS *
2371 **************************/
2373 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2376 r11 = _mm256_mul_ps(rsq11,rinv11);
2377 r11 = _mm256_andnot_ps(dummy_mask,r11);
2379 /* EWALD ELECTROSTATICS */
2381 /* Analytical PME correction */
2382 zeta2 = _mm256_mul_ps(beta2,rsq11);
2383 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2384 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2385 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2386 felec = _mm256_mul_ps(qq11,felec);
2388 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2392 fscal = _mm256_and_ps(fscal,cutoff_mask);
2394 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2396 /* Calculate temporary vectorial force */
2397 tx = _mm256_mul_ps(fscal,dx11);
2398 ty = _mm256_mul_ps(fscal,dy11);
2399 tz = _mm256_mul_ps(fscal,dz11);
2401 /* Update vectorial force */
2402 fix1 = _mm256_add_ps(fix1,tx);
2403 fiy1 = _mm256_add_ps(fiy1,ty);
2404 fiz1 = _mm256_add_ps(fiz1,tz);
2406 fjx1 = _mm256_add_ps(fjx1,tx);
2407 fjy1 = _mm256_add_ps(fjy1,ty);
2408 fjz1 = _mm256_add_ps(fjz1,tz);
2412 /**************************
2413 * CALCULATE INTERACTIONS *
2414 **************************/
2416 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2419 r12 = _mm256_mul_ps(rsq12,rinv12);
2420 r12 = _mm256_andnot_ps(dummy_mask,r12);
2422 /* EWALD ELECTROSTATICS */
2424 /* Analytical PME correction */
2425 zeta2 = _mm256_mul_ps(beta2,rsq12);
2426 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2427 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2428 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2429 felec = _mm256_mul_ps(qq12,felec);
2431 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2435 fscal = _mm256_and_ps(fscal,cutoff_mask);
2437 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2439 /* Calculate temporary vectorial force */
2440 tx = _mm256_mul_ps(fscal,dx12);
2441 ty = _mm256_mul_ps(fscal,dy12);
2442 tz = _mm256_mul_ps(fscal,dz12);
2444 /* Update vectorial force */
2445 fix1 = _mm256_add_ps(fix1,tx);
2446 fiy1 = _mm256_add_ps(fiy1,ty);
2447 fiz1 = _mm256_add_ps(fiz1,tz);
2449 fjx2 = _mm256_add_ps(fjx2,tx);
2450 fjy2 = _mm256_add_ps(fjy2,ty);
2451 fjz2 = _mm256_add_ps(fjz2,tz);
2455 /**************************
2456 * CALCULATE INTERACTIONS *
2457 **************************/
2459 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2462 r20 = _mm256_mul_ps(rsq20,rinv20);
2463 r20 = _mm256_andnot_ps(dummy_mask,r20);
2465 /* EWALD ELECTROSTATICS */
2467 /* Analytical PME correction */
2468 zeta2 = _mm256_mul_ps(beta2,rsq20);
2469 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
2470 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2471 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2472 felec = _mm256_mul_ps(qq20,felec);
2474 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2478 fscal = _mm256_and_ps(fscal,cutoff_mask);
2480 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2482 /* Calculate temporary vectorial force */
2483 tx = _mm256_mul_ps(fscal,dx20);
2484 ty = _mm256_mul_ps(fscal,dy20);
2485 tz = _mm256_mul_ps(fscal,dz20);
2487 /* Update vectorial force */
2488 fix2 = _mm256_add_ps(fix2,tx);
2489 fiy2 = _mm256_add_ps(fiy2,ty);
2490 fiz2 = _mm256_add_ps(fiz2,tz);
2492 fjx0 = _mm256_add_ps(fjx0,tx);
2493 fjy0 = _mm256_add_ps(fjy0,ty);
2494 fjz0 = _mm256_add_ps(fjz0,tz);
2498 /**************************
2499 * CALCULATE INTERACTIONS *
2500 **************************/
2502 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2505 r21 = _mm256_mul_ps(rsq21,rinv21);
2506 r21 = _mm256_andnot_ps(dummy_mask,r21);
2508 /* EWALD ELECTROSTATICS */
2510 /* Analytical PME correction */
2511 zeta2 = _mm256_mul_ps(beta2,rsq21);
2512 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2513 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2514 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2515 felec = _mm256_mul_ps(qq21,felec);
2517 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2521 fscal = _mm256_and_ps(fscal,cutoff_mask);
2523 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2525 /* Calculate temporary vectorial force */
2526 tx = _mm256_mul_ps(fscal,dx21);
2527 ty = _mm256_mul_ps(fscal,dy21);
2528 tz = _mm256_mul_ps(fscal,dz21);
2530 /* Update vectorial force */
2531 fix2 = _mm256_add_ps(fix2,tx);
2532 fiy2 = _mm256_add_ps(fiy2,ty);
2533 fiz2 = _mm256_add_ps(fiz2,tz);
2535 fjx1 = _mm256_add_ps(fjx1,tx);
2536 fjy1 = _mm256_add_ps(fjy1,ty);
2537 fjz1 = _mm256_add_ps(fjz1,tz);
2541 /**************************
2542 * CALCULATE INTERACTIONS *
2543 **************************/
2545 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2548 r22 = _mm256_mul_ps(rsq22,rinv22);
2549 r22 = _mm256_andnot_ps(dummy_mask,r22);
2551 /* EWALD ELECTROSTATICS */
2553 /* Analytical PME correction */
2554 zeta2 = _mm256_mul_ps(beta2,rsq22);
2555 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2556 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2557 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2558 felec = _mm256_mul_ps(qq22,felec);
2560 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2564 fscal = _mm256_and_ps(fscal,cutoff_mask);
2566 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2568 /* Calculate temporary vectorial force */
2569 tx = _mm256_mul_ps(fscal,dx22);
2570 ty = _mm256_mul_ps(fscal,dy22);
2571 tz = _mm256_mul_ps(fscal,dz22);
2573 /* Update vectorial force */
2574 fix2 = _mm256_add_ps(fix2,tx);
2575 fiy2 = _mm256_add_ps(fiy2,ty);
2576 fiz2 = _mm256_add_ps(fiz2,tz);
2578 fjx2 = _mm256_add_ps(fjx2,tx);
2579 fjy2 = _mm256_add_ps(fjy2,ty);
2580 fjz2 = _mm256_add_ps(fjz2,tz);
2584 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2585 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2586 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2587 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2588 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2589 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2590 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2591 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2593 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2594 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2596 /* Inner loop uses 547 flops */
2599 /* End of innermost loop */
2601 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2602 f+i_coord_offset,fshift+i_shift_offset);
2604 /* Increment number of inner iterations */
2605 inneriter += j_index_end - j_index_start;
2607 /* Outer loop uses 18 flops */
2610 /* Increment number of outer iterations */
2613 /* Update outer/inner flops */
2615 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*547);