2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 real * vdwgridioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 real * vdwgridioffsetptr1;
88 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 real * vdwgridioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
114 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
125 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
126 __m256d one_half = _mm256_set1_pd(0.5);
127 __m256d minus_one = _mm256_set1_pd(-1.0);
129 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
130 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
132 __m256d dummy_mask,cutoff_mask;
133 __m128 tmpmask0,tmpmask1;
134 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
135 __m256d one = _mm256_set1_pd(1.0);
136 __m256d two = _mm256_set1_pd(2.0);
142 jindex = nlist->jindex;
144 shiftidx = nlist->shift;
146 shiftvec = fr->shift_vec[0];
147 fshift = fr->fshift[0];
148 facel = _mm256_set1_pd(fr->ic->epsfac);
149 charge = mdatoms->chargeA;
150 nvdwtype = fr->ntype;
152 vdwtype = mdatoms->typeA;
153 vdwgridparam = fr->ljpme_c6grid;
154 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
155 ewclj = _mm256_set1_pd(fr->ic->ewaldcoeff_lj);
156 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
158 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
159 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
160 beta2 = _mm256_mul_pd(beta,beta);
161 beta3 = _mm256_mul_pd(beta,beta2);
163 ewtab = fr->ic->tabq_coul_FDV0;
164 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
165 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
167 /* Setup water-specific parameters */
168 inr = nlist->iinr[0];
169 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
170 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
171 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
172 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
173 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
175 jq0 = _mm256_set1_pd(charge[inr+0]);
176 jq1 = _mm256_set1_pd(charge[inr+1]);
177 jq2 = _mm256_set1_pd(charge[inr+2]);
178 vdwjidx0A = 2*vdwtype[inr+0];
179 qq00 = _mm256_mul_pd(iq0,jq0);
180 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
181 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
182 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
183 qq01 = _mm256_mul_pd(iq0,jq1);
184 qq02 = _mm256_mul_pd(iq0,jq2);
185 qq10 = _mm256_mul_pd(iq1,jq0);
186 qq11 = _mm256_mul_pd(iq1,jq1);
187 qq12 = _mm256_mul_pd(iq1,jq2);
188 qq20 = _mm256_mul_pd(iq2,jq0);
189 qq21 = _mm256_mul_pd(iq2,jq1);
190 qq22 = _mm256_mul_pd(iq2,jq2);
192 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
193 rcutoff_scalar = fr->ic->rcoulomb;
194 rcutoff = _mm256_set1_pd(rcutoff_scalar);
195 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
197 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
198 rvdw = _mm256_set1_pd(fr->ic->rvdw);
200 /* Avoid stupid compiler warnings */
201 jnrA = jnrB = jnrC = jnrD = 0;
210 for(iidx=0;iidx<4*DIM;iidx++)
215 /* Start outer loop over neighborlists */
216 for(iidx=0; iidx<nri; iidx++)
218 /* Load shift vector for this list */
219 i_shift_offset = DIM*shiftidx[iidx];
221 /* Load limits for loop over neighbors */
222 j_index_start = jindex[iidx];
223 j_index_end = jindex[iidx+1];
225 /* Get outer coordinate index */
227 i_coord_offset = DIM*inr;
229 /* Load i particle coords and add shift vector */
230 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
231 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
233 fix0 = _mm256_setzero_pd();
234 fiy0 = _mm256_setzero_pd();
235 fiz0 = _mm256_setzero_pd();
236 fix1 = _mm256_setzero_pd();
237 fiy1 = _mm256_setzero_pd();
238 fiz1 = _mm256_setzero_pd();
239 fix2 = _mm256_setzero_pd();
240 fiy2 = _mm256_setzero_pd();
241 fiz2 = _mm256_setzero_pd();
243 /* Reset potential sums */
244 velecsum = _mm256_setzero_pd();
245 vvdwsum = _mm256_setzero_pd();
247 /* Start inner kernel loop */
248 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
251 /* Get j neighbor index, and coordinate index */
256 j_coord_offsetA = DIM*jnrA;
257 j_coord_offsetB = DIM*jnrB;
258 j_coord_offsetC = DIM*jnrC;
259 j_coord_offsetD = DIM*jnrD;
261 /* load j atom coordinates */
262 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
263 x+j_coord_offsetC,x+j_coord_offsetD,
264 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
266 /* Calculate displacement vector */
267 dx00 = _mm256_sub_pd(ix0,jx0);
268 dy00 = _mm256_sub_pd(iy0,jy0);
269 dz00 = _mm256_sub_pd(iz0,jz0);
270 dx01 = _mm256_sub_pd(ix0,jx1);
271 dy01 = _mm256_sub_pd(iy0,jy1);
272 dz01 = _mm256_sub_pd(iz0,jz1);
273 dx02 = _mm256_sub_pd(ix0,jx2);
274 dy02 = _mm256_sub_pd(iy0,jy2);
275 dz02 = _mm256_sub_pd(iz0,jz2);
276 dx10 = _mm256_sub_pd(ix1,jx0);
277 dy10 = _mm256_sub_pd(iy1,jy0);
278 dz10 = _mm256_sub_pd(iz1,jz0);
279 dx11 = _mm256_sub_pd(ix1,jx1);
280 dy11 = _mm256_sub_pd(iy1,jy1);
281 dz11 = _mm256_sub_pd(iz1,jz1);
282 dx12 = _mm256_sub_pd(ix1,jx2);
283 dy12 = _mm256_sub_pd(iy1,jy2);
284 dz12 = _mm256_sub_pd(iz1,jz2);
285 dx20 = _mm256_sub_pd(ix2,jx0);
286 dy20 = _mm256_sub_pd(iy2,jy0);
287 dz20 = _mm256_sub_pd(iz2,jz0);
288 dx21 = _mm256_sub_pd(ix2,jx1);
289 dy21 = _mm256_sub_pd(iy2,jy1);
290 dz21 = _mm256_sub_pd(iz2,jz1);
291 dx22 = _mm256_sub_pd(ix2,jx2);
292 dy22 = _mm256_sub_pd(iy2,jy2);
293 dz22 = _mm256_sub_pd(iz2,jz2);
295 /* Calculate squared distance and things based on it */
296 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
297 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
298 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
299 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
300 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
301 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
302 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
303 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
304 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
306 rinv00 = avx256_invsqrt_d(rsq00);
307 rinv01 = avx256_invsqrt_d(rsq01);
308 rinv02 = avx256_invsqrt_d(rsq02);
309 rinv10 = avx256_invsqrt_d(rsq10);
310 rinv11 = avx256_invsqrt_d(rsq11);
311 rinv12 = avx256_invsqrt_d(rsq12);
312 rinv20 = avx256_invsqrt_d(rsq20);
313 rinv21 = avx256_invsqrt_d(rsq21);
314 rinv22 = avx256_invsqrt_d(rsq22);
316 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
317 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
318 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
319 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
320 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
321 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
322 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
323 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
324 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
326 fjx0 = _mm256_setzero_pd();
327 fjy0 = _mm256_setzero_pd();
328 fjz0 = _mm256_setzero_pd();
329 fjx1 = _mm256_setzero_pd();
330 fjy1 = _mm256_setzero_pd();
331 fjz1 = _mm256_setzero_pd();
332 fjx2 = _mm256_setzero_pd();
333 fjy2 = _mm256_setzero_pd();
334 fjz2 = _mm256_setzero_pd();
336 /**************************
337 * CALCULATE INTERACTIONS *
338 **************************/
340 if (gmx_mm256_any_lt(rsq00,rcutoff2))
343 r00 = _mm256_mul_pd(rsq00,rinv00);
345 /* EWALD ELECTROSTATICS */
347 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
348 ewrt = _mm256_mul_pd(r00,ewtabscale);
349 ewitab = _mm256_cvttpd_epi32(ewrt);
350 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
351 ewitab = _mm_slli_epi32(ewitab,2);
352 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
353 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
354 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
355 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
356 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
357 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
358 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
359 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
360 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
362 /* Analytical LJ-PME */
363 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
364 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
365 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
366 exponent = avx256_exp_d(ewcljrsq);
367 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
368 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
369 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
370 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
371 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
372 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
373 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
374 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
375 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
377 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velec = _mm256_and_pd(velec,cutoff_mask);
381 velecsum = _mm256_add_pd(velecsum,velec);
382 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
383 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
385 fscal = _mm256_add_pd(felec,fvdw);
387 fscal = _mm256_and_pd(fscal,cutoff_mask);
389 /* Calculate temporary vectorial force */
390 tx = _mm256_mul_pd(fscal,dx00);
391 ty = _mm256_mul_pd(fscal,dy00);
392 tz = _mm256_mul_pd(fscal,dz00);
394 /* Update vectorial force */
395 fix0 = _mm256_add_pd(fix0,tx);
396 fiy0 = _mm256_add_pd(fiy0,ty);
397 fiz0 = _mm256_add_pd(fiz0,tz);
399 fjx0 = _mm256_add_pd(fjx0,tx);
400 fjy0 = _mm256_add_pd(fjy0,ty);
401 fjz0 = _mm256_add_pd(fjz0,tz);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 if (gmx_mm256_any_lt(rsq01,rcutoff2))
412 r01 = _mm256_mul_pd(rsq01,rinv01);
414 /* EWALD ELECTROSTATICS */
416 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
417 ewrt = _mm256_mul_pd(r01,ewtabscale);
418 ewitab = _mm256_cvttpd_epi32(ewrt);
419 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
420 ewitab = _mm_slli_epi32(ewitab,2);
421 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
422 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
423 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
424 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
425 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
426 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
427 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
428 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
429 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
431 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velec = _mm256_and_pd(velec,cutoff_mask);
435 velecsum = _mm256_add_pd(velecsum,velec);
439 fscal = _mm256_and_pd(fscal,cutoff_mask);
441 /* Calculate temporary vectorial force */
442 tx = _mm256_mul_pd(fscal,dx01);
443 ty = _mm256_mul_pd(fscal,dy01);
444 tz = _mm256_mul_pd(fscal,dz01);
446 /* Update vectorial force */
447 fix0 = _mm256_add_pd(fix0,tx);
448 fiy0 = _mm256_add_pd(fiy0,ty);
449 fiz0 = _mm256_add_pd(fiz0,tz);
451 fjx1 = _mm256_add_pd(fjx1,tx);
452 fjy1 = _mm256_add_pd(fjy1,ty);
453 fjz1 = _mm256_add_pd(fjz1,tz);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 if (gmx_mm256_any_lt(rsq02,rcutoff2))
464 r02 = _mm256_mul_pd(rsq02,rinv02);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm256_mul_pd(r02,ewtabscale);
470 ewitab = _mm256_cvttpd_epi32(ewrt);
471 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
477 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
479 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
480 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
481 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
483 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velec = _mm256_and_pd(velec,cutoff_mask);
487 velecsum = _mm256_add_pd(velecsum,velec);
491 fscal = _mm256_and_pd(fscal,cutoff_mask);
493 /* Calculate temporary vectorial force */
494 tx = _mm256_mul_pd(fscal,dx02);
495 ty = _mm256_mul_pd(fscal,dy02);
496 tz = _mm256_mul_pd(fscal,dz02);
498 /* Update vectorial force */
499 fix0 = _mm256_add_pd(fix0,tx);
500 fiy0 = _mm256_add_pd(fiy0,ty);
501 fiz0 = _mm256_add_pd(fiz0,tz);
503 fjx2 = _mm256_add_pd(fjx2,tx);
504 fjy2 = _mm256_add_pd(fjy2,ty);
505 fjz2 = _mm256_add_pd(fjz2,tz);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 if (gmx_mm256_any_lt(rsq10,rcutoff2))
516 r10 = _mm256_mul_pd(rsq10,rinv10);
518 /* EWALD ELECTROSTATICS */
520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521 ewrt = _mm256_mul_pd(r10,ewtabscale);
522 ewitab = _mm256_cvttpd_epi32(ewrt);
523 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
524 ewitab = _mm_slli_epi32(ewitab,2);
525 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
526 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
527 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
528 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
529 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
530 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
531 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
532 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
533 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
535 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velec = _mm256_and_pd(velec,cutoff_mask);
539 velecsum = _mm256_add_pd(velecsum,velec);
543 fscal = _mm256_and_pd(fscal,cutoff_mask);
545 /* Calculate temporary vectorial force */
546 tx = _mm256_mul_pd(fscal,dx10);
547 ty = _mm256_mul_pd(fscal,dy10);
548 tz = _mm256_mul_pd(fscal,dz10);
550 /* Update vectorial force */
551 fix1 = _mm256_add_pd(fix1,tx);
552 fiy1 = _mm256_add_pd(fiy1,ty);
553 fiz1 = _mm256_add_pd(fiz1,tz);
555 fjx0 = _mm256_add_pd(fjx0,tx);
556 fjy0 = _mm256_add_pd(fjy0,ty);
557 fjz0 = _mm256_add_pd(fjz0,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 if (gmx_mm256_any_lt(rsq11,rcutoff2))
568 r11 = _mm256_mul_pd(rsq11,rinv11);
570 /* EWALD ELECTROSTATICS */
572 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
573 ewrt = _mm256_mul_pd(r11,ewtabscale);
574 ewitab = _mm256_cvttpd_epi32(ewrt);
575 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
576 ewitab = _mm_slli_epi32(ewitab,2);
577 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
578 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
579 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
580 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
581 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
582 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
583 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
584 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
585 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
587 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velec = _mm256_and_pd(velec,cutoff_mask);
591 velecsum = _mm256_add_pd(velecsum,velec);
595 fscal = _mm256_and_pd(fscal,cutoff_mask);
597 /* Calculate temporary vectorial force */
598 tx = _mm256_mul_pd(fscal,dx11);
599 ty = _mm256_mul_pd(fscal,dy11);
600 tz = _mm256_mul_pd(fscal,dz11);
602 /* Update vectorial force */
603 fix1 = _mm256_add_pd(fix1,tx);
604 fiy1 = _mm256_add_pd(fiy1,ty);
605 fiz1 = _mm256_add_pd(fiz1,tz);
607 fjx1 = _mm256_add_pd(fjx1,tx);
608 fjy1 = _mm256_add_pd(fjy1,ty);
609 fjz1 = _mm256_add_pd(fjz1,tz);
613 /**************************
614 * CALCULATE INTERACTIONS *
615 **************************/
617 if (gmx_mm256_any_lt(rsq12,rcutoff2))
620 r12 = _mm256_mul_pd(rsq12,rinv12);
622 /* EWALD ELECTROSTATICS */
624 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
625 ewrt = _mm256_mul_pd(r12,ewtabscale);
626 ewitab = _mm256_cvttpd_epi32(ewrt);
627 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
628 ewitab = _mm_slli_epi32(ewitab,2);
629 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
630 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
631 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
632 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
633 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
634 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
635 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
636 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
637 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
639 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
641 /* Update potential sum for this i atom from the interaction with this j atom. */
642 velec = _mm256_and_pd(velec,cutoff_mask);
643 velecsum = _mm256_add_pd(velecsum,velec);
647 fscal = _mm256_and_pd(fscal,cutoff_mask);
649 /* Calculate temporary vectorial force */
650 tx = _mm256_mul_pd(fscal,dx12);
651 ty = _mm256_mul_pd(fscal,dy12);
652 tz = _mm256_mul_pd(fscal,dz12);
654 /* Update vectorial force */
655 fix1 = _mm256_add_pd(fix1,tx);
656 fiy1 = _mm256_add_pd(fiy1,ty);
657 fiz1 = _mm256_add_pd(fiz1,tz);
659 fjx2 = _mm256_add_pd(fjx2,tx);
660 fjy2 = _mm256_add_pd(fjy2,ty);
661 fjz2 = _mm256_add_pd(fjz2,tz);
665 /**************************
666 * CALCULATE INTERACTIONS *
667 **************************/
669 if (gmx_mm256_any_lt(rsq20,rcutoff2))
672 r20 = _mm256_mul_pd(rsq20,rinv20);
674 /* EWALD ELECTROSTATICS */
676 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
677 ewrt = _mm256_mul_pd(r20,ewtabscale);
678 ewitab = _mm256_cvttpd_epi32(ewrt);
679 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
680 ewitab = _mm_slli_epi32(ewitab,2);
681 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
682 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
683 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
684 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
685 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
686 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
687 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
688 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
689 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
691 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velec = _mm256_and_pd(velec,cutoff_mask);
695 velecsum = _mm256_add_pd(velecsum,velec);
699 fscal = _mm256_and_pd(fscal,cutoff_mask);
701 /* Calculate temporary vectorial force */
702 tx = _mm256_mul_pd(fscal,dx20);
703 ty = _mm256_mul_pd(fscal,dy20);
704 tz = _mm256_mul_pd(fscal,dz20);
706 /* Update vectorial force */
707 fix2 = _mm256_add_pd(fix2,tx);
708 fiy2 = _mm256_add_pd(fiy2,ty);
709 fiz2 = _mm256_add_pd(fiz2,tz);
711 fjx0 = _mm256_add_pd(fjx0,tx);
712 fjy0 = _mm256_add_pd(fjy0,ty);
713 fjz0 = _mm256_add_pd(fjz0,tz);
717 /**************************
718 * CALCULATE INTERACTIONS *
719 **************************/
721 if (gmx_mm256_any_lt(rsq21,rcutoff2))
724 r21 = _mm256_mul_pd(rsq21,rinv21);
726 /* EWALD ELECTROSTATICS */
728 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
729 ewrt = _mm256_mul_pd(r21,ewtabscale);
730 ewitab = _mm256_cvttpd_epi32(ewrt);
731 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
732 ewitab = _mm_slli_epi32(ewitab,2);
733 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
734 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
735 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
736 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
737 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
738 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
739 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
740 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
741 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
743 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
745 /* Update potential sum for this i atom from the interaction with this j atom. */
746 velec = _mm256_and_pd(velec,cutoff_mask);
747 velecsum = _mm256_add_pd(velecsum,velec);
751 fscal = _mm256_and_pd(fscal,cutoff_mask);
753 /* Calculate temporary vectorial force */
754 tx = _mm256_mul_pd(fscal,dx21);
755 ty = _mm256_mul_pd(fscal,dy21);
756 tz = _mm256_mul_pd(fscal,dz21);
758 /* Update vectorial force */
759 fix2 = _mm256_add_pd(fix2,tx);
760 fiy2 = _mm256_add_pd(fiy2,ty);
761 fiz2 = _mm256_add_pd(fiz2,tz);
763 fjx1 = _mm256_add_pd(fjx1,tx);
764 fjy1 = _mm256_add_pd(fjy1,ty);
765 fjz1 = _mm256_add_pd(fjz1,tz);
769 /**************************
770 * CALCULATE INTERACTIONS *
771 **************************/
773 if (gmx_mm256_any_lt(rsq22,rcutoff2))
776 r22 = _mm256_mul_pd(rsq22,rinv22);
778 /* EWALD ELECTROSTATICS */
780 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
781 ewrt = _mm256_mul_pd(r22,ewtabscale);
782 ewitab = _mm256_cvttpd_epi32(ewrt);
783 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
784 ewitab = _mm_slli_epi32(ewitab,2);
785 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
786 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
787 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
788 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
789 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
790 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
791 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
792 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
793 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
795 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
797 /* Update potential sum for this i atom from the interaction with this j atom. */
798 velec = _mm256_and_pd(velec,cutoff_mask);
799 velecsum = _mm256_add_pd(velecsum,velec);
803 fscal = _mm256_and_pd(fscal,cutoff_mask);
805 /* Calculate temporary vectorial force */
806 tx = _mm256_mul_pd(fscal,dx22);
807 ty = _mm256_mul_pd(fscal,dy22);
808 tz = _mm256_mul_pd(fscal,dz22);
810 /* Update vectorial force */
811 fix2 = _mm256_add_pd(fix2,tx);
812 fiy2 = _mm256_add_pd(fiy2,ty);
813 fiz2 = _mm256_add_pd(fiz2,tz);
815 fjx2 = _mm256_add_pd(fjx2,tx);
816 fjy2 = _mm256_add_pd(fjy2,ty);
817 fjz2 = _mm256_add_pd(fjz2,tz);
821 fjptrA = f+j_coord_offsetA;
822 fjptrB = f+j_coord_offsetB;
823 fjptrC = f+j_coord_offsetC;
824 fjptrD = f+j_coord_offsetD;
826 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
827 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
829 /* Inner loop uses 450 flops */
835 /* Get j neighbor index, and coordinate index */
836 jnrlistA = jjnr[jidx];
837 jnrlistB = jjnr[jidx+1];
838 jnrlistC = jjnr[jidx+2];
839 jnrlistD = jjnr[jidx+3];
840 /* Sign of each element will be negative for non-real atoms.
841 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
842 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
844 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
846 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
847 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
848 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
850 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
851 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
852 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
853 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
854 j_coord_offsetA = DIM*jnrA;
855 j_coord_offsetB = DIM*jnrB;
856 j_coord_offsetC = DIM*jnrC;
857 j_coord_offsetD = DIM*jnrD;
859 /* load j atom coordinates */
860 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
861 x+j_coord_offsetC,x+j_coord_offsetD,
862 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
864 /* Calculate displacement vector */
865 dx00 = _mm256_sub_pd(ix0,jx0);
866 dy00 = _mm256_sub_pd(iy0,jy0);
867 dz00 = _mm256_sub_pd(iz0,jz0);
868 dx01 = _mm256_sub_pd(ix0,jx1);
869 dy01 = _mm256_sub_pd(iy0,jy1);
870 dz01 = _mm256_sub_pd(iz0,jz1);
871 dx02 = _mm256_sub_pd(ix0,jx2);
872 dy02 = _mm256_sub_pd(iy0,jy2);
873 dz02 = _mm256_sub_pd(iz0,jz2);
874 dx10 = _mm256_sub_pd(ix1,jx0);
875 dy10 = _mm256_sub_pd(iy1,jy0);
876 dz10 = _mm256_sub_pd(iz1,jz0);
877 dx11 = _mm256_sub_pd(ix1,jx1);
878 dy11 = _mm256_sub_pd(iy1,jy1);
879 dz11 = _mm256_sub_pd(iz1,jz1);
880 dx12 = _mm256_sub_pd(ix1,jx2);
881 dy12 = _mm256_sub_pd(iy1,jy2);
882 dz12 = _mm256_sub_pd(iz1,jz2);
883 dx20 = _mm256_sub_pd(ix2,jx0);
884 dy20 = _mm256_sub_pd(iy2,jy0);
885 dz20 = _mm256_sub_pd(iz2,jz0);
886 dx21 = _mm256_sub_pd(ix2,jx1);
887 dy21 = _mm256_sub_pd(iy2,jy1);
888 dz21 = _mm256_sub_pd(iz2,jz1);
889 dx22 = _mm256_sub_pd(ix2,jx2);
890 dy22 = _mm256_sub_pd(iy2,jy2);
891 dz22 = _mm256_sub_pd(iz2,jz2);
893 /* Calculate squared distance and things based on it */
894 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
895 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
896 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
897 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
898 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
899 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
900 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
901 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
902 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
904 rinv00 = avx256_invsqrt_d(rsq00);
905 rinv01 = avx256_invsqrt_d(rsq01);
906 rinv02 = avx256_invsqrt_d(rsq02);
907 rinv10 = avx256_invsqrt_d(rsq10);
908 rinv11 = avx256_invsqrt_d(rsq11);
909 rinv12 = avx256_invsqrt_d(rsq12);
910 rinv20 = avx256_invsqrt_d(rsq20);
911 rinv21 = avx256_invsqrt_d(rsq21);
912 rinv22 = avx256_invsqrt_d(rsq22);
914 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
915 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
916 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
917 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
918 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
919 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
920 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
921 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
922 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
924 fjx0 = _mm256_setzero_pd();
925 fjy0 = _mm256_setzero_pd();
926 fjz0 = _mm256_setzero_pd();
927 fjx1 = _mm256_setzero_pd();
928 fjy1 = _mm256_setzero_pd();
929 fjz1 = _mm256_setzero_pd();
930 fjx2 = _mm256_setzero_pd();
931 fjy2 = _mm256_setzero_pd();
932 fjz2 = _mm256_setzero_pd();
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 if (gmx_mm256_any_lt(rsq00,rcutoff2))
941 r00 = _mm256_mul_pd(rsq00,rinv00);
942 r00 = _mm256_andnot_pd(dummy_mask,r00);
944 /* EWALD ELECTROSTATICS */
946 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
947 ewrt = _mm256_mul_pd(r00,ewtabscale);
948 ewitab = _mm256_cvttpd_epi32(ewrt);
949 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
950 ewitab = _mm_slli_epi32(ewitab,2);
951 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
952 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
953 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
954 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
955 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
956 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
957 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
958 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
959 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
961 /* Analytical LJ-PME */
962 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
963 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
964 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
965 exponent = avx256_exp_d(ewcljrsq);
966 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
967 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
968 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
969 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
970 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
971 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
972 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
973 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
974 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
976 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm256_and_pd(velec,cutoff_mask);
980 velec = _mm256_andnot_pd(dummy_mask,velec);
981 velecsum = _mm256_add_pd(velecsum,velec);
982 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
983 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
984 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
986 fscal = _mm256_add_pd(felec,fvdw);
988 fscal = _mm256_and_pd(fscal,cutoff_mask);
990 fscal = _mm256_andnot_pd(dummy_mask,fscal);
992 /* Calculate temporary vectorial force */
993 tx = _mm256_mul_pd(fscal,dx00);
994 ty = _mm256_mul_pd(fscal,dy00);
995 tz = _mm256_mul_pd(fscal,dz00);
997 /* Update vectorial force */
998 fix0 = _mm256_add_pd(fix0,tx);
999 fiy0 = _mm256_add_pd(fiy0,ty);
1000 fiz0 = _mm256_add_pd(fiz0,tz);
1002 fjx0 = _mm256_add_pd(fjx0,tx);
1003 fjy0 = _mm256_add_pd(fjy0,ty);
1004 fjz0 = _mm256_add_pd(fjz0,tz);
1008 /**************************
1009 * CALCULATE INTERACTIONS *
1010 **************************/
1012 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1015 r01 = _mm256_mul_pd(rsq01,rinv01);
1016 r01 = _mm256_andnot_pd(dummy_mask,r01);
1018 /* EWALD ELECTROSTATICS */
1020 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1021 ewrt = _mm256_mul_pd(r01,ewtabscale);
1022 ewitab = _mm256_cvttpd_epi32(ewrt);
1023 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1024 ewitab = _mm_slli_epi32(ewitab,2);
1025 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1026 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1027 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1028 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1029 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1030 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1031 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1032 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
1033 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1035 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1037 /* Update potential sum for this i atom from the interaction with this j atom. */
1038 velec = _mm256_and_pd(velec,cutoff_mask);
1039 velec = _mm256_andnot_pd(dummy_mask,velec);
1040 velecsum = _mm256_add_pd(velecsum,velec);
1044 fscal = _mm256_and_pd(fscal,cutoff_mask);
1046 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1048 /* Calculate temporary vectorial force */
1049 tx = _mm256_mul_pd(fscal,dx01);
1050 ty = _mm256_mul_pd(fscal,dy01);
1051 tz = _mm256_mul_pd(fscal,dz01);
1053 /* Update vectorial force */
1054 fix0 = _mm256_add_pd(fix0,tx);
1055 fiy0 = _mm256_add_pd(fiy0,ty);
1056 fiz0 = _mm256_add_pd(fiz0,tz);
1058 fjx1 = _mm256_add_pd(fjx1,tx);
1059 fjy1 = _mm256_add_pd(fjy1,ty);
1060 fjz1 = _mm256_add_pd(fjz1,tz);
1064 /**************************
1065 * CALCULATE INTERACTIONS *
1066 **************************/
1068 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1071 r02 = _mm256_mul_pd(rsq02,rinv02);
1072 r02 = _mm256_andnot_pd(dummy_mask,r02);
1074 /* EWALD ELECTROSTATICS */
1076 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1077 ewrt = _mm256_mul_pd(r02,ewtabscale);
1078 ewitab = _mm256_cvttpd_epi32(ewrt);
1079 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1080 ewitab = _mm_slli_epi32(ewitab,2);
1081 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1082 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1083 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1084 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1085 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1086 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1087 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1088 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1089 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1091 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec = _mm256_and_pd(velec,cutoff_mask);
1095 velec = _mm256_andnot_pd(dummy_mask,velec);
1096 velecsum = _mm256_add_pd(velecsum,velec);
1100 fscal = _mm256_and_pd(fscal,cutoff_mask);
1102 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1104 /* Calculate temporary vectorial force */
1105 tx = _mm256_mul_pd(fscal,dx02);
1106 ty = _mm256_mul_pd(fscal,dy02);
1107 tz = _mm256_mul_pd(fscal,dz02);
1109 /* Update vectorial force */
1110 fix0 = _mm256_add_pd(fix0,tx);
1111 fiy0 = _mm256_add_pd(fiy0,ty);
1112 fiz0 = _mm256_add_pd(fiz0,tz);
1114 fjx2 = _mm256_add_pd(fjx2,tx);
1115 fjy2 = _mm256_add_pd(fjy2,ty);
1116 fjz2 = _mm256_add_pd(fjz2,tz);
1120 /**************************
1121 * CALCULATE INTERACTIONS *
1122 **************************/
1124 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1127 r10 = _mm256_mul_pd(rsq10,rinv10);
1128 r10 = _mm256_andnot_pd(dummy_mask,r10);
1130 /* EWALD ELECTROSTATICS */
1132 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1133 ewrt = _mm256_mul_pd(r10,ewtabscale);
1134 ewitab = _mm256_cvttpd_epi32(ewrt);
1135 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1136 ewitab = _mm_slli_epi32(ewitab,2);
1137 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1138 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1139 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1140 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1141 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1142 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1143 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1144 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1145 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1147 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm256_and_pd(velec,cutoff_mask);
1151 velec = _mm256_andnot_pd(dummy_mask,velec);
1152 velecsum = _mm256_add_pd(velecsum,velec);
1156 fscal = _mm256_and_pd(fscal,cutoff_mask);
1158 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1160 /* Calculate temporary vectorial force */
1161 tx = _mm256_mul_pd(fscal,dx10);
1162 ty = _mm256_mul_pd(fscal,dy10);
1163 tz = _mm256_mul_pd(fscal,dz10);
1165 /* Update vectorial force */
1166 fix1 = _mm256_add_pd(fix1,tx);
1167 fiy1 = _mm256_add_pd(fiy1,ty);
1168 fiz1 = _mm256_add_pd(fiz1,tz);
1170 fjx0 = _mm256_add_pd(fjx0,tx);
1171 fjy0 = _mm256_add_pd(fjy0,ty);
1172 fjz0 = _mm256_add_pd(fjz0,tz);
1176 /**************************
1177 * CALCULATE INTERACTIONS *
1178 **************************/
1180 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1183 r11 = _mm256_mul_pd(rsq11,rinv11);
1184 r11 = _mm256_andnot_pd(dummy_mask,r11);
1186 /* EWALD ELECTROSTATICS */
1188 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1189 ewrt = _mm256_mul_pd(r11,ewtabscale);
1190 ewitab = _mm256_cvttpd_epi32(ewrt);
1191 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1192 ewitab = _mm_slli_epi32(ewitab,2);
1193 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1194 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1195 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1196 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1197 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1198 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1199 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1200 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1201 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1203 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1205 /* Update potential sum for this i atom from the interaction with this j atom. */
1206 velec = _mm256_and_pd(velec,cutoff_mask);
1207 velec = _mm256_andnot_pd(dummy_mask,velec);
1208 velecsum = _mm256_add_pd(velecsum,velec);
1212 fscal = _mm256_and_pd(fscal,cutoff_mask);
1214 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1216 /* Calculate temporary vectorial force */
1217 tx = _mm256_mul_pd(fscal,dx11);
1218 ty = _mm256_mul_pd(fscal,dy11);
1219 tz = _mm256_mul_pd(fscal,dz11);
1221 /* Update vectorial force */
1222 fix1 = _mm256_add_pd(fix1,tx);
1223 fiy1 = _mm256_add_pd(fiy1,ty);
1224 fiz1 = _mm256_add_pd(fiz1,tz);
1226 fjx1 = _mm256_add_pd(fjx1,tx);
1227 fjy1 = _mm256_add_pd(fjy1,ty);
1228 fjz1 = _mm256_add_pd(fjz1,tz);
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1239 r12 = _mm256_mul_pd(rsq12,rinv12);
1240 r12 = _mm256_andnot_pd(dummy_mask,r12);
1242 /* EWALD ELECTROSTATICS */
1244 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1245 ewrt = _mm256_mul_pd(r12,ewtabscale);
1246 ewitab = _mm256_cvttpd_epi32(ewrt);
1247 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1248 ewitab = _mm_slli_epi32(ewitab,2);
1249 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1250 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1251 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1252 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1253 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1254 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1255 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1256 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1257 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1259 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1261 /* Update potential sum for this i atom from the interaction with this j atom. */
1262 velec = _mm256_and_pd(velec,cutoff_mask);
1263 velec = _mm256_andnot_pd(dummy_mask,velec);
1264 velecsum = _mm256_add_pd(velecsum,velec);
1268 fscal = _mm256_and_pd(fscal,cutoff_mask);
1270 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1272 /* Calculate temporary vectorial force */
1273 tx = _mm256_mul_pd(fscal,dx12);
1274 ty = _mm256_mul_pd(fscal,dy12);
1275 tz = _mm256_mul_pd(fscal,dz12);
1277 /* Update vectorial force */
1278 fix1 = _mm256_add_pd(fix1,tx);
1279 fiy1 = _mm256_add_pd(fiy1,ty);
1280 fiz1 = _mm256_add_pd(fiz1,tz);
1282 fjx2 = _mm256_add_pd(fjx2,tx);
1283 fjy2 = _mm256_add_pd(fjy2,ty);
1284 fjz2 = _mm256_add_pd(fjz2,tz);
1288 /**************************
1289 * CALCULATE INTERACTIONS *
1290 **************************/
1292 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1295 r20 = _mm256_mul_pd(rsq20,rinv20);
1296 r20 = _mm256_andnot_pd(dummy_mask,r20);
1298 /* EWALD ELECTROSTATICS */
1300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1301 ewrt = _mm256_mul_pd(r20,ewtabscale);
1302 ewitab = _mm256_cvttpd_epi32(ewrt);
1303 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1304 ewitab = _mm_slli_epi32(ewitab,2);
1305 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1306 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1307 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1308 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1309 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1310 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1311 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1312 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1313 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1315 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1317 /* Update potential sum for this i atom from the interaction with this j atom. */
1318 velec = _mm256_and_pd(velec,cutoff_mask);
1319 velec = _mm256_andnot_pd(dummy_mask,velec);
1320 velecsum = _mm256_add_pd(velecsum,velec);
1324 fscal = _mm256_and_pd(fscal,cutoff_mask);
1326 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1328 /* Calculate temporary vectorial force */
1329 tx = _mm256_mul_pd(fscal,dx20);
1330 ty = _mm256_mul_pd(fscal,dy20);
1331 tz = _mm256_mul_pd(fscal,dz20);
1333 /* Update vectorial force */
1334 fix2 = _mm256_add_pd(fix2,tx);
1335 fiy2 = _mm256_add_pd(fiy2,ty);
1336 fiz2 = _mm256_add_pd(fiz2,tz);
1338 fjx0 = _mm256_add_pd(fjx0,tx);
1339 fjy0 = _mm256_add_pd(fjy0,ty);
1340 fjz0 = _mm256_add_pd(fjz0,tz);
1344 /**************************
1345 * CALCULATE INTERACTIONS *
1346 **************************/
1348 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1351 r21 = _mm256_mul_pd(rsq21,rinv21);
1352 r21 = _mm256_andnot_pd(dummy_mask,r21);
1354 /* EWALD ELECTROSTATICS */
1356 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1357 ewrt = _mm256_mul_pd(r21,ewtabscale);
1358 ewitab = _mm256_cvttpd_epi32(ewrt);
1359 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1360 ewitab = _mm_slli_epi32(ewitab,2);
1361 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1362 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1363 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1364 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1365 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1366 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1367 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1368 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1369 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1371 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1373 /* Update potential sum for this i atom from the interaction with this j atom. */
1374 velec = _mm256_and_pd(velec,cutoff_mask);
1375 velec = _mm256_andnot_pd(dummy_mask,velec);
1376 velecsum = _mm256_add_pd(velecsum,velec);
1380 fscal = _mm256_and_pd(fscal,cutoff_mask);
1382 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1384 /* Calculate temporary vectorial force */
1385 tx = _mm256_mul_pd(fscal,dx21);
1386 ty = _mm256_mul_pd(fscal,dy21);
1387 tz = _mm256_mul_pd(fscal,dz21);
1389 /* Update vectorial force */
1390 fix2 = _mm256_add_pd(fix2,tx);
1391 fiy2 = _mm256_add_pd(fiy2,ty);
1392 fiz2 = _mm256_add_pd(fiz2,tz);
1394 fjx1 = _mm256_add_pd(fjx1,tx);
1395 fjy1 = _mm256_add_pd(fjy1,ty);
1396 fjz1 = _mm256_add_pd(fjz1,tz);
1400 /**************************
1401 * CALCULATE INTERACTIONS *
1402 **************************/
1404 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1407 r22 = _mm256_mul_pd(rsq22,rinv22);
1408 r22 = _mm256_andnot_pd(dummy_mask,r22);
1410 /* EWALD ELECTROSTATICS */
1412 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1413 ewrt = _mm256_mul_pd(r22,ewtabscale);
1414 ewitab = _mm256_cvttpd_epi32(ewrt);
1415 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1416 ewitab = _mm_slli_epi32(ewitab,2);
1417 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1418 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1419 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1420 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1421 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1422 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1423 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1424 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1425 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1427 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1429 /* Update potential sum for this i atom from the interaction with this j atom. */
1430 velec = _mm256_and_pd(velec,cutoff_mask);
1431 velec = _mm256_andnot_pd(dummy_mask,velec);
1432 velecsum = _mm256_add_pd(velecsum,velec);
1436 fscal = _mm256_and_pd(fscal,cutoff_mask);
1438 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1440 /* Calculate temporary vectorial force */
1441 tx = _mm256_mul_pd(fscal,dx22);
1442 ty = _mm256_mul_pd(fscal,dy22);
1443 tz = _mm256_mul_pd(fscal,dz22);
1445 /* Update vectorial force */
1446 fix2 = _mm256_add_pd(fix2,tx);
1447 fiy2 = _mm256_add_pd(fiy2,ty);
1448 fiz2 = _mm256_add_pd(fiz2,tz);
1450 fjx2 = _mm256_add_pd(fjx2,tx);
1451 fjy2 = _mm256_add_pd(fjy2,ty);
1452 fjz2 = _mm256_add_pd(fjz2,tz);
1456 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1457 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1458 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1459 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1461 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1462 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1464 /* Inner loop uses 459 flops */
1467 /* End of innermost loop */
1469 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1470 f+i_coord_offset,fshift+i_shift_offset);
1473 /* Update potential energies */
1474 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1475 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1477 /* Increment number of inner iterations */
1478 inneriter += j_index_end - j_index_start;
1480 /* Outer loop uses 20 flops */
1483 /* Increment number of outer iterations */
1486 /* Update outer/inner flops */
1488 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
1491 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double
1492 * Electrostatics interaction: Ewald
1493 * VdW interaction: LJEwald
1494 * Geometry: Water3-Water3
1495 * Calculate force/pot: Force
1498 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double
1499 (t_nblist * gmx_restrict nlist,
1500 rvec * gmx_restrict xx,
1501 rvec * gmx_restrict ff,
1502 struct t_forcerec * gmx_restrict fr,
1503 t_mdatoms * gmx_restrict mdatoms,
1504 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1505 t_nrnb * gmx_restrict nrnb)
1507 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1508 * just 0 for non-waters.
1509 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1510 * jnr indices corresponding to data put in the four positions in the SIMD register.
1512 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1513 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1514 int jnrA,jnrB,jnrC,jnrD;
1515 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1516 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1517 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1518 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1519 real rcutoff_scalar;
1520 real *shiftvec,*fshift,*x,*f;
1521 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1522 real scratch[4*DIM];
1523 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1524 real * vdwioffsetptr0;
1525 real * vdwgridioffsetptr0;
1526 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1527 real * vdwioffsetptr1;
1528 real * vdwgridioffsetptr1;
1529 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1530 real * vdwioffsetptr2;
1531 real * vdwgridioffsetptr2;
1532 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1533 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1534 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1535 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1536 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1537 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1538 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1539 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1540 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1541 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1542 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1543 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1544 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1545 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1546 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1547 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1548 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1551 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1554 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1555 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1566 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1567 __m256d one_half = _mm256_set1_pd(0.5);
1568 __m256d minus_one = _mm256_set1_pd(-1.0);
1570 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1571 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1573 __m256d dummy_mask,cutoff_mask;
1574 __m128 tmpmask0,tmpmask1;
1575 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1576 __m256d one = _mm256_set1_pd(1.0);
1577 __m256d two = _mm256_set1_pd(2.0);
1583 jindex = nlist->jindex;
1585 shiftidx = nlist->shift;
1587 shiftvec = fr->shift_vec[0];
1588 fshift = fr->fshift[0];
1589 facel = _mm256_set1_pd(fr->ic->epsfac);
1590 charge = mdatoms->chargeA;
1591 nvdwtype = fr->ntype;
1592 vdwparam = fr->nbfp;
1593 vdwtype = mdatoms->typeA;
1594 vdwgridparam = fr->ljpme_c6grid;
1595 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1596 ewclj = _mm256_set1_pd(fr->ic->ewaldcoeff_lj);
1597 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1599 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1600 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1601 beta2 = _mm256_mul_pd(beta,beta);
1602 beta3 = _mm256_mul_pd(beta,beta2);
1604 ewtab = fr->ic->tabq_coul_F;
1605 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1606 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1608 /* Setup water-specific parameters */
1609 inr = nlist->iinr[0];
1610 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1611 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1612 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1613 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1614 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1616 jq0 = _mm256_set1_pd(charge[inr+0]);
1617 jq1 = _mm256_set1_pd(charge[inr+1]);
1618 jq2 = _mm256_set1_pd(charge[inr+2]);
1619 vdwjidx0A = 2*vdwtype[inr+0];
1620 qq00 = _mm256_mul_pd(iq0,jq0);
1621 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1622 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1623 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1624 qq01 = _mm256_mul_pd(iq0,jq1);
1625 qq02 = _mm256_mul_pd(iq0,jq2);
1626 qq10 = _mm256_mul_pd(iq1,jq0);
1627 qq11 = _mm256_mul_pd(iq1,jq1);
1628 qq12 = _mm256_mul_pd(iq1,jq2);
1629 qq20 = _mm256_mul_pd(iq2,jq0);
1630 qq21 = _mm256_mul_pd(iq2,jq1);
1631 qq22 = _mm256_mul_pd(iq2,jq2);
1633 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1634 rcutoff_scalar = fr->ic->rcoulomb;
1635 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1636 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1638 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1639 rvdw = _mm256_set1_pd(fr->ic->rvdw);
1641 /* Avoid stupid compiler warnings */
1642 jnrA = jnrB = jnrC = jnrD = 0;
1643 j_coord_offsetA = 0;
1644 j_coord_offsetB = 0;
1645 j_coord_offsetC = 0;
1646 j_coord_offsetD = 0;
1651 for(iidx=0;iidx<4*DIM;iidx++)
1653 scratch[iidx] = 0.0;
1656 /* Start outer loop over neighborlists */
1657 for(iidx=0; iidx<nri; iidx++)
1659 /* Load shift vector for this list */
1660 i_shift_offset = DIM*shiftidx[iidx];
1662 /* Load limits for loop over neighbors */
1663 j_index_start = jindex[iidx];
1664 j_index_end = jindex[iidx+1];
1666 /* Get outer coordinate index */
1668 i_coord_offset = DIM*inr;
1670 /* Load i particle coords and add shift vector */
1671 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1672 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1674 fix0 = _mm256_setzero_pd();
1675 fiy0 = _mm256_setzero_pd();
1676 fiz0 = _mm256_setzero_pd();
1677 fix1 = _mm256_setzero_pd();
1678 fiy1 = _mm256_setzero_pd();
1679 fiz1 = _mm256_setzero_pd();
1680 fix2 = _mm256_setzero_pd();
1681 fiy2 = _mm256_setzero_pd();
1682 fiz2 = _mm256_setzero_pd();
1684 /* Start inner kernel loop */
1685 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1688 /* Get j neighbor index, and coordinate index */
1690 jnrB = jjnr[jidx+1];
1691 jnrC = jjnr[jidx+2];
1692 jnrD = jjnr[jidx+3];
1693 j_coord_offsetA = DIM*jnrA;
1694 j_coord_offsetB = DIM*jnrB;
1695 j_coord_offsetC = DIM*jnrC;
1696 j_coord_offsetD = DIM*jnrD;
1698 /* load j atom coordinates */
1699 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1700 x+j_coord_offsetC,x+j_coord_offsetD,
1701 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1703 /* Calculate displacement vector */
1704 dx00 = _mm256_sub_pd(ix0,jx0);
1705 dy00 = _mm256_sub_pd(iy0,jy0);
1706 dz00 = _mm256_sub_pd(iz0,jz0);
1707 dx01 = _mm256_sub_pd(ix0,jx1);
1708 dy01 = _mm256_sub_pd(iy0,jy1);
1709 dz01 = _mm256_sub_pd(iz0,jz1);
1710 dx02 = _mm256_sub_pd(ix0,jx2);
1711 dy02 = _mm256_sub_pd(iy0,jy2);
1712 dz02 = _mm256_sub_pd(iz0,jz2);
1713 dx10 = _mm256_sub_pd(ix1,jx0);
1714 dy10 = _mm256_sub_pd(iy1,jy0);
1715 dz10 = _mm256_sub_pd(iz1,jz0);
1716 dx11 = _mm256_sub_pd(ix1,jx1);
1717 dy11 = _mm256_sub_pd(iy1,jy1);
1718 dz11 = _mm256_sub_pd(iz1,jz1);
1719 dx12 = _mm256_sub_pd(ix1,jx2);
1720 dy12 = _mm256_sub_pd(iy1,jy2);
1721 dz12 = _mm256_sub_pd(iz1,jz2);
1722 dx20 = _mm256_sub_pd(ix2,jx0);
1723 dy20 = _mm256_sub_pd(iy2,jy0);
1724 dz20 = _mm256_sub_pd(iz2,jz0);
1725 dx21 = _mm256_sub_pd(ix2,jx1);
1726 dy21 = _mm256_sub_pd(iy2,jy1);
1727 dz21 = _mm256_sub_pd(iz2,jz1);
1728 dx22 = _mm256_sub_pd(ix2,jx2);
1729 dy22 = _mm256_sub_pd(iy2,jy2);
1730 dz22 = _mm256_sub_pd(iz2,jz2);
1732 /* Calculate squared distance and things based on it */
1733 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1734 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1735 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1736 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1737 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1738 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1739 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1740 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1741 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1743 rinv00 = avx256_invsqrt_d(rsq00);
1744 rinv01 = avx256_invsqrt_d(rsq01);
1745 rinv02 = avx256_invsqrt_d(rsq02);
1746 rinv10 = avx256_invsqrt_d(rsq10);
1747 rinv11 = avx256_invsqrt_d(rsq11);
1748 rinv12 = avx256_invsqrt_d(rsq12);
1749 rinv20 = avx256_invsqrt_d(rsq20);
1750 rinv21 = avx256_invsqrt_d(rsq21);
1751 rinv22 = avx256_invsqrt_d(rsq22);
1753 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1754 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1755 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1756 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1757 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1758 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1759 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1760 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1761 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1763 fjx0 = _mm256_setzero_pd();
1764 fjy0 = _mm256_setzero_pd();
1765 fjz0 = _mm256_setzero_pd();
1766 fjx1 = _mm256_setzero_pd();
1767 fjy1 = _mm256_setzero_pd();
1768 fjz1 = _mm256_setzero_pd();
1769 fjx2 = _mm256_setzero_pd();
1770 fjy2 = _mm256_setzero_pd();
1771 fjz2 = _mm256_setzero_pd();
1773 /**************************
1774 * CALCULATE INTERACTIONS *
1775 **************************/
1777 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1780 r00 = _mm256_mul_pd(rsq00,rinv00);
1782 /* EWALD ELECTROSTATICS */
1784 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1785 ewrt = _mm256_mul_pd(r00,ewtabscale);
1786 ewitab = _mm256_cvttpd_epi32(ewrt);
1787 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1788 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1789 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1791 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1792 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1794 /* Analytical LJ-PME */
1795 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1796 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
1797 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1798 exponent = avx256_exp_d(ewcljrsq);
1799 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1800 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1801 /* f6A = 6 * C6grid * (1 - poly) */
1802 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1803 /* f6B = C6grid * exponent * beta^6 */
1804 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1805 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1806 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1808 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1810 fscal = _mm256_add_pd(felec,fvdw);
1812 fscal = _mm256_and_pd(fscal,cutoff_mask);
1814 /* Calculate temporary vectorial force */
1815 tx = _mm256_mul_pd(fscal,dx00);
1816 ty = _mm256_mul_pd(fscal,dy00);
1817 tz = _mm256_mul_pd(fscal,dz00);
1819 /* Update vectorial force */
1820 fix0 = _mm256_add_pd(fix0,tx);
1821 fiy0 = _mm256_add_pd(fiy0,ty);
1822 fiz0 = _mm256_add_pd(fiz0,tz);
1824 fjx0 = _mm256_add_pd(fjx0,tx);
1825 fjy0 = _mm256_add_pd(fjy0,ty);
1826 fjz0 = _mm256_add_pd(fjz0,tz);
1830 /**************************
1831 * CALCULATE INTERACTIONS *
1832 **************************/
1834 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1837 r01 = _mm256_mul_pd(rsq01,rinv01);
1839 /* EWALD ELECTROSTATICS */
1841 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1842 ewrt = _mm256_mul_pd(r01,ewtabscale);
1843 ewitab = _mm256_cvttpd_epi32(ewrt);
1844 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1845 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1846 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1848 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1849 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1851 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1855 fscal = _mm256_and_pd(fscal,cutoff_mask);
1857 /* Calculate temporary vectorial force */
1858 tx = _mm256_mul_pd(fscal,dx01);
1859 ty = _mm256_mul_pd(fscal,dy01);
1860 tz = _mm256_mul_pd(fscal,dz01);
1862 /* Update vectorial force */
1863 fix0 = _mm256_add_pd(fix0,tx);
1864 fiy0 = _mm256_add_pd(fiy0,ty);
1865 fiz0 = _mm256_add_pd(fiz0,tz);
1867 fjx1 = _mm256_add_pd(fjx1,tx);
1868 fjy1 = _mm256_add_pd(fjy1,ty);
1869 fjz1 = _mm256_add_pd(fjz1,tz);
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1880 r02 = _mm256_mul_pd(rsq02,rinv02);
1882 /* EWALD ELECTROSTATICS */
1884 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1885 ewrt = _mm256_mul_pd(r02,ewtabscale);
1886 ewitab = _mm256_cvttpd_epi32(ewrt);
1887 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1888 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1889 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1891 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1892 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1894 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1898 fscal = _mm256_and_pd(fscal,cutoff_mask);
1900 /* Calculate temporary vectorial force */
1901 tx = _mm256_mul_pd(fscal,dx02);
1902 ty = _mm256_mul_pd(fscal,dy02);
1903 tz = _mm256_mul_pd(fscal,dz02);
1905 /* Update vectorial force */
1906 fix0 = _mm256_add_pd(fix0,tx);
1907 fiy0 = _mm256_add_pd(fiy0,ty);
1908 fiz0 = _mm256_add_pd(fiz0,tz);
1910 fjx2 = _mm256_add_pd(fjx2,tx);
1911 fjy2 = _mm256_add_pd(fjy2,ty);
1912 fjz2 = _mm256_add_pd(fjz2,tz);
1916 /**************************
1917 * CALCULATE INTERACTIONS *
1918 **************************/
1920 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1923 r10 = _mm256_mul_pd(rsq10,rinv10);
1925 /* EWALD ELECTROSTATICS */
1927 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1928 ewrt = _mm256_mul_pd(r10,ewtabscale);
1929 ewitab = _mm256_cvttpd_epi32(ewrt);
1930 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1931 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1932 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1934 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1935 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1937 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1941 fscal = _mm256_and_pd(fscal,cutoff_mask);
1943 /* Calculate temporary vectorial force */
1944 tx = _mm256_mul_pd(fscal,dx10);
1945 ty = _mm256_mul_pd(fscal,dy10);
1946 tz = _mm256_mul_pd(fscal,dz10);
1948 /* Update vectorial force */
1949 fix1 = _mm256_add_pd(fix1,tx);
1950 fiy1 = _mm256_add_pd(fiy1,ty);
1951 fiz1 = _mm256_add_pd(fiz1,tz);
1953 fjx0 = _mm256_add_pd(fjx0,tx);
1954 fjy0 = _mm256_add_pd(fjy0,ty);
1955 fjz0 = _mm256_add_pd(fjz0,tz);
1959 /**************************
1960 * CALCULATE INTERACTIONS *
1961 **************************/
1963 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1966 r11 = _mm256_mul_pd(rsq11,rinv11);
1968 /* EWALD ELECTROSTATICS */
1970 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1971 ewrt = _mm256_mul_pd(r11,ewtabscale);
1972 ewitab = _mm256_cvttpd_epi32(ewrt);
1973 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1974 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1975 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1977 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1978 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1980 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1984 fscal = _mm256_and_pd(fscal,cutoff_mask);
1986 /* Calculate temporary vectorial force */
1987 tx = _mm256_mul_pd(fscal,dx11);
1988 ty = _mm256_mul_pd(fscal,dy11);
1989 tz = _mm256_mul_pd(fscal,dz11);
1991 /* Update vectorial force */
1992 fix1 = _mm256_add_pd(fix1,tx);
1993 fiy1 = _mm256_add_pd(fiy1,ty);
1994 fiz1 = _mm256_add_pd(fiz1,tz);
1996 fjx1 = _mm256_add_pd(fjx1,tx);
1997 fjy1 = _mm256_add_pd(fjy1,ty);
1998 fjz1 = _mm256_add_pd(fjz1,tz);
2002 /**************************
2003 * CALCULATE INTERACTIONS *
2004 **************************/
2006 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2009 r12 = _mm256_mul_pd(rsq12,rinv12);
2011 /* EWALD ELECTROSTATICS */
2013 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2014 ewrt = _mm256_mul_pd(r12,ewtabscale);
2015 ewitab = _mm256_cvttpd_epi32(ewrt);
2016 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2017 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2018 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2020 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2021 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2023 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2027 fscal = _mm256_and_pd(fscal,cutoff_mask);
2029 /* Calculate temporary vectorial force */
2030 tx = _mm256_mul_pd(fscal,dx12);
2031 ty = _mm256_mul_pd(fscal,dy12);
2032 tz = _mm256_mul_pd(fscal,dz12);
2034 /* Update vectorial force */
2035 fix1 = _mm256_add_pd(fix1,tx);
2036 fiy1 = _mm256_add_pd(fiy1,ty);
2037 fiz1 = _mm256_add_pd(fiz1,tz);
2039 fjx2 = _mm256_add_pd(fjx2,tx);
2040 fjy2 = _mm256_add_pd(fjy2,ty);
2041 fjz2 = _mm256_add_pd(fjz2,tz);
2045 /**************************
2046 * CALCULATE INTERACTIONS *
2047 **************************/
2049 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2052 r20 = _mm256_mul_pd(rsq20,rinv20);
2054 /* EWALD ELECTROSTATICS */
2056 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2057 ewrt = _mm256_mul_pd(r20,ewtabscale);
2058 ewitab = _mm256_cvttpd_epi32(ewrt);
2059 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2060 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2061 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2063 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2064 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2066 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2070 fscal = _mm256_and_pd(fscal,cutoff_mask);
2072 /* Calculate temporary vectorial force */
2073 tx = _mm256_mul_pd(fscal,dx20);
2074 ty = _mm256_mul_pd(fscal,dy20);
2075 tz = _mm256_mul_pd(fscal,dz20);
2077 /* Update vectorial force */
2078 fix2 = _mm256_add_pd(fix2,tx);
2079 fiy2 = _mm256_add_pd(fiy2,ty);
2080 fiz2 = _mm256_add_pd(fiz2,tz);
2082 fjx0 = _mm256_add_pd(fjx0,tx);
2083 fjy0 = _mm256_add_pd(fjy0,ty);
2084 fjz0 = _mm256_add_pd(fjz0,tz);
2088 /**************************
2089 * CALCULATE INTERACTIONS *
2090 **************************/
2092 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2095 r21 = _mm256_mul_pd(rsq21,rinv21);
2097 /* EWALD ELECTROSTATICS */
2099 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2100 ewrt = _mm256_mul_pd(r21,ewtabscale);
2101 ewitab = _mm256_cvttpd_epi32(ewrt);
2102 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2103 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2104 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2106 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2107 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2109 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2113 fscal = _mm256_and_pd(fscal,cutoff_mask);
2115 /* Calculate temporary vectorial force */
2116 tx = _mm256_mul_pd(fscal,dx21);
2117 ty = _mm256_mul_pd(fscal,dy21);
2118 tz = _mm256_mul_pd(fscal,dz21);
2120 /* Update vectorial force */
2121 fix2 = _mm256_add_pd(fix2,tx);
2122 fiy2 = _mm256_add_pd(fiy2,ty);
2123 fiz2 = _mm256_add_pd(fiz2,tz);
2125 fjx1 = _mm256_add_pd(fjx1,tx);
2126 fjy1 = _mm256_add_pd(fjy1,ty);
2127 fjz1 = _mm256_add_pd(fjz1,tz);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2138 r22 = _mm256_mul_pd(rsq22,rinv22);
2140 /* EWALD ELECTROSTATICS */
2142 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2143 ewrt = _mm256_mul_pd(r22,ewtabscale);
2144 ewitab = _mm256_cvttpd_epi32(ewrt);
2145 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2146 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2147 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2149 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2150 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2152 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2156 fscal = _mm256_and_pd(fscal,cutoff_mask);
2158 /* Calculate temporary vectorial force */
2159 tx = _mm256_mul_pd(fscal,dx22);
2160 ty = _mm256_mul_pd(fscal,dy22);
2161 tz = _mm256_mul_pd(fscal,dz22);
2163 /* Update vectorial force */
2164 fix2 = _mm256_add_pd(fix2,tx);
2165 fiy2 = _mm256_add_pd(fiy2,ty);
2166 fiz2 = _mm256_add_pd(fiz2,tz);
2168 fjx2 = _mm256_add_pd(fjx2,tx);
2169 fjy2 = _mm256_add_pd(fjy2,ty);
2170 fjz2 = _mm256_add_pd(fjz2,tz);
2174 fjptrA = f+j_coord_offsetA;
2175 fjptrB = f+j_coord_offsetB;
2176 fjptrC = f+j_coord_offsetC;
2177 fjptrD = f+j_coord_offsetD;
2179 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2180 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2182 /* Inner loop uses 374 flops */
2185 if(jidx<j_index_end)
2188 /* Get j neighbor index, and coordinate index */
2189 jnrlistA = jjnr[jidx];
2190 jnrlistB = jjnr[jidx+1];
2191 jnrlistC = jjnr[jidx+2];
2192 jnrlistD = jjnr[jidx+3];
2193 /* Sign of each element will be negative for non-real atoms.
2194 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2195 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2197 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2199 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2200 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2201 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2203 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2204 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2205 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2206 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2207 j_coord_offsetA = DIM*jnrA;
2208 j_coord_offsetB = DIM*jnrB;
2209 j_coord_offsetC = DIM*jnrC;
2210 j_coord_offsetD = DIM*jnrD;
2212 /* load j atom coordinates */
2213 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2214 x+j_coord_offsetC,x+j_coord_offsetD,
2215 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2217 /* Calculate displacement vector */
2218 dx00 = _mm256_sub_pd(ix0,jx0);
2219 dy00 = _mm256_sub_pd(iy0,jy0);
2220 dz00 = _mm256_sub_pd(iz0,jz0);
2221 dx01 = _mm256_sub_pd(ix0,jx1);
2222 dy01 = _mm256_sub_pd(iy0,jy1);
2223 dz01 = _mm256_sub_pd(iz0,jz1);
2224 dx02 = _mm256_sub_pd(ix0,jx2);
2225 dy02 = _mm256_sub_pd(iy0,jy2);
2226 dz02 = _mm256_sub_pd(iz0,jz2);
2227 dx10 = _mm256_sub_pd(ix1,jx0);
2228 dy10 = _mm256_sub_pd(iy1,jy0);
2229 dz10 = _mm256_sub_pd(iz1,jz0);
2230 dx11 = _mm256_sub_pd(ix1,jx1);
2231 dy11 = _mm256_sub_pd(iy1,jy1);
2232 dz11 = _mm256_sub_pd(iz1,jz1);
2233 dx12 = _mm256_sub_pd(ix1,jx2);
2234 dy12 = _mm256_sub_pd(iy1,jy2);
2235 dz12 = _mm256_sub_pd(iz1,jz2);
2236 dx20 = _mm256_sub_pd(ix2,jx0);
2237 dy20 = _mm256_sub_pd(iy2,jy0);
2238 dz20 = _mm256_sub_pd(iz2,jz0);
2239 dx21 = _mm256_sub_pd(ix2,jx1);
2240 dy21 = _mm256_sub_pd(iy2,jy1);
2241 dz21 = _mm256_sub_pd(iz2,jz1);
2242 dx22 = _mm256_sub_pd(ix2,jx2);
2243 dy22 = _mm256_sub_pd(iy2,jy2);
2244 dz22 = _mm256_sub_pd(iz2,jz2);
2246 /* Calculate squared distance and things based on it */
2247 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2248 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2249 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2250 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2251 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2252 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2253 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2254 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2255 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2257 rinv00 = avx256_invsqrt_d(rsq00);
2258 rinv01 = avx256_invsqrt_d(rsq01);
2259 rinv02 = avx256_invsqrt_d(rsq02);
2260 rinv10 = avx256_invsqrt_d(rsq10);
2261 rinv11 = avx256_invsqrt_d(rsq11);
2262 rinv12 = avx256_invsqrt_d(rsq12);
2263 rinv20 = avx256_invsqrt_d(rsq20);
2264 rinv21 = avx256_invsqrt_d(rsq21);
2265 rinv22 = avx256_invsqrt_d(rsq22);
2267 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2268 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2269 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2270 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2271 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2272 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2273 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2274 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2275 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2277 fjx0 = _mm256_setzero_pd();
2278 fjy0 = _mm256_setzero_pd();
2279 fjz0 = _mm256_setzero_pd();
2280 fjx1 = _mm256_setzero_pd();
2281 fjy1 = _mm256_setzero_pd();
2282 fjz1 = _mm256_setzero_pd();
2283 fjx2 = _mm256_setzero_pd();
2284 fjy2 = _mm256_setzero_pd();
2285 fjz2 = _mm256_setzero_pd();
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2291 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2294 r00 = _mm256_mul_pd(rsq00,rinv00);
2295 r00 = _mm256_andnot_pd(dummy_mask,r00);
2297 /* EWALD ELECTROSTATICS */
2299 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2300 ewrt = _mm256_mul_pd(r00,ewtabscale);
2301 ewitab = _mm256_cvttpd_epi32(ewrt);
2302 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2303 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2304 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2306 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2307 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2309 /* Analytical LJ-PME */
2310 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2311 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
2312 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2313 exponent = avx256_exp_d(ewcljrsq);
2314 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2315 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2316 /* f6A = 6 * C6grid * (1 - poly) */
2317 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2318 /* f6B = C6grid * exponent * beta^6 */
2319 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2320 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2321 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2323 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2325 fscal = _mm256_add_pd(felec,fvdw);
2327 fscal = _mm256_and_pd(fscal,cutoff_mask);
2329 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2331 /* Calculate temporary vectorial force */
2332 tx = _mm256_mul_pd(fscal,dx00);
2333 ty = _mm256_mul_pd(fscal,dy00);
2334 tz = _mm256_mul_pd(fscal,dz00);
2336 /* Update vectorial force */
2337 fix0 = _mm256_add_pd(fix0,tx);
2338 fiy0 = _mm256_add_pd(fiy0,ty);
2339 fiz0 = _mm256_add_pd(fiz0,tz);
2341 fjx0 = _mm256_add_pd(fjx0,tx);
2342 fjy0 = _mm256_add_pd(fjy0,ty);
2343 fjz0 = _mm256_add_pd(fjz0,tz);
2347 /**************************
2348 * CALCULATE INTERACTIONS *
2349 **************************/
2351 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2354 r01 = _mm256_mul_pd(rsq01,rinv01);
2355 r01 = _mm256_andnot_pd(dummy_mask,r01);
2357 /* EWALD ELECTROSTATICS */
2359 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2360 ewrt = _mm256_mul_pd(r01,ewtabscale);
2361 ewitab = _mm256_cvttpd_epi32(ewrt);
2362 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2363 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2364 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2366 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2367 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2369 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2373 fscal = _mm256_and_pd(fscal,cutoff_mask);
2375 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2377 /* Calculate temporary vectorial force */
2378 tx = _mm256_mul_pd(fscal,dx01);
2379 ty = _mm256_mul_pd(fscal,dy01);
2380 tz = _mm256_mul_pd(fscal,dz01);
2382 /* Update vectorial force */
2383 fix0 = _mm256_add_pd(fix0,tx);
2384 fiy0 = _mm256_add_pd(fiy0,ty);
2385 fiz0 = _mm256_add_pd(fiz0,tz);
2387 fjx1 = _mm256_add_pd(fjx1,tx);
2388 fjy1 = _mm256_add_pd(fjy1,ty);
2389 fjz1 = _mm256_add_pd(fjz1,tz);
2393 /**************************
2394 * CALCULATE INTERACTIONS *
2395 **************************/
2397 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2400 r02 = _mm256_mul_pd(rsq02,rinv02);
2401 r02 = _mm256_andnot_pd(dummy_mask,r02);
2403 /* EWALD ELECTROSTATICS */
2405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2406 ewrt = _mm256_mul_pd(r02,ewtabscale);
2407 ewitab = _mm256_cvttpd_epi32(ewrt);
2408 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2409 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2410 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2412 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2413 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2415 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2419 fscal = _mm256_and_pd(fscal,cutoff_mask);
2421 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2423 /* Calculate temporary vectorial force */
2424 tx = _mm256_mul_pd(fscal,dx02);
2425 ty = _mm256_mul_pd(fscal,dy02);
2426 tz = _mm256_mul_pd(fscal,dz02);
2428 /* Update vectorial force */
2429 fix0 = _mm256_add_pd(fix0,tx);
2430 fiy0 = _mm256_add_pd(fiy0,ty);
2431 fiz0 = _mm256_add_pd(fiz0,tz);
2433 fjx2 = _mm256_add_pd(fjx2,tx);
2434 fjy2 = _mm256_add_pd(fjy2,ty);
2435 fjz2 = _mm256_add_pd(fjz2,tz);
2439 /**************************
2440 * CALCULATE INTERACTIONS *
2441 **************************/
2443 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2446 r10 = _mm256_mul_pd(rsq10,rinv10);
2447 r10 = _mm256_andnot_pd(dummy_mask,r10);
2449 /* EWALD ELECTROSTATICS */
2451 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2452 ewrt = _mm256_mul_pd(r10,ewtabscale);
2453 ewitab = _mm256_cvttpd_epi32(ewrt);
2454 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2455 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2456 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2458 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2459 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2461 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2465 fscal = _mm256_and_pd(fscal,cutoff_mask);
2467 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2469 /* Calculate temporary vectorial force */
2470 tx = _mm256_mul_pd(fscal,dx10);
2471 ty = _mm256_mul_pd(fscal,dy10);
2472 tz = _mm256_mul_pd(fscal,dz10);
2474 /* Update vectorial force */
2475 fix1 = _mm256_add_pd(fix1,tx);
2476 fiy1 = _mm256_add_pd(fiy1,ty);
2477 fiz1 = _mm256_add_pd(fiz1,tz);
2479 fjx0 = _mm256_add_pd(fjx0,tx);
2480 fjy0 = _mm256_add_pd(fjy0,ty);
2481 fjz0 = _mm256_add_pd(fjz0,tz);
2485 /**************************
2486 * CALCULATE INTERACTIONS *
2487 **************************/
2489 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2492 r11 = _mm256_mul_pd(rsq11,rinv11);
2493 r11 = _mm256_andnot_pd(dummy_mask,r11);
2495 /* EWALD ELECTROSTATICS */
2497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2498 ewrt = _mm256_mul_pd(r11,ewtabscale);
2499 ewitab = _mm256_cvttpd_epi32(ewrt);
2500 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2501 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2502 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2504 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2505 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2507 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2511 fscal = _mm256_and_pd(fscal,cutoff_mask);
2513 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2515 /* Calculate temporary vectorial force */
2516 tx = _mm256_mul_pd(fscal,dx11);
2517 ty = _mm256_mul_pd(fscal,dy11);
2518 tz = _mm256_mul_pd(fscal,dz11);
2520 /* Update vectorial force */
2521 fix1 = _mm256_add_pd(fix1,tx);
2522 fiy1 = _mm256_add_pd(fiy1,ty);
2523 fiz1 = _mm256_add_pd(fiz1,tz);
2525 fjx1 = _mm256_add_pd(fjx1,tx);
2526 fjy1 = _mm256_add_pd(fjy1,ty);
2527 fjz1 = _mm256_add_pd(fjz1,tz);
2531 /**************************
2532 * CALCULATE INTERACTIONS *
2533 **************************/
2535 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2538 r12 = _mm256_mul_pd(rsq12,rinv12);
2539 r12 = _mm256_andnot_pd(dummy_mask,r12);
2541 /* EWALD ELECTROSTATICS */
2543 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2544 ewrt = _mm256_mul_pd(r12,ewtabscale);
2545 ewitab = _mm256_cvttpd_epi32(ewrt);
2546 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2547 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2548 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2550 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2551 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2553 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2557 fscal = _mm256_and_pd(fscal,cutoff_mask);
2559 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2561 /* Calculate temporary vectorial force */
2562 tx = _mm256_mul_pd(fscal,dx12);
2563 ty = _mm256_mul_pd(fscal,dy12);
2564 tz = _mm256_mul_pd(fscal,dz12);
2566 /* Update vectorial force */
2567 fix1 = _mm256_add_pd(fix1,tx);
2568 fiy1 = _mm256_add_pd(fiy1,ty);
2569 fiz1 = _mm256_add_pd(fiz1,tz);
2571 fjx2 = _mm256_add_pd(fjx2,tx);
2572 fjy2 = _mm256_add_pd(fjy2,ty);
2573 fjz2 = _mm256_add_pd(fjz2,tz);
2577 /**************************
2578 * CALCULATE INTERACTIONS *
2579 **************************/
2581 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2584 r20 = _mm256_mul_pd(rsq20,rinv20);
2585 r20 = _mm256_andnot_pd(dummy_mask,r20);
2587 /* EWALD ELECTROSTATICS */
2589 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2590 ewrt = _mm256_mul_pd(r20,ewtabscale);
2591 ewitab = _mm256_cvttpd_epi32(ewrt);
2592 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2593 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2594 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2596 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2597 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2599 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2603 fscal = _mm256_and_pd(fscal,cutoff_mask);
2605 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2607 /* Calculate temporary vectorial force */
2608 tx = _mm256_mul_pd(fscal,dx20);
2609 ty = _mm256_mul_pd(fscal,dy20);
2610 tz = _mm256_mul_pd(fscal,dz20);
2612 /* Update vectorial force */
2613 fix2 = _mm256_add_pd(fix2,tx);
2614 fiy2 = _mm256_add_pd(fiy2,ty);
2615 fiz2 = _mm256_add_pd(fiz2,tz);
2617 fjx0 = _mm256_add_pd(fjx0,tx);
2618 fjy0 = _mm256_add_pd(fjy0,ty);
2619 fjz0 = _mm256_add_pd(fjz0,tz);
2623 /**************************
2624 * CALCULATE INTERACTIONS *
2625 **************************/
2627 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2630 r21 = _mm256_mul_pd(rsq21,rinv21);
2631 r21 = _mm256_andnot_pd(dummy_mask,r21);
2633 /* EWALD ELECTROSTATICS */
2635 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2636 ewrt = _mm256_mul_pd(r21,ewtabscale);
2637 ewitab = _mm256_cvttpd_epi32(ewrt);
2638 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2639 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2640 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2642 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2643 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2645 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2649 fscal = _mm256_and_pd(fscal,cutoff_mask);
2651 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2653 /* Calculate temporary vectorial force */
2654 tx = _mm256_mul_pd(fscal,dx21);
2655 ty = _mm256_mul_pd(fscal,dy21);
2656 tz = _mm256_mul_pd(fscal,dz21);
2658 /* Update vectorial force */
2659 fix2 = _mm256_add_pd(fix2,tx);
2660 fiy2 = _mm256_add_pd(fiy2,ty);
2661 fiz2 = _mm256_add_pd(fiz2,tz);
2663 fjx1 = _mm256_add_pd(fjx1,tx);
2664 fjy1 = _mm256_add_pd(fjy1,ty);
2665 fjz1 = _mm256_add_pd(fjz1,tz);
2669 /**************************
2670 * CALCULATE INTERACTIONS *
2671 **************************/
2673 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2676 r22 = _mm256_mul_pd(rsq22,rinv22);
2677 r22 = _mm256_andnot_pd(dummy_mask,r22);
2679 /* EWALD ELECTROSTATICS */
2681 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2682 ewrt = _mm256_mul_pd(r22,ewtabscale);
2683 ewitab = _mm256_cvttpd_epi32(ewrt);
2684 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2685 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2686 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2688 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2689 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2691 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2695 fscal = _mm256_and_pd(fscal,cutoff_mask);
2697 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2699 /* Calculate temporary vectorial force */
2700 tx = _mm256_mul_pd(fscal,dx22);
2701 ty = _mm256_mul_pd(fscal,dy22);
2702 tz = _mm256_mul_pd(fscal,dz22);
2704 /* Update vectorial force */
2705 fix2 = _mm256_add_pd(fix2,tx);
2706 fiy2 = _mm256_add_pd(fiy2,ty);
2707 fiz2 = _mm256_add_pd(fiz2,tz);
2709 fjx2 = _mm256_add_pd(fjx2,tx);
2710 fjy2 = _mm256_add_pd(fjy2,ty);
2711 fjz2 = _mm256_add_pd(fjz2,tz);
2715 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2716 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2717 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2718 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2720 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2721 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2723 /* Inner loop uses 383 flops */
2726 /* End of innermost loop */
2728 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2729 f+i_coord_offset,fshift+i_shift_offset);
2731 /* Increment number of inner iterations */
2732 inneriter += j_index_end - j_index_start;
2734 /* Outer loop uses 18 flops */
2737 /* Increment number of outer iterations */
2740 /* Update outer/inner flops */
2742 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*383);