2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 real * vdwgridioffsetptr0;
86 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 real * vdwioffsetptr1;
88 real * vdwgridioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 real * vdwgridioffsetptr2;
92 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
101 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
102 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
103 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
106 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
111 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
114 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
115 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
126 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
127 __m256d one_half = _mm256_set1_pd(0.5);
128 __m256d minus_one = _mm256_set1_pd(-1.0);
130 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
131 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
133 __m256d dummy_mask,cutoff_mask;
134 __m128 tmpmask0,tmpmask1;
135 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
136 __m256d one = _mm256_set1_pd(1.0);
137 __m256d two = _mm256_set1_pd(2.0);
143 jindex = nlist->jindex;
145 shiftidx = nlist->shift;
147 shiftvec = fr->shift_vec[0];
148 fshift = fr->fshift[0];
149 facel = _mm256_set1_pd(fr->epsfac);
150 charge = mdatoms->chargeA;
151 nvdwtype = fr->ntype;
153 vdwtype = mdatoms->typeA;
154 vdwgridparam = fr->ljpme_c6grid;
155 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
156 ewclj = _mm256_set1_pd(fr->ewaldcoeff_lj);
157 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
159 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
160 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
161 beta2 = _mm256_mul_pd(beta,beta);
162 beta3 = _mm256_mul_pd(beta,beta2);
164 ewtab = fr->ic->tabq_coul_FDV0;
165 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
166 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
168 /* Setup water-specific parameters */
169 inr = nlist->iinr[0];
170 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
171 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
172 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
173 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
174 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
176 jq0 = _mm256_set1_pd(charge[inr+0]);
177 jq1 = _mm256_set1_pd(charge[inr+1]);
178 jq2 = _mm256_set1_pd(charge[inr+2]);
179 vdwjidx0A = 2*vdwtype[inr+0];
180 qq00 = _mm256_mul_pd(iq0,jq0);
181 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
182 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
183 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
184 qq01 = _mm256_mul_pd(iq0,jq1);
185 qq02 = _mm256_mul_pd(iq0,jq2);
186 qq10 = _mm256_mul_pd(iq1,jq0);
187 qq11 = _mm256_mul_pd(iq1,jq1);
188 qq12 = _mm256_mul_pd(iq1,jq2);
189 qq20 = _mm256_mul_pd(iq2,jq0);
190 qq21 = _mm256_mul_pd(iq2,jq1);
191 qq22 = _mm256_mul_pd(iq2,jq2);
193 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
194 rcutoff_scalar = fr->rcoulomb;
195 rcutoff = _mm256_set1_pd(rcutoff_scalar);
196 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
198 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
199 rvdw = _mm256_set1_pd(fr->rvdw);
201 /* Avoid stupid compiler warnings */
202 jnrA = jnrB = jnrC = jnrD = 0;
211 for(iidx=0;iidx<4*DIM;iidx++)
216 /* Start outer loop over neighborlists */
217 for(iidx=0; iidx<nri; iidx++)
219 /* Load shift vector for this list */
220 i_shift_offset = DIM*shiftidx[iidx];
222 /* Load limits for loop over neighbors */
223 j_index_start = jindex[iidx];
224 j_index_end = jindex[iidx+1];
226 /* Get outer coordinate index */
228 i_coord_offset = DIM*inr;
230 /* Load i particle coords and add shift vector */
231 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
232 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
234 fix0 = _mm256_setzero_pd();
235 fiy0 = _mm256_setzero_pd();
236 fiz0 = _mm256_setzero_pd();
237 fix1 = _mm256_setzero_pd();
238 fiy1 = _mm256_setzero_pd();
239 fiz1 = _mm256_setzero_pd();
240 fix2 = _mm256_setzero_pd();
241 fiy2 = _mm256_setzero_pd();
242 fiz2 = _mm256_setzero_pd();
244 /* Reset potential sums */
245 velecsum = _mm256_setzero_pd();
246 vvdwsum = _mm256_setzero_pd();
248 /* Start inner kernel loop */
249 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
252 /* Get j neighbor index, and coordinate index */
257 j_coord_offsetA = DIM*jnrA;
258 j_coord_offsetB = DIM*jnrB;
259 j_coord_offsetC = DIM*jnrC;
260 j_coord_offsetD = DIM*jnrD;
262 /* load j atom coordinates */
263 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
264 x+j_coord_offsetC,x+j_coord_offsetD,
265 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
267 /* Calculate displacement vector */
268 dx00 = _mm256_sub_pd(ix0,jx0);
269 dy00 = _mm256_sub_pd(iy0,jy0);
270 dz00 = _mm256_sub_pd(iz0,jz0);
271 dx01 = _mm256_sub_pd(ix0,jx1);
272 dy01 = _mm256_sub_pd(iy0,jy1);
273 dz01 = _mm256_sub_pd(iz0,jz1);
274 dx02 = _mm256_sub_pd(ix0,jx2);
275 dy02 = _mm256_sub_pd(iy0,jy2);
276 dz02 = _mm256_sub_pd(iz0,jz2);
277 dx10 = _mm256_sub_pd(ix1,jx0);
278 dy10 = _mm256_sub_pd(iy1,jy0);
279 dz10 = _mm256_sub_pd(iz1,jz0);
280 dx11 = _mm256_sub_pd(ix1,jx1);
281 dy11 = _mm256_sub_pd(iy1,jy1);
282 dz11 = _mm256_sub_pd(iz1,jz1);
283 dx12 = _mm256_sub_pd(ix1,jx2);
284 dy12 = _mm256_sub_pd(iy1,jy2);
285 dz12 = _mm256_sub_pd(iz1,jz2);
286 dx20 = _mm256_sub_pd(ix2,jx0);
287 dy20 = _mm256_sub_pd(iy2,jy0);
288 dz20 = _mm256_sub_pd(iz2,jz0);
289 dx21 = _mm256_sub_pd(ix2,jx1);
290 dy21 = _mm256_sub_pd(iy2,jy1);
291 dz21 = _mm256_sub_pd(iz2,jz1);
292 dx22 = _mm256_sub_pd(ix2,jx2);
293 dy22 = _mm256_sub_pd(iy2,jy2);
294 dz22 = _mm256_sub_pd(iz2,jz2);
296 /* Calculate squared distance and things based on it */
297 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
298 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
299 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
300 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
301 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
302 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
303 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
304 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
305 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
307 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
308 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
309 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
310 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
311 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
312 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
313 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
314 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
315 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
317 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
318 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
319 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
320 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
321 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
322 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
323 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
324 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
325 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
327 fjx0 = _mm256_setzero_pd();
328 fjy0 = _mm256_setzero_pd();
329 fjz0 = _mm256_setzero_pd();
330 fjx1 = _mm256_setzero_pd();
331 fjy1 = _mm256_setzero_pd();
332 fjz1 = _mm256_setzero_pd();
333 fjx2 = _mm256_setzero_pd();
334 fjy2 = _mm256_setzero_pd();
335 fjz2 = _mm256_setzero_pd();
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 if (gmx_mm256_any_lt(rsq00,rcutoff2))
344 r00 = _mm256_mul_pd(rsq00,rinv00);
346 /* EWALD ELECTROSTATICS */
348 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
349 ewrt = _mm256_mul_pd(r00,ewtabscale);
350 ewitab = _mm256_cvttpd_epi32(ewrt);
351 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
352 ewitab = _mm_slli_epi32(ewitab,2);
353 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
354 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
355 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
356 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
357 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
358 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
359 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
360 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
361 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
363 /* Analytical LJ-PME */
364 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
365 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
366 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
367 exponent = gmx_simd_exp_d(ewcljrsq);
368 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
369 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
370 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
371 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
372 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
373 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
374 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
375 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
376 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
378 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velec = _mm256_and_pd(velec,cutoff_mask);
382 velecsum = _mm256_add_pd(velecsum,velec);
383 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
384 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
386 fscal = _mm256_add_pd(felec,fvdw);
388 fscal = _mm256_and_pd(fscal,cutoff_mask);
390 /* Calculate temporary vectorial force */
391 tx = _mm256_mul_pd(fscal,dx00);
392 ty = _mm256_mul_pd(fscal,dy00);
393 tz = _mm256_mul_pd(fscal,dz00);
395 /* Update vectorial force */
396 fix0 = _mm256_add_pd(fix0,tx);
397 fiy0 = _mm256_add_pd(fiy0,ty);
398 fiz0 = _mm256_add_pd(fiz0,tz);
400 fjx0 = _mm256_add_pd(fjx0,tx);
401 fjy0 = _mm256_add_pd(fjy0,ty);
402 fjz0 = _mm256_add_pd(fjz0,tz);
406 /**************************
407 * CALCULATE INTERACTIONS *
408 **************************/
410 if (gmx_mm256_any_lt(rsq01,rcutoff2))
413 r01 = _mm256_mul_pd(rsq01,rinv01);
415 /* EWALD ELECTROSTATICS */
417 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
418 ewrt = _mm256_mul_pd(r01,ewtabscale);
419 ewitab = _mm256_cvttpd_epi32(ewrt);
420 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
421 ewitab = _mm_slli_epi32(ewitab,2);
422 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
423 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
424 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
425 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
426 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
427 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
428 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
429 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
430 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
432 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velec = _mm256_and_pd(velec,cutoff_mask);
436 velecsum = _mm256_add_pd(velecsum,velec);
440 fscal = _mm256_and_pd(fscal,cutoff_mask);
442 /* Calculate temporary vectorial force */
443 tx = _mm256_mul_pd(fscal,dx01);
444 ty = _mm256_mul_pd(fscal,dy01);
445 tz = _mm256_mul_pd(fscal,dz01);
447 /* Update vectorial force */
448 fix0 = _mm256_add_pd(fix0,tx);
449 fiy0 = _mm256_add_pd(fiy0,ty);
450 fiz0 = _mm256_add_pd(fiz0,tz);
452 fjx1 = _mm256_add_pd(fjx1,tx);
453 fjy1 = _mm256_add_pd(fjy1,ty);
454 fjz1 = _mm256_add_pd(fjz1,tz);
458 /**************************
459 * CALCULATE INTERACTIONS *
460 **************************/
462 if (gmx_mm256_any_lt(rsq02,rcutoff2))
465 r02 = _mm256_mul_pd(rsq02,rinv02);
467 /* EWALD ELECTROSTATICS */
469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
470 ewrt = _mm256_mul_pd(r02,ewtabscale);
471 ewitab = _mm256_cvttpd_epi32(ewrt);
472 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
473 ewitab = _mm_slli_epi32(ewitab,2);
474 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
475 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
476 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
477 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
478 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
479 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
480 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
481 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
482 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
484 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velec = _mm256_and_pd(velec,cutoff_mask);
488 velecsum = _mm256_add_pd(velecsum,velec);
492 fscal = _mm256_and_pd(fscal,cutoff_mask);
494 /* Calculate temporary vectorial force */
495 tx = _mm256_mul_pd(fscal,dx02);
496 ty = _mm256_mul_pd(fscal,dy02);
497 tz = _mm256_mul_pd(fscal,dz02);
499 /* Update vectorial force */
500 fix0 = _mm256_add_pd(fix0,tx);
501 fiy0 = _mm256_add_pd(fiy0,ty);
502 fiz0 = _mm256_add_pd(fiz0,tz);
504 fjx2 = _mm256_add_pd(fjx2,tx);
505 fjy2 = _mm256_add_pd(fjy2,ty);
506 fjz2 = _mm256_add_pd(fjz2,tz);
510 /**************************
511 * CALCULATE INTERACTIONS *
512 **************************/
514 if (gmx_mm256_any_lt(rsq10,rcutoff2))
517 r10 = _mm256_mul_pd(rsq10,rinv10);
519 /* EWALD ELECTROSTATICS */
521 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
522 ewrt = _mm256_mul_pd(r10,ewtabscale);
523 ewitab = _mm256_cvttpd_epi32(ewrt);
524 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
525 ewitab = _mm_slli_epi32(ewitab,2);
526 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
527 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
528 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
529 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
530 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
531 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
532 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
533 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
534 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
536 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velec = _mm256_and_pd(velec,cutoff_mask);
540 velecsum = _mm256_add_pd(velecsum,velec);
544 fscal = _mm256_and_pd(fscal,cutoff_mask);
546 /* Calculate temporary vectorial force */
547 tx = _mm256_mul_pd(fscal,dx10);
548 ty = _mm256_mul_pd(fscal,dy10);
549 tz = _mm256_mul_pd(fscal,dz10);
551 /* Update vectorial force */
552 fix1 = _mm256_add_pd(fix1,tx);
553 fiy1 = _mm256_add_pd(fiy1,ty);
554 fiz1 = _mm256_add_pd(fiz1,tz);
556 fjx0 = _mm256_add_pd(fjx0,tx);
557 fjy0 = _mm256_add_pd(fjy0,ty);
558 fjz0 = _mm256_add_pd(fjz0,tz);
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 if (gmx_mm256_any_lt(rsq11,rcutoff2))
569 r11 = _mm256_mul_pd(rsq11,rinv11);
571 /* EWALD ELECTROSTATICS */
573 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
574 ewrt = _mm256_mul_pd(r11,ewtabscale);
575 ewitab = _mm256_cvttpd_epi32(ewrt);
576 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
577 ewitab = _mm_slli_epi32(ewitab,2);
578 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
579 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
580 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
581 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
582 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
583 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
584 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
585 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
586 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
588 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
590 /* Update potential sum for this i atom from the interaction with this j atom. */
591 velec = _mm256_and_pd(velec,cutoff_mask);
592 velecsum = _mm256_add_pd(velecsum,velec);
596 fscal = _mm256_and_pd(fscal,cutoff_mask);
598 /* Calculate temporary vectorial force */
599 tx = _mm256_mul_pd(fscal,dx11);
600 ty = _mm256_mul_pd(fscal,dy11);
601 tz = _mm256_mul_pd(fscal,dz11);
603 /* Update vectorial force */
604 fix1 = _mm256_add_pd(fix1,tx);
605 fiy1 = _mm256_add_pd(fiy1,ty);
606 fiz1 = _mm256_add_pd(fiz1,tz);
608 fjx1 = _mm256_add_pd(fjx1,tx);
609 fjy1 = _mm256_add_pd(fjy1,ty);
610 fjz1 = _mm256_add_pd(fjz1,tz);
614 /**************************
615 * CALCULATE INTERACTIONS *
616 **************************/
618 if (gmx_mm256_any_lt(rsq12,rcutoff2))
621 r12 = _mm256_mul_pd(rsq12,rinv12);
623 /* EWALD ELECTROSTATICS */
625 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
626 ewrt = _mm256_mul_pd(r12,ewtabscale);
627 ewitab = _mm256_cvttpd_epi32(ewrt);
628 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
629 ewitab = _mm_slli_epi32(ewitab,2);
630 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
631 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
632 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
633 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
634 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
635 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
636 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
637 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
638 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
640 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
642 /* Update potential sum for this i atom from the interaction with this j atom. */
643 velec = _mm256_and_pd(velec,cutoff_mask);
644 velecsum = _mm256_add_pd(velecsum,velec);
648 fscal = _mm256_and_pd(fscal,cutoff_mask);
650 /* Calculate temporary vectorial force */
651 tx = _mm256_mul_pd(fscal,dx12);
652 ty = _mm256_mul_pd(fscal,dy12);
653 tz = _mm256_mul_pd(fscal,dz12);
655 /* Update vectorial force */
656 fix1 = _mm256_add_pd(fix1,tx);
657 fiy1 = _mm256_add_pd(fiy1,ty);
658 fiz1 = _mm256_add_pd(fiz1,tz);
660 fjx2 = _mm256_add_pd(fjx2,tx);
661 fjy2 = _mm256_add_pd(fjy2,ty);
662 fjz2 = _mm256_add_pd(fjz2,tz);
666 /**************************
667 * CALCULATE INTERACTIONS *
668 **************************/
670 if (gmx_mm256_any_lt(rsq20,rcutoff2))
673 r20 = _mm256_mul_pd(rsq20,rinv20);
675 /* EWALD ELECTROSTATICS */
677 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
678 ewrt = _mm256_mul_pd(r20,ewtabscale);
679 ewitab = _mm256_cvttpd_epi32(ewrt);
680 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
681 ewitab = _mm_slli_epi32(ewitab,2);
682 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
683 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
684 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
685 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
686 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
687 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
688 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
689 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
690 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
692 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velec = _mm256_and_pd(velec,cutoff_mask);
696 velecsum = _mm256_add_pd(velecsum,velec);
700 fscal = _mm256_and_pd(fscal,cutoff_mask);
702 /* Calculate temporary vectorial force */
703 tx = _mm256_mul_pd(fscal,dx20);
704 ty = _mm256_mul_pd(fscal,dy20);
705 tz = _mm256_mul_pd(fscal,dz20);
707 /* Update vectorial force */
708 fix2 = _mm256_add_pd(fix2,tx);
709 fiy2 = _mm256_add_pd(fiy2,ty);
710 fiz2 = _mm256_add_pd(fiz2,tz);
712 fjx0 = _mm256_add_pd(fjx0,tx);
713 fjy0 = _mm256_add_pd(fjy0,ty);
714 fjz0 = _mm256_add_pd(fjz0,tz);
718 /**************************
719 * CALCULATE INTERACTIONS *
720 **************************/
722 if (gmx_mm256_any_lt(rsq21,rcutoff2))
725 r21 = _mm256_mul_pd(rsq21,rinv21);
727 /* EWALD ELECTROSTATICS */
729 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
730 ewrt = _mm256_mul_pd(r21,ewtabscale);
731 ewitab = _mm256_cvttpd_epi32(ewrt);
732 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
733 ewitab = _mm_slli_epi32(ewitab,2);
734 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
735 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
736 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
737 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
738 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
739 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
740 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
741 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
742 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
744 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velec = _mm256_and_pd(velec,cutoff_mask);
748 velecsum = _mm256_add_pd(velecsum,velec);
752 fscal = _mm256_and_pd(fscal,cutoff_mask);
754 /* Calculate temporary vectorial force */
755 tx = _mm256_mul_pd(fscal,dx21);
756 ty = _mm256_mul_pd(fscal,dy21);
757 tz = _mm256_mul_pd(fscal,dz21);
759 /* Update vectorial force */
760 fix2 = _mm256_add_pd(fix2,tx);
761 fiy2 = _mm256_add_pd(fiy2,ty);
762 fiz2 = _mm256_add_pd(fiz2,tz);
764 fjx1 = _mm256_add_pd(fjx1,tx);
765 fjy1 = _mm256_add_pd(fjy1,ty);
766 fjz1 = _mm256_add_pd(fjz1,tz);
770 /**************************
771 * CALCULATE INTERACTIONS *
772 **************************/
774 if (gmx_mm256_any_lt(rsq22,rcutoff2))
777 r22 = _mm256_mul_pd(rsq22,rinv22);
779 /* EWALD ELECTROSTATICS */
781 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
782 ewrt = _mm256_mul_pd(r22,ewtabscale);
783 ewitab = _mm256_cvttpd_epi32(ewrt);
784 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
785 ewitab = _mm_slli_epi32(ewitab,2);
786 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
787 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
788 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
789 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
790 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
791 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
792 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
793 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
794 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
796 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
798 /* Update potential sum for this i atom from the interaction with this j atom. */
799 velec = _mm256_and_pd(velec,cutoff_mask);
800 velecsum = _mm256_add_pd(velecsum,velec);
804 fscal = _mm256_and_pd(fscal,cutoff_mask);
806 /* Calculate temporary vectorial force */
807 tx = _mm256_mul_pd(fscal,dx22);
808 ty = _mm256_mul_pd(fscal,dy22);
809 tz = _mm256_mul_pd(fscal,dz22);
811 /* Update vectorial force */
812 fix2 = _mm256_add_pd(fix2,tx);
813 fiy2 = _mm256_add_pd(fiy2,ty);
814 fiz2 = _mm256_add_pd(fiz2,tz);
816 fjx2 = _mm256_add_pd(fjx2,tx);
817 fjy2 = _mm256_add_pd(fjy2,ty);
818 fjz2 = _mm256_add_pd(fjz2,tz);
822 fjptrA = f+j_coord_offsetA;
823 fjptrB = f+j_coord_offsetB;
824 fjptrC = f+j_coord_offsetC;
825 fjptrD = f+j_coord_offsetD;
827 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
828 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
830 /* Inner loop uses 450 flops */
836 /* Get j neighbor index, and coordinate index */
837 jnrlistA = jjnr[jidx];
838 jnrlistB = jjnr[jidx+1];
839 jnrlistC = jjnr[jidx+2];
840 jnrlistD = jjnr[jidx+3];
841 /* Sign of each element will be negative for non-real atoms.
842 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
843 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
845 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
847 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
848 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
849 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
851 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
852 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
853 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
854 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
855 j_coord_offsetA = DIM*jnrA;
856 j_coord_offsetB = DIM*jnrB;
857 j_coord_offsetC = DIM*jnrC;
858 j_coord_offsetD = DIM*jnrD;
860 /* load j atom coordinates */
861 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
862 x+j_coord_offsetC,x+j_coord_offsetD,
863 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
865 /* Calculate displacement vector */
866 dx00 = _mm256_sub_pd(ix0,jx0);
867 dy00 = _mm256_sub_pd(iy0,jy0);
868 dz00 = _mm256_sub_pd(iz0,jz0);
869 dx01 = _mm256_sub_pd(ix0,jx1);
870 dy01 = _mm256_sub_pd(iy0,jy1);
871 dz01 = _mm256_sub_pd(iz0,jz1);
872 dx02 = _mm256_sub_pd(ix0,jx2);
873 dy02 = _mm256_sub_pd(iy0,jy2);
874 dz02 = _mm256_sub_pd(iz0,jz2);
875 dx10 = _mm256_sub_pd(ix1,jx0);
876 dy10 = _mm256_sub_pd(iy1,jy0);
877 dz10 = _mm256_sub_pd(iz1,jz0);
878 dx11 = _mm256_sub_pd(ix1,jx1);
879 dy11 = _mm256_sub_pd(iy1,jy1);
880 dz11 = _mm256_sub_pd(iz1,jz1);
881 dx12 = _mm256_sub_pd(ix1,jx2);
882 dy12 = _mm256_sub_pd(iy1,jy2);
883 dz12 = _mm256_sub_pd(iz1,jz2);
884 dx20 = _mm256_sub_pd(ix2,jx0);
885 dy20 = _mm256_sub_pd(iy2,jy0);
886 dz20 = _mm256_sub_pd(iz2,jz0);
887 dx21 = _mm256_sub_pd(ix2,jx1);
888 dy21 = _mm256_sub_pd(iy2,jy1);
889 dz21 = _mm256_sub_pd(iz2,jz1);
890 dx22 = _mm256_sub_pd(ix2,jx2);
891 dy22 = _mm256_sub_pd(iy2,jy2);
892 dz22 = _mm256_sub_pd(iz2,jz2);
894 /* Calculate squared distance and things based on it */
895 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
896 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
897 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
898 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
899 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
900 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
901 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
902 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
903 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
905 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
906 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
907 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
908 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
909 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
910 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
911 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
912 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
913 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
915 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
916 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
917 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
918 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
919 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
920 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
921 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
922 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
923 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
925 fjx0 = _mm256_setzero_pd();
926 fjy0 = _mm256_setzero_pd();
927 fjz0 = _mm256_setzero_pd();
928 fjx1 = _mm256_setzero_pd();
929 fjy1 = _mm256_setzero_pd();
930 fjz1 = _mm256_setzero_pd();
931 fjx2 = _mm256_setzero_pd();
932 fjy2 = _mm256_setzero_pd();
933 fjz2 = _mm256_setzero_pd();
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm256_any_lt(rsq00,rcutoff2))
942 r00 = _mm256_mul_pd(rsq00,rinv00);
943 r00 = _mm256_andnot_pd(dummy_mask,r00);
945 /* EWALD ELECTROSTATICS */
947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
948 ewrt = _mm256_mul_pd(r00,ewtabscale);
949 ewitab = _mm256_cvttpd_epi32(ewrt);
950 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
951 ewitab = _mm_slli_epi32(ewitab,2);
952 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
953 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
954 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
955 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
956 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
957 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
958 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
959 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
960 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
962 /* Analytical LJ-PME */
963 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
964 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
965 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
966 exponent = gmx_simd_exp_d(ewcljrsq);
967 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
968 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
969 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
970 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
971 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
972 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
973 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
974 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
975 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
977 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm256_and_pd(velec,cutoff_mask);
981 velec = _mm256_andnot_pd(dummy_mask,velec);
982 velecsum = _mm256_add_pd(velecsum,velec);
983 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
984 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
985 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
987 fscal = _mm256_add_pd(felec,fvdw);
989 fscal = _mm256_and_pd(fscal,cutoff_mask);
991 fscal = _mm256_andnot_pd(dummy_mask,fscal);
993 /* Calculate temporary vectorial force */
994 tx = _mm256_mul_pd(fscal,dx00);
995 ty = _mm256_mul_pd(fscal,dy00);
996 tz = _mm256_mul_pd(fscal,dz00);
998 /* Update vectorial force */
999 fix0 = _mm256_add_pd(fix0,tx);
1000 fiy0 = _mm256_add_pd(fiy0,ty);
1001 fiz0 = _mm256_add_pd(fiz0,tz);
1003 fjx0 = _mm256_add_pd(fjx0,tx);
1004 fjy0 = _mm256_add_pd(fjy0,ty);
1005 fjz0 = _mm256_add_pd(fjz0,tz);
1009 /**************************
1010 * CALCULATE INTERACTIONS *
1011 **************************/
1013 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1016 r01 = _mm256_mul_pd(rsq01,rinv01);
1017 r01 = _mm256_andnot_pd(dummy_mask,r01);
1019 /* EWALD ELECTROSTATICS */
1021 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1022 ewrt = _mm256_mul_pd(r01,ewtabscale);
1023 ewitab = _mm256_cvttpd_epi32(ewrt);
1024 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1025 ewitab = _mm_slli_epi32(ewitab,2);
1026 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1027 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1028 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1029 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1030 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1031 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1032 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1033 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
1034 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1036 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1038 /* Update potential sum for this i atom from the interaction with this j atom. */
1039 velec = _mm256_and_pd(velec,cutoff_mask);
1040 velec = _mm256_andnot_pd(dummy_mask,velec);
1041 velecsum = _mm256_add_pd(velecsum,velec);
1045 fscal = _mm256_and_pd(fscal,cutoff_mask);
1047 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1049 /* Calculate temporary vectorial force */
1050 tx = _mm256_mul_pd(fscal,dx01);
1051 ty = _mm256_mul_pd(fscal,dy01);
1052 tz = _mm256_mul_pd(fscal,dz01);
1054 /* Update vectorial force */
1055 fix0 = _mm256_add_pd(fix0,tx);
1056 fiy0 = _mm256_add_pd(fiy0,ty);
1057 fiz0 = _mm256_add_pd(fiz0,tz);
1059 fjx1 = _mm256_add_pd(fjx1,tx);
1060 fjy1 = _mm256_add_pd(fjy1,ty);
1061 fjz1 = _mm256_add_pd(fjz1,tz);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1072 r02 = _mm256_mul_pd(rsq02,rinv02);
1073 r02 = _mm256_andnot_pd(dummy_mask,r02);
1075 /* EWALD ELECTROSTATICS */
1077 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1078 ewrt = _mm256_mul_pd(r02,ewtabscale);
1079 ewitab = _mm256_cvttpd_epi32(ewrt);
1080 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1081 ewitab = _mm_slli_epi32(ewitab,2);
1082 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1083 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1084 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1085 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1086 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1087 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1088 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1089 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1090 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1092 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1094 /* Update potential sum for this i atom from the interaction with this j atom. */
1095 velec = _mm256_and_pd(velec,cutoff_mask);
1096 velec = _mm256_andnot_pd(dummy_mask,velec);
1097 velecsum = _mm256_add_pd(velecsum,velec);
1101 fscal = _mm256_and_pd(fscal,cutoff_mask);
1103 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1105 /* Calculate temporary vectorial force */
1106 tx = _mm256_mul_pd(fscal,dx02);
1107 ty = _mm256_mul_pd(fscal,dy02);
1108 tz = _mm256_mul_pd(fscal,dz02);
1110 /* Update vectorial force */
1111 fix0 = _mm256_add_pd(fix0,tx);
1112 fiy0 = _mm256_add_pd(fiy0,ty);
1113 fiz0 = _mm256_add_pd(fiz0,tz);
1115 fjx2 = _mm256_add_pd(fjx2,tx);
1116 fjy2 = _mm256_add_pd(fjy2,ty);
1117 fjz2 = _mm256_add_pd(fjz2,tz);
1121 /**************************
1122 * CALCULATE INTERACTIONS *
1123 **************************/
1125 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1128 r10 = _mm256_mul_pd(rsq10,rinv10);
1129 r10 = _mm256_andnot_pd(dummy_mask,r10);
1131 /* EWALD ELECTROSTATICS */
1133 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1134 ewrt = _mm256_mul_pd(r10,ewtabscale);
1135 ewitab = _mm256_cvttpd_epi32(ewrt);
1136 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1137 ewitab = _mm_slli_epi32(ewitab,2);
1138 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1139 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1140 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1141 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1142 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1143 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1144 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1145 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1146 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1148 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1150 /* Update potential sum for this i atom from the interaction with this j atom. */
1151 velec = _mm256_and_pd(velec,cutoff_mask);
1152 velec = _mm256_andnot_pd(dummy_mask,velec);
1153 velecsum = _mm256_add_pd(velecsum,velec);
1157 fscal = _mm256_and_pd(fscal,cutoff_mask);
1159 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1161 /* Calculate temporary vectorial force */
1162 tx = _mm256_mul_pd(fscal,dx10);
1163 ty = _mm256_mul_pd(fscal,dy10);
1164 tz = _mm256_mul_pd(fscal,dz10);
1166 /* Update vectorial force */
1167 fix1 = _mm256_add_pd(fix1,tx);
1168 fiy1 = _mm256_add_pd(fiy1,ty);
1169 fiz1 = _mm256_add_pd(fiz1,tz);
1171 fjx0 = _mm256_add_pd(fjx0,tx);
1172 fjy0 = _mm256_add_pd(fjy0,ty);
1173 fjz0 = _mm256_add_pd(fjz0,tz);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1184 r11 = _mm256_mul_pd(rsq11,rinv11);
1185 r11 = _mm256_andnot_pd(dummy_mask,r11);
1187 /* EWALD ELECTROSTATICS */
1189 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1190 ewrt = _mm256_mul_pd(r11,ewtabscale);
1191 ewitab = _mm256_cvttpd_epi32(ewrt);
1192 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1193 ewitab = _mm_slli_epi32(ewitab,2);
1194 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1195 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1196 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1197 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1198 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1199 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1200 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1201 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1202 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1204 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1206 /* Update potential sum for this i atom from the interaction with this j atom. */
1207 velec = _mm256_and_pd(velec,cutoff_mask);
1208 velec = _mm256_andnot_pd(dummy_mask,velec);
1209 velecsum = _mm256_add_pd(velecsum,velec);
1213 fscal = _mm256_and_pd(fscal,cutoff_mask);
1215 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1217 /* Calculate temporary vectorial force */
1218 tx = _mm256_mul_pd(fscal,dx11);
1219 ty = _mm256_mul_pd(fscal,dy11);
1220 tz = _mm256_mul_pd(fscal,dz11);
1222 /* Update vectorial force */
1223 fix1 = _mm256_add_pd(fix1,tx);
1224 fiy1 = _mm256_add_pd(fiy1,ty);
1225 fiz1 = _mm256_add_pd(fiz1,tz);
1227 fjx1 = _mm256_add_pd(fjx1,tx);
1228 fjy1 = _mm256_add_pd(fjy1,ty);
1229 fjz1 = _mm256_add_pd(fjz1,tz);
1233 /**************************
1234 * CALCULATE INTERACTIONS *
1235 **************************/
1237 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1240 r12 = _mm256_mul_pd(rsq12,rinv12);
1241 r12 = _mm256_andnot_pd(dummy_mask,r12);
1243 /* EWALD ELECTROSTATICS */
1245 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1246 ewrt = _mm256_mul_pd(r12,ewtabscale);
1247 ewitab = _mm256_cvttpd_epi32(ewrt);
1248 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1249 ewitab = _mm_slli_epi32(ewitab,2);
1250 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1251 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1252 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1253 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1254 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1255 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1256 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1257 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1258 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1260 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1262 /* Update potential sum for this i atom from the interaction with this j atom. */
1263 velec = _mm256_and_pd(velec,cutoff_mask);
1264 velec = _mm256_andnot_pd(dummy_mask,velec);
1265 velecsum = _mm256_add_pd(velecsum,velec);
1269 fscal = _mm256_and_pd(fscal,cutoff_mask);
1271 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1273 /* Calculate temporary vectorial force */
1274 tx = _mm256_mul_pd(fscal,dx12);
1275 ty = _mm256_mul_pd(fscal,dy12);
1276 tz = _mm256_mul_pd(fscal,dz12);
1278 /* Update vectorial force */
1279 fix1 = _mm256_add_pd(fix1,tx);
1280 fiy1 = _mm256_add_pd(fiy1,ty);
1281 fiz1 = _mm256_add_pd(fiz1,tz);
1283 fjx2 = _mm256_add_pd(fjx2,tx);
1284 fjy2 = _mm256_add_pd(fjy2,ty);
1285 fjz2 = _mm256_add_pd(fjz2,tz);
1289 /**************************
1290 * CALCULATE INTERACTIONS *
1291 **************************/
1293 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1296 r20 = _mm256_mul_pd(rsq20,rinv20);
1297 r20 = _mm256_andnot_pd(dummy_mask,r20);
1299 /* EWALD ELECTROSTATICS */
1301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1302 ewrt = _mm256_mul_pd(r20,ewtabscale);
1303 ewitab = _mm256_cvttpd_epi32(ewrt);
1304 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1305 ewitab = _mm_slli_epi32(ewitab,2);
1306 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1307 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1308 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1309 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1310 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1311 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1312 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1313 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1314 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1316 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1318 /* Update potential sum for this i atom from the interaction with this j atom. */
1319 velec = _mm256_and_pd(velec,cutoff_mask);
1320 velec = _mm256_andnot_pd(dummy_mask,velec);
1321 velecsum = _mm256_add_pd(velecsum,velec);
1325 fscal = _mm256_and_pd(fscal,cutoff_mask);
1327 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1329 /* Calculate temporary vectorial force */
1330 tx = _mm256_mul_pd(fscal,dx20);
1331 ty = _mm256_mul_pd(fscal,dy20);
1332 tz = _mm256_mul_pd(fscal,dz20);
1334 /* Update vectorial force */
1335 fix2 = _mm256_add_pd(fix2,tx);
1336 fiy2 = _mm256_add_pd(fiy2,ty);
1337 fiz2 = _mm256_add_pd(fiz2,tz);
1339 fjx0 = _mm256_add_pd(fjx0,tx);
1340 fjy0 = _mm256_add_pd(fjy0,ty);
1341 fjz0 = _mm256_add_pd(fjz0,tz);
1345 /**************************
1346 * CALCULATE INTERACTIONS *
1347 **************************/
1349 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1352 r21 = _mm256_mul_pd(rsq21,rinv21);
1353 r21 = _mm256_andnot_pd(dummy_mask,r21);
1355 /* EWALD ELECTROSTATICS */
1357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1358 ewrt = _mm256_mul_pd(r21,ewtabscale);
1359 ewitab = _mm256_cvttpd_epi32(ewrt);
1360 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1361 ewitab = _mm_slli_epi32(ewitab,2);
1362 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1363 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1364 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1365 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1366 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1367 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1368 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1369 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1370 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1372 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1374 /* Update potential sum for this i atom from the interaction with this j atom. */
1375 velec = _mm256_and_pd(velec,cutoff_mask);
1376 velec = _mm256_andnot_pd(dummy_mask,velec);
1377 velecsum = _mm256_add_pd(velecsum,velec);
1381 fscal = _mm256_and_pd(fscal,cutoff_mask);
1383 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1385 /* Calculate temporary vectorial force */
1386 tx = _mm256_mul_pd(fscal,dx21);
1387 ty = _mm256_mul_pd(fscal,dy21);
1388 tz = _mm256_mul_pd(fscal,dz21);
1390 /* Update vectorial force */
1391 fix2 = _mm256_add_pd(fix2,tx);
1392 fiy2 = _mm256_add_pd(fiy2,ty);
1393 fiz2 = _mm256_add_pd(fiz2,tz);
1395 fjx1 = _mm256_add_pd(fjx1,tx);
1396 fjy1 = _mm256_add_pd(fjy1,ty);
1397 fjz1 = _mm256_add_pd(fjz1,tz);
1401 /**************************
1402 * CALCULATE INTERACTIONS *
1403 **************************/
1405 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1408 r22 = _mm256_mul_pd(rsq22,rinv22);
1409 r22 = _mm256_andnot_pd(dummy_mask,r22);
1411 /* EWALD ELECTROSTATICS */
1413 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1414 ewrt = _mm256_mul_pd(r22,ewtabscale);
1415 ewitab = _mm256_cvttpd_epi32(ewrt);
1416 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1417 ewitab = _mm_slli_epi32(ewitab,2);
1418 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1419 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1420 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1421 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1422 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1423 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1424 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1425 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1426 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1428 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1430 /* Update potential sum for this i atom from the interaction with this j atom. */
1431 velec = _mm256_and_pd(velec,cutoff_mask);
1432 velec = _mm256_andnot_pd(dummy_mask,velec);
1433 velecsum = _mm256_add_pd(velecsum,velec);
1437 fscal = _mm256_and_pd(fscal,cutoff_mask);
1439 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1441 /* Calculate temporary vectorial force */
1442 tx = _mm256_mul_pd(fscal,dx22);
1443 ty = _mm256_mul_pd(fscal,dy22);
1444 tz = _mm256_mul_pd(fscal,dz22);
1446 /* Update vectorial force */
1447 fix2 = _mm256_add_pd(fix2,tx);
1448 fiy2 = _mm256_add_pd(fiy2,ty);
1449 fiz2 = _mm256_add_pd(fiz2,tz);
1451 fjx2 = _mm256_add_pd(fjx2,tx);
1452 fjy2 = _mm256_add_pd(fjy2,ty);
1453 fjz2 = _mm256_add_pd(fjz2,tz);
1457 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1458 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1459 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1460 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1462 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1463 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1465 /* Inner loop uses 459 flops */
1468 /* End of innermost loop */
1470 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1471 f+i_coord_offset,fshift+i_shift_offset);
1474 /* Update potential energies */
1475 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1476 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1478 /* Increment number of inner iterations */
1479 inneriter += j_index_end - j_index_start;
1481 /* Outer loop uses 20 flops */
1484 /* Increment number of outer iterations */
1487 /* Update outer/inner flops */
1489 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
1492 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double
1493 * Electrostatics interaction: Ewald
1494 * VdW interaction: LJEwald
1495 * Geometry: Water3-Water3
1496 * Calculate force/pot: Force
1499 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double
1500 (t_nblist * gmx_restrict nlist,
1501 rvec * gmx_restrict xx,
1502 rvec * gmx_restrict ff,
1503 t_forcerec * gmx_restrict fr,
1504 t_mdatoms * gmx_restrict mdatoms,
1505 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1506 t_nrnb * gmx_restrict nrnb)
1508 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1509 * just 0 for non-waters.
1510 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1511 * jnr indices corresponding to data put in the four positions in the SIMD register.
1513 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1514 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1515 int jnrA,jnrB,jnrC,jnrD;
1516 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1517 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1518 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1519 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1520 real rcutoff_scalar;
1521 real *shiftvec,*fshift,*x,*f;
1522 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1523 real scratch[4*DIM];
1524 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1525 real * vdwioffsetptr0;
1526 real * vdwgridioffsetptr0;
1527 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1528 real * vdwioffsetptr1;
1529 real * vdwgridioffsetptr1;
1530 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1531 real * vdwioffsetptr2;
1532 real * vdwgridioffsetptr2;
1533 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1534 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1535 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1536 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1537 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1538 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1539 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1540 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1541 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1542 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1543 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1544 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1545 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1546 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1547 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1548 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1549 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1552 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1555 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1556 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1567 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1568 __m256d one_half = _mm256_set1_pd(0.5);
1569 __m256d minus_one = _mm256_set1_pd(-1.0);
1571 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1572 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1574 __m256d dummy_mask,cutoff_mask;
1575 __m128 tmpmask0,tmpmask1;
1576 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1577 __m256d one = _mm256_set1_pd(1.0);
1578 __m256d two = _mm256_set1_pd(2.0);
1584 jindex = nlist->jindex;
1586 shiftidx = nlist->shift;
1588 shiftvec = fr->shift_vec[0];
1589 fshift = fr->fshift[0];
1590 facel = _mm256_set1_pd(fr->epsfac);
1591 charge = mdatoms->chargeA;
1592 nvdwtype = fr->ntype;
1593 vdwparam = fr->nbfp;
1594 vdwtype = mdatoms->typeA;
1595 vdwgridparam = fr->ljpme_c6grid;
1596 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1597 ewclj = _mm256_set1_pd(fr->ewaldcoeff_lj);
1598 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1600 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1601 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1602 beta2 = _mm256_mul_pd(beta,beta);
1603 beta3 = _mm256_mul_pd(beta,beta2);
1605 ewtab = fr->ic->tabq_coul_F;
1606 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1607 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1609 /* Setup water-specific parameters */
1610 inr = nlist->iinr[0];
1611 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1612 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1613 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1614 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1615 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1617 jq0 = _mm256_set1_pd(charge[inr+0]);
1618 jq1 = _mm256_set1_pd(charge[inr+1]);
1619 jq2 = _mm256_set1_pd(charge[inr+2]);
1620 vdwjidx0A = 2*vdwtype[inr+0];
1621 qq00 = _mm256_mul_pd(iq0,jq0);
1622 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1623 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1624 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1625 qq01 = _mm256_mul_pd(iq0,jq1);
1626 qq02 = _mm256_mul_pd(iq0,jq2);
1627 qq10 = _mm256_mul_pd(iq1,jq0);
1628 qq11 = _mm256_mul_pd(iq1,jq1);
1629 qq12 = _mm256_mul_pd(iq1,jq2);
1630 qq20 = _mm256_mul_pd(iq2,jq0);
1631 qq21 = _mm256_mul_pd(iq2,jq1);
1632 qq22 = _mm256_mul_pd(iq2,jq2);
1634 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1635 rcutoff_scalar = fr->rcoulomb;
1636 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1637 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1639 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1640 rvdw = _mm256_set1_pd(fr->rvdw);
1642 /* Avoid stupid compiler warnings */
1643 jnrA = jnrB = jnrC = jnrD = 0;
1644 j_coord_offsetA = 0;
1645 j_coord_offsetB = 0;
1646 j_coord_offsetC = 0;
1647 j_coord_offsetD = 0;
1652 for(iidx=0;iidx<4*DIM;iidx++)
1654 scratch[iidx] = 0.0;
1657 /* Start outer loop over neighborlists */
1658 for(iidx=0; iidx<nri; iidx++)
1660 /* Load shift vector for this list */
1661 i_shift_offset = DIM*shiftidx[iidx];
1663 /* Load limits for loop over neighbors */
1664 j_index_start = jindex[iidx];
1665 j_index_end = jindex[iidx+1];
1667 /* Get outer coordinate index */
1669 i_coord_offset = DIM*inr;
1671 /* Load i particle coords and add shift vector */
1672 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1673 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1675 fix0 = _mm256_setzero_pd();
1676 fiy0 = _mm256_setzero_pd();
1677 fiz0 = _mm256_setzero_pd();
1678 fix1 = _mm256_setzero_pd();
1679 fiy1 = _mm256_setzero_pd();
1680 fiz1 = _mm256_setzero_pd();
1681 fix2 = _mm256_setzero_pd();
1682 fiy2 = _mm256_setzero_pd();
1683 fiz2 = _mm256_setzero_pd();
1685 /* Start inner kernel loop */
1686 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1689 /* Get j neighbor index, and coordinate index */
1691 jnrB = jjnr[jidx+1];
1692 jnrC = jjnr[jidx+2];
1693 jnrD = jjnr[jidx+3];
1694 j_coord_offsetA = DIM*jnrA;
1695 j_coord_offsetB = DIM*jnrB;
1696 j_coord_offsetC = DIM*jnrC;
1697 j_coord_offsetD = DIM*jnrD;
1699 /* load j atom coordinates */
1700 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1701 x+j_coord_offsetC,x+j_coord_offsetD,
1702 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1704 /* Calculate displacement vector */
1705 dx00 = _mm256_sub_pd(ix0,jx0);
1706 dy00 = _mm256_sub_pd(iy0,jy0);
1707 dz00 = _mm256_sub_pd(iz0,jz0);
1708 dx01 = _mm256_sub_pd(ix0,jx1);
1709 dy01 = _mm256_sub_pd(iy0,jy1);
1710 dz01 = _mm256_sub_pd(iz0,jz1);
1711 dx02 = _mm256_sub_pd(ix0,jx2);
1712 dy02 = _mm256_sub_pd(iy0,jy2);
1713 dz02 = _mm256_sub_pd(iz0,jz2);
1714 dx10 = _mm256_sub_pd(ix1,jx0);
1715 dy10 = _mm256_sub_pd(iy1,jy0);
1716 dz10 = _mm256_sub_pd(iz1,jz0);
1717 dx11 = _mm256_sub_pd(ix1,jx1);
1718 dy11 = _mm256_sub_pd(iy1,jy1);
1719 dz11 = _mm256_sub_pd(iz1,jz1);
1720 dx12 = _mm256_sub_pd(ix1,jx2);
1721 dy12 = _mm256_sub_pd(iy1,jy2);
1722 dz12 = _mm256_sub_pd(iz1,jz2);
1723 dx20 = _mm256_sub_pd(ix2,jx0);
1724 dy20 = _mm256_sub_pd(iy2,jy0);
1725 dz20 = _mm256_sub_pd(iz2,jz0);
1726 dx21 = _mm256_sub_pd(ix2,jx1);
1727 dy21 = _mm256_sub_pd(iy2,jy1);
1728 dz21 = _mm256_sub_pd(iz2,jz1);
1729 dx22 = _mm256_sub_pd(ix2,jx2);
1730 dy22 = _mm256_sub_pd(iy2,jy2);
1731 dz22 = _mm256_sub_pd(iz2,jz2);
1733 /* Calculate squared distance and things based on it */
1734 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1735 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1736 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1737 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1738 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1739 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1740 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1741 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1742 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1744 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1745 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1746 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1747 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1748 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1749 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1750 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1751 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1752 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1754 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1755 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1756 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1757 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1758 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1759 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1760 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1761 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1762 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1764 fjx0 = _mm256_setzero_pd();
1765 fjy0 = _mm256_setzero_pd();
1766 fjz0 = _mm256_setzero_pd();
1767 fjx1 = _mm256_setzero_pd();
1768 fjy1 = _mm256_setzero_pd();
1769 fjz1 = _mm256_setzero_pd();
1770 fjx2 = _mm256_setzero_pd();
1771 fjy2 = _mm256_setzero_pd();
1772 fjz2 = _mm256_setzero_pd();
1774 /**************************
1775 * CALCULATE INTERACTIONS *
1776 **************************/
1778 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1781 r00 = _mm256_mul_pd(rsq00,rinv00);
1783 /* EWALD ELECTROSTATICS */
1785 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1786 ewrt = _mm256_mul_pd(r00,ewtabscale);
1787 ewitab = _mm256_cvttpd_epi32(ewrt);
1788 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1789 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1790 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1792 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1793 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1795 /* Analytical LJ-PME */
1796 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1797 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
1798 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1799 exponent = gmx_simd_exp_d(ewcljrsq);
1800 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1801 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1802 /* f6A = 6 * C6grid * (1 - poly) */
1803 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1804 /* f6B = C6grid * exponent * beta^6 */
1805 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1806 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1807 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1809 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1811 fscal = _mm256_add_pd(felec,fvdw);
1813 fscal = _mm256_and_pd(fscal,cutoff_mask);
1815 /* Calculate temporary vectorial force */
1816 tx = _mm256_mul_pd(fscal,dx00);
1817 ty = _mm256_mul_pd(fscal,dy00);
1818 tz = _mm256_mul_pd(fscal,dz00);
1820 /* Update vectorial force */
1821 fix0 = _mm256_add_pd(fix0,tx);
1822 fiy0 = _mm256_add_pd(fiy0,ty);
1823 fiz0 = _mm256_add_pd(fiz0,tz);
1825 fjx0 = _mm256_add_pd(fjx0,tx);
1826 fjy0 = _mm256_add_pd(fjy0,ty);
1827 fjz0 = _mm256_add_pd(fjz0,tz);
1831 /**************************
1832 * CALCULATE INTERACTIONS *
1833 **************************/
1835 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1838 r01 = _mm256_mul_pd(rsq01,rinv01);
1840 /* EWALD ELECTROSTATICS */
1842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1843 ewrt = _mm256_mul_pd(r01,ewtabscale);
1844 ewitab = _mm256_cvttpd_epi32(ewrt);
1845 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1846 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1847 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1849 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1850 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1852 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1856 fscal = _mm256_and_pd(fscal,cutoff_mask);
1858 /* Calculate temporary vectorial force */
1859 tx = _mm256_mul_pd(fscal,dx01);
1860 ty = _mm256_mul_pd(fscal,dy01);
1861 tz = _mm256_mul_pd(fscal,dz01);
1863 /* Update vectorial force */
1864 fix0 = _mm256_add_pd(fix0,tx);
1865 fiy0 = _mm256_add_pd(fiy0,ty);
1866 fiz0 = _mm256_add_pd(fiz0,tz);
1868 fjx1 = _mm256_add_pd(fjx1,tx);
1869 fjy1 = _mm256_add_pd(fjy1,ty);
1870 fjz1 = _mm256_add_pd(fjz1,tz);
1874 /**************************
1875 * CALCULATE INTERACTIONS *
1876 **************************/
1878 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1881 r02 = _mm256_mul_pd(rsq02,rinv02);
1883 /* EWALD ELECTROSTATICS */
1885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1886 ewrt = _mm256_mul_pd(r02,ewtabscale);
1887 ewitab = _mm256_cvttpd_epi32(ewrt);
1888 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1889 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1890 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1892 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1893 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1895 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1899 fscal = _mm256_and_pd(fscal,cutoff_mask);
1901 /* Calculate temporary vectorial force */
1902 tx = _mm256_mul_pd(fscal,dx02);
1903 ty = _mm256_mul_pd(fscal,dy02);
1904 tz = _mm256_mul_pd(fscal,dz02);
1906 /* Update vectorial force */
1907 fix0 = _mm256_add_pd(fix0,tx);
1908 fiy0 = _mm256_add_pd(fiy0,ty);
1909 fiz0 = _mm256_add_pd(fiz0,tz);
1911 fjx2 = _mm256_add_pd(fjx2,tx);
1912 fjy2 = _mm256_add_pd(fjy2,ty);
1913 fjz2 = _mm256_add_pd(fjz2,tz);
1917 /**************************
1918 * CALCULATE INTERACTIONS *
1919 **************************/
1921 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1924 r10 = _mm256_mul_pd(rsq10,rinv10);
1926 /* EWALD ELECTROSTATICS */
1928 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1929 ewrt = _mm256_mul_pd(r10,ewtabscale);
1930 ewitab = _mm256_cvttpd_epi32(ewrt);
1931 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1932 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1933 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1935 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1936 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1938 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1942 fscal = _mm256_and_pd(fscal,cutoff_mask);
1944 /* Calculate temporary vectorial force */
1945 tx = _mm256_mul_pd(fscal,dx10);
1946 ty = _mm256_mul_pd(fscal,dy10);
1947 tz = _mm256_mul_pd(fscal,dz10);
1949 /* Update vectorial force */
1950 fix1 = _mm256_add_pd(fix1,tx);
1951 fiy1 = _mm256_add_pd(fiy1,ty);
1952 fiz1 = _mm256_add_pd(fiz1,tz);
1954 fjx0 = _mm256_add_pd(fjx0,tx);
1955 fjy0 = _mm256_add_pd(fjy0,ty);
1956 fjz0 = _mm256_add_pd(fjz0,tz);
1960 /**************************
1961 * CALCULATE INTERACTIONS *
1962 **************************/
1964 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1967 r11 = _mm256_mul_pd(rsq11,rinv11);
1969 /* EWALD ELECTROSTATICS */
1971 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1972 ewrt = _mm256_mul_pd(r11,ewtabscale);
1973 ewitab = _mm256_cvttpd_epi32(ewrt);
1974 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1975 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1976 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1978 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1979 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1981 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1985 fscal = _mm256_and_pd(fscal,cutoff_mask);
1987 /* Calculate temporary vectorial force */
1988 tx = _mm256_mul_pd(fscal,dx11);
1989 ty = _mm256_mul_pd(fscal,dy11);
1990 tz = _mm256_mul_pd(fscal,dz11);
1992 /* Update vectorial force */
1993 fix1 = _mm256_add_pd(fix1,tx);
1994 fiy1 = _mm256_add_pd(fiy1,ty);
1995 fiz1 = _mm256_add_pd(fiz1,tz);
1997 fjx1 = _mm256_add_pd(fjx1,tx);
1998 fjy1 = _mm256_add_pd(fjy1,ty);
1999 fjz1 = _mm256_add_pd(fjz1,tz);
2003 /**************************
2004 * CALCULATE INTERACTIONS *
2005 **************************/
2007 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2010 r12 = _mm256_mul_pd(rsq12,rinv12);
2012 /* EWALD ELECTROSTATICS */
2014 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2015 ewrt = _mm256_mul_pd(r12,ewtabscale);
2016 ewitab = _mm256_cvttpd_epi32(ewrt);
2017 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2018 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2019 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2021 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2022 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2024 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2028 fscal = _mm256_and_pd(fscal,cutoff_mask);
2030 /* Calculate temporary vectorial force */
2031 tx = _mm256_mul_pd(fscal,dx12);
2032 ty = _mm256_mul_pd(fscal,dy12);
2033 tz = _mm256_mul_pd(fscal,dz12);
2035 /* Update vectorial force */
2036 fix1 = _mm256_add_pd(fix1,tx);
2037 fiy1 = _mm256_add_pd(fiy1,ty);
2038 fiz1 = _mm256_add_pd(fiz1,tz);
2040 fjx2 = _mm256_add_pd(fjx2,tx);
2041 fjy2 = _mm256_add_pd(fjy2,ty);
2042 fjz2 = _mm256_add_pd(fjz2,tz);
2046 /**************************
2047 * CALCULATE INTERACTIONS *
2048 **************************/
2050 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2053 r20 = _mm256_mul_pd(rsq20,rinv20);
2055 /* EWALD ELECTROSTATICS */
2057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2058 ewrt = _mm256_mul_pd(r20,ewtabscale);
2059 ewitab = _mm256_cvttpd_epi32(ewrt);
2060 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2061 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2062 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2064 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2065 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2067 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2071 fscal = _mm256_and_pd(fscal,cutoff_mask);
2073 /* Calculate temporary vectorial force */
2074 tx = _mm256_mul_pd(fscal,dx20);
2075 ty = _mm256_mul_pd(fscal,dy20);
2076 tz = _mm256_mul_pd(fscal,dz20);
2078 /* Update vectorial force */
2079 fix2 = _mm256_add_pd(fix2,tx);
2080 fiy2 = _mm256_add_pd(fiy2,ty);
2081 fiz2 = _mm256_add_pd(fiz2,tz);
2083 fjx0 = _mm256_add_pd(fjx0,tx);
2084 fjy0 = _mm256_add_pd(fjy0,ty);
2085 fjz0 = _mm256_add_pd(fjz0,tz);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2096 r21 = _mm256_mul_pd(rsq21,rinv21);
2098 /* EWALD ELECTROSTATICS */
2100 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2101 ewrt = _mm256_mul_pd(r21,ewtabscale);
2102 ewitab = _mm256_cvttpd_epi32(ewrt);
2103 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2104 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2105 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2107 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2108 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2110 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2114 fscal = _mm256_and_pd(fscal,cutoff_mask);
2116 /* Calculate temporary vectorial force */
2117 tx = _mm256_mul_pd(fscal,dx21);
2118 ty = _mm256_mul_pd(fscal,dy21);
2119 tz = _mm256_mul_pd(fscal,dz21);
2121 /* Update vectorial force */
2122 fix2 = _mm256_add_pd(fix2,tx);
2123 fiy2 = _mm256_add_pd(fiy2,ty);
2124 fiz2 = _mm256_add_pd(fiz2,tz);
2126 fjx1 = _mm256_add_pd(fjx1,tx);
2127 fjy1 = _mm256_add_pd(fjy1,ty);
2128 fjz1 = _mm256_add_pd(fjz1,tz);
2132 /**************************
2133 * CALCULATE INTERACTIONS *
2134 **************************/
2136 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2139 r22 = _mm256_mul_pd(rsq22,rinv22);
2141 /* EWALD ELECTROSTATICS */
2143 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2144 ewrt = _mm256_mul_pd(r22,ewtabscale);
2145 ewitab = _mm256_cvttpd_epi32(ewrt);
2146 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2147 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2148 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2150 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2151 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2153 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2157 fscal = _mm256_and_pd(fscal,cutoff_mask);
2159 /* Calculate temporary vectorial force */
2160 tx = _mm256_mul_pd(fscal,dx22);
2161 ty = _mm256_mul_pd(fscal,dy22);
2162 tz = _mm256_mul_pd(fscal,dz22);
2164 /* Update vectorial force */
2165 fix2 = _mm256_add_pd(fix2,tx);
2166 fiy2 = _mm256_add_pd(fiy2,ty);
2167 fiz2 = _mm256_add_pd(fiz2,tz);
2169 fjx2 = _mm256_add_pd(fjx2,tx);
2170 fjy2 = _mm256_add_pd(fjy2,ty);
2171 fjz2 = _mm256_add_pd(fjz2,tz);
2175 fjptrA = f+j_coord_offsetA;
2176 fjptrB = f+j_coord_offsetB;
2177 fjptrC = f+j_coord_offsetC;
2178 fjptrD = f+j_coord_offsetD;
2180 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2181 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2183 /* Inner loop uses 374 flops */
2186 if(jidx<j_index_end)
2189 /* Get j neighbor index, and coordinate index */
2190 jnrlistA = jjnr[jidx];
2191 jnrlistB = jjnr[jidx+1];
2192 jnrlistC = jjnr[jidx+2];
2193 jnrlistD = jjnr[jidx+3];
2194 /* Sign of each element will be negative for non-real atoms.
2195 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2196 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2198 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2200 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2201 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2202 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2204 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2205 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2206 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2207 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2208 j_coord_offsetA = DIM*jnrA;
2209 j_coord_offsetB = DIM*jnrB;
2210 j_coord_offsetC = DIM*jnrC;
2211 j_coord_offsetD = DIM*jnrD;
2213 /* load j atom coordinates */
2214 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2215 x+j_coord_offsetC,x+j_coord_offsetD,
2216 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2218 /* Calculate displacement vector */
2219 dx00 = _mm256_sub_pd(ix0,jx0);
2220 dy00 = _mm256_sub_pd(iy0,jy0);
2221 dz00 = _mm256_sub_pd(iz0,jz0);
2222 dx01 = _mm256_sub_pd(ix0,jx1);
2223 dy01 = _mm256_sub_pd(iy0,jy1);
2224 dz01 = _mm256_sub_pd(iz0,jz1);
2225 dx02 = _mm256_sub_pd(ix0,jx2);
2226 dy02 = _mm256_sub_pd(iy0,jy2);
2227 dz02 = _mm256_sub_pd(iz0,jz2);
2228 dx10 = _mm256_sub_pd(ix1,jx0);
2229 dy10 = _mm256_sub_pd(iy1,jy0);
2230 dz10 = _mm256_sub_pd(iz1,jz0);
2231 dx11 = _mm256_sub_pd(ix1,jx1);
2232 dy11 = _mm256_sub_pd(iy1,jy1);
2233 dz11 = _mm256_sub_pd(iz1,jz1);
2234 dx12 = _mm256_sub_pd(ix1,jx2);
2235 dy12 = _mm256_sub_pd(iy1,jy2);
2236 dz12 = _mm256_sub_pd(iz1,jz2);
2237 dx20 = _mm256_sub_pd(ix2,jx0);
2238 dy20 = _mm256_sub_pd(iy2,jy0);
2239 dz20 = _mm256_sub_pd(iz2,jz0);
2240 dx21 = _mm256_sub_pd(ix2,jx1);
2241 dy21 = _mm256_sub_pd(iy2,jy1);
2242 dz21 = _mm256_sub_pd(iz2,jz1);
2243 dx22 = _mm256_sub_pd(ix2,jx2);
2244 dy22 = _mm256_sub_pd(iy2,jy2);
2245 dz22 = _mm256_sub_pd(iz2,jz2);
2247 /* Calculate squared distance and things based on it */
2248 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2249 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2250 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2251 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2252 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2253 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2254 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2255 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2256 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2258 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2259 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2260 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2261 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2262 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2263 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2264 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2265 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2266 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2268 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2269 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2270 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2271 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2272 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2273 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2274 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2275 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2276 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2278 fjx0 = _mm256_setzero_pd();
2279 fjy0 = _mm256_setzero_pd();
2280 fjz0 = _mm256_setzero_pd();
2281 fjx1 = _mm256_setzero_pd();
2282 fjy1 = _mm256_setzero_pd();
2283 fjz1 = _mm256_setzero_pd();
2284 fjx2 = _mm256_setzero_pd();
2285 fjy2 = _mm256_setzero_pd();
2286 fjz2 = _mm256_setzero_pd();
2288 /**************************
2289 * CALCULATE INTERACTIONS *
2290 **************************/
2292 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2295 r00 = _mm256_mul_pd(rsq00,rinv00);
2296 r00 = _mm256_andnot_pd(dummy_mask,r00);
2298 /* EWALD ELECTROSTATICS */
2300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2301 ewrt = _mm256_mul_pd(r00,ewtabscale);
2302 ewitab = _mm256_cvttpd_epi32(ewrt);
2303 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2304 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2305 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2307 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2308 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2310 /* Analytical LJ-PME */
2311 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2312 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
2313 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2314 exponent = gmx_simd_exp_d(ewcljrsq);
2315 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2316 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2317 /* f6A = 6 * C6grid * (1 - poly) */
2318 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2319 /* f6B = C6grid * exponent * beta^6 */
2320 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2321 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2322 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2324 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2326 fscal = _mm256_add_pd(felec,fvdw);
2328 fscal = _mm256_and_pd(fscal,cutoff_mask);
2330 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2332 /* Calculate temporary vectorial force */
2333 tx = _mm256_mul_pd(fscal,dx00);
2334 ty = _mm256_mul_pd(fscal,dy00);
2335 tz = _mm256_mul_pd(fscal,dz00);
2337 /* Update vectorial force */
2338 fix0 = _mm256_add_pd(fix0,tx);
2339 fiy0 = _mm256_add_pd(fiy0,ty);
2340 fiz0 = _mm256_add_pd(fiz0,tz);
2342 fjx0 = _mm256_add_pd(fjx0,tx);
2343 fjy0 = _mm256_add_pd(fjy0,ty);
2344 fjz0 = _mm256_add_pd(fjz0,tz);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2355 r01 = _mm256_mul_pd(rsq01,rinv01);
2356 r01 = _mm256_andnot_pd(dummy_mask,r01);
2358 /* EWALD ELECTROSTATICS */
2360 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2361 ewrt = _mm256_mul_pd(r01,ewtabscale);
2362 ewitab = _mm256_cvttpd_epi32(ewrt);
2363 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2364 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2365 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2367 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2368 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2370 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2374 fscal = _mm256_and_pd(fscal,cutoff_mask);
2376 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2378 /* Calculate temporary vectorial force */
2379 tx = _mm256_mul_pd(fscal,dx01);
2380 ty = _mm256_mul_pd(fscal,dy01);
2381 tz = _mm256_mul_pd(fscal,dz01);
2383 /* Update vectorial force */
2384 fix0 = _mm256_add_pd(fix0,tx);
2385 fiy0 = _mm256_add_pd(fiy0,ty);
2386 fiz0 = _mm256_add_pd(fiz0,tz);
2388 fjx1 = _mm256_add_pd(fjx1,tx);
2389 fjy1 = _mm256_add_pd(fjy1,ty);
2390 fjz1 = _mm256_add_pd(fjz1,tz);
2394 /**************************
2395 * CALCULATE INTERACTIONS *
2396 **************************/
2398 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2401 r02 = _mm256_mul_pd(rsq02,rinv02);
2402 r02 = _mm256_andnot_pd(dummy_mask,r02);
2404 /* EWALD ELECTROSTATICS */
2406 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2407 ewrt = _mm256_mul_pd(r02,ewtabscale);
2408 ewitab = _mm256_cvttpd_epi32(ewrt);
2409 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2410 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2411 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2413 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2414 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2416 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2420 fscal = _mm256_and_pd(fscal,cutoff_mask);
2422 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2424 /* Calculate temporary vectorial force */
2425 tx = _mm256_mul_pd(fscal,dx02);
2426 ty = _mm256_mul_pd(fscal,dy02);
2427 tz = _mm256_mul_pd(fscal,dz02);
2429 /* Update vectorial force */
2430 fix0 = _mm256_add_pd(fix0,tx);
2431 fiy0 = _mm256_add_pd(fiy0,ty);
2432 fiz0 = _mm256_add_pd(fiz0,tz);
2434 fjx2 = _mm256_add_pd(fjx2,tx);
2435 fjy2 = _mm256_add_pd(fjy2,ty);
2436 fjz2 = _mm256_add_pd(fjz2,tz);
2440 /**************************
2441 * CALCULATE INTERACTIONS *
2442 **************************/
2444 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2447 r10 = _mm256_mul_pd(rsq10,rinv10);
2448 r10 = _mm256_andnot_pd(dummy_mask,r10);
2450 /* EWALD ELECTROSTATICS */
2452 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2453 ewrt = _mm256_mul_pd(r10,ewtabscale);
2454 ewitab = _mm256_cvttpd_epi32(ewrt);
2455 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2456 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2457 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2459 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2460 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2462 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2466 fscal = _mm256_and_pd(fscal,cutoff_mask);
2468 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2470 /* Calculate temporary vectorial force */
2471 tx = _mm256_mul_pd(fscal,dx10);
2472 ty = _mm256_mul_pd(fscal,dy10);
2473 tz = _mm256_mul_pd(fscal,dz10);
2475 /* Update vectorial force */
2476 fix1 = _mm256_add_pd(fix1,tx);
2477 fiy1 = _mm256_add_pd(fiy1,ty);
2478 fiz1 = _mm256_add_pd(fiz1,tz);
2480 fjx0 = _mm256_add_pd(fjx0,tx);
2481 fjy0 = _mm256_add_pd(fjy0,ty);
2482 fjz0 = _mm256_add_pd(fjz0,tz);
2486 /**************************
2487 * CALCULATE INTERACTIONS *
2488 **************************/
2490 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2493 r11 = _mm256_mul_pd(rsq11,rinv11);
2494 r11 = _mm256_andnot_pd(dummy_mask,r11);
2496 /* EWALD ELECTROSTATICS */
2498 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2499 ewrt = _mm256_mul_pd(r11,ewtabscale);
2500 ewitab = _mm256_cvttpd_epi32(ewrt);
2501 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2502 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2503 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2505 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2506 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2508 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2512 fscal = _mm256_and_pd(fscal,cutoff_mask);
2514 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2516 /* Calculate temporary vectorial force */
2517 tx = _mm256_mul_pd(fscal,dx11);
2518 ty = _mm256_mul_pd(fscal,dy11);
2519 tz = _mm256_mul_pd(fscal,dz11);
2521 /* Update vectorial force */
2522 fix1 = _mm256_add_pd(fix1,tx);
2523 fiy1 = _mm256_add_pd(fiy1,ty);
2524 fiz1 = _mm256_add_pd(fiz1,tz);
2526 fjx1 = _mm256_add_pd(fjx1,tx);
2527 fjy1 = _mm256_add_pd(fjy1,ty);
2528 fjz1 = _mm256_add_pd(fjz1,tz);
2532 /**************************
2533 * CALCULATE INTERACTIONS *
2534 **************************/
2536 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2539 r12 = _mm256_mul_pd(rsq12,rinv12);
2540 r12 = _mm256_andnot_pd(dummy_mask,r12);
2542 /* EWALD ELECTROSTATICS */
2544 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2545 ewrt = _mm256_mul_pd(r12,ewtabscale);
2546 ewitab = _mm256_cvttpd_epi32(ewrt);
2547 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2548 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2549 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2551 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2552 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2554 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2558 fscal = _mm256_and_pd(fscal,cutoff_mask);
2560 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2562 /* Calculate temporary vectorial force */
2563 tx = _mm256_mul_pd(fscal,dx12);
2564 ty = _mm256_mul_pd(fscal,dy12);
2565 tz = _mm256_mul_pd(fscal,dz12);
2567 /* Update vectorial force */
2568 fix1 = _mm256_add_pd(fix1,tx);
2569 fiy1 = _mm256_add_pd(fiy1,ty);
2570 fiz1 = _mm256_add_pd(fiz1,tz);
2572 fjx2 = _mm256_add_pd(fjx2,tx);
2573 fjy2 = _mm256_add_pd(fjy2,ty);
2574 fjz2 = _mm256_add_pd(fjz2,tz);
2578 /**************************
2579 * CALCULATE INTERACTIONS *
2580 **************************/
2582 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2585 r20 = _mm256_mul_pd(rsq20,rinv20);
2586 r20 = _mm256_andnot_pd(dummy_mask,r20);
2588 /* EWALD ELECTROSTATICS */
2590 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2591 ewrt = _mm256_mul_pd(r20,ewtabscale);
2592 ewitab = _mm256_cvttpd_epi32(ewrt);
2593 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2594 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2595 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2597 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2598 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2600 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2604 fscal = _mm256_and_pd(fscal,cutoff_mask);
2606 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2608 /* Calculate temporary vectorial force */
2609 tx = _mm256_mul_pd(fscal,dx20);
2610 ty = _mm256_mul_pd(fscal,dy20);
2611 tz = _mm256_mul_pd(fscal,dz20);
2613 /* Update vectorial force */
2614 fix2 = _mm256_add_pd(fix2,tx);
2615 fiy2 = _mm256_add_pd(fiy2,ty);
2616 fiz2 = _mm256_add_pd(fiz2,tz);
2618 fjx0 = _mm256_add_pd(fjx0,tx);
2619 fjy0 = _mm256_add_pd(fjy0,ty);
2620 fjz0 = _mm256_add_pd(fjz0,tz);
2624 /**************************
2625 * CALCULATE INTERACTIONS *
2626 **************************/
2628 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2631 r21 = _mm256_mul_pd(rsq21,rinv21);
2632 r21 = _mm256_andnot_pd(dummy_mask,r21);
2634 /* EWALD ELECTROSTATICS */
2636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2637 ewrt = _mm256_mul_pd(r21,ewtabscale);
2638 ewitab = _mm256_cvttpd_epi32(ewrt);
2639 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2640 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2641 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2643 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2644 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2646 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2650 fscal = _mm256_and_pd(fscal,cutoff_mask);
2652 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2654 /* Calculate temporary vectorial force */
2655 tx = _mm256_mul_pd(fscal,dx21);
2656 ty = _mm256_mul_pd(fscal,dy21);
2657 tz = _mm256_mul_pd(fscal,dz21);
2659 /* Update vectorial force */
2660 fix2 = _mm256_add_pd(fix2,tx);
2661 fiy2 = _mm256_add_pd(fiy2,ty);
2662 fiz2 = _mm256_add_pd(fiz2,tz);
2664 fjx1 = _mm256_add_pd(fjx1,tx);
2665 fjy1 = _mm256_add_pd(fjy1,ty);
2666 fjz1 = _mm256_add_pd(fjz1,tz);
2670 /**************************
2671 * CALCULATE INTERACTIONS *
2672 **************************/
2674 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2677 r22 = _mm256_mul_pd(rsq22,rinv22);
2678 r22 = _mm256_andnot_pd(dummy_mask,r22);
2680 /* EWALD ELECTROSTATICS */
2682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2683 ewrt = _mm256_mul_pd(r22,ewtabscale);
2684 ewitab = _mm256_cvttpd_epi32(ewrt);
2685 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2686 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2687 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2689 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2690 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2692 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2696 fscal = _mm256_and_pd(fscal,cutoff_mask);
2698 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2700 /* Calculate temporary vectorial force */
2701 tx = _mm256_mul_pd(fscal,dx22);
2702 ty = _mm256_mul_pd(fscal,dy22);
2703 tz = _mm256_mul_pd(fscal,dz22);
2705 /* Update vectorial force */
2706 fix2 = _mm256_add_pd(fix2,tx);
2707 fiy2 = _mm256_add_pd(fiy2,ty);
2708 fiz2 = _mm256_add_pd(fiz2,tz);
2710 fjx2 = _mm256_add_pd(fjx2,tx);
2711 fjy2 = _mm256_add_pd(fjy2,ty);
2712 fjz2 = _mm256_add_pd(fjz2,tz);
2716 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2717 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2718 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2719 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2721 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2722 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2724 /* Inner loop uses 383 flops */
2727 /* End of innermost loop */
2729 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2730 f+i_coord_offset,fshift+i_shift_offset);
2732 /* Increment number of inner iterations */
2733 inneriter += j_index_end - j_index_start;
2735 /* Outer loop uses 18 flops */
2738 /* Increment number of outer iterations */
2741 /* Update outer/inner flops */
2743 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*383);