2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
105 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
106 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
108 __m256d dummy_mask,cutoff_mask;
109 __m128 tmpmask0,tmpmask1;
110 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
111 __m256d one = _mm256_set1_pd(1.0);
112 __m256d two = _mm256_set1_pd(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm256_set1_pd(fr->epsfac);
125 charge = mdatoms->chargeA;
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
131 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
132 beta2 = _mm256_mul_pd(beta,beta);
133 beta3 = _mm256_mul_pd(beta,beta2);
135 ewtab = fr->ic->tabq_coul_FDV0;
136 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
137 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
142 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
143 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
144 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
146 jq1 = _mm256_set1_pd(charge[inr+1]);
147 jq2 = _mm256_set1_pd(charge[inr+2]);
148 jq3 = _mm256_set1_pd(charge[inr+3]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
151 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
152 qq11 = _mm256_mul_pd(iq1,jq1);
153 qq12 = _mm256_mul_pd(iq1,jq2);
154 qq13 = _mm256_mul_pd(iq1,jq3);
155 qq21 = _mm256_mul_pd(iq2,jq1);
156 qq22 = _mm256_mul_pd(iq2,jq2);
157 qq23 = _mm256_mul_pd(iq2,jq3);
158 qq31 = _mm256_mul_pd(iq3,jq1);
159 qq32 = _mm256_mul_pd(iq3,jq2);
160 qq33 = _mm256_mul_pd(iq3,jq3);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar = fr->rcoulomb;
164 rcutoff = _mm256_set1_pd(rcutoff_scalar);
165 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
167 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
168 rvdw = _mm256_set1_pd(fr->rvdw);
170 /* Avoid stupid compiler warnings */
171 jnrA = jnrB = jnrC = jnrD = 0;
180 for(iidx=0;iidx<4*DIM;iidx++)
185 /* Start outer loop over neighborlists */
186 for(iidx=0; iidx<nri; iidx++)
188 /* Load shift vector for this list */
189 i_shift_offset = DIM*shiftidx[iidx];
191 /* Load limits for loop over neighbors */
192 j_index_start = jindex[iidx];
193 j_index_end = jindex[iidx+1];
195 /* Get outer coordinate index */
197 i_coord_offset = DIM*inr;
199 /* Load i particle coords and add shift vector */
200 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
201 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
203 fix0 = _mm256_setzero_pd();
204 fiy0 = _mm256_setzero_pd();
205 fiz0 = _mm256_setzero_pd();
206 fix1 = _mm256_setzero_pd();
207 fiy1 = _mm256_setzero_pd();
208 fiz1 = _mm256_setzero_pd();
209 fix2 = _mm256_setzero_pd();
210 fiy2 = _mm256_setzero_pd();
211 fiz2 = _mm256_setzero_pd();
212 fix3 = _mm256_setzero_pd();
213 fiy3 = _mm256_setzero_pd();
214 fiz3 = _mm256_setzero_pd();
216 /* Reset potential sums */
217 velecsum = _mm256_setzero_pd();
218 vvdwsum = _mm256_setzero_pd();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
224 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
231 j_coord_offsetC = DIM*jnrC;
232 j_coord_offsetD = DIM*jnrD;
234 /* load j atom coordinates */
235 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
238 &jy2,&jz2,&jx3,&jy3,&jz3);
240 /* Calculate displacement vector */
241 dx00 = _mm256_sub_pd(ix0,jx0);
242 dy00 = _mm256_sub_pd(iy0,jy0);
243 dz00 = _mm256_sub_pd(iz0,jz0);
244 dx11 = _mm256_sub_pd(ix1,jx1);
245 dy11 = _mm256_sub_pd(iy1,jy1);
246 dz11 = _mm256_sub_pd(iz1,jz1);
247 dx12 = _mm256_sub_pd(ix1,jx2);
248 dy12 = _mm256_sub_pd(iy1,jy2);
249 dz12 = _mm256_sub_pd(iz1,jz2);
250 dx13 = _mm256_sub_pd(ix1,jx3);
251 dy13 = _mm256_sub_pd(iy1,jy3);
252 dz13 = _mm256_sub_pd(iz1,jz3);
253 dx21 = _mm256_sub_pd(ix2,jx1);
254 dy21 = _mm256_sub_pd(iy2,jy1);
255 dz21 = _mm256_sub_pd(iz2,jz1);
256 dx22 = _mm256_sub_pd(ix2,jx2);
257 dy22 = _mm256_sub_pd(iy2,jy2);
258 dz22 = _mm256_sub_pd(iz2,jz2);
259 dx23 = _mm256_sub_pd(ix2,jx3);
260 dy23 = _mm256_sub_pd(iy2,jy3);
261 dz23 = _mm256_sub_pd(iz2,jz3);
262 dx31 = _mm256_sub_pd(ix3,jx1);
263 dy31 = _mm256_sub_pd(iy3,jy1);
264 dz31 = _mm256_sub_pd(iz3,jz1);
265 dx32 = _mm256_sub_pd(ix3,jx2);
266 dy32 = _mm256_sub_pd(iy3,jy2);
267 dz32 = _mm256_sub_pd(iz3,jz2);
268 dx33 = _mm256_sub_pd(ix3,jx3);
269 dy33 = _mm256_sub_pd(iy3,jy3);
270 dz33 = _mm256_sub_pd(iz3,jz3);
272 /* Calculate squared distance and things based on it */
273 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
274 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
275 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
276 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
277 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
278 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
279 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
280 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
281 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
282 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
284 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
285 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
286 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
287 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
288 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
289 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
290 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
291 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
292 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
294 rinvsq00 = gmx_mm256_inv_pd(rsq00);
295 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
296 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
297 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
298 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
299 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
300 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
301 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
302 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
303 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
305 fjx0 = _mm256_setzero_pd();
306 fjy0 = _mm256_setzero_pd();
307 fjz0 = _mm256_setzero_pd();
308 fjx1 = _mm256_setzero_pd();
309 fjy1 = _mm256_setzero_pd();
310 fjz1 = _mm256_setzero_pd();
311 fjx2 = _mm256_setzero_pd();
312 fjy2 = _mm256_setzero_pd();
313 fjz2 = _mm256_setzero_pd();
314 fjx3 = _mm256_setzero_pd();
315 fjy3 = _mm256_setzero_pd();
316 fjz3 = _mm256_setzero_pd();
318 /**************************
319 * CALCULATE INTERACTIONS *
320 **************************/
322 if (gmx_mm256_any_lt(rsq00,rcutoff2))
325 /* LENNARD-JONES DISPERSION/REPULSION */
327 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
328 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
329 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
330 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
331 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
332 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
334 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
336 /* Update potential sum for this i atom from the interaction with this j atom. */
337 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
338 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
342 fscal = _mm256_and_pd(fscal,cutoff_mask);
344 /* Calculate temporary vectorial force */
345 tx = _mm256_mul_pd(fscal,dx00);
346 ty = _mm256_mul_pd(fscal,dy00);
347 tz = _mm256_mul_pd(fscal,dz00);
349 /* Update vectorial force */
350 fix0 = _mm256_add_pd(fix0,tx);
351 fiy0 = _mm256_add_pd(fiy0,ty);
352 fiz0 = _mm256_add_pd(fiz0,tz);
354 fjx0 = _mm256_add_pd(fjx0,tx);
355 fjy0 = _mm256_add_pd(fjy0,ty);
356 fjz0 = _mm256_add_pd(fjz0,tz);
360 /**************************
361 * CALCULATE INTERACTIONS *
362 **************************/
364 if (gmx_mm256_any_lt(rsq11,rcutoff2))
367 r11 = _mm256_mul_pd(rsq11,rinv11);
369 /* EWALD ELECTROSTATICS */
371 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
372 ewrt = _mm256_mul_pd(r11,ewtabscale);
373 ewitab = _mm256_cvttpd_epi32(ewrt);
374 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
375 ewitab = _mm_slli_epi32(ewitab,2);
376 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
377 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
378 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
379 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
380 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
381 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
382 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
383 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
384 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
386 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velec = _mm256_and_pd(velec,cutoff_mask);
390 velecsum = _mm256_add_pd(velecsum,velec);
394 fscal = _mm256_and_pd(fscal,cutoff_mask);
396 /* Calculate temporary vectorial force */
397 tx = _mm256_mul_pd(fscal,dx11);
398 ty = _mm256_mul_pd(fscal,dy11);
399 tz = _mm256_mul_pd(fscal,dz11);
401 /* Update vectorial force */
402 fix1 = _mm256_add_pd(fix1,tx);
403 fiy1 = _mm256_add_pd(fiy1,ty);
404 fiz1 = _mm256_add_pd(fiz1,tz);
406 fjx1 = _mm256_add_pd(fjx1,tx);
407 fjy1 = _mm256_add_pd(fjy1,ty);
408 fjz1 = _mm256_add_pd(fjz1,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 if (gmx_mm256_any_lt(rsq12,rcutoff2))
419 r12 = _mm256_mul_pd(rsq12,rinv12);
421 /* EWALD ELECTROSTATICS */
423 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
424 ewrt = _mm256_mul_pd(r12,ewtabscale);
425 ewitab = _mm256_cvttpd_epi32(ewrt);
426 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
427 ewitab = _mm_slli_epi32(ewitab,2);
428 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
429 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
430 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
431 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
432 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
433 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
434 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
435 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
436 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
438 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velec = _mm256_and_pd(velec,cutoff_mask);
442 velecsum = _mm256_add_pd(velecsum,velec);
446 fscal = _mm256_and_pd(fscal,cutoff_mask);
448 /* Calculate temporary vectorial force */
449 tx = _mm256_mul_pd(fscal,dx12);
450 ty = _mm256_mul_pd(fscal,dy12);
451 tz = _mm256_mul_pd(fscal,dz12);
453 /* Update vectorial force */
454 fix1 = _mm256_add_pd(fix1,tx);
455 fiy1 = _mm256_add_pd(fiy1,ty);
456 fiz1 = _mm256_add_pd(fiz1,tz);
458 fjx2 = _mm256_add_pd(fjx2,tx);
459 fjy2 = _mm256_add_pd(fjy2,ty);
460 fjz2 = _mm256_add_pd(fjz2,tz);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 if (gmx_mm256_any_lt(rsq13,rcutoff2))
471 r13 = _mm256_mul_pd(rsq13,rinv13);
473 /* EWALD ELECTROSTATICS */
475 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476 ewrt = _mm256_mul_pd(r13,ewtabscale);
477 ewitab = _mm256_cvttpd_epi32(ewrt);
478 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
479 ewitab = _mm_slli_epi32(ewitab,2);
480 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
481 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
482 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
483 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
484 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
485 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
486 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
487 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
488 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
490 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velec = _mm256_and_pd(velec,cutoff_mask);
494 velecsum = _mm256_add_pd(velecsum,velec);
498 fscal = _mm256_and_pd(fscal,cutoff_mask);
500 /* Calculate temporary vectorial force */
501 tx = _mm256_mul_pd(fscal,dx13);
502 ty = _mm256_mul_pd(fscal,dy13);
503 tz = _mm256_mul_pd(fscal,dz13);
505 /* Update vectorial force */
506 fix1 = _mm256_add_pd(fix1,tx);
507 fiy1 = _mm256_add_pd(fiy1,ty);
508 fiz1 = _mm256_add_pd(fiz1,tz);
510 fjx3 = _mm256_add_pd(fjx3,tx);
511 fjy3 = _mm256_add_pd(fjy3,ty);
512 fjz3 = _mm256_add_pd(fjz3,tz);
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
520 if (gmx_mm256_any_lt(rsq21,rcutoff2))
523 r21 = _mm256_mul_pd(rsq21,rinv21);
525 /* EWALD ELECTROSTATICS */
527 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
528 ewrt = _mm256_mul_pd(r21,ewtabscale);
529 ewitab = _mm256_cvttpd_epi32(ewrt);
530 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
531 ewitab = _mm_slli_epi32(ewitab,2);
532 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
533 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
534 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
535 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
536 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
537 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
538 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
539 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
540 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
542 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velec = _mm256_and_pd(velec,cutoff_mask);
546 velecsum = _mm256_add_pd(velecsum,velec);
550 fscal = _mm256_and_pd(fscal,cutoff_mask);
552 /* Calculate temporary vectorial force */
553 tx = _mm256_mul_pd(fscal,dx21);
554 ty = _mm256_mul_pd(fscal,dy21);
555 tz = _mm256_mul_pd(fscal,dz21);
557 /* Update vectorial force */
558 fix2 = _mm256_add_pd(fix2,tx);
559 fiy2 = _mm256_add_pd(fiy2,ty);
560 fiz2 = _mm256_add_pd(fiz2,tz);
562 fjx1 = _mm256_add_pd(fjx1,tx);
563 fjy1 = _mm256_add_pd(fjy1,ty);
564 fjz1 = _mm256_add_pd(fjz1,tz);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 if (gmx_mm256_any_lt(rsq22,rcutoff2))
575 r22 = _mm256_mul_pd(rsq22,rinv22);
577 /* EWALD ELECTROSTATICS */
579 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
580 ewrt = _mm256_mul_pd(r22,ewtabscale);
581 ewitab = _mm256_cvttpd_epi32(ewrt);
582 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
583 ewitab = _mm_slli_epi32(ewitab,2);
584 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
585 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
586 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
587 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
588 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
589 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
590 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
591 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
592 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
594 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
596 /* Update potential sum for this i atom from the interaction with this j atom. */
597 velec = _mm256_and_pd(velec,cutoff_mask);
598 velecsum = _mm256_add_pd(velecsum,velec);
602 fscal = _mm256_and_pd(fscal,cutoff_mask);
604 /* Calculate temporary vectorial force */
605 tx = _mm256_mul_pd(fscal,dx22);
606 ty = _mm256_mul_pd(fscal,dy22);
607 tz = _mm256_mul_pd(fscal,dz22);
609 /* Update vectorial force */
610 fix2 = _mm256_add_pd(fix2,tx);
611 fiy2 = _mm256_add_pd(fiy2,ty);
612 fiz2 = _mm256_add_pd(fiz2,tz);
614 fjx2 = _mm256_add_pd(fjx2,tx);
615 fjy2 = _mm256_add_pd(fjy2,ty);
616 fjz2 = _mm256_add_pd(fjz2,tz);
620 /**************************
621 * CALCULATE INTERACTIONS *
622 **************************/
624 if (gmx_mm256_any_lt(rsq23,rcutoff2))
627 r23 = _mm256_mul_pd(rsq23,rinv23);
629 /* EWALD ELECTROSTATICS */
631 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632 ewrt = _mm256_mul_pd(r23,ewtabscale);
633 ewitab = _mm256_cvttpd_epi32(ewrt);
634 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
635 ewitab = _mm_slli_epi32(ewitab,2);
636 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
637 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
638 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
639 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
640 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
641 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
642 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
643 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
644 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
646 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
648 /* Update potential sum for this i atom from the interaction with this j atom. */
649 velec = _mm256_and_pd(velec,cutoff_mask);
650 velecsum = _mm256_add_pd(velecsum,velec);
654 fscal = _mm256_and_pd(fscal,cutoff_mask);
656 /* Calculate temporary vectorial force */
657 tx = _mm256_mul_pd(fscal,dx23);
658 ty = _mm256_mul_pd(fscal,dy23);
659 tz = _mm256_mul_pd(fscal,dz23);
661 /* Update vectorial force */
662 fix2 = _mm256_add_pd(fix2,tx);
663 fiy2 = _mm256_add_pd(fiy2,ty);
664 fiz2 = _mm256_add_pd(fiz2,tz);
666 fjx3 = _mm256_add_pd(fjx3,tx);
667 fjy3 = _mm256_add_pd(fjy3,ty);
668 fjz3 = _mm256_add_pd(fjz3,tz);
672 /**************************
673 * CALCULATE INTERACTIONS *
674 **************************/
676 if (gmx_mm256_any_lt(rsq31,rcutoff2))
679 r31 = _mm256_mul_pd(rsq31,rinv31);
681 /* EWALD ELECTROSTATICS */
683 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
684 ewrt = _mm256_mul_pd(r31,ewtabscale);
685 ewitab = _mm256_cvttpd_epi32(ewrt);
686 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
687 ewitab = _mm_slli_epi32(ewitab,2);
688 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
689 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
690 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
691 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
692 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
693 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
694 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
695 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
696 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
698 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
700 /* Update potential sum for this i atom from the interaction with this j atom. */
701 velec = _mm256_and_pd(velec,cutoff_mask);
702 velecsum = _mm256_add_pd(velecsum,velec);
706 fscal = _mm256_and_pd(fscal,cutoff_mask);
708 /* Calculate temporary vectorial force */
709 tx = _mm256_mul_pd(fscal,dx31);
710 ty = _mm256_mul_pd(fscal,dy31);
711 tz = _mm256_mul_pd(fscal,dz31);
713 /* Update vectorial force */
714 fix3 = _mm256_add_pd(fix3,tx);
715 fiy3 = _mm256_add_pd(fiy3,ty);
716 fiz3 = _mm256_add_pd(fiz3,tz);
718 fjx1 = _mm256_add_pd(fjx1,tx);
719 fjy1 = _mm256_add_pd(fjy1,ty);
720 fjz1 = _mm256_add_pd(fjz1,tz);
724 /**************************
725 * CALCULATE INTERACTIONS *
726 **************************/
728 if (gmx_mm256_any_lt(rsq32,rcutoff2))
731 r32 = _mm256_mul_pd(rsq32,rinv32);
733 /* EWALD ELECTROSTATICS */
735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
736 ewrt = _mm256_mul_pd(r32,ewtabscale);
737 ewitab = _mm256_cvttpd_epi32(ewrt);
738 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
739 ewitab = _mm_slli_epi32(ewitab,2);
740 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
741 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
742 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
743 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
744 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
745 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
746 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
747 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
748 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
750 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
752 /* Update potential sum for this i atom from the interaction with this j atom. */
753 velec = _mm256_and_pd(velec,cutoff_mask);
754 velecsum = _mm256_add_pd(velecsum,velec);
758 fscal = _mm256_and_pd(fscal,cutoff_mask);
760 /* Calculate temporary vectorial force */
761 tx = _mm256_mul_pd(fscal,dx32);
762 ty = _mm256_mul_pd(fscal,dy32);
763 tz = _mm256_mul_pd(fscal,dz32);
765 /* Update vectorial force */
766 fix3 = _mm256_add_pd(fix3,tx);
767 fiy3 = _mm256_add_pd(fiy3,ty);
768 fiz3 = _mm256_add_pd(fiz3,tz);
770 fjx2 = _mm256_add_pd(fjx2,tx);
771 fjy2 = _mm256_add_pd(fjy2,ty);
772 fjz2 = _mm256_add_pd(fjz2,tz);
776 /**************************
777 * CALCULATE INTERACTIONS *
778 **************************/
780 if (gmx_mm256_any_lt(rsq33,rcutoff2))
783 r33 = _mm256_mul_pd(rsq33,rinv33);
785 /* EWALD ELECTROSTATICS */
787 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
788 ewrt = _mm256_mul_pd(r33,ewtabscale);
789 ewitab = _mm256_cvttpd_epi32(ewrt);
790 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
791 ewitab = _mm_slli_epi32(ewitab,2);
792 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
793 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
794 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
795 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
796 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
797 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
798 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
799 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
800 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
802 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
804 /* Update potential sum for this i atom from the interaction with this j atom. */
805 velec = _mm256_and_pd(velec,cutoff_mask);
806 velecsum = _mm256_add_pd(velecsum,velec);
810 fscal = _mm256_and_pd(fscal,cutoff_mask);
812 /* Calculate temporary vectorial force */
813 tx = _mm256_mul_pd(fscal,dx33);
814 ty = _mm256_mul_pd(fscal,dy33);
815 tz = _mm256_mul_pd(fscal,dz33);
817 /* Update vectorial force */
818 fix3 = _mm256_add_pd(fix3,tx);
819 fiy3 = _mm256_add_pd(fiy3,ty);
820 fiz3 = _mm256_add_pd(fiz3,tz);
822 fjx3 = _mm256_add_pd(fjx3,tx);
823 fjy3 = _mm256_add_pd(fjy3,ty);
824 fjz3 = _mm256_add_pd(fjz3,tz);
828 fjptrA = f+j_coord_offsetA;
829 fjptrB = f+j_coord_offsetB;
830 fjptrC = f+j_coord_offsetC;
831 fjptrD = f+j_coord_offsetD;
833 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
834 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
835 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
837 /* Inner loop uses 458 flops */
843 /* Get j neighbor index, and coordinate index */
844 jnrlistA = jjnr[jidx];
845 jnrlistB = jjnr[jidx+1];
846 jnrlistC = jjnr[jidx+2];
847 jnrlistD = jjnr[jidx+3];
848 /* Sign of each element will be negative for non-real atoms.
849 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
850 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
852 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
854 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
855 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
856 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
858 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
859 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
860 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
861 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
862 j_coord_offsetA = DIM*jnrA;
863 j_coord_offsetB = DIM*jnrB;
864 j_coord_offsetC = DIM*jnrC;
865 j_coord_offsetD = DIM*jnrD;
867 /* load j atom coordinates */
868 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
869 x+j_coord_offsetC,x+j_coord_offsetD,
870 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
871 &jy2,&jz2,&jx3,&jy3,&jz3);
873 /* Calculate displacement vector */
874 dx00 = _mm256_sub_pd(ix0,jx0);
875 dy00 = _mm256_sub_pd(iy0,jy0);
876 dz00 = _mm256_sub_pd(iz0,jz0);
877 dx11 = _mm256_sub_pd(ix1,jx1);
878 dy11 = _mm256_sub_pd(iy1,jy1);
879 dz11 = _mm256_sub_pd(iz1,jz1);
880 dx12 = _mm256_sub_pd(ix1,jx2);
881 dy12 = _mm256_sub_pd(iy1,jy2);
882 dz12 = _mm256_sub_pd(iz1,jz2);
883 dx13 = _mm256_sub_pd(ix1,jx3);
884 dy13 = _mm256_sub_pd(iy1,jy3);
885 dz13 = _mm256_sub_pd(iz1,jz3);
886 dx21 = _mm256_sub_pd(ix2,jx1);
887 dy21 = _mm256_sub_pd(iy2,jy1);
888 dz21 = _mm256_sub_pd(iz2,jz1);
889 dx22 = _mm256_sub_pd(ix2,jx2);
890 dy22 = _mm256_sub_pd(iy2,jy2);
891 dz22 = _mm256_sub_pd(iz2,jz2);
892 dx23 = _mm256_sub_pd(ix2,jx3);
893 dy23 = _mm256_sub_pd(iy2,jy3);
894 dz23 = _mm256_sub_pd(iz2,jz3);
895 dx31 = _mm256_sub_pd(ix3,jx1);
896 dy31 = _mm256_sub_pd(iy3,jy1);
897 dz31 = _mm256_sub_pd(iz3,jz1);
898 dx32 = _mm256_sub_pd(ix3,jx2);
899 dy32 = _mm256_sub_pd(iy3,jy2);
900 dz32 = _mm256_sub_pd(iz3,jz2);
901 dx33 = _mm256_sub_pd(ix3,jx3);
902 dy33 = _mm256_sub_pd(iy3,jy3);
903 dz33 = _mm256_sub_pd(iz3,jz3);
905 /* Calculate squared distance and things based on it */
906 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
907 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
908 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
909 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
910 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
911 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
912 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
913 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
914 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
915 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
917 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
918 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
919 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
920 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
921 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
922 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
923 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
924 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
925 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
927 rinvsq00 = gmx_mm256_inv_pd(rsq00);
928 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
929 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
930 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
931 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
932 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
933 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
934 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
935 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
936 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
938 fjx0 = _mm256_setzero_pd();
939 fjy0 = _mm256_setzero_pd();
940 fjz0 = _mm256_setzero_pd();
941 fjx1 = _mm256_setzero_pd();
942 fjy1 = _mm256_setzero_pd();
943 fjz1 = _mm256_setzero_pd();
944 fjx2 = _mm256_setzero_pd();
945 fjy2 = _mm256_setzero_pd();
946 fjz2 = _mm256_setzero_pd();
947 fjx3 = _mm256_setzero_pd();
948 fjy3 = _mm256_setzero_pd();
949 fjz3 = _mm256_setzero_pd();
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 if (gmx_mm256_any_lt(rsq00,rcutoff2))
958 /* LENNARD-JONES DISPERSION/REPULSION */
960 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
961 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
962 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
963 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
964 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
965 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
967 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
971 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
972 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
976 fscal = _mm256_and_pd(fscal,cutoff_mask);
978 fscal = _mm256_andnot_pd(dummy_mask,fscal);
980 /* Calculate temporary vectorial force */
981 tx = _mm256_mul_pd(fscal,dx00);
982 ty = _mm256_mul_pd(fscal,dy00);
983 tz = _mm256_mul_pd(fscal,dz00);
985 /* Update vectorial force */
986 fix0 = _mm256_add_pd(fix0,tx);
987 fiy0 = _mm256_add_pd(fiy0,ty);
988 fiz0 = _mm256_add_pd(fiz0,tz);
990 fjx0 = _mm256_add_pd(fjx0,tx);
991 fjy0 = _mm256_add_pd(fjy0,ty);
992 fjz0 = _mm256_add_pd(fjz0,tz);
996 /**************************
997 * CALCULATE INTERACTIONS *
998 **************************/
1000 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1003 r11 = _mm256_mul_pd(rsq11,rinv11);
1004 r11 = _mm256_andnot_pd(dummy_mask,r11);
1006 /* EWALD ELECTROSTATICS */
1008 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1009 ewrt = _mm256_mul_pd(r11,ewtabscale);
1010 ewitab = _mm256_cvttpd_epi32(ewrt);
1011 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1012 ewitab = _mm_slli_epi32(ewitab,2);
1013 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1014 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1015 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1016 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1017 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1018 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1019 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1020 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1021 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1023 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm256_and_pd(velec,cutoff_mask);
1027 velec = _mm256_andnot_pd(dummy_mask,velec);
1028 velecsum = _mm256_add_pd(velecsum,velec);
1032 fscal = _mm256_and_pd(fscal,cutoff_mask);
1034 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1036 /* Calculate temporary vectorial force */
1037 tx = _mm256_mul_pd(fscal,dx11);
1038 ty = _mm256_mul_pd(fscal,dy11);
1039 tz = _mm256_mul_pd(fscal,dz11);
1041 /* Update vectorial force */
1042 fix1 = _mm256_add_pd(fix1,tx);
1043 fiy1 = _mm256_add_pd(fiy1,ty);
1044 fiz1 = _mm256_add_pd(fiz1,tz);
1046 fjx1 = _mm256_add_pd(fjx1,tx);
1047 fjy1 = _mm256_add_pd(fjy1,ty);
1048 fjz1 = _mm256_add_pd(fjz1,tz);
1052 /**************************
1053 * CALCULATE INTERACTIONS *
1054 **************************/
1056 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1059 r12 = _mm256_mul_pd(rsq12,rinv12);
1060 r12 = _mm256_andnot_pd(dummy_mask,r12);
1062 /* EWALD ELECTROSTATICS */
1064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1065 ewrt = _mm256_mul_pd(r12,ewtabscale);
1066 ewitab = _mm256_cvttpd_epi32(ewrt);
1067 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1068 ewitab = _mm_slli_epi32(ewitab,2);
1069 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1070 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1071 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1072 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1073 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1074 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1075 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1076 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1077 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1079 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1081 /* Update potential sum for this i atom from the interaction with this j atom. */
1082 velec = _mm256_and_pd(velec,cutoff_mask);
1083 velec = _mm256_andnot_pd(dummy_mask,velec);
1084 velecsum = _mm256_add_pd(velecsum,velec);
1088 fscal = _mm256_and_pd(fscal,cutoff_mask);
1090 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1092 /* Calculate temporary vectorial force */
1093 tx = _mm256_mul_pd(fscal,dx12);
1094 ty = _mm256_mul_pd(fscal,dy12);
1095 tz = _mm256_mul_pd(fscal,dz12);
1097 /* Update vectorial force */
1098 fix1 = _mm256_add_pd(fix1,tx);
1099 fiy1 = _mm256_add_pd(fiy1,ty);
1100 fiz1 = _mm256_add_pd(fiz1,tz);
1102 fjx2 = _mm256_add_pd(fjx2,tx);
1103 fjy2 = _mm256_add_pd(fjy2,ty);
1104 fjz2 = _mm256_add_pd(fjz2,tz);
1108 /**************************
1109 * CALCULATE INTERACTIONS *
1110 **************************/
1112 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1115 r13 = _mm256_mul_pd(rsq13,rinv13);
1116 r13 = _mm256_andnot_pd(dummy_mask,r13);
1118 /* EWALD ELECTROSTATICS */
1120 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1121 ewrt = _mm256_mul_pd(r13,ewtabscale);
1122 ewitab = _mm256_cvttpd_epi32(ewrt);
1123 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1124 ewitab = _mm_slli_epi32(ewitab,2);
1125 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1126 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1127 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1128 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1129 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1130 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1131 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1132 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
1133 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1135 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1137 /* Update potential sum for this i atom from the interaction with this j atom. */
1138 velec = _mm256_and_pd(velec,cutoff_mask);
1139 velec = _mm256_andnot_pd(dummy_mask,velec);
1140 velecsum = _mm256_add_pd(velecsum,velec);
1144 fscal = _mm256_and_pd(fscal,cutoff_mask);
1146 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1148 /* Calculate temporary vectorial force */
1149 tx = _mm256_mul_pd(fscal,dx13);
1150 ty = _mm256_mul_pd(fscal,dy13);
1151 tz = _mm256_mul_pd(fscal,dz13);
1153 /* Update vectorial force */
1154 fix1 = _mm256_add_pd(fix1,tx);
1155 fiy1 = _mm256_add_pd(fiy1,ty);
1156 fiz1 = _mm256_add_pd(fiz1,tz);
1158 fjx3 = _mm256_add_pd(fjx3,tx);
1159 fjy3 = _mm256_add_pd(fjy3,ty);
1160 fjz3 = _mm256_add_pd(fjz3,tz);
1164 /**************************
1165 * CALCULATE INTERACTIONS *
1166 **************************/
1168 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1171 r21 = _mm256_mul_pd(rsq21,rinv21);
1172 r21 = _mm256_andnot_pd(dummy_mask,r21);
1174 /* EWALD ELECTROSTATICS */
1176 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1177 ewrt = _mm256_mul_pd(r21,ewtabscale);
1178 ewitab = _mm256_cvttpd_epi32(ewrt);
1179 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1180 ewitab = _mm_slli_epi32(ewitab,2);
1181 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1182 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1183 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1184 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1185 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1186 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1187 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1188 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1189 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1191 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1193 /* Update potential sum for this i atom from the interaction with this j atom. */
1194 velec = _mm256_and_pd(velec,cutoff_mask);
1195 velec = _mm256_andnot_pd(dummy_mask,velec);
1196 velecsum = _mm256_add_pd(velecsum,velec);
1200 fscal = _mm256_and_pd(fscal,cutoff_mask);
1202 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1204 /* Calculate temporary vectorial force */
1205 tx = _mm256_mul_pd(fscal,dx21);
1206 ty = _mm256_mul_pd(fscal,dy21);
1207 tz = _mm256_mul_pd(fscal,dz21);
1209 /* Update vectorial force */
1210 fix2 = _mm256_add_pd(fix2,tx);
1211 fiy2 = _mm256_add_pd(fiy2,ty);
1212 fiz2 = _mm256_add_pd(fiz2,tz);
1214 fjx1 = _mm256_add_pd(fjx1,tx);
1215 fjy1 = _mm256_add_pd(fjy1,ty);
1216 fjz1 = _mm256_add_pd(fjz1,tz);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1227 r22 = _mm256_mul_pd(rsq22,rinv22);
1228 r22 = _mm256_andnot_pd(dummy_mask,r22);
1230 /* EWALD ELECTROSTATICS */
1232 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1233 ewrt = _mm256_mul_pd(r22,ewtabscale);
1234 ewitab = _mm256_cvttpd_epi32(ewrt);
1235 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1236 ewitab = _mm_slli_epi32(ewitab,2);
1237 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1238 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1239 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1240 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1241 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1242 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1243 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1244 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1245 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1247 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1249 /* Update potential sum for this i atom from the interaction with this j atom. */
1250 velec = _mm256_and_pd(velec,cutoff_mask);
1251 velec = _mm256_andnot_pd(dummy_mask,velec);
1252 velecsum = _mm256_add_pd(velecsum,velec);
1256 fscal = _mm256_and_pd(fscal,cutoff_mask);
1258 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1260 /* Calculate temporary vectorial force */
1261 tx = _mm256_mul_pd(fscal,dx22);
1262 ty = _mm256_mul_pd(fscal,dy22);
1263 tz = _mm256_mul_pd(fscal,dz22);
1265 /* Update vectorial force */
1266 fix2 = _mm256_add_pd(fix2,tx);
1267 fiy2 = _mm256_add_pd(fiy2,ty);
1268 fiz2 = _mm256_add_pd(fiz2,tz);
1270 fjx2 = _mm256_add_pd(fjx2,tx);
1271 fjy2 = _mm256_add_pd(fjy2,ty);
1272 fjz2 = _mm256_add_pd(fjz2,tz);
1276 /**************************
1277 * CALCULATE INTERACTIONS *
1278 **************************/
1280 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1283 r23 = _mm256_mul_pd(rsq23,rinv23);
1284 r23 = _mm256_andnot_pd(dummy_mask,r23);
1286 /* EWALD ELECTROSTATICS */
1288 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1289 ewrt = _mm256_mul_pd(r23,ewtabscale);
1290 ewitab = _mm256_cvttpd_epi32(ewrt);
1291 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1292 ewitab = _mm_slli_epi32(ewitab,2);
1293 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1294 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1295 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1296 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1297 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1298 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1299 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1300 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
1301 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1303 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1305 /* Update potential sum for this i atom from the interaction with this j atom. */
1306 velec = _mm256_and_pd(velec,cutoff_mask);
1307 velec = _mm256_andnot_pd(dummy_mask,velec);
1308 velecsum = _mm256_add_pd(velecsum,velec);
1312 fscal = _mm256_and_pd(fscal,cutoff_mask);
1314 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1316 /* Calculate temporary vectorial force */
1317 tx = _mm256_mul_pd(fscal,dx23);
1318 ty = _mm256_mul_pd(fscal,dy23);
1319 tz = _mm256_mul_pd(fscal,dz23);
1321 /* Update vectorial force */
1322 fix2 = _mm256_add_pd(fix2,tx);
1323 fiy2 = _mm256_add_pd(fiy2,ty);
1324 fiz2 = _mm256_add_pd(fiz2,tz);
1326 fjx3 = _mm256_add_pd(fjx3,tx);
1327 fjy3 = _mm256_add_pd(fjy3,ty);
1328 fjz3 = _mm256_add_pd(fjz3,tz);
1332 /**************************
1333 * CALCULATE INTERACTIONS *
1334 **************************/
1336 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1339 r31 = _mm256_mul_pd(rsq31,rinv31);
1340 r31 = _mm256_andnot_pd(dummy_mask,r31);
1342 /* EWALD ELECTROSTATICS */
1344 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1345 ewrt = _mm256_mul_pd(r31,ewtabscale);
1346 ewitab = _mm256_cvttpd_epi32(ewrt);
1347 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1348 ewitab = _mm_slli_epi32(ewitab,2);
1349 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1350 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1351 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1352 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1353 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1354 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1355 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1356 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
1357 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1359 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1361 /* Update potential sum for this i atom from the interaction with this j atom. */
1362 velec = _mm256_and_pd(velec,cutoff_mask);
1363 velec = _mm256_andnot_pd(dummy_mask,velec);
1364 velecsum = _mm256_add_pd(velecsum,velec);
1368 fscal = _mm256_and_pd(fscal,cutoff_mask);
1370 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1372 /* Calculate temporary vectorial force */
1373 tx = _mm256_mul_pd(fscal,dx31);
1374 ty = _mm256_mul_pd(fscal,dy31);
1375 tz = _mm256_mul_pd(fscal,dz31);
1377 /* Update vectorial force */
1378 fix3 = _mm256_add_pd(fix3,tx);
1379 fiy3 = _mm256_add_pd(fiy3,ty);
1380 fiz3 = _mm256_add_pd(fiz3,tz);
1382 fjx1 = _mm256_add_pd(fjx1,tx);
1383 fjy1 = _mm256_add_pd(fjy1,ty);
1384 fjz1 = _mm256_add_pd(fjz1,tz);
1388 /**************************
1389 * CALCULATE INTERACTIONS *
1390 **************************/
1392 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1395 r32 = _mm256_mul_pd(rsq32,rinv32);
1396 r32 = _mm256_andnot_pd(dummy_mask,r32);
1398 /* EWALD ELECTROSTATICS */
1400 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1401 ewrt = _mm256_mul_pd(r32,ewtabscale);
1402 ewitab = _mm256_cvttpd_epi32(ewrt);
1403 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1404 ewitab = _mm_slli_epi32(ewitab,2);
1405 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1406 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1407 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1408 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1409 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1410 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1411 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1412 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
1413 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1415 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1417 /* Update potential sum for this i atom from the interaction with this j atom. */
1418 velec = _mm256_and_pd(velec,cutoff_mask);
1419 velec = _mm256_andnot_pd(dummy_mask,velec);
1420 velecsum = _mm256_add_pd(velecsum,velec);
1424 fscal = _mm256_and_pd(fscal,cutoff_mask);
1426 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1428 /* Calculate temporary vectorial force */
1429 tx = _mm256_mul_pd(fscal,dx32);
1430 ty = _mm256_mul_pd(fscal,dy32);
1431 tz = _mm256_mul_pd(fscal,dz32);
1433 /* Update vectorial force */
1434 fix3 = _mm256_add_pd(fix3,tx);
1435 fiy3 = _mm256_add_pd(fiy3,ty);
1436 fiz3 = _mm256_add_pd(fiz3,tz);
1438 fjx2 = _mm256_add_pd(fjx2,tx);
1439 fjy2 = _mm256_add_pd(fjy2,ty);
1440 fjz2 = _mm256_add_pd(fjz2,tz);
1444 /**************************
1445 * CALCULATE INTERACTIONS *
1446 **************************/
1448 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1451 r33 = _mm256_mul_pd(rsq33,rinv33);
1452 r33 = _mm256_andnot_pd(dummy_mask,r33);
1454 /* EWALD ELECTROSTATICS */
1456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1457 ewrt = _mm256_mul_pd(r33,ewtabscale);
1458 ewitab = _mm256_cvttpd_epi32(ewrt);
1459 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1460 ewitab = _mm_slli_epi32(ewitab,2);
1461 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1462 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1463 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1464 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1465 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1466 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1467 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1468 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
1469 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1471 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1473 /* Update potential sum for this i atom from the interaction with this j atom. */
1474 velec = _mm256_and_pd(velec,cutoff_mask);
1475 velec = _mm256_andnot_pd(dummy_mask,velec);
1476 velecsum = _mm256_add_pd(velecsum,velec);
1480 fscal = _mm256_and_pd(fscal,cutoff_mask);
1482 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1484 /* Calculate temporary vectorial force */
1485 tx = _mm256_mul_pd(fscal,dx33);
1486 ty = _mm256_mul_pd(fscal,dy33);
1487 tz = _mm256_mul_pd(fscal,dz33);
1489 /* Update vectorial force */
1490 fix3 = _mm256_add_pd(fix3,tx);
1491 fiy3 = _mm256_add_pd(fiy3,ty);
1492 fiz3 = _mm256_add_pd(fiz3,tz);
1494 fjx3 = _mm256_add_pd(fjx3,tx);
1495 fjy3 = _mm256_add_pd(fjy3,ty);
1496 fjz3 = _mm256_add_pd(fjz3,tz);
1500 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1501 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1502 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1503 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1505 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1506 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1507 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1509 /* Inner loop uses 467 flops */
1512 /* End of innermost loop */
1514 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1515 f+i_coord_offset,fshift+i_shift_offset);
1518 /* Update potential energies */
1519 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1520 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1522 /* Increment number of inner iterations */
1523 inneriter += j_index_end - j_index_start;
1525 /* Outer loop uses 26 flops */
1528 /* Increment number of outer iterations */
1531 /* Update outer/inner flops */
1533 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
1536 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1537 * Electrostatics interaction: Ewald
1538 * VdW interaction: LennardJones
1539 * Geometry: Water4-Water4
1540 * Calculate force/pot: Force
1543 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_double
1544 (t_nblist * gmx_restrict nlist,
1545 rvec * gmx_restrict xx,
1546 rvec * gmx_restrict ff,
1547 t_forcerec * gmx_restrict fr,
1548 t_mdatoms * gmx_restrict mdatoms,
1549 nb_kernel_data_t * gmx_restrict kernel_data,
1550 t_nrnb * gmx_restrict nrnb)
1552 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1553 * just 0 for non-waters.
1554 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1555 * jnr indices corresponding to data put in the four positions in the SIMD register.
1557 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1558 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1559 int jnrA,jnrB,jnrC,jnrD;
1560 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1561 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1562 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1563 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1564 real rcutoff_scalar;
1565 real *shiftvec,*fshift,*x,*f;
1566 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1567 real scratch[4*DIM];
1568 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1569 real * vdwioffsetptr0;
1570 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1571 real * vdwioffsetptr1;
1572 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1573 real * vdwioffsetptr2;
1574 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1575 real * vdwioffsetptr3;
1576 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1577 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1578 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1579 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1580 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1581 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1582 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1583 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1584 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1585 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1586 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1587 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1588 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1589 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1590 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1591 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1592 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1593 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1594 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1595 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1598 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1601 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1602 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1604 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1605 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1607 __m256d dummy_mask,cutoff_mask;
1608 __m128 tmpmask0,tmpmask1;
1609 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1610 __m256d one = _mm256_set1_pd(1.0);
1611 __m256d two = _mm256_set1_pd(2.0);
1617 jindex = nlist->jindex;
1619 shiftidx = nlist->shift;
1621 shiftvec = fr->shift_vec[0];
1622 fshift = fr->fshift[0];
1623 facel = _mm256_set1_pd(fr->epsfac);
1624 charge = mdatoms->chargeA;
1625 nvdwtype = fr->ntype;
1626 vdwparam = fr->nbfp;
1627 vdwtype = mdatoms->typeA;
1629 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1630 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1631 beta2 = _mm256_mul_pd(beta,beta);
1632 beta3 = _mm256_mul_pd(beta,beta2);
1634 ewtab = fr->ic->tabq_coul_F;
1635 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1636 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1638 /* Setup water-specific parameters */
1639 inr = nlist->iinr[0];
1640 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1641 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1642 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1643 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1645 jq1 = _mm256_set1_pd(charge[inr+1]);
1646 jq2 = _mm256_set1_pd(charge[inr+2]);
1647 jq3 = _mm256_set1_pd(charge[inr+3]);
1648 vdwjidx0A = 2*vdwtype[inr+0];
1649 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1650 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1651 qq11 = _mm256_mul_pd(iq1,jq1);
1652 qq12 = _mm256_mul_pd(iq1,jq2);
1653 qq13 = _mm256_mul_pd(iq1,jq3);
1654 qq21 = _mm256_mul_pd(iq2,jq1);
1655 qq22 = _mm256_mul_pd(iq2,jq2);
1656 qq23 = _mm256_mul_pd(iq2,jq3);
1657 qq31 = _mm256_mul_pd(iq3,jq1);
1658 qq32 = _mm256_mul_pd(iq3,jq2);
1659 qq33 = _mm256_mul_pd(iq3,jq3);
1661 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1662 rcutoff_scalar = fr->rcoulomb;
1663 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1664 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1666 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1667 rvdw = _mm256_set1_pd(fr->rvdw);
1669 /* Avoid stupid compiler warnings */
1670 jnrA = jnrB = jnrC = jnrD = 0;
1671 j_coord_offsetA = 0;
1672 j_coord_offsetB = 0;
1673 j_coord_offsetC = 0;
1674 j_coord_offsetD = 0;
1679 for(iidx=0;iidx<4*DIM;iidx++)
1681 scratch[iidx] = 0.0;
1684 /* Start outer loop over neighborlists */
1685 for(iidx=0; iidx<nri; iidx++)
1687 /* Load shift vector for this list */
1688 i_shift_offset = DIM*shiftidx[iidx];
1690 /* Load limits for loop over neighbors */
1691 j_index_start = jindex[iidx];
1692 j_index_end = jindex[iidx+1];
1694 /* Get outer coordinate index */
1696 i_coord_offset = DIM*inr;
1698 /* Load i particle coords and add shift vector */
1699 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1700 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1702 fix0 = _mm256_setzero_pd();
1703 fiy0 = _mm256_setzero_pd();
1704 fiz0 = _mm256_setzero_pd();
1705 fix1 = _mm256_setzero_pd();
1706 fiy1 = _mm256_setzero_pd();
1707 fiz1 = _mm256_setzero_pd();
1708 fix2 = _mm256_setzero_pd();
1709 fiy2 = _mm256_setzero_pd();
1710 fiz2 = _mm256_setzero_pd();
1711 fix3 = _mm256_setzero_pd();
1712 fiy3 = _mm256_setzero_pd();
1713 fiz3 = _mm256_setzero_pd();
1715 /* Start inner kernel loop */
1716 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1719 /* Get j neighbor index, and coordinate index */
1721 jnrB = jjnr[jidx+1];
1722 jnrC = jjnr[jidx+2];
1723 jnrD = jjnr[jidx+3];
1724 j_coord_offsetA = DIM*jnrA;
1725 j_coord_offsetB = DIM*jnrB;
1726 j_coord_offsetC = DIM*jnrC;
1727 j_coord_offsetD = DIM*jnrD;
1729 /* load j atom coordinates */
1730 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1731 x+j_coord_offsetC,x+j_coord_offsetD,
1732 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1733 &jy2,&jz2,&jx3,&jy3,&jz3);
1735 /* Calculate displacement vector */
1736 dx00 = _mm256_sub_pd(ix0,jx0);
1737 dy00 = _mm256_sub_pd(iy0,jy0);
1738 dz00 = _mm256_sub_pd(iz0,jz0);
1739 dx11 = _mm256_sub_pd(ix1,jx1);
1740 dy11 = _mm256_sub_pd(iy1,jy1);
1741 dz11 = _mm256_sub_pd(iz1,jz1);
1742 dx12 = _mm256_sub_pd(ix1,jx2);
1743 dy12 = _mm256_sub_pd(iy1,jy2);
1744 dz12 = _mm256_sub_pd(iz1,jz2);
1745 dx13 = _mm256_sub_pd(ix1,jx3);
1746 dy13 = _mm256_sub_pd(iy1,jy3);
1747 dz13 = _mm256_sub_pd(iz1,jz3);
1748 dx21 = _mm256_sub_pd(ix2,jx1);
1749 dy21 = _mm256_sub_pd(iy2,jy1);
1750 dz21 = _mm256_sub_pd(iz2,jz1);
1751 dx22 = _mm256_sub_pd(ix2,jx2);
1752 dy22 = _mm256_sub_pd(iy2,jy2);
1753 dz22 = _mm256_sub_pd(iz2,jz2);
1754 dx23 = _mm256_sub_pd(ix2,jx3);
1755 dy23 = _mm256_sub_pd(iy2,jy3);
1756 dz23 = _mm256_sub_pd(iz2,jz3);
1757 dx31 = _mm256_sub_pd(ix3,jx1);
1758 dy31 = _mm256_sub_pd(iy3,jy1);
1759 dz31 = _mm256_sub_pd(iz3,jz1);
1760 dx32 = _mm256_sub_pd(ix3,jx2);
1761 dy32 = _mm256_sub_pd(iy3,jy2);
1762 dz32 = _mm256_sub_pd(iz3,jz2);
1763 dx33 = _mm256_sub_pd(ix3,jx3);
1764 dy33 = _mm256_sub_pd(iy3,jy3);
1765 dz33 = _mm256_sub_pd(iz3,jz3);
1767 /* Calculate squared distance and things based on it */
1768 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1769 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1770 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1771 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1772 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1773 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1774 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1775 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1776 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1777 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1779 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1780 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1781 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1782 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1783 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1784 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1785 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1786 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1787 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1789 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1790 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1791 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1792 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1793 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1794 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1795 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1796 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1797 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1798 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1800 fjx0 = _mm256_setzero_pd();
1801 fjy0 = _mm256_setzero_pd();
1802 fjz0 = _mm256_setzero_pd();
1803 fjx1 = _mm256_setzero_pd();
1804 fjy1 = _mm256_setzero_pd();
1805 fjz1 = _mm256_setzero_pd();
1806 fjx2 = _mm256_setzero_pd();
1807 fjy2 = _mm256_setzero_pd();
1808 fjz2 = _mm256_setzero_pd();
1809 fjx3 = _mm256_setzero_pd();
1810 fjy3 = _mm256_setzero_pd();
1811 fjz3 = _mm256_setzero_pd();
1813 /**************************
1814 * CALCULATE INTERACTIONS *
1815 **************************/
1817 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1820 /* LENNARD-JONES DISPERSION/REPULSION */
1822 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1823 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1825 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1829 fscal = _mm256_and_pd(fscal,cutoff_mask);
1831 /* Calculate temporary vectorial force */
1832 tx = _mm256_mul_pd(fscal,dx00);
1833 ty = _mm256_mul_pd(fscal,dy00);
1834 tz = _mm256_mul_pd(fscal,dz00);
1836 /* Update vectorial force */
1837 fix0 = _mm256_add_pd(fix0,tx);
1838 fiy0 = _mm256_add_pd(fiy0,ty);
1839 fiz0 = _mm256_add_pd(fiz0,tz);
1841 fjx0 = _mm256_add_pd(fjx0,tx);
1842 fjy0 = _mm256_add_pd(fjy0,ty);
1843 fjz0 = _mm256_add_pd(fjz0,tz);
1847 /**************************
1848 * CALCULATE INTERACTIONS *
1849 **************************/
1851 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1854 r11 = _mm256_mul_pd(rsq11,rinv11);
1856 /* EWALD ELECTROSTATICS */
1858 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1859 ewrt = _mm256_mul_pd(r11,ewtabscale);
1860 ewitab = _mm256_cvttpd_epi32(ewrt);
1861 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1862 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1863 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1865 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1866 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1868 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1872 fscal = _mm256_and_pd(fscal,cutoff_mask);
1874 /* Calculate temporary vectorial force */
1875 tx = _mm256_mul_pd(fscal,dx11);
1876 ty = _mm256_mul_pd(fscal,dy11);
1877 tz = _mm256_mul_pd(fscal,dz11);
1879 /* Update vectorial force */
1880 fix1 = _mm256_add_pd(fix1,tx);
1881 fiy1 = _mm256_add_pd(fiy1,ty);
1882 fiz1 = _mm256_add_pd(fiz1,tz);
1884 fjx1 = _mm256_add_pd(fjx1,tx);
1885 fjy1 = _mm256_add_pd(fjy1,ty);
1886 fjz1 = _mm256_add_pd(fjz1,tz);
1890 /**************************
1891 * CALCULATE INTERACTIONS *
1892 **************************/
1894 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1897 r12 = _mm256_mul_pd(rsq12,rinv12);
1899 /* EWALD ELECTROSTATICS */
1901 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1902 ewrt = _mm256_mul_pd(r12,ewtabscale);
1903 ewitab = _mm256_cvttpd_epi32(ewrt);
1904 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1905 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1906 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1908 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1909 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1911 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1915 fscal = _mm256_and_pd(fscal,cutoff_mask);
1917 /* Calculate temporary vectorial force */
1918 tx = _mm256_mul_pd(fscal,dx12);
1919 ty = _mm256_mul_pd(fscal,dy12);
1920 tz = _mm256_mul_pd(fscal,dz12);
1922 /* Update vectorial force */
1923 fix1 = _mm256_add_pd(fix1,tx);
1924 fiy1 = _mm256_add_pd(fiy1,ty);
1925 fiz1 = _mm256_add_pd(fiz1,tz);
1927 fjx2 = _mm256_add_pd(fjx2,tx);
1928 fjy2 = _mm256_add_pd(fjy2,ty);
1929 fjz2 = _mm256_add_pd(fjz2,tz);
1933 /**************************
1934 * CALCULATE INTERACTIONS *
1935 **************************/
1937 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1940 r13 = _mm256_mul_pd(rsq13,rinv13);
1942 /* EWALD ELECTROSTATICS */
1944 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1945 ewrt = _mm256_mul_pd(r13,ewtabscale);
1946 ewitab = _mm256_cvttpd_epi32(ewrt);
1947 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1948 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1949 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1951 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1952 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1954 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1958 fscal = _mm256_and_pd(fscal,cutoff_mask);
1960 /* Calculate temporary vectorial force */
1961 tx = _mm256_mul_pd(fscal,dx13);
1962 ty = _mm256_mul_pd(fscal,dy13);
1963 tz = _mm256_mul_pd(fscal,dz13);
1965 /* Update vectorial force */
1966 fix1 = _mm256_add_pd(fix1,tx);
1967 fiy1 = _mm256_add_pd(fiy1,ty);
1968 fiz1 = _mm256_add_pd(fiz1,tz);
1970 fjx3 = _mm256_add_pd(fjx3,tx);
1971 fjy3 = _mm256_add_pd(fjy3,ty);
1972 fjz3 = _mm256_add_pd(fjz3,tz);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1983 r21 = _mm256_mul_pd(rsq21,rinv21);
1985 /* EWALD ELECTROSTATICS */
1987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988 ewrt = _mm256_mul_pd(r21,ewtabscale);
1989 ewitab = _mm256_cvttpd_epi32(ewrt);
1990 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1991 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1992 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1994 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1995 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1997 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2001 fscal = _mm256_and_pd(fscal,cutoff_mask);
2003 /* Calculate temporary vectorial force */
2004 tx = _mm256_mul_pd(fscal,dx21);
2005 ty = _mm256_mul_pd(fscal,dy21);
2006 tz = _mm256_mul_pd(fscal,dz21);
2008 /* Update vectorial force */
2009 fix2 = _mm256_add_pd(fix2,tx);
2010 fiy2 = _mm256_add_pd(fiy2,ty);
2011 fiz2 = _mm256_add_pd(fiz2,tz);
2013 fjx1 = _mm256_add_pd(fjx1,tx);
2014 fjy1 = _mm256_add_pd(fjy1,ty);
2015 fjz1 = _mm256_add_pd(fjz1,tz);
2019 /**************************
2020 * CALCULATE INTERACTIONS *
2021 **************************/
2023 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2026 r22 = _mm256_mul_pd(rsq22,rinv22);
2028 /* EWALD ELECTROSTATICS */
2030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2031 ewrt = _mm256_mul_pd(r22,ewtabscale);
2032 ewitab = _mm256_cvttpd_epi32(ewrt);
2033 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2034 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2035 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2037 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2038 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2040 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2044 fscal = _mm256_and_pd(fscal,cutoff_mask);
2046 /* Calculate temporary vectorial force */
2047 tx = _mm256_mul_pd(fscal,dx22);
2048 ty = _mm256_mul_pd(fscal,dy22);
2049 tz = _mm256_mul_pd(fscal,dz22);
2051 /* Update vectorial force */
2052 fix2 = _mm256_add_pd(fix2,tx);
2053 fiy2 = _mm256_add_pd(fiy2,ty);
2054 fiz2 = _mm256_add_pd(fiz2,tz);
2056 fjx2 = _mm256_add_pd(fjx2,tx);
2057 fjy2 = _mm256_add_pd(fjy2,ty);
2058 fjz2 = _mm256_add_pd(fjz2,tz);
2062 /**************************
2063 * CALCULATE INTERACTIONS *
2064 **************************/
2066 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2069 r23 = _mm256_mul_pd(rsq23,rinv23);
2071 /* EWALD ELECTROSTATICS */
2073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2074 ewrt = _mm256_mul_pd(r23,ewtabscale);
2075 ewitab = _mm256_cvttpd_epi32(ewrt);
2076 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2077 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2078 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2080 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2081 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2083 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2087 fscal = _mm256_and_pd(fscal,cutoff_mask);
2089 /* Calculate temporary vectorial force */
2090 tx = _mm256_mul_pd(fscal,dx23);
2091 ty = _mm256_mul_pd(fscal,dy23);
2092 tz = _mm256_mul_pd(fscal,dz23);
2094 /* Update vectorial force */
2095 fix2 = _mm256_add_pd(fix2,tx);
2096 fiy2 = _mm256_add_pd(fiy2,ty);
2097 fiz2 = _mm256_add_pd(fiz2,tz);
2099 fjx3 = _mm256_add_pd(fjx3,tx);
2100 fjy3 = _mm256_add_pd(fjy3,ty);
2101 fjz3 = _mm256_add_pd(fjz3,tz);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2112 r31 = _mm256_mul_pd(rsq31,rinv31);
2114 /* EWALD ELECTROSTATICS */
2116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2117 ewrt = _mm256_mul_pd(r31,ewtabscale);
2118 ewitab = _mm256_cvttpd_epi32(ewrt);
2119 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2120 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2121 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2123 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2124 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2126 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2130 fscal = _mm256_and_pd(fscal,cutoff_mask);
2132 /* Calculate temporary vectorial force */
2133 tx = _mm256_mul_pd(fscal,dx31);
2134 ty = _mm256_mul_pd(fscal,dy31);
2135 tz = _mm256_mul_pd(fscal,dz31);
2137 /* Update vectorial force */
2138 fix3 = _mm256_add_pd(fix3,tx);
2139 fiy3 = _mm256_add_pd(fiy3,ty);
2140 fiz3 = _mm256_add_pd(fiz3,tz);
2142 fjx1 = _mm256_add_pd(fjx1,tx);
2143 fjy1 = _mm256_add_pd(fjy1,ty);
2144 fjz1 = _mm256_add_pd(fjz1,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2155 r32 = _mm256_mul_pd(rsq32,rinv32);
2157 /* EWALD ELECTROSTATICS */
2159 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160 ewrt = _mm256_mul_pd(r32,ewtabscale);
2161 ewitab = _mm256_cvttpd_epi32(ewrt);
2162 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2163 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2164 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2166 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2167 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2169 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2173 fscal = _mm256_and_pd(fscal,cutoff_mask);
2175 /* Calculate temporary vectorial force */
2176 tx = _mm256_mul_pd(fscal,dx32);
2177 ty = _mm256_mul_pd(fscal,dy32);
2178 tz = _mm256_mul_pd(fscal,dz32);
2180 /* Update vectorial force */
2181 fix3 = _mm256_add_pd(fix3,tx);
2182 fiy3 = _mm256_add_pd(fiy3,ty);
2183 fiz3 = _mm256_add_pd(fiz3,tz);
2185 fjx2 = _mm256_add_pd(fjx2,tx);
2186 fjy2 = _mm256_add_pd(fjy2,ty);
2187 fjz2 = _mm256_add_pd(fjz2,tz);
2191 /**************************
2192 * CALCULATE INTERACTIONS *
2193 **************************/
2195 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2198 r33 = _mm256_mul_pd(rsq33,rinv33);
2200 /* EWALD ELECTROSTATICS */
2202 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2203 ewrt = _mm256_mul_pd(r33,ewtabscale);
2204 ewitab = _mm256_cvttpd_epi32(ewrt);
2205 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2206 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2207 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2209 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2210 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2212 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2216 fscal = _mm256_and_pd(fscal,cutoff_mask);
2218 /* Calculate temporary vectorial force */
2219 tx = _mm256_mul_pd(fscal,dx33);
2220 ty = _mm256_mul_pd(fscal,dy33);
2221 tz = _mm256_mul_pd(fscal,dz33);
2223 /* Update vectorial force */
2224 fix3 = _mm256_add_pd(fix3,tx);
2225 fiy3 = _mm256_add_pd(fiy3,ty);
2226 fiz3 = _mm256_add_pd(fiz3,tz);
2228 fjx3 = _mm256_add_pd(fjx3,tx);
2229 fjy3 = _mm256_add_pd(fjy3,ty);
2230 fjz3 = _mm256_add_pd(fjz3,tz);
2234 fjptrA = f+j_coord_offsetA;
2235 fjptrB = f+j_coord_offsetB;
2236 fjptrC = f+j_coord_offsetC;
2237 fjptrD = f+j_coord_offsetD;
2239 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2240 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2241 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2243 /* Inner loop uses 384 flops */
2246 if(jidx<j_index_end)
2249 /* Get j neighbor index, and coordinate index */
2250 jnrlistA = jjnr[jidx];
2251 jnrlistB = jjnr[jidx+1];
2252 jnrlistC = jjnr[jidx+2];
2253 jnrlistD = jjnr[jidx+3];
2254 /* Sign of each element will be negative for non-real atoms.
2255 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2256 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2258 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2260 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2261 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2262 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2264 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2265 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2266 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2267 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2268 j_coord_offsetA = DIM*jnrA;
2269 j_coord_offsetB = DIM*jnrB;
2270 j_coord_offsetC = DIM*jnrC;
2271 j_coord_offsetD = DIM*jnrD;
2273 /* load j atom coordinates */
2274 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2275 x+j_coord_offsetC,x+j_coord_offsetD,
2276 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2277 &jy2,&jz2,&jx3,&jy3,&jz3);
2279 /* Calculate displacement vector */
2280 dx00 = _mm256_sub_pd(ix0,jx0);
2281 dy00 = _mm256_sub_pd(iy0,jy0);
2282 dz00 = _mm256_sub_pd(iz0,jz0);
2283 dx11 = _mm256_sub_pd(ix1,jx1);
2284 dy11 = _mm256_sub_pd(iy1,jy1);
2285 dz11 = _mm256_sub_pd(iz1,jz1);
2286 dx12 = _mm256_sub_pd(ix1,jx2);
2287 dy12 = _mm256_sub_pd(iy1,jy2);
2288 dz12 = _mm256_sub_pd(iz1,jz2);
2289 dx13 = _mm256_sub_pd(ix1,jx3);
2290 dy13 = _mm256_sub_pd(iy1,jy3);
2291 dz13 = _mm256_sub_pd(iz1,jz3);
2292 dx21 = _mm256_sub_pd(ix2,jx1);
2293 dy21 = _mm256_sub_pd(iy2,jy1);
2294 dz21 = _mm256_sub_pd(iz2,jz1);
2295 dx22 = _mm256_sub_pd(ix2,jx2);
2296 dy22 = _mm256_sub_pd(iy2,jy2);
2297 dz22 = _mm256_sub_pd(iz2,jz2);
2298 dx23 = _mm256_sub_pd(ix2,jx3);
2299 dy23 = _mm256_sub_pd(iy2,jy3);
2300 dz23 = _mm256_sub_pd(iz2,jz3);
2301 dx31 = _mm256_sub_pd(ix3,jx1);
2302 dy31 = _mm256_sub_pd(iy3,jy1);
2303 dz31 = _mm256_sub_pd(iz3,jz1);
2304 dx32 = _mm256_sub_pd(ix3,jx2);
2305 dy32 = _mm256_sub_pd(iy3,jy2);
2306 dz32 = _mm256_sub_pd(iz3,jz2);
2307 dx33 = _mm256_sub_pd(ix3,jx3);
2308 dy33 = _mm256_sub_pd(iy3,jy3);
2309 dz33 = _mm256_sub_pd(iz3,jz3);
2311 /* Calculate squared distance and things based on it */
2312 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2313 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2314 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2315 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2316 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2317 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2318 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2319 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2320 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2321 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2323 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2324 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2325 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2326 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2327 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2328 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2329 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2330 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2331 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2333 rinvsq00 = gmx_mm256_inv_pd(rsq00);
2334 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2335 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2336 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2337 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2338 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2339 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2340 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2341 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2342 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2344 fjx0 = _mm256_setzero_pd();
2345 fjy0 = _mm256_setzero_pd();
2346 fjz0 = _mm256_setzero_pd();
2347 fjx1 = _mm256_setzero_pd();
2348 fjy1 = _mm256_setzero_pd();
2349 fjz1 = _mm256_setzero_pd();
2350 fjx2 = _mm256_setzero_pd();
2351 fjy2 = _mm256_setzero_pd();
2352 fjz2 = _mm256_setzero_pd();
2353 fjx3 = _mm256_setzero_pd();
2354 fjy3 = _mm256_setzero_pd();
2355 fjz3 = _mm256_setzero_pd();
2357 /**************************
2358 * CALCULATE INTERACTIONS *
2359 **************************/
2361 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2364 /* LENNARD-JONES DISPERSION/REPULSION */
2366 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2367 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2369 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2373 fscal = _mm256_and_pd(fscal,cutoff_mask);
2375 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2377 /* Calculate temporary vectorial force */
2378 tx = _mm256_mul_pd(fscal,dx00);
2379 ty = _mm256_mul_pd(fscal,dy00);
2380 tz = _mm256_mul_pd(fscal,dz00);
2382 /* Update vectorial force */
2383 fix0 = _mm256_add_pd(fix0,tx);
2384 fiy0 = _mm256_add_pd(fiy0,ty);
2385 fiz0 = _mm256_add_pd(fiz0,tz);
2387 fjx0 = _mm256_add_pd(fjx0,tx);
2388 fjy0 = _mm256_add_pd(fjy0,ty);
2389 fjz0 = _mm256_add_pd(fjz0,tz);
2393 /**************************
2394 * CALCULATE INTERACTIONS *
2395 **************************/
2397 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2400 r11 = _mm256_mul_pd(rsq11,rinv11);
2401 r11 = _mm256_andnot_pd(dummy_mask,r11);
2403 /* EWALD ELECTROSTATICS */
2405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2406 ewrt = _mm256_mul_pd(r11,ewtabscale);
2407 ewitab = _mm256_cvttpd_epi32(ewrt);
2408 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2409 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2410 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2412 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2413 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2415 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2419 fscal = _mm256_and_pd(fscal,cutoff_mask);
2421 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2423 /* Calculate temporary vectorial force */
2424 tx = _mm256_mul_pd(fscal,dx11);
2425 ty = _mm256_mul_pd(fscal,dy11);
2426 tz = _mm256_mul_pd(fscal,dz11);
2428 /* Update vectorial force */
2429 fix1 = _mm256_add_pd(fix1,tx);
2430 fiy1 = _mm256_add_pd(fiy1,ty);
2431 fiz1 = _mm256_add_pd(fiz1,tz);
2433 fjx1 = _mm256_add_pd(fjx1,tx);
2434 fjy1 = _mm256_add_pd(fjy1,ty);
2435 fjz1 = _mm256_add_pd(fjz1,tz);
2439 /**************************
2440 * CALCULATE INTERACTIONS *
2441 **************************/
2443 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2446 r12 = _mm256_mul_pd(rsq12,rinv12);
2447 r12 = _mm256_andnot_pd(dummy_mask,r12);
2449 /* EWALD ELECTROSTATICS */
2451 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2452 ewrt = _mm256_mul_pd(r12,ewtabscale);
2453 ewitab = _mm256_cvttpd_epi32(ewrt);
2454 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2455 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2456 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2458 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2459 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2461 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2465 fscal = _mm256_and_pd(fscal,cutoff_mask);
2467 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2469 /* Calculate temporary vectorial force */
2470 tx = _mm256_mul_pd(fscal,dx12);
2471 ty = _mm256_mul_pd(fscal,dy12);
2472 tz = _mm256_mul_pd(fscal,dz12);
2474 /* Update vectorial force */
2475 fix1 = _mm256_add_pd(fix1,tx);
2476 fiy1 = _mm256_add_pd(fiy1,ty);
2477 fiz1 = _mm256_add_pd(fiz1,tz);
2479 fjx2 = _mm256_add_pd(fjx2,tx);
2480 fjy2 = _mm256_add_pd(fjy2,ty);
2481 fjz2 = _mm256_add_pd(fjz2,tz);
2485 /**************************
2486 * CALCULATE INTERACTIONS *
2487 **************************/
2489 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2492 r13 = _mm256_mul_pd(rsq13,rinv13);
2493 r13 = _mm256_andnot_pd(dummy_mask,r13);
2495 /* EWALD ELECTROSTATICS */
2497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2498 ewrt = _mm256_mul_pd(r13,ewtabscale);
2499 ewitab = _mm256_cvttpd_epi32(ewrt);
2500 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2501 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2502 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2504 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2505 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2507 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2511 fscal = _mm256_and_pd(fscal,cutoff_mask);
2513 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2515 /* Calculate temporary vectorial force */
2516 tx = _mm256_mul_pd(fscal,dx13);
2517 ty = _mm256_mul_pd(fscal,dy13);
2518 tz = _mm256_mul_pd(fscal,dz13);
2520 /* Update vectorial force */
2521 fix1 = _mm256_add_pd(fix1,tx);
2522 fiy1 = _mm256_add_pd(fiy1,ty);
2523 fiz1 = _mm256_add_pd(fiz1,tz);
2525 fjx3 = _mm256_add_pd(fjx3,tx);
2526 fjy3 = _mm256_add_pd(fjy3,ty);
2527 fjz3 = _mm256_add_pd(fjz3,tz);
2531 /**************************
2532 * CALCULATE INTERACTIONS *
2533 **************************/
2535 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2538 r21 = _mm256_mul_pd(rsq21,rinv21);
2539 r21 = _mm256_andnot_pd(dummy_mask,r21);
2541 /* EWALD ELECTROSTATICS */
2543 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2544 ewrt = _mm256_mul_pd(r21,ewtabscale);
2545 ewitab = _mm256_cvttpd_epi32(ewrt);
2546 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2547 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2548 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2550 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2551 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2553 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2557 fscal = _mm256_and_pd(fscal,cutoff_mask);
2559 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2561 /* Calculate temporary vectorial force */
2562 tx = _mm256_mul_pd(fscal,dx21);
2563 ty = _mm256_mul_pd(fscal,dy21);
2564 tz = _mm256_mul_pd(fscal,dz21);
2566 /* Update vectorial force */
2567 fix2 = _mm256_add_pd(fix2,tx);
2568 fiy2 = _mm256_add_pd(fiy2,ty);
2569 fiz2 = _mm256_add_pd(fiz2,tz);
2571 fjx1 = _mm256_add_pd(fjx1,tx);
2572 fjy1 = _mm256_add_pd(fjy1,ty);
2573 fjz1 = _mm256_add_pd(fjz1,tz);
2577 /**************************
2578 * CALCULATE INTERACTIONS *
2579 **************************/
2581 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2584 r22 = _mm256_mul_pd(rsq22,rinv22);
2585 r22 = _mm256_andnot_pd(dummy_mask,r22);
2587 /* EWALD ELECTROSTATICS */
2589 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2590 ewrt = _mm256_mul_pd(r22,ewtabscale);
2591 ewitab = _mm256_cvttpd_epi32(ewrt);
2592 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2593 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2594 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2596 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2597 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2599 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2603 fscal = _mm256_and_pd(fscal,cutoff_mask);
2605 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2607 /* Calculate temporary vectorial force */
2608 tx = _mm256_mul_pd(fscal,dx22);
2609 ty = _mm256_mul_pd(fscal,dy22);
2610 tz = _mm256_mul_pd(fscal,dz22);
2612 /* Update vectorial force */
2613 fix2 = _mm256_add_pd(fix2,tx);
2614 fiy2 = _mm256_add_pd(fiy2,ty);
2615 fiz2 = _mm256_add_pd(fiz2,tz);
2617 fjx2 = _mm256_add_pd(fjx2,tx);
2618 fjy2 = _mm256_add_pd(fjy2,ty);
2619 fjz2 = _mm256_add_pd(fjz2,tz);
2623 /**************************
2624 * CALCULATE INTERACTIONS *
2625 **************************/
2627 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2630 r23 = _mm256_mul_pd(rsq23,rinv23);
2631 r23 = _mm256_andnot_pd(dummy_mask,r23);
2633 /* EWALD ELECTROSTATICS */
2635 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2636 ewrt = _mm256_mul_pd(r23,ewtabscale);
2637 ewitab = _mm256_cvttpd_epi32(ewrt);
2638 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2639 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2640 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2642 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2643 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2645 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2649 fscal = _mm256_and_pd(fscal,cutoff_mask);
2651 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2653 /* Calculate temporary vectorial force */
2654 tx = _mm256_mul_pd(fscal,dx23);
2655 ty = _mm256_mul_pd(fscal,dy23);
2656 tz = _mm256_mul_pd(fscal,dz23);
2658 /* Update vectorial force */
2659 fix2 = _mm256_add_pd(fix2,tx);
2660 fiy2 = _mm256_add_pd(fiy2,ty);
2661 fiz2 = _mm256_add_pd(fiz2,tz);
2663 fjx3 = _mm256_add_pd(fjx3,tx);
2664 fjy3 = _mm256_add_pd(fjy3,ty);
2665 fjz3 = _mm256_add_pd(fjz3,tz);
2669 /**************************
2670 * CALCULATE INTERACTIONS *
2671 **************************/
2673 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2676 r31 = _mm256_mul_pd(rsq31,rinv31);
2677 r31 = _mm256_andnot_pd(dummy_mask,r31);
2679 /* EWALD ELECTROSTATICS */
2681 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2682 ewrt = _mm256_mul_pd(r31,ewtabscale);
2683 ewitab = _mm256_cvttpd_epi32(ewrt);
2684 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2685 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2686 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2688 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2689 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2691 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2695 fscal = _mm256_and_pd(fscal,cutoff_mask);
2697 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2699 /* Calculate temporary vectorial force */
2700 tx = _mm256_mul_pd(fscal,dx31);
2701 ty = _mm256_mul_pd(fscal,dy31);
2702 tz = _mm256_mul_pd(fscal,dz31);
2704 /* Update vectorial force */
2705 fix3 = _mm256_add_pd(fix3,tx);
2706 fiy3 = _mm256_add_pd(fiy3,ty);
2707 fiz3 = _mm256_add_pd(fiz3,tz);
2709 fjx1 = _mm256_add_pd(fjx1,tx);
2710 fjy1 = _mm256_add_pd(fjy1,ty);
2711 fjz1 = _mm256_add_pd(fjz1,tz);
2715 /**************************
2716 * CALCULATE INTERACTIONS *
2717 **************************/
2719 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2722 r32 = _mm256_mul_pd(rsq32,rinv32);
2723 r32 = _mm256_andnot_pd(dummy_mask,r32);
2725 /* EWALD ELECTROSTATICS */
2727 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2728 ewrt = _mm256_mul_pd(r32,ewtabscale);
2729 ewitab = _mm256_cvttpd_epi32(ewrt);
2730 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2731 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2732 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2734 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2735 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2737 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2741 fscal = _mm256_and_pd(fscal,cutoff_mask);
2743 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2745 /* Calculate temporary vectorial force */
2746 tx = _mm256_mul_pd(fscal,dx32);
2747 ty = _mm256_mul_pd(fscal,dy32);
2748 tz = _mm256_mul_pd(fscal,dz32);
2750 /* Update vectorial force */
2751 fix3 = _mm256_add_pd(fix3,tx);
2752 fiy3 = _mm256_add_pd(fiy3,ty);
2753 fiz3 = _mm256_add_pd(fiz3,tz);
2755 fjx2 = _mm256_add_pd(fjx2,tx);
2756 fjy2 = _mm256_add_pd(fjy2,ty);
2757 fjz2 = _mm256_add_pd(fjz2,tz);
2761 /**************************
2762 * CALCULATE INTERACTIONS *
2763 **************************/
2765 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2768 r33 = _mm256_mul_pd(rsq33,rinv33);
2769 r33 = _mm256_andnot_pd(dummy_mask,r33);
2771 /* EWALD ELECTROSTATICS */
2773 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2774 ewrt = _mm256_mul_pd(r33,ewtabscale);
2775 ewitab = _mm256_cvttpd_epi32(ewrt);
2776 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2777 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2778 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2780 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2781 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2783 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2787 fscal = _mm256_and_pd(fscal,cutoff_mask);
2789 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2791 /* Calculate temporary vectorial force */
2792 tx = _mm256_mul_pd(fscal,dx33);
2793 ty = _mm256_mul_pd(fscal,dy33);
2794 tz = _mm256_mul_pd(fscal,dz33);
2796 /* Update vectorial force */
2797 fix3 = _mm256_add_pd(fix3,tx);
2798 fiy3 = _mm256_add_pd(fiy3,ty);
2799 fiz3 = _mm256_add_pd(fiz3,tz);
2801 fjx3 = _mm256_add_pd(fjx3,tx);
2802 fjy3 = _mm256_add_pd(fjy3,ty);
2803 fjz3 = _mm256_add_pd(fjz3,tz);
2807 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2808 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2809 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2810 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2812 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2813 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2814 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2816 /* Inner loop uses 393 flops */
2819 /* End of innermost loop */
2821 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2822 f+i_coord_offset,fshift+i_shift_offset);
2824 /* Increment number of inner iterations */
2825 inneriter += j_index_end - j_index_start;
2827 /* Outer loop uses 24 flops */
2830 /* Increment number of outer iterations */
2833 /* Update outer/inner flops */
2835 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);