2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
107 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
111 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
113 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
116 __m256d dummy_mask,cutoff_mask;
117 __m128 tmpmask0,tmpmask1;
118 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
119 __m256d one = _mm256_set1_pd(1.0);
120 __m256d two = _mm256_set1_pd(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm256_set1_pd(fr->ic->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
139 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
140 beta2 = _mm256_mul_pd(beta,beta);
141 beta3 = _mm256_mul_pd(beta,beta2);
143 ewtab = fr->ic->tabq_coul_FDV0;
144 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
145 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
150 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
151 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
152 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
154 jq0 = _mm256_set1_pd(charge[inr+0]);
155 jq1 = _mm256_set1_pd(charge[inr+1]);
156 jq2 = _mm256_set1_pd(charge[inr+2]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 qq00 = _mm256_mul_pd(iq0,jq0);
159 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
160 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
161 qq01 = _mm256_mul_pd(iq0,jq1);
162 qq02 = _mm256_mul_pd(iq0,jq2);
163 qq10 = _mm256_mul_pd(iq1,jq0);
164 qq11 = _mm256_mul_pd(iq1,jq1);
165 qq12 = _mm256_mul_pd(iq1,jq2);
166 qq20 = _mm256_mul_pd(iq2,jq0);
167 qq21 = _mm256_mul_pd(iq2,jq1);
168 qq22 = _mm256_mul_pd(iq2,jq2);
170 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
171 rcutoff_scalar = fr->ic->rcoulomb;
172 rcutoff = _mm256_set1_pd(rcutoff_scalar);
173 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
175 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
176 rvdw = _mm256_set1_pd(fr->ic->rvdw);
178 /* Avoid stupid compiler warnings */
179 jnrA = jnrB = jnrC = jnrD = 0;
188 for(iidx=0;iidx<4*DIM;iidx++)
193 /* Start outer loop over neighborlists */
194 for(iidx=0; iidx<nri; iidx++)
196 /* Load shift vector for this list */
197 i_shift_offset = DIM*shiftidx[iidx];
199 /* Load limits for loop over neighbors */
200 j_index_start = jindex[iidx];
201 j_index_end = jindex[iidx+1];
203 /* Get outer coordinate index */
205 i_coord_offset = DIM*inr;
207 /* Load i particle coords and add shift vector */
208 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
209 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
211 fix0 = _mm256_setzero_pd();
212 fiy0 = _mm256_setzero_pd();
213 fiz0 = _mm256_setzero_pd();
214 fix1 = _mm256_setzero_pd();
215 fiy1 = _mm256_setzero_pd();
216 fiz1 = _mm256_setzero_pd();
217 fix2 = _mm256_setzero_pd();
218 fiy2 = _mm256_setzero_pd();
219 fiz2 = _mm256_setzero_pd();
221 /* Reset potential sums */
222 velecsum = _mm256_setzero_pd();
223 vvdwsum = _mm256_setzero_pd();
225 /* Start inner kernel loop */
226 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
229 /* Get j neighbor index, and coordinate index */
234 j_coord_offsetA = DIM*jnrA;
235 j_coord_offsetB = DIM*jnrB;
236 j_coord_offsetC = DIM*jnrC;
237 j_coord_offsetD = DIM*jnrD;
239 /* load j atom coordinates */
240 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
241 x+j_coord_offsetC,x+j_coord_offsetD,
242 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
244 /* Calculate displacement vector */
245 dx00 = _mm256_sub_pd(ix0,jx0);
246 dy00 = _mm256_sub_pd(iy0,jy0);
247 dz00 = _mm256_sub_pd(iz0,jz0);
248 dx01 = _mm256_sub_pd(ix0,jx1);
249 dy01 = _mm256_sub_pd(iy0,jy1);
250 dz01 = _mm256_sub_pd(iz0,jz1);
251 dx02 = _mm256_sub_pd(ix0,jx2);
252 dy02 = _mm256_sub_pd(iy0,jy2);
253 dz02 = _mm256_sub_pd(iz0,jz2);
254 dx10 = _mm256_sub_pd(ix1,jx0);
255 dy10 = _mm256_sub_pd(iy1,jy0);
256 dz10 = _mm256_sub_pd(iz1,jz0);
257 dx11 = _mm256_sub_pd(ix1,jx1);
258 dy11 = _mm256_sub_pd(iy1,jy1);
259 dz11 = _mm256_sub_pd(iz1,jz1);
260 dx12 = _mm256_sub_pd(ix1,jx2);
261 dy12 = _mm256_sub_pd(iy1,jy2);
262 dz12 = _mm256_sub_pd(iz1,jz2);
263 dx20 = _mm256_sub_pd(ix2,jx0);
264 dy20 = _mm256_sub_pd(iy2,jy0);
265 dz20 = _mm256_sub_pd(iz2,jz0);
266 dx21 = _mm256_sub_pd(ix2,jx1);
267 dy21 = _mm256_sub_pd(iy2,jy1);
268 dz21 = _mm256_sub_pd(iz2,jz1);
269 dx22 = _mm256_sub_pd(ix2,jx2);
270 dy22 = _mm256_sub_pd(iy2,jy2);
271 dz22 = _mm256_sub_pd(iz2,jz2);
273 /* Calculate squared distance and things based on it */
274 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
275 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
276 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
277 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
278 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
279 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
280 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
281 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
282 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
284 rinv00 = avx256_invsqrt_d(rsq00);
285 rinv01 = avx256_invsqrt_d(rsq01);
286 rinv02 = avx256_invsqrt_d(rsq02);
287 rinv10 = avx256_invsqrt_d(rsq10);
288 rinv11 = avx256_invsqrt_d(rsq11);
289 rinv12 = avx256_invsqrt_d(rsq12);
290 rinv20 = avx256_invsqrt_d(rsq20);
291 rinv21 = avx256_invsqrt_d(rsq21);
292 rinv22 = avx256_invsqrt_d(rsq22);
294 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
295 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
296 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
297 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
298 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
299 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
300 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
301 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
302 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
304 fjx0 = _mm256_setzero_pd();
305 fjy0 = _mm256_setzero_pd();
306 fjz0 = _mm256_setzero_pd();
307 fjx1 = _mm256_setzero_pd();
308 fjy1 = _mm256_setzero_pd();
309 fjz1 = _mm256_setzero_pd();
310 fjx2 = _mm256_setzero_pd();
311 fjy2 = _mm256_setzero_pd();
312 fjz2 = _mm256_setzero_pd();
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 if (gmx_mm256_any_lt(rsq00,rcutoff2))
321 r00 = _mm256_mul_pd(rsq00,rinv00);
323 /* EWALD ELECTROSTATICS */
325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
326 ewrt = _mm256_mul_pd(r00,ewtabscale);
327 ewitab = _mm256_cvttpd_epi32(ewrt);
328 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
329 ewitab = _mm_slli_epi32(ewitab,2);
330 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
331 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
332 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
333 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
334 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
335 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
336 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
337 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
338 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
340 /* LENNARD-JONES DISPERSION/REPULSION */
342 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
343 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
344 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
345 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
346 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
347 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
349 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velec = _mm256_and_pd(velec,cutoff_mask);
353 velecsum = _mm256_add_pd(velecsum,velec);
354 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
355 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
357 fscal = _mm256_add_pd(felec,fvdw);
359 fscal = _mm256_and_pd(fscal,cutoff_mask);
361 /* Calculate temporary vectorial force */
362 tx = _mm256_mul_pd(fscal,dx00);
363 ty = _mm256_mul_pd(fscal,dy00);
364 tz = _mm256_mul_pd(fscal,dz00);
366 /* Update vectorial force */
367 fix0 = _mm256_add_pd(fix0,tx);
368 fiy0 = _mm256_add_pd(fiy0,ty);
369 fiz0 = _mm256_add_pd(fiz0,tz);
371 fjx0 = _mm256_add_pd(fjx0,tx);
372 fjy0 = _mm256_add_pd(fjy0,ty);
373 fjz0 = _mm256_add_pd(fjz0,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 if (gmx_mm256_any_lt(rsq01,rcutoff2))
384 r01 = _mm256_mul_pd(rsq01,rinv01);
386 /* EWALD ELECTROSTATICS */
388 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
389 ewrt = _mm256_mul_pd(r01,ewtabscale);
390 ewitab = _mm256_cvttpd_epi32(ewrt);
391 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
392 ewitab = _mm_slli_epi32(ewitab,2);
393 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
394 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
395 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
396 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
397 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
398 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
399 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
400 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
401 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
403 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velec = _mm256_and_pd(velec,cutoff_mask);
407 velecsum = _mm256_add_pd(velecsum,velec);
411 fscal = _mm256_and_pd(fscal,cutoff_mask);
413 /* Calculate temporary vectorial force */
414 tx = _mm256_mul_pd(fscal,dx01);
415 ty = _mm256_mul_pd(fscal,dy01);
416 tz = _mm256_mul_pd(fscal,dz01);
418 /* Update vectorial force */
419 fix0 = _mm256_add_pd(fix0,tx);
420 fiy0 = _mm256_add_pd(fiy0,ty);
421 fiz0 = _mm256_add_pd(fiz0,tz);
423 fjx1 = _mm256_add_pd(fjx1,tx);
424 fjy1 = _mm256_add_pd(fjy1,ty);
425 fjz1 = _mm256_add_pd(fjz1,tz);
429 /**************************
430 * CALCULATE INTERACTIONS *
431 **************************/
433 if (gmx_mm256_any_lt(rsq02,rcutoff2))
436 r02 = _mm256_mul_pd(rsq02,rinv02);
438 /* EWALD ELECTROSTATICS */
440 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
441 ewrt = _mm256_mul_pd(r02,ewtabscale);
442 ewitab = _mm256_cvttpd_epi32(ewrt);
443 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
444 ewitab = _mm_slli_epi32(ewitab,2);
445 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
446 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
447 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
448 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
449 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
450 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
451 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
452 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
453 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
455 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velec = _mm256_and_pd(velec,cutoff_mask);
459 velecsum = _mm256_add_pd(velecsum,velec);
463 fscal = _mm256_and_pd(fscal,cutoff_mask);
465 /* Calculate temporary vectorial force */
466 tx = _mm256_mul_pd(fscal,dx02);
467 ty = _mm256_mul_pd(fscal,dy02);
468 tz = _mm256_mul_pd(fscal,dz02);
470 /* Update vectorial force */
471 fix0 = _mm256_add_pd(fix0,tx);
472 fiy0 = _mm256_add_pd(fiy0,ty);
473 fiz0 = _mm256_add_pd(fiz0,tz);
475 fjx2 = _mm256_add_pd(fjx2,tx);
476 fjy2 = _mm256_add_pd(fjy2,ty);
477 fjz2 = _mm256_add_pd(fjz2,tz);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 if (gmx_mm256_any_lt(rsq10,rcutoff2))
488 r10 = _mm256_mul_pd(rsq10,rinv10);
490 /* EWALD ELECTROSTATICS */
492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
493 ewrt = _mm256_mul_pd(r10,ewtabscale);
494 ewitab = _mm256_cvttpd_epi32(ewrt);
495 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
496 ewitab = _mm_slli_epi32(ewitab,2);
497 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
498 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
499 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
500 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
501 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
502 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
503 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
504 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
505 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
507 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velec = _mm256_and_pd(velec,cutoff_mask);
511 velecsum = _mm256_add_pd(velecsum,velec);
515 fscal = _mm256_and_pd(fscal,cutoff_mask);
517 /* Calculate temporary vectorial force */
518 tx = _mm256_mul_pd(fscal,dx10);
519 ty = _mm256_mul_pd(fscal,dy10);
520 tz = _mm256_mul_pd(fscal,dz10);
522 /* Update vectorial force */
523 fix1 = _mm256_add_pd(fix1,tx);
524 fiy1 = _mm256_add_pd(fiy1,ty);
525 fiz1 = _mm256_add_pd(fiz1,tz);
527 fjx0 = _mm256_add_pd(fjx0,tx);
528 fjy0 = _mm256_add_pd(fjy0,ty);
529 fjz0 = _mm256_add_pd(fjz0,tz);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 if (gmx_mm256_any_lt(rsq11,rcutoff2))
540 r11 = _mm256_mul_pd(rsq11,rinv11);
542 /* EWALD ELECTROSTATICS */
544 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
545 ewrt = _mm256_mul_pd(r11,ewtabscale);
546 ewitab = _mm256_cvttpd_epi32(ewrt);
547 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
548 ewitab = _mm_slli_epi32(ewitab,2);
549 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
550 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
551 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
552 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
553 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
554 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
555 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
556 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
557 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
559 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velec = _mm256_and_pd(velec,cutoff_mask);
563 velecsum = _mm256_add_pd(velecsum,velec);
567 fscal = _mm256_and_pd(fscal,cutoff_mask);
569 /* Calculate temporary vectorial force */
570 tx = _mm256_mul_pd(fscal,dx11);
571 ty = _mm256_mul_pd(fscal,dy11);
572 tz = _mm256_mul_pd(fscal,dz11);
574 /* Update vectorial force */
575 fix1 = _mm256_add_pd(fix1,tx);
576 fiy1 = _mm256_add_pd(fiy1,ty);
577 fiz1 = _mm256_add_pd(fiz1,tz);
579 fjx1 = _mm256_add_pd(fjx1,tx);
580 fjy1 = _mm256_add_pd(fjy1,ty);
581 fjz1 = _mm256_add_pd(fjz1,tz);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 if (gmx_mm256_any_lt(rsq12,rcutoff2))
592 r12 = _mm256_mul_pd(rsq12,rinv12);
594 /* EWALD ELECTROSTATICS */
596 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
597 ewrt = _mm256_mul_pd(r12,ewtabscale);
598 ewitab = _mm256_cvttpd_epi32(ewrt);
599 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
600 ewitab = _mm_slli_epi32(ewitab,2);
601 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
602 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
603 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
604 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
605 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
606 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
607 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
608 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
609 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
611 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velec = _mm256_and_pd(velec,cutoff_mask);
615 velecsum = _mm256_add_pd(velecsum,velec);
619 fscal = _mm256_and_pd(fscal,cutoff_mask);
621 /* Calculate temporary vectorial force */
622 tx = _mm256_mul_pd(fscal,dx12);
623 ty = _mm256_mul_pd(fscal,dy12);
624 tz = _mm256_mul_pd(fscal,dz12);
626 /* Update vectorial force */
627 fix1 = _mm256_add_pd(fix1,tx);
628 fiy1 = _mm256_add_pd(fiy1,ty);
629 fiz1 = _mm256_add_pd(fiz1,tz);
631 fjx2 = _mm256_add_pd(fjx2,tx);
632 fjy2 = _mm256_add_pd(fjy2,ty);
633 fjz2 = _mm256_add_pd(fjz2,tz);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 if (gmx_mm256_any_lt(rsq20,rcutoff2))
644 r20 = _mm256_mul_pd(rsq20,rinv20);
646 /* EWALD ELECTROSTATICS */
648 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
649 ewrt = _mm256_mul_pd(r20,ewtabscale);
650 ewitab = _mm256_cvttpd_epi32(ewrt);
651 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
652 ewitab = _mm_slli_epi32(ewitab,2);
653 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
654 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
655 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
656 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
657 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
658 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
659 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
660 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
661 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
663 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velec = _mm256_and_pd(velec,cutoff_mask);
667 velecsum = _mm256_add_pd(velecsum,velec);
671 fscal = _mm256_and_pd(fscal,cutoff_mask);
673 /* Calculate temporary vectorial force */
674 tx = _mm256_mul_pd(fscal,dx20);
675 ty = _mm256_mul_pd(fscal,dy20);
676 tz = _mm256_mul_pd(fscal,dz20);
678 /* Update vectorial force */
679 fix2 = _mm256_add_pd(fix2,tx);
680 fiy2 = _mm256_add_pd(fiy2,ty);
681 fiz2 = _mm256_add_pd(fiz2,tz);
683 fjx0 = _mm256_add_pd(fjx0,tx);
684 fjy0 = _mm256_add_pd(fjy0,ty);
685 fjz0 = _mm256_add_pd(fjz0,tz);
689 /**************************
690 * CALCULATE INTERACTIONS *
691 **************************/
693 if (gmx_mm256_any_lt(rsq21,rcutoff2))
696 r21 = _mm256_mul_pd(rsq21,rinv21);
698 /* EWALD ELECTROSTATICS */
700 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
701 ewrt = _mm256_mul_pd(r21,ewtabscale);
702 ewitab = _mm256_cvttpd_epi32(ewrt);
703 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
704 ewitab = _mm_slli_epi32(ewitab,2);
705 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
706 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
707 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
708 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
709 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
710 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
711 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
712 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
713 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
715 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
717 /* Update potential sum for this i atom from the interaction with this j atom. */
718 velec = _mm256_and_pd(velec,cutoff_mask);
719 velecsum = _mm256_add_pd(velecsum,velec);
723 fscal = _mm256_and_pd(fscal,cutoff_mask);
725 /* Calculate temporary vectorial force */
726 tx = _mm256_mul_pd(fscal,dx21);
727 ty = _mm256_mul_pd(fscal,dy21);
728 tz = _mm256_mul_pd(fscal,dz21);
730 /* Update vectorial force */
731 fix2 = _mm256_add_pd(fix2,tx);
732 fiy2 = _mm256_add_pd(fiy2,ty);
733 fiz2 = _mm256_add_pd(fiz2,tz);
735 fjx1 = _mm256_add_pd(fjx1,tx);
736 fjy1 = _mm256_add_pd(fjy1,ty);
737 fjz1 = _mm256_add_pd(fjz1,tz);
741 /**************************
742 * CALCULATE INTERACTIONS *
743 **************************/
745 if (gmx_mm256_any_lt(rsq22,rcutoff2))
748 r22 = _mm256_mul_pd(rsq22,rinv22);
750 /* EWALD ELECTROSTATICS */
752 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
753 ewrt = _mm256_mul_pd(r22,ewtabscale);
754 ewitab = _mm256_cvttpd_epi32(ewrt);
755 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
756 ewitab = _mm_slli_epi32(ewitab,2);
757 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
758 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
759 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
760 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
761 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
762 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
763 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
764 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
765 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
767 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
769 /* Update potential sum for this i atom from the interaction with this j atom. */
770 velec = _mm256_and_pd(velec,cutoff_mask);
771 velecsum = _mm256_add_pd(velecsum,velec);
775 fscal = _mm256_and_pd(fscal,cutoff_mask);
777 /* Calculate temporary vectorial force */
778 tx = _mm256_mul_pd(fscal,dx22);
779 ty = _mm256_mul_pd(fscal,dy22);
780 tz = _mm256_mul_pd(fscal,dz22);
782 /* Update vectorial force */
783 fix2 = _mm256_add_pd(fix2,tx);
784 fiy2 = _mm256_add_pd(fiy2,ty);
785 fiz2 = _mm256_add_pd(fiz2,tz);
787 fjx2 = _mm256_add_pd(fjx2,tx);
788 fjy2 = _mm256_add_pd(fjy2,ty);
789 fjz2 = _mm256_add_pd(fjz2,tz);
793 fjptrA = f+j_coord_offsetA;
794 fjptrB = f+j_coord_offsetB;
795 fjptrC = f+j_coord_offsetC;
796 fjptrD = f+j_coord_offsetD;
798 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
799 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
801 /* Inner loop uses 432 flops */
807 /* Get j neighbor index, and coordinate index */
808 jnrlistA = jjnr[jidx];
809 jnrlistB = jjnr[jidx+1];
810 jnrlistC = jjnr[jidx+2];
811 jnrlistD = jjnr[jidx+3];
812 /* Sign of each element will be negative for non-real atoms.
813 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
814 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
816 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
818 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
819 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
820 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
822 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
823 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
824 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
825 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
826 j_coord_offsetA = DIM*jnrA;
827 j_coord_offsetB = DIM*jnrB;
828 j_coord_offsetC = DIM*jnrC;
829 j_coord_offsetD = DIM*jnrD;
831 /* load j atom coordinates */
832 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
833 x+j_coord_offsetC,x+j_coord_offsetD,
834 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
836 /* Calculate displacement vector */
837 dx00 = _mm256_sub_pd(ix0,jx0);
838 dy00 = _mm256_sub_pd(iy0,jy0);
839 dz00 = _mm256_sub_pd(iz0,jz0);
840 dx01 = _mm256_sub_pd(ix0,jx1);
841 dy01 = _mm256_sub_pd(iy0,jy1);
842 dz01 = _mm256_sub_pd(iz0,jz1);
843 dx02 = _mm256_sub_pd(ix0,jx2);
844 dy02 = _mm256_sub_pd(iy0,jy2);
845 dz02 = _mm256_sub_pd(iz0,jz2);
846 dx10 = _mm256_sub_pd(ix1,jx0);
847 dy10 = _mm256_sub_pd(iy1,jy0);
848 dz10 = _mm256_sub_pd(iz1,jz0);
849 dx11 = _mm256_sub_pd(ix1,jx1);
850 dy11 = _mm256_sub_pd(iy1,jy1);
851 dz11 = _mm256_sub_pd(iz1,jz1);
852 dx12 = _mm256_sub_pd(ix1,jx2);
853 dy12 = _mm256_sub_pd(iy1,jy2);
854 dz12 = _mm256_sub_pd(iz1,jz2);
855 dx20 = _mm256_sub_pd(ix2,jx0);
856 dy20 = _mm256_sub_pd(iy2,jy0);
857 dz20 = _mm256_sub_pd(iz2,jz0);
858 dx21 = _mm256_sub_pd(ix2,jx1);
859 dy21 = _mm256_sub_pd(iy2,jy1);
860 dz21 = _mm256_sub_pd(iz2,jz1);
861 dx22 = _mm256_sub_pd(ix2,jx2);
862 dy22 = _mm256_sub_pd(iy2,jy2);
863 dz22 = _mm256_sub_pd(iz2,jz2);
865 /* Calculate squared distance and things based on it */
866 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
867 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
868 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
869 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
870 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
871 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
872 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
873 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
874 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
876 rinv00 = avx256_invsqrt_d(rsq00);
877 rinv01 = avx256_invsqrt_d(rsq01);
878 rinv02 = avx256_invsqrt_d(rsq02);
879 rinv10 = avx256_invsqrt_d(rsq10);
880 rinv11 = avx256_invsqrt_d(rsq11);
881 rinv12 = avx256_invsqrt_d(rsq12);
882 rinv20 = avx256_invsqrt_d(rsq20);
883 rinv21 = avx256_invsqrt_d(rsq21);
884 rinv22 = avx256_invsqrt_d(rsq22);
886 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
887 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
888 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
889 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
890 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
891 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
892 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
893 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
894 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
896 fjx0 = _mm256_setzero_pd();
897 fjy0 = _mm256_setzero_pd();
898 fjz0 = _mm256_setzero_pd();
899 fjx1 = _mm256_setzero_pd();
900 fjy1 = _mm256_setzero_pd();
901 fjz1 = _mm256_setzero_pd();
902 fjx2 = _mm256_setzero_pd();
903 fjy2 = _mm256_setzero_pd();
904 fjz2 = _mm256_setzero_pd();
906 /**************************
907 * CALCULATE INTERACTIONS *
908 **************************/
910 if (gmx_mm256_any_lt(rsq00,rcutoff2))
913 r00 = _mm256_mul_pd(rsq00,rinv00);
914 r00 = _mm256_andnot_pd(dummy_mask,r00);
916 /* EWALD ELECTROSTATICS */
918 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
919 ewrt = _mm256_mul_pd(r00,ewtabscale);
920 ewitab = _mm256_cvttpd_epi32(ewrt);
921 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
922 ewitab = _mm_slli_epi32(ewitab,2);
923 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
924 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
925 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
926 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
927 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
928 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
929 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
930 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
931 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
933 /* LENNARD-JONES DISPERSION/REPULSION */
935 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
936 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
937 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
938 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
939 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
940 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
942 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
944 /* Update potential sum for this i atom from the interaction with this j atom. */
945 velec = _mm256_and_pd(velec,cutoff_mask);
946 velec = _mm256_andnot_pd(dummy_mask,velec);
947 velecsum = _mm256_add_pd(velecsum,velec);
948 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
949 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
950 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
952 fscal = _mm256_add_pd(felec,fvdw);
954 fscal = _mm256_and_pd(fscal,cutoff_mask);
956 fscal = _mm256_andnot_pd(dummy_mask,fscal);
958 /* Calculate temporary vectorial force */
959 tx = _mm256_mul_pd(fscal,dx00);
960 ty = _mm256_mul_pd(fscal,dy00);
961 tz = _mm256_mul_pd(fscal,dz00);
963 /* Update vectorial force */
964 fix0 = _mm256_add_pd(fix0,tx);
965 fiy0 = _mm256_add_pd(fiy0,ty);
966 fiz0 = _mm256_add_pd(fiz0,tz);
968 fjx0 = _mm256_add_pd(fjx0,tx);
969 fjy0 = _mm256_add_pd(fjy0,ty);
970 fjz0 = _mm256_add_pd(fjz0,tz);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 if (gmx_mm256_any_lt(rsq01,rcutoff2))
981 r01 = _mm256_mul_pd(rsq01,rinv01);
982 r01 = _mm256_andnot_pd(dummy_mask,r01);
984 /* EWALD ELECTROSTATICS */
986 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
987 ewrt = _mm256_mul_pd(r01,ewtabscale);
988 ewitab = _mm256_cvttpd_epi32(ewrt);
989 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
990 ewitab = _mm_slli_epi32(ewitab,2);
991 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
992 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
993 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
994 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
995 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
996 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
997 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
998 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
999 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1001 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1003 /* Update potential sum for this i atom from the interaction with this j atom. */
1004 velec = _mm256_and_pd(velec,cutoff_mask);
1005 velec = _mm256_andnot_pd(dummy_mask,velec);
1006 velecsum = _mm256_add_pd(velecsum,velec);
1010 fscal = _mm256_and_pd(fscal,cutoff_mask);
1012 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1014 /* Calculate temporary vectorial force */
1015 tx = _mm256_mul_pd(fscal,dx01);
1016 ty = _mm256_mul_pd(fscal,dy01);
1017 tz = _mm256_mul_pd(fscal,dz01);
1019 /* Update vectorial force */
1020 fix0 = _mm256_add_pd(fix0,tx);
1021 fiy0 = _mm256_add_pd(fiy0,ty);
1022 fiz0 = _mm256_add_pd(fiz0,tz);
1024 fjx1 = _mm256_add_pd(fjx1,tx);
1025 fjy1 = _mm256_add_pd(fjy1,ty);
1026 fjz1 = _mm256_add_pd(fjz1,tz);
1030 /**************************
1031 * CALCULATE INTERACTIONS *
1032 **************************/
1034 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1037 r02 = _mm256_mul_pd(rsq02,rinv02);
1038 r02 = _mm256_andnot_pd(dummy_mask,r02);
1040 /* EWALD ELECTROSTATICS */
1042 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1043 ewrt = _mm256_mul_pd(r02,ewtabscale);
1044 ewitab = _mm256_cvttpd_epi32(ewrt);
1045 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1046 ewitab = _mm_slli_epi32(ewitab,2);
1047 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1048 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1049 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1050 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1051 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1052 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1053 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1054 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1055 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1057 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1059 /* Update potential sum for this i atom from the interaction with this j atom. */
1060 velec = _mm256_and_pd(velec,cutoff_mask);
1061 velec = _mm256_andnot_pd(dummy_mask,velec);
1062 velecsum = _mm256_add_pd(velecsum,velec);
1066 fscal = _mm256_and_pd(fscal,cutoff_mask);
1068 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1070 /* Calculate temporary vectorial force */
1071 tx = _mm256_mul_pd(fscal,dx02);
1072 ty = _mm256_mul_pd(fscal,dy02);
1073 tz = _mm256_mul_pd(fscal,dz02);
1075 /* Update vectorial force */
1076 fix0 = _mm256_add_pd(fix0,tx);
1077 fiy0 = _mm256_add_pd(fiy0,ty);
1078 fiz0 = _mm256_add_pd(fiz0,tz);
1080 fjx2 = _mm256_add_pd(fjx2,tx);
1081 fjy2 = _mm256_add_pd(fjy2,ty);
1082 fjz2 = _mm256_add_pd(fjz2,tz);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1093 r10 = _mm256_mul_pd(rsq10,rinv10);
1094 r10 = _mm256_andnot_pd(dummy_mask,r10);
1096 /* EWALD ELECTROSTATICS */
1098 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1099 ewrt = _mm256_mul_pd(r10,ewtabscale);
1100 ewitab = _mm256_cvttpd_epi32(ewrt);
1101 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1102 ewitab = _mm_slli_epi32(ewitab,2);
1103 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1104 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1105 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1106 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1107 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1108 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1109 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1110 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1111 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1113 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1115 /* Update potential sum for this i atom from the interaction with this j atom. */
1116 velec = _mm256_and_pd(velec,cutoff_mask);
1117 velec = _mm256_andnot_pd(dummy_mask,velec);
1118 velecsum = _mm256_add_pd(velecsum,velec);
1122 fscal = _mm256_and_pd(fscal,cutoff_mask);
1124 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1126 /* Calculate temporary vectorial force */
1127 tx = _mm256_mul_pd(fscal,dx10);
1128 ty = _mm256_mul_pd(fscal,dy10);
1129 tz = _mm256_mul_pd(fscal,dz10);
1131 /* Update vectorial force */
1132 fix1 = _mm256_add_pd(fix1,tx);
1133 fiy1 = _mm256_add_pd(fiy1,ty);
1134 fiz1 = _mm256_add_pd(fiz1,tz);
1136 fjx0 = _mm256_add_pd(fjx0,tx);
1137 fjy0 = _mm256_add_pd(fjy0,ty);
1138 fjz0 = _mm256_add_pd(fjz0,tz);
1142 /**************************
1143 * CALCULATE INTERACTIONS *
1144 **************************/
1146 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1149 r11 = _mm256_mul_pd(rsq11,rinv11);
1150 r11 = _mm256_andnot_pd(dummy_mask,r11);
1152 /* EWALD ELECTROSTATICS */
1154 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1155 ewrt = _mm256_mul_pd(r11,ewtabscale);
1156 ewitab = _mm256_cvttpd_epi32(ewrt);
1157 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1158 ewitab = _mm_slli_epi32(ewitab,2);
1159 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1160 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1161 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1162 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1163 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1164 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1165 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1166 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1167 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1169 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1171 /* Update potential sum for this i atom from the interaction with this j atom. */
1172 velec = _mm256_and_pd(velec,cutoff_mask);
1173 velec = _mm256_andnot_pd(dummy_mask,velec);
1174 velecsum = _mm256_add_pd(velecsum,velec);
1178 fscal = _mm256_and_pd(fscal,cutoff_mask);
1180 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1182 /* Calculate temporary vectorial force */
1183 tx = _mm256_mul_pd(fscal,dx11);
1184 ty = _mm256_mul_pd(fscal,dy11);
1185 tz = _mm256_mul_pd(fscal,dz11);
1187 /* Update vectorial force */
1188 fix1 = _mm256_add_pd(fix1,tx);
1189 fiy1 = _mm256_add_pd(fiy1,ty);
1190 fiz1 = _mm256_add_pd(fiz1,tz);
1192 fjx1 = _mm256_add_pd(fjx1,tx);
1193 fjy1 = _mm256_add_pd(fjy1,ty);
1194 fjz1 = _mm256_add_pd(fjz1,tz);
1198 /**************************
1199 * CALCULATE INTERACTIONS *
1200 **************************/
1202 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1205 r12 = _mm256_mul_pd(rsq12,rinv12);
1206 r12 = _mm256_andnot_pd(dummy_mask,r12);
1208 /* EWALD ELECTROSTATICS */
1210 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1211 ewrt = _mm256_mul_pd(r12,ewtabscale);
1212 ewitab = _mm256_cvttpd_epi32(ewrt);
1213 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1214 ewitab = _mm_slli_epi32(ewitab,2);
1215 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1216 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1217 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1218 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1219 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1220 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1221 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1222 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1223 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1225 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1227 /* Update potential sum for this i atom from the interaction with this j atom. */
1228 velec = _mm256_and_pd(velec,cutoff_mask);
1229 velec = _mm256_andnot_pd(dummy_mask,velec);
1230 velecsum = _mm256_add_pd(velecsum,velec);
1234 fscal = _mm256_and_pd(fscal,cutoff_mask);
1236 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1238 /* Calculate temporary vectorial force */
1239 tx = _mm256_mul_pd(fscal,dx12);
1240 ty = _mm256_mul_pd(fscal,dy12);
1241 tz = _mm256_mul_pd(fscal,dz12);
1243 /* Update vectorial force */
1244 fix1 = _mm256_add_pd(fix1,tx);
1245 fiy1 = _mm256_add_pd(fiy1,ty);
1246 fiz1 = _mm256_add_pd(fiz1,tz);
1248 fjx2 = _mm256_add_pd(fjx2,tx);
1249 fjy2 = _mm256_add_pd(fjy2,ty);
1250 fjz2 = _mm256_add_pd(fjz2,tz);
1254 /**************************
1255 * CALCULATE INTERACTIONS *
1256 **************************/
1258 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1261 r20 = _mm256_mul_pd(rsq20,rinv20);
1262 r20 = _mm256_andnot_pd(dummy_mask,r20);
1264 /* EWALD ELECTROSTATICS */
1266 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1267 ewrt = _mm256_mul_pd(r20,ewtabscale);
1268 ewitab = _mm256_cvttpd_epi32(ewrt);
1269 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1270 ewitab = _mm_slli_epi32(ewitab,2);
1271 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1272 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1273 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1274 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1275 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1276 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1277 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1278 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1279 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1281 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1283 /* Update potential sum for this i atom from the interaction with this j atom. */
1284 velec = _mm256_and_pd(velec,cutoff_mask);
1285 velec = _mm256_andnot_pd(dummy_mask,velec);
1286 velecsum = _mm256_add_pd(velecsum,velec);
1290 fscal = _mm256_and_pd(fscal,cutoff_mask);
1292 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1294 /* Calculate temporary vectorial force */
1295 tx = _mm256_mul_pd(fscal,dx20);
1296 ty = _mm256_mul_pd(fscal,dy20);
1297 tz = _mm256_mul_pd(fscal,dz20);
1299 /* Update vectorial force */
1300 fix2 = _mm256_add_pd(fix2,tx);
1301 fiy2 = _mm256_add_pd(fiy2,ty);
1302 fiz2 = _mm256_add_pd(fiz2,tz);
1304 fjx0 = _mm256_add_pd(fjx0,tx);
1305 fjy0 = _mm256_add_pd(fjy0,ty);
1306 fjz0 = _mm256_add_pd(fjz0,tz);
1310 /**************************
1311 * CALCULATE INTERACTIONS *
1312 **************************/
1314 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1317 r21 = _mm256_mul_pd(rsq21,rinv21);
1318 r21 = _mm256_andnot_pd(dummy_mask,r21);
1320 /* EWALD ELECTROSTATICS */
1322 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1323 ewrt = _mm256_mul_pd(r21,ewtabscale);
1324 ewitab = _mm256_cvttpd_epi32(ewrt);
1325 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1326 ewitab = _mm_slli_epi32(ewitab,2);
1327 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1328 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1329 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1330 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1331 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1332 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1333 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1334 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1335 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1337 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1339 /* Update potential sum for this i atom from the interaction with this j atom. */
1340 velec = _mm256_and_pd(velec,cutoff_mask);
1341 velec = _mm256_andnot_pd(dummy_mask,velec);
1342 velecsum = _mm256_add_pd(velecsum,velec);
1346 fscal = _mm256_and_pd(fscal,cutoff_mask);
1348 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1350 /* Calculate temporary vectorial force */
1351 tx = _mm256_mul_pd(fscal,dx21);
1352 ty = _mm256_mul_pd(fscal,dy21);
1353 tz = _mm256_mul_pd(fscal,dz21);
1355 /* Update vectorial force */
1356 fix2 = _mm256_add_pd(fix2,tx);
1357 fiy2 = _mm256_add_pd(fiy2,ty);
1358 fiz2 = _mm256_add_pd(fiz2,tz);
1360 fjx1 = _mm256_add_pd(fjx1,tx);
1361 fjy1 = _mm256_add_pd(fjy1,ty);
1362 fjz1 = _mm256_add_pd(fjz1,tz);
1366 /**************************
1367 * CALCULATE INTERACTIONS *
1368 **************************/
1370 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1373 r22 = _mm256_mul_pd(rsq22,rinv22);
1374 r22 = _mm256_andnot_pd(dummy_mask,r22);
1376 /* EWALD ELECTROSTATICS */
1378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1379 ewrt = _mm256_mul_pd(r22,ewtabscale);
1380 ewitab = _mm256_cvttpd_epi32(ewrt);
1381 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1382 ewitab = _mm_slli_epi32(ewitab,2);
1383 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1384 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1385 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1386 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1387 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1388 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1389 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1390 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1391 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1393 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1395 /* Update potential sum for this i atom from the interaction with this j atom. */
1396 velec = _mm256_and_pd(velec,cutoff_mask);
1397 velec = _mm256_andnot_pd(dummy_mask,velec);
1398 velecsum = _mm256_add_pd(velecsum,velec);
1402 fscal = _mm256_and_pd(fscal,cutoff_mask);
1404 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1406 /* Calculate temporary vectorial force */
1407 tx = _mm256_mul_pd(fscal,dx22);
1408 ty = _mm256_mul_pd(fscal,dy22);
1409 tz = _mm256_mul_pd(fscal,dz22);
1411 /* Update vectorial force */
1412 fix2 = _mm256_add_pd(fix2,tx);
1413 fiy2 = _mm256_add_pd(fiy2,ty);
1414 fiz2 = _mm256_add_pd(fiz2,tz);
1416 fjx2 = _mm256_add_pd(fjx2,tx);
1417 fjy2 = _mm256_add_pd(fjy2,ty);
1418 fjz2 = _mm256_add_pd(fjz2,tz);
1422 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1423 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1424 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1425 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1427 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1428 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1430 /* Inner loop uses 441 flops */
1433 /* End of innermost loop */
1435 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1436 f+i_coord_offset,fshift+i_shift_offset);
1439 /* Update potential energies */
1440 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1441 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1443 /* Increment number of inner iterations */
1444 inneriter += j_index_end - j_index_start;
1446 /* Outer loop uses 20 flops */
1449 /* Increment number of outer iterations */
1452 /* Update outer/inner flops */
1454 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1457 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1458 * Electrostatics interaction: Ewald
1459 * VdW interaction: LennardJones
1460 * Geometry: Water3-Water3
1461 * Calculate force/pot: Force
1464 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1465 (t_nblist * gmx_restrict nlist,
1466 rvec * gmx_restrict xx,
1467 rvec * gmx_restrict ff,
1468 struct t_forcerec * gmx_restrict fr,
1469 t_mdatoms * gmx_restrict mdatoms,
1470 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1471 t_nrnb * gmx_restrict nrnb)
1473 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1474 * just 0 for non-waters.
1475 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1476 * jnr indices corresponding to data put in the four positions in the SIMD register.
1478 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1479 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1480 int jnrA,jnrB,jnrC,jnrD;
1481 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1482 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1483 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1484 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1485 real rcutoff_scalar;
1486 real *shiftvec,*fshift,*x,*f;
1487 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1488 real scratch[4*DIM];
1489 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1490 real * vdwioffsetptr0;
1491 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1492 real * vdwioffsetptr1;
1493 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1494 real * vdwioffsetptr2;
1495 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1496 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1497 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1498 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1499 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1500 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1501 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1502 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1503 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1504 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1505 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1506 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1507 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1508 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1509 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1510 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1511 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1514 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1517 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1518 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1520 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1521 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1523 __m256d dummy_mask,cutoff_mask;
1524 __m128 tmpmask0,tmpmask1;
1525 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1526 __m256d one = _mm256_set1_pd(1.0);
1527 __m256d two = _mm256_set1_pd(2.0);
1533 jindex = nlist->jindex;
1535 shiftidx = nlist->shift;
1537 shiftvec = fr->shift_vec[0];
1538 fshift = fr->fshift[0];
1539 facel = _mm256_set1_pd(fr->ic->epsfac);
1540 charge = mdatoms->chargeA;
1541 nvdwtype = fr->ntype;
1542 vdwparam = fr->nbfp;
1543 vdwtype = mdatoms->typeA;
1545 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1546 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1547 beta2 = _mm256_mul_pd(beta,beta);
1548 beta3 = _mm256_mul_pd(beta,beta2);
1550 ewtab = fr->ic->tabq_coul_F;
1551 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1552 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1554 /* Setup water-specific parameters */
1555 inr = nlist->iinr[0];
1556 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1557 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1558 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1559 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1561 jq0 = _mm256_set1_pd(charge[inr+0]);
1562 jq1 = _mm256_set1_pd(charge[inr+1]);
1563 jq2 = _mm256_set1_pd(charge[inr+2]);
1564 vdwjidx0A = 2*vdwtype[inr+0];
1565 qq00 = _mm256_mul_pd(iq0,jq0);
1566 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1567 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1568 qq01 = _mm256_mul_pd(iq0,jq1);
1569 qq02 = _mm256_mul_pd(iq0,jq2);
1570 qq10 = _mm256_mul_pd(iq1,jq0);
1571 qq11 = _mm256_mul_pd(iq1,jq1);
1572 qq12 = _mm256_mul_pd(iq1,jq2);
1573 qq20 = _mm256_mul_pd(iq2,jq0);
1574 qq21 = _mm256_mul_pd(iq2,jq1);
1575 qq22 = _mm256_mul_pd(iq2,jq2);
1577 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1578 rcutoff_scalar = fr->ic->rcoulomb;
1579 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1580 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1582 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1583 rvdw = _mm256_set1_pd(fr->ic->rvdw);
1585 /* Avoid stupid compiler warnings */
1586 jnrA = jnrB = jnrC = jnrD = 0;
1587 j_coord_offsetA = 0;
1588 j_coord_offsetB = 0;
1589 j_coord_offsetC = 0;
1590 j_coord_offsetD = 0;
1595 for(iidx=0;iidx<4*DIM;iidx++)
1597 scratch[iidx] = 0.0;
1600 /* Start outer loop over neighborlists */
1601 for(iidx=0; iidx<nri; iidx++)
1603 /* Load shift vector for this list */
1604 i_shift_offset = DIM*shiftidx[iidx];
1606 /* Load limits for loop over neighbors */
1607 j_index_start = jindex[iidx];
1608 j_index_end = jindex[iidx+1];
1610 /* Get outer coordinate index */
1612 i_coord_offset = DIM*inr;
1614 /* Load i particle coords and add shift vector */
1615 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1616 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1618 fix0 = _mm256_setzero_pd();
1619 fiy0 = _mm256_setzero_pd();
1620 fiz0 = _mm256_setzero_pd();
1621 fix1 = _mm256_setzero_pd();
1622 fiy1 = _mm256_setzero_pd();
1623 fiz1 = _mm256_setzero_pd();
1624 fix2 = _mm256_setzero_pd();
1625 fiy2 = _mm256_setzero_pd();
1626 fiz2 = _mm256_setzero_pd();
1628 /* Start inner kernel loop */
1629 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1632 /* Get j neighbor index, and coordinate index */
1634 jnrB = jjnr[jidx+1];
1635 jnrC = jjnr[jidx+2];
1636 jnrD = jjnr[jidx+3];
1637 j_coord_offsetA = DIM*jnrA;
1638 j_coord_offsetB = DIM*jnrB;
1639 j_coord_offsetC = DIM*jnrC;
1640 j_coord_offsetD = DIM*jnrD;
1642 /* load j atom coordinates */
1643 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1644 x+j_coord_offsetC,x+j_coord_offsetD,
1645 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1647 /* Calculate displacement vector */
1648 dx00 = _mm256_sub_pd(ix0,jx0);
1649 dy00 = _mm256_sub_pd(iy0,jy0);
1650 dz00 = _mm256_sub_pd(iz0,jz0);
1651 dx01 = _mm256_sub_pd(ix0,jx1);
1652 dy01 = _mm256_sub_pd(iy0,jy1);
1653 dz01 = _mm256_sub_pd(iz0,jz1);
1654 dx02 = _mm256_sub_pd(ix0,jx2);
1655 dy02 = _mm256_sub_pd(iy0,jy2);
1656 dz02 = _mm256_sub_pd(iz0,jz2);
1657 dx10 = _mm256_sub_pd(ix1,jx0);
1658 dy10 = _mm256_sub_pd(iy1,jy0);
1659 dz10 = _mm256_sub_pd(iz1,jz0);
1660 dx11 = _mm256_sub_pd(ix1,jx1);
1661 dy11 = _mm256_sub_pd(iy1,jy1);
1662 dz11 = _mm256_sub_pd(iz1,jz1);
1663 dx12 = _mm256_sub_pd(ix1,jx2);
1664 dy12 = _mm256_sub_pd(iy1,jy2);
1665 dz12 = _mm256_sub_pd(iz1,jz2);
1666 dx20 = _mm256_sub_pd(ix2,jx0);
1667 dy20 = _mm256_sub_pd(iy2,jy0);
1668 dz20 = _mm256_sub_pd(iz2,jz0);
1669 dx21 = _mm256_sub_pd(ix2,jx1);
1670 dy21 = _mm256_sub_pd(iy2,jy1);
1671 dz21 = _mm256_sub_pd(iz2,jz1);
1672 dx22 = _mm256_sub_pd(ix2,jx2);
1673 dy22 = _mm256_sub_pd(iy2,jy2);
1674 dz22 = _mm256_sub_pd(iz2,jz2);
1676 /* Calculate squared distance and things based on it */
1677 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1678 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1679 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1680 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1681 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1682 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1683 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1684 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1685 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1687 rinv00 = avx256_invsqrt_d(rsq00);
1688 rinv01 = avx256_invsqrt_d(rsq01);
1689 rinv02 = avx256_invsqrt_d(rsq02);
1690 rinv10 = avx256_invsqrt_d(rsq10);
1691 rinv11 = avx256_invsqrt_d(rsq11);
1692 rinv12 = avx256_invsqrt_d(rsq12);
1693 rinv20 = avx256_invsqrt_d(rsq20);
1694 rinv21 = avx256_invsqrt_d(rsq21);
1695 rinv22 = avx256_invsqrt_d(rsq22);
1697 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1698 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1699 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1700 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1701 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1702 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1703 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1704 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1705 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1707 fjx0 = _mm256_setzero_pd();
1708 fjy0 = _mm256_setzero_pd();
1709 fjz0 = _mm256_setzero_pd();
1710 fjx1 = _mm256_setzero_pd();
1711 fjy1 = _mm256_setzero_pd();
1712 fjz1 = _mm256_setzero_pd();
1713 fjx2 = _mm256_setzero_pd();
1714 fjy2 = _mm256_setzero_pd();
1715 fjz2 = _mm256_setzero_pd();
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1724 r00 = _mm256_mul_pd(rsq00,rinv00);
1726 /* EWALD ELECTROSTATICS */
1728 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1729 ewrt = _mm256_mul_pd(r00,ewtabscale);
1730 ewitab = _mm256_cvttpd_epi32(ewrt);
1731 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1732 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1733 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1735 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1736 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1738 /* LENNARD-JONES DISPERSION/REPULSION */
1740 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1741 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1743 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1745 fscal = _mm256_add_pd(felec,fvdw);
1747 fscal = _mm256_and_pd(fscal,cutoff_mask);
1749 /* Calculate temporary vectorial force */
1750 tx = _mm256_mul_pd(fscal,dx00);
1751 ty = _mm256_mul_pd(fscal,dy00);
1752 tz = _mm256_mul_pd(fscal,dz00);
1754 /* Update vectorial force */
1755 fix0 = _mm256_add_pd(fix0,tx);
1756 fiy0 = _mm256_add_pd(fiy0,ty);
1757 fiz0 = _mm256_add_pd(fiz0,tz);
1759 fjx0 = _mm256_add_pd(fjx0,tx);
1760 fjy0 = _mm256_add_pd(fjy0,ty);
1761 fjz0 = _mm256_add_pd(fjz0,tz);
1765 /**************************
1766 * CALCULATE INTERACTIONS *
1767 **************************/
1769 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1772 r01 = _mm256_mul_pd(rsq01,rinv01);
1774 /* EWALD ELECTROSTATICS */
1776 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1777 ewrt = _mm256_mul_pd(r01,ewtabscale);
1778 ewitab = _mm256_cvttpd_epi32(ewrt);
1779 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1780 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1781 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1783 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1784 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1786 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1790 fscal = _mm256_and_pd(fscal,cutoff_mask);
1792 /* Calculate temporary vectorial force */
1793 tx = _mm256_mul_pd(fscal,dx01);
1794 ty = _mm256_mul_pd(fscal,dy01);
1795 tz = _mm256_mul_pd(fscal,dz01);
1797 /* Update vectorial force */
1798 fix0 = _mm256_add_pd(fix0,tx);
1799 fiy0 = _mm256_add_pd(fiy0,ty);
1800 fiz0 = _mm256_add_pd(fiz0,tz);
1802 fjx1 = _mm256_add_pd(fjx1,tx);
1803 fjy1 = _mm256_add_pd(fjy1,ty);
1804 fjz1 = _mm256_add_pd(fjz1,tz);
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1815 r02 = _mm256_mul_pd(rsq02,rinv02);
1817 /* EWALD ELECTROSTATICS */
1819 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1820 ewrt = _mm256_mul_pd(r02,ewtabscale);
1821 ewitab = _mm256_cvttpd_epi32(ewrt);
1822 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1823 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1824 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1826 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1827 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1829 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1833 fscal = _mm256_and_pd(fscal,cutoff_mask);
1835 /* Calculate temporary vectorial force */
1836 tx = _mm256_mul_pd(fscal,dx02);
1837 ty = _mm256_mul_pd(fscal,dy02);
1838 tz = _mm256_mul_pd(fscal,dz02);
1840 /* Update vectorial force */
1841 fix0 = _mm256_add_pd(fix0,tx);
1842 fiy0 = _mm256_add_pd(fiy0,ty);
1843 fiz0 = _mm256_add_pd(fiz0,tz);
1845 fjx2 = _mm256_add_pd(fjx2,tx);
1846 fjy2 = _mm256_add_pd(fjy2,ty);
1847 fjz2 = _mm256_add_pd(fjz2,tz);
1851 /**************************
1852 * CALCULATE INTERACTIONS *
1853 **************************/
1855 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1858 r10 = _mm256_mul_pd(rsq10,rinv10);
1860 /* EWALD ELECTROSTATICS */
1862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1863 ewrt = _mm256_mul_pd(r10,ewtabscale);
1864 ewitab = _mm256_cvttpd_epi32(ewrt);
1865 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1866 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1867 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1869 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1870 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1872 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1876 fscal = _mm256_and_pd(fscal,cutoff_mask);
1878 /* Calculate temporary vectorial force */
1879 tx = _mm256_mul_pd(fscal,dx10);
1880 ty = _mm256_mul_pd(fscal,dy10);
1881 tz = _mm256_mul_pd(fscal,dz10);
1883 /* Update vectorial force */
1884 fix1 = _mm256_add_pd(fix1,tx);
1885 fiy1 = _mm256_add_pd(fiy1,ty);
1886 fiz1 = _mm256_add_pd(fiz1,tz);
1888 fjx0 = _mm256_add_pd(fjx0,tx);
1889 fjy0 = _mm256_add_pd(fjy0,ty);
1890 fjz0 = _mm256_add_pd(fjz0,tz);
1894 /**************************
1895 * CALCULATE INTERACTIONS *
1896 **************************/
1898 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1901 r11 = _mm256_mul_pd(rsq11,rinv11);
1903 /* EWALD ELECTROSTATICS */
1905 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1906 ewrt = _mm256_mul_pd(r11,ewtabscale);
1907 ewitab = _mm256_cvttpd_epi32(ewrt);
1908 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1909 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1910 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1912 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1913 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1915 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1919 fscal = _mm256_and_pd(fscal,cutoff_mask);
1921 /* Calculate temporary vectorial force */
1922 tx = _mm256_mul_pd(fscal,dx11);
1923 ty = _mm256_mul_pd(fscal,dy11);
1924 tz = _mm256_mul_pd(fscal,dz11);
1926 /* Update vectorial force */
1927 fix1 = _mm256_add_pd(fix1,tx);
1928 fiy1 = _mm256_add_pd(fiy1,ty);
1929 fiz1 = _mm256_add_pd(fiz1,tz);
1931 fjx1 = _mm256_add_pd(fjx1,tx);
1932 fjy1 = _mm256_add_pd(fjy1,ty);
1933 fjz1 = _mm256_add_pd(fjz1,tz);
1937 /**************************
1938 * CALCULATE INTERACTIONS *
1939 **************************/
1941 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1944 r12 = _mm256_mul_pd(rsq12,rinv12);
1946 /* EWALD ELECTROSTATICS */
1948 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1949 ewrt = _mm256_mul_pd(r12,ewtabscale);
1950 ewitab = _mm256_cvttpd_epi32(ewrt);
1951 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1952 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1953 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1955 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1956 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1958 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1962 fscal = _mm256_and_pd(fscal,cutoff_mask);
1964 /* Calculate temporary vectorial force */
1965 tx = _mm256_mul_pd(fscal,dx12);
1966 ty = _mm256_mul_pd(fscal,dy12);
1967 tz = _mm256_mul_pd(fscal,dz12);
1969 /* Update vectorial force */
1970 fix1 = _mm256_add_pd(fix1,tx);
1971 fiy1 = _mm256_add_pd(fiy1,ty);
1972 fiz1 = _mm256_add_pd(fiz1,tz);
1974 fjx2 = _mm256_add_pd(fjx2,tx);
1975 fjy2 = _mm256_add_pd(fjy2,ty);
1976 fjz2 = _mm256_add_pd(fjz2,tz);
1980 /**************************
1981 * CALCULATE INTERACTIONS *
1982 **************************/
1984 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1987 r20 = _mm256_mul_pd(rsq20,rinv20);
1989 /* EWALD ELECTROSTATICS */
1991 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1992 ewrt = _mm256_mul_pd(r20,ewtabscale);
1993 ewitab = _mm256_cvttpd_epi32(ewrt);
1994 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1995 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1996 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1998 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1999 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2001 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2005 fscal = _mm256_and_pd(fscal,cutoff_mask);
2007 /* Calculate temporary vectorial force */
2008 tx = _mm256_mul_pd(fscal,dx20);
2009 ty = _mm256_mul_pd(fscal,dy20);
2010 tz = _mm256_mul_pd(fscal,dz20);
2012 /* Update vectorial force */
2013 fix2 = _mm256_add_pd(fix2,tx);
2014 fiy2 = _mm256_add_pd(fiy2,ty);
2015 fiz2 = _mm256_add_pd(fiz2,tz);
2017 fjx0 = _mm256_add_pd(fjx0,tx);
2018 fjy0 = _mm256_add_pd(fjy0,ty);
2019 fjz0 = _mm256_add_pd(fjz0,tz);
2023 /**************************
2024 * CALCULATE INTERACTIONS *
2025 **************************/
2027 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2030 r21 = _mm256_mul_pd(rsq21,rinv21);
2032 /* EWALD ELECTROSTATICS */
2034 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2035 ewrt = _mm256_mul_pd(r21,ewtabscale);
2036 ewitab = _mm256_cvttpd_epi32(ewrt);
2037 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2038 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2039 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2041 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2042 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2044 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2048 fscal = _mm256_and_pd(fscal,cutoff_mask);
2050 /* Calculate temporary vectorial force */
2051 tx = _mm256_mul_pd(fscal,dx21);
2052 ty = _mm256_mul_pd(fscal,dy21);
2053 tz = _mm256_mul_pd(fscal,dz21);
2055 /* Update vectorial force */
2056 fix2 = _mm256_add_pd(fix2,tx);
2057 fiy2 = _mm256_add_pd(fiy2,ty);
2058 fiz2 = _mm256_add_pd(fiz2,tz);
2060 fjx1 = _mm256_add_pd(fjx1,tx);
2061 fjy1 = _mm256_add_pd(fjy1,ty);
2062 fjz1 = _mm256_add_pd(fjz1,tz);
2066 /**************************
2067 * CALCULATE INTERACTIONS *
2068 **************************/
2070 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2073 r22 = _mm256_mul_pd(rsq22,rinv22);
2075 /* EWALD ELECTROSTATICS */
2077 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2078 ewrt = _mm256_mul_pd(r22,ewtabscale);
2079 ewitab = _mm256_cvttpd_epi32(ewrt);
2080 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2081 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2082 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2084 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2085 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2087 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2091 fscal = _mm256_and_pd(fscal,cutoff_mask);
2093 /* Calculate temporary vectorial force */
2094 tx = _mm256_mul_pd(fscal,dx22);
2095 ty = _mm256_mul_pd(fscal,dy22);
2096 tz = _mm256_mul_pd(fscal,dz22);
2098 /* Update vectorial force */
2099 fix2 = _mm256_add_pd(fix2,tx);
2100 fiy2 = _mm256_add_pd(fiy2,ty);
2101 fiz2 = _mm256_add_pd(fiz2,tz);
2103 fjx2 = _mm256_add_pd(fjx2,tx);
2104 fjy2 = _mm256_add_pd(fjy2,ty);
2105 fjz2 = _mm256_add_pd(fjz2,tz);
2109 fjptrA = f+j_coord_offsetA;
2110 fjptrB = f+j_coord_offsetB;
2111 fjptrC = f+j_coord_offsetC;
2112 fjptrD = f+j_coord_offsetD;
2114 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2115 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2117 /* Inner loop uses 358 flops */
2120 if(jidx<j_index_end)
2123 /* Get j neighbor index, and coordinate index */
2124 jnrlistA = jjnr[jidx];
2125 jnrlistB = jjnr[jidx+1];
2126 jnrlistC = jjnr[jidx+2];
2127 jnrlistD = jjnr[jidx+3];
2128 /* Sign of each element will be negative for non-real atoms.
2129 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2130 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2132 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2134 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2135 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2136 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2138 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2139 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2140 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2141 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2142 j_coord_offsetA = DIM*jnrA;
2143 j_coord_offsetB = DIM*jnrB;
2144 j_coord_offsetC = DIM*jnrC;
2145 j_coord_offsetD = DIM*jnrD;
2147 /* load j atom coordinates */
2148 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2149 x+j_coord_offsetC,x+j_coord_offsetD,
2150 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2152 /* Calculate displacement vector */
2153 dx00 = _mm256_sub_pd(ix0,jx0);
2154 dy00 = _mm256_sub_pd(iy0,jy0);
2155 dz00 = _mm256_sub_pd(iz0,jz0);
2156 dx01 = _mm256_sub_pd(ix0,jx1);
2157 dy01 = _mm256_sub_pd(iy0,jy1);
2158 dz01 = _mm256_sub_pd(iz0,jz1);
2159 dx02 = _mm256_sub_pd(ix0,jx2);
2160 dy02 = _mm256_sub_pd(iy0,jy2);
2161 dz02 = _mm256_sub_pd(iz0,jz2);
2162 dx10 = _mm256_sub_pd(ix1,jx0);
2163 dy10 = _mm256_sub_pd(iy1,jy0);
2164 dz10 = _mm256_sub_pd(iz1,jz0);
2165 dx11 = _mm256_sub_pd(ix1,jx1);
2166 dy11 = _mm256_sub_pd(iy1,jy1);
2167 dz11 = _mm256_sub_pd(iz1,jz1);
2168 dx12 = _mm256_sub_pd(ix1,jx2);
2169 dy12 = _mm256_sub_pd(iy1,jy2);
2170 dz12 = _mm256_sub_pd(iz1,jz2);
2171 dx20 = _mm256_sub_pd(ix2,jx0);
2172 dy20 = _mm256_sub_pd(iy2,jy0);
2173 dz20 = _mm256_sub_pd(iz2,jz0);
2174 dx21 = _mm256_sub_pd(ix2,jx1);
2175 dy21 = _mm256_sub_pd(iy2,jy1);
2176 dz21 = _mm256_sub_pd(iz2,jz1);
2177 dx22 = _mm256_sub_pd(ix2,jx2);
2178 dy22 = _mm256_sub_pd(iy2,jy2);
2179 dz22 = _mm256_sub_pd(iz2,jz2);
2181 /* Calculate squared distance and things based on it */
2182 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2183 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2184 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2185 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2186 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2187 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2188 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2189 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2190 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2192 rinv00 = avx256_invsqrt_d(rsq00);
2193 rinv01 = avx256_invsqrt_d(rsq01);
2194 rinv02 = avx256_invsqrt_d(rsq02);
2195 rinv10 = avx256_invsqrt_d(rsq10);
2196 rinv11 = avx256_invsqrt_d(rsq11);
2197 rinv12 = avx256_invsqrt_d(rsq12);
2198 rinv20 = avx256_invsqrt_d(rsq20);
2199 rinv21 = avx256_invsqrt_d(rsq21);
2200 rinv22 = avx256_invsqrt_d(rsq22);
2202 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2203 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2204 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2205 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2206 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2207 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2208 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2209 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2210 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2212 fjx0 = _mm256_setzero_pd();
2213 fjy0 = _mm256_setzero_pd();
2214 fjz0 = _mm256_setzero_pd();
2215 fjx1 = _mm256_setzero_pd();
2216 fjy1 = _mm256_setzero_pd();
2217 fjz1 = _mm256_setzero_pd();
2218 fjx2 = _mm256_setzero_pd();
2219 fjy2 = _mm256_setzero_pd();
2220 fjz2 = _mm256_setzero_pd();
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2226 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2229 r00 = _mm256_mul_pd(rsq00,rinv00);
2230 r00 = _mm256_andnot_pd(dummy_mask,r00);
2232 /* EWALD ELECTROSTATICS */
2234 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2235 ewrt = _mm256_mul_pd(r00,ewtabscale);
2236 ewitab = _mm256_cvttpd_epi32(ewrt);
2237 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2238 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2239 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2241 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2242 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2244 /* LENNARD-JONES DISPERSION/REPULSION */
2246 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2247 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2249 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2251 fscal = _mm256_add_pd(felec,fvdw);
2253 fscal = _mm256_and_pd(fscal,cutoff_mask);
2255 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2257 /* Calculate temporary vectorial force */
2258 tx = _mm256_mul_pd(fscal,dx00);
2259 ty = _mm256_mul_pd(fscal,dy00);
2260 tz = _mm256_mul_pd(fscal,dz00);
2262 /* Update vectorial force */
2263 fix0 = _mm256_add_pd(fix0,tx);
2264 fiy0 = _mm256_add_pd(fiy0,ty);
2265 fiz0 = _mm256_add_pd(fiz0,tz);
2267 fjx0 = _mm256_add_pd(fjx0,tx);
2268 fjy0 = _mm256_add_pd(fjy0,ty);
2269 fjz0 = _mm256_add_pd(fjz0,tz);
2273 /**************************
2274 * CALCULATE INTERACTIONS *
2275 **************************/
2277 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2280 r01 = _mm256_mul_pd(rsq01,rinv01);
2281 r01 = _mm256_andnot_pd(dummy_mask,r01);
2283 /* EWALD ELECTROSTATICS */
2285 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2286 ewrt = _mm256_mul_pd(r01,ewtabscale);
2287 ewitab = _mm256_cvttpd_epi32(ewrt);
2288 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2289 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2290 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2292 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2293 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2295 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2299 fscal = _mm256_and_pd(fscal,cutoff_mask);
2301 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2303 /* Calculate temporary vectorial force */
2304 tx = _mm256_mul_pd(fscal,dx01);
2305 ty = _mm256_mul_pd(fscal,dy01);
2306 tz = _mm256_mul_pd(fscal,dz01);
2308 /* Update vectorial force */
2309 fix0 = _mm256_add_pd(fix0,tx);
2310 fiy0 = _mm256_add_pd(fiy0,ty);
2311 fiz0 = _mm256_add_pd(fiz0,tz);
2313 fjx1 = _mm256_add_pd(fjx1,tx);
2314 fjy1 = _mm256_add_pd(fjy1,ty);
2315 fjz1 = _mm256_add_pd(fjz1,tz);
2319 /**************************
2320 * CALCULATE INTERACTIONS *
2321 **************************/
2323 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2326 r02 = _mm256_mul_pd(rsq02,rinv02);
2327 r02 = _mm256_andnot_pd(dummy_mask,r02);
2329 /* EWALD ELECTROSTATICS */
2331 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2332 ewrt = _mm256_mul_pd(r02,ewtabscale);
2333 ewitab = _mm256_cvttpd_epi32(ewrt);
2334 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2335 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2336 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2338 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2339 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2341 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2345 fscal = _mm256_and_pd(fscal,cutoff_mask);
2347 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2349 /* Calculate temporary vectorial force */
2350 tx = _mm256_mul_pd(fscal,dx02);
2351 ty = _mm256_mul_pd(fscal,dy02);
2352 tz = _mm256_mul_pd(fscal,dz02);
2354 /* Update vectorial force */
2355 fix0 = _mm256_add_pd(fix0,tx);
2356 fiy0 = _mm256_add_pd(fiy0,ty);
2357 fiz0 = _mm256_add_pd(fiz0,tz);
2359 fjx2 = _mm256_add_pd(fjx2,tx);
2360 fjy2 = _mm256_add_pd(fjy2,ty);
2361 fjz2 = _mm256_add_pd(fjz2,tz);
2365 /**************************
2366 * CALCULATE INTERACTIONS *
2367 **************************/
2369 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2372 r10 = _mm256_mul_pd(rsq10,rinv10);
2373 r10 = _mm256_andnot_pd(dummy_mask,r10);
2375 /* EWALD ELECTROSTATICS */
2377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2378 ewrt = _mm256_mul_pd(r10,ewtabscale);
2379 ewitab = _mm256_cvttpd_epi32(ewrt);
2380 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2381 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2382 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2384 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2385 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2387 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2391 fscal = _mm256_and_pd(fscal,cutoff_mask);
2393 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2395 /* Calculate temporary vectorial force */
2396 tx = _mm256_mul_pd(fscal,dx10);
2397 ty = _mm256_mul_pd(fscal,dy10);
2398 tz = _mm256_mul_pd(fscal,dz10);
2400 /* Update vectorial force */
2401 fix1 = _mm256_add_pd(fix1,tx);
2402 fiy1 = _mm256_add_pd(fiy1,ty);
2403 fiz1 = _mm256_add_pd(fiz1,tz);
2405 fjx0 = _mm256_add_pd(fjx0,tx);
2406 fjy0 = _mm256_add_pd(fjy0,ty);
2407 fjz0 = _mm256_add_pd(fjz0,tz);
2411 /**************************
2412 * CALCULATE INTERACTIONS *
2413 **************************/
2415 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2418 r11 = _mm256_mul_pd(rsq11,rinv11);
2419 r11 = _mm256_andnot_pd(dummy_mask,r11);
2421 /* EWALD ELECTROSTATICS */
2423 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2424 ewrt = _mm256_mul_pd(r11,ewtabscale);
2425 ewitab = _mm256_cvttpd_epi32(ewrt);
2426 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2427 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2428 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2430 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2431 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2433 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2437 fscal = _mm256_and_pd(fscal,cutoff_mask);
2439 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2441 /* Calculate temporary vectorial force */
2442 tx = _mm256_mul_pd(fscal,dx11);
2443 ty = _mm256_mul_pd(fscal,dy11);
2444 tz = _mm256_mul_pd(fscal,dz11);
2446 /* Update vectorial force */
2447 fix1 = _mm256_add_pd(fix1,tx);
2448 fiy1 = _mm256_add_pd(fiy1,ty);
2449 fiz1 = _mm256_add_pd(fiz1,tz);
2451 fjx1 = _mm256_add_pd(fjx1,tx);
2452 fjy1 = _mm256_add_pd(fjy1,ty);
2453 fjz1 = _mm256_add_pd(fjz1,tz);
2457 /**************************
2458 * CALCULATE INTERACTIONS *
2459 **************************/
2461 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2464 r12 = _mm256_mul_pd(rsq12,rinv12);
2465 r12 = _mm256_andnot_pd(dummy_mask,r12);
2467 /* EWALD ELECTROSTATICS */
2469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2470 ewrt = _mm256_mul_pd(r12,ewtabscale);
2471 ewitab = _mm256_cvttpd_epi32(ewrt);
2472 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2473 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2474 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2476 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2477 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2479 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2483 fscal = _mm256_and_pd(fscal,cutoff_mask);
2485 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2487 /* Calculate temporary vectorial force */
2488 tx = _mm256_mul_pd(fscal,dx12);
2489 ty = _mm256_mul_pd(fscal,dy12);
2490 tz = _mm256_mul_pd(fscal,dz12);
2492 /* Update vectorial force */
2493 fix1 = _mm256_add_pd(fix1,tx);
2494 fiy1 = _mm256_add_pd(fiy1,ty);
2495 fiz1 = _mm256_add_pd(fiz1,tz);
2497 fjx2 = _mm256_add_pd(fjx2,tx);
2498 fjy2 = _mm256_add_pd(fjy2,ty);
2499 fjz2 = _mm256_add_pd(fjz2,tz);
2503 /**************************
2504 * CALCULATE INTERACTIONS *
2505 **************************/
2507 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2510 r20 = _mm256_mul_pd(rsq20,rinv20);
2511 r20 = _mm256_andnot_pd(dummy_mask,r20);
2513 /* EWALD ELECTROSTATICS */
2515 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2516 ewrt = _mm256_mul_pd(r20,ewtabscale);
2517 ewitab = _mm256_cvttpd_epi32(ewrt);
2518 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2519 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2520 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2522 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2523 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2525 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2529 fscal = _mm256_and_pd(fscal,cutoff_mask);
2531 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2533 /* Calculate temporary vectorial force */
2534 tx = _mm256_mul_pd(fscal,dx20);
2535 ty = _mm256_mul_pd(fscal,dy20);
2536 tz = _mm256_mul_pd(fscal,dz20);
2538 /* Update vectorial force */
2539 fix2 = _mm256_add_pd(fix2,tx);
2540 fiy2 = _mm256_add_pd(fiy2,ty);
2541 fiz2 = _mm256_add_pd(fiz2,tz);
2543 fjx0 = _mm256_add_pd(fjx0,tx);
2544 fjy0 = _mm256_add_pd(fjy0,ty);
2545 fjz0 = _mm256_add_pd(fjz0,tz);
2549 /**************************
2550 * CALCULATE INTERACTIONS *
2551 **************************/
2553 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2556 r21 = _mm256_mul_pd(rsq21,rinv21);
2557 r21 = _mm256_andnot_pd(dummy_mask,r21);
2559 /* EWALD ELECTROSTATICS */
2561 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2562 ewrt = _mm256_mul_pd(r21,ewtabscale);
2563 ewitab = _mm256_cvttpd_epi32(ewrt);
2564 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2565 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2566 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2568 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2569 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2571 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2575 fscal = _mm256_and_pd(fscal,cutoff_mask);
2577 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2579 /* Calculate temporary vectorial force */
2580 tx = _mm256_mul_pd(fscal,dx21);
2581 ty = _mm256_mul_pd(fscal,dy21);
2582 tz = _mm256_mul_pd(fscal,dz21);
2584 /* Update vectorial force */
2585 fix2 = _mm256_add_pd(fix2,tx);
2586 fiy2 = _mm256_add_pd(fiy2,ty);
2587 fiz2 = _mm256_add_pd(fiz2,tz);
2589 fjx1 = _mm256_add_pd(fjx1,tx);
2590 fjy1 = _mm256_add_pd(fjy1,ty);
2591 fjz1 = _mm256_add_pd(fjz1,tz);
2595 /**************************
2596 * CALCULATE INTERACTIONS *
2597 **************************/
2599 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2602 r22 = _mm256_mul_pd(rsq22,rinv22);
2603 r22 = _mm256_andnot_pd(dummy_mask,r22);
2605 /* EWALD ELECTROSTATICS */
2607 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2608 ewrt = _mm256_mul_pd(r22,ewtabscale);
2609 ewitab = _mm256_cvttpd_epi32(ewrt);
2610 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2611 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2612 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2614 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2615 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2617 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2621 fscal = _mm256_and_pd(fscal,cutoff_mask);
2623 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2625 /* Calculate temporary vectorial force */
2626 tx = _mm256_mul_pd(fscal,dx22);
2627 ty = _mm256_mul_pd(fscal,dy22);
2628 tz = _mm256_mul_pd(fscal,dz22);
2630 /* Update vectorial force */
2631 fix2 = _mm256_add_pd(fix2,tx);
2632 fiy2 = _mm256_add_pd(fiy2,ty);
2633 fiz2 = _mm256_add_pd(fiz2,tz);
2635 fjx2 = _mm256_add_pd(fjx2,tx);
2636 fjy2 = _mm256_add_pd(fjy2,ty);
2637 fjz2 = _mm256_add_pd(fjz2,tz);
2641 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2642 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2643 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2644 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2646 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2647 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2649 /* Inner loop uses 367 flops */
2652 /* End of innermost loop */
2654 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2655 f+i_coord_offset,fshift+i_shift_offset);
2657 /* Increment number of inner iterations */
2658 inneriter += j_index_end - j_index_start;
2660 /* Outer loop uses 18 flops */
2663 /* Increment number of outer iterations */
2666 /* Update outer/inner flops */
2668 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);