2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
112 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
114 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
117 __m256d dummy_mask,cutoff_mask;
118 __m128 tmpmask0,tmpmask1;
119 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120 __m256d one = _mm256_set1_pd(1.0);
121 __m256d two = _mm256_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm256_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
140 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
141 beta2 = _mm256_mul_pd(beta,beta);
142 beta3 = _mm256_mul_pd(beta,beta2);
144 ewtab = fr->ic->tabq_coul_FDV0;
145 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
146 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
148 /* Setup water-specific parameters */
149 inr = nlist->iinr[0];
150 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
151 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
152 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
153 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
155 jq0 = _mm256_set1_pd(charge[inr+0]);
156 jq1 = _mm256_set1_pd(charge[inr+1]);
157 jq2 = _mm256_set1_pd(charge[inr+2]);
158 vdwjidx0A = 2*vdwtype[inr+0];
159 qq00 = _mm256_mul_pd(iq0,jq0);
160 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
161 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
162 qq01 = _mm256_mul_pd(iq0,jq1);
163 qq02 = _mm256_mul_pd(iq0,jq2);
164 qq10 = _mm256_mul_pd(iq1,jq0);
165 qq11 = _mm256_mul_pd(iq1,jq1);
166 qq12 = _mm256_mul_pd(iq1,jq2);
167 qq20 = _mm256_mul_pd(iq2,jq0);
168 qq21 = _mm256_mul_pd(iq2,jq1);
169 qq22 = _mm256_mul_pd(iq2,jq2);
171 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
172 rcutoff_scalar = fr->rcoulomb;
173 rcutoff = _mm256_set1_pd(rcutoff_scalar);
174 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
176 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
177 rvdw = _mm256_set1_pd(fr->rvdw);
179 /* Avoid stupid compiler warnings */
180 jnrA = jnrB = jnrC = jnrD = 0;
189 for(iidx=0;iidx<4*DIM;iidx++)
194 /* Start outer loop over neighborlists */
195 for(iidx=0; iidx<nri; iidx++)
197 /* Load shift vector for this list */
198 i_shift_offset = DIM*shiftidx[iidx];
200 /* Load limits for loop over neighbors */
201 j_index_start = jindex[iidx];
202 j_index_end = jindex[iidx+1];
204 /* Get outer coordinate index */
206 i_coord_offset = DIM*inr;
208 /* Load i particle coords and add shift vector */
209 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
210 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
212 fix0 = _mm256_setzero_pd();
213 fiy0 = _mm256_setzero_pd();
214 fiz0 = _mm256_setzero_pd();
215 fix1 = _mm256_setzero_pd();
216 fiy1 = _mm256_setzero_pd();
217 fiz1 = _mm256_setzero_pd();
218 fix2 = _mm256_setzero_pd();
219 fiy2 = _mm256_setzero_pd();
220 fiz2 = _mm256_setzero_pd();
222 /* Reset potential sums */
223 velecsum = _mm256_setzero_pd();
224 vvdwsum = _mm256_setzero_pd();
226 /* Start inner kernel loop */
227 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
230 /* Get j neighbor index, and coordinate index */
235 j_coord_offsetA = DIM*jnrA;
236 j_coord_offsetB = DIM*jnrB;
237 j_coord_offsetC = DIM*jnrC;
238 j_coord_offsetD = DIM*jnrD;
240 /* load j atom coordinates */
241 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
242 x+j_coord_offsetC,x+j_coord_offsetD,
243 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
245 /* Calculate displacement vector */
246 dx00 = _mm256_sub_pd(ix0,jx0);
247 dy00 = _mm256_sub_pd(iy0,jy0);
248 dz00 = _mm256_sub_pd(iz0,jz0);
249 dx01 = _mm256_sub_pd(ix0,jx1);
250 dy01 = _mm256_sub_pd(iy0,jy1);
251 dz01 = _mm256_sub_pd(iz0,jz1);
252 dx02 = _mm256_sub_pd(ix0,jx2);
253 dy02 = _mm256_sub_pd(iy0,jy2);
254 dz02 = _mm256_sub_pd(iz0,jz2);
255 dx10 = _mm256_sub_pd(ix1,jx0);
256 dy10 = _mm256_sub_pd(iy1,jy0);
257 dz10 = _mm256_sub_pd(iz1,jz0);
258 dx11 = _mm256_sub_pd(ix1,jx1);
259 dy11 = _mm256_sub_pd(iy1,jy1);
260 dz11 = _mm256_sub_pd(iz1,jz1);
261 dx12 = _mm256_sub_pd(ix1,jx2);
262 dy12 = _mm256_sub_pd(iy1,jy2);
263 dz12 = _mm256_sub_pd(iz1,jz2);
264 dx20 = _mm256_sub_pd(ix2,jx0);
265 dy20 = _mm256_sub_pd(iy2,jy0);
266 dz20 = _mm256_sub_pd(iz2,jz0);
267 dx21 = _mm256_sub_pd(ix2,jx1);
268 dy21 = _mm256_sub_pd(iy2,jy1);
269 dz21 = _mm256_sub_pd(iz2,jz1);
270 dx22 = _mm256_sub_pd(ix2,jx2);
271 dy22 = _mm256_sub_pd(iy2,jy2);
272 dz22 = _mm256_sub_pd(iz2,jz2);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
276 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
277 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
278 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
279 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
280 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
281 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
282 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
283 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
285 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
286 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
287 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
288 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
289 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
290 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
291 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
292 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
293 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
295 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
296 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
297 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
298 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
299 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
300 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
301 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
302 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
303 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
305 fjx0 = _mm256_setzero_pd();
306 fjy0 = _mm256_setzero_pd();
307 fjz0 = _mm256_setzero_pd();
308 fjx1 = _mm256_setzero_pd();
309 fjy1 = _mm256_setzero_pd();
310 fjz1 = _mm256_setzero_pd();
311 fjx2 = _mm256_setzero_pd();
312 fjy2 = _mm256_setzero_pd();
313 fjz2 = _mm256_setzero_pd();
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
319 if (gmx_mm256_any_lt(rsq00,rcutoff2))
322 r00 = _mm256_mul_pd(rsq00,rinv00);
324 /* EWALD ELECTROSTATICS */
326 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
327 ewrt = _mm256_mul_pd(r00,ewtabscale);
328 ewitab = _mm256_cvttpd_epi32(ewrt);
329 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
330 ewitab = _mm_slli_epi32(ewitab,2);
331 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
332 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
333 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
334 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
335 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
336 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
337 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
338 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
339 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
341 /* LENNARD-JONES DISPERSION/REPULSION */
343 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
344 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
345 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
346 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
347 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
348 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
350 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velec = _mm256_and_pd(velec,cutoff_mask);
354 velecsum = _mm256_add_pd(velecsum,velec);
355 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
356 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
358 fscal = _mm256_add_pd(felec,fvdw);
360 fscal = _mm256_and_pd(fscal,cutoff_mask);
362 /* Calculate temporary vectorial force */
363 tx = _mm256_mul_pd(fscal,dx00);
364 ty = _mm256_mul_pd(fscal,dy00);
365 tz = _mm256_mul_pd(fscal,dz00);
367 /* Update vectorial force */
368 fix0 = _mm256_add_pd(fix0,tx);
369 fiy0 = _mm256_add_pd(fiy0,ty);
370 fiz0 = _mm256_add_pd(fiz0,tz);
372 fjx0 = _mm256_add_pd(fjx0,tx);
373 fjy0 = _mm256_add_pd(fjy0,ty);
374 fjz0 = _mm256_add_pd(fjz0,tz);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 if (gmx_mm256_any_lt(rsq01,rcutoff2))
385 r01 = _mm256_mul_pd(rsq01,rinv01);
387 /* EWALD ELECTROSTATICS */
389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
390 ewrt = _mm256_mul_pd(r01,ewtabscale);
391 ewitab = _mm256_cvttpd_epi32(ewrt);
392 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
393 ewitab = _mm_slli_epi32(ewitab,2);
394 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
395 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
396 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
397 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
398 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
399 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
400 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
401 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
402 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
404 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velec = _mm256_and_pd(velec,cutoff_mask);
408 velecsum = _mm256_add_pd(velecsum,velec);
412 fscal = _mm256_and_pd(fscal,cutoff_mask);
414 /* Calculate temporary vectorial force */
415 tx = _mm256_mul_pd(fscal,dx01);
416 ty = _mm256_mul_pd(fscal,dy01);
417 tz = _mm256_mul_pd(fscal,dz01);
419 /* Update vectorial force */
420 fix0 = _mm256_add_pd(fix0,tx);
421 fiy0 = _mm256_add_pd(fiy0,ty);
422 fiz0 = _mm256_add_pd(fiz0,tz);
424 fjx1 = _mm256_add_pd(fjx1,tx);
425 fjy1 = _mm256_add_pd(fjy1,ty);
426 fjz1 = _mm256_add_pd(fjz1,tz);
430 /**************************
431 * CALCULATE INTERACTIONS *
432 **************************/
434 if (gmx_mm256_any_lt(rsq02,rcutoff2))
437 r02 = _mm256_mul_pd(rsq02,rinv02);
439 /* EWALD ELECTROSTATICS */
441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442 ewrt = _mm256_mul_pd(r02,ewtabscale);
443 ewitab = _mm256_cvttpd_epi32(ewrt);
444 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
445 ewitab = _mm_slli_epi32(ewitab,2);
446 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
447 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
448 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
449 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
450 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
451 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
452 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
453 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
454 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
456 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velec = _mm256_and_pd(velec,cutoff_mask);
460 velecsum = _mm256_add_pd(velecsum,velec);
464 fscal = _mm256_and_pd(fscal,cutoff_mask);
466 /* Calculate temporary vectorial force */
467 tx = _mm256_mul_pd(fscal,dx02);
468 ty = _mm256_mul_pd(fscal,dy02);
469 tz = _mm256_mul_pd(fscal,dz02);
471 /* Update vectorial force */
472 fix0 = _mm256_add_pd(fix0,tx);
473 fiy0 = _mm256_add_pd(fiy0,ty);
474 fiz0 = _mm256_add_pd(fiz0,tz);
476 fjx2 = _mm256_add_pd(fjx2,tx);
477 fjy2 = _mm256_add_pd(fjy2,ty);
478 fjz2 = _mm256_add_pd(fjz2,tz);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 if (gmx_mm256_any_lt(rsq10,rcutoff2))
489 r10 = _mm256_mul_pd(rsq10,rinv10);
491 /* EWALD ELECTROSTATICS */
493 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
494 ewrt = _mm256_mul_pd(r10,ewtabscale);
495 ewitab = _mm256_cvttpd_epi32(ewrt);
496 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
497 ewitab = _mm_slli_epi32(ewitab,2);
498 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
499 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
500 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
501 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
502 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
503 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
504 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
505 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
506 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
508 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velec = _mm256_and_pd(velec,cutoff_mask);
512 velecsum = _mm256_add_pd(velecsum,velec);
516 fscal = _mm256_and_pd(fscal,cutoff_mask);
518 /* Calculate temporary vectorial force */
519 tx = _mm256_mul_pd(fscal,dx10);
520 ty = _mm256_mul_pd(fscal,dy10);
521 tz = _mm256_mul_pd(fscal,dz10);
523 /* Update vectorial force */
524 fix1 = _mm256_add_pd(fix1,tx);
525 fiy1 = _mm256_add_pd(fiy1,ty);
526 fiz1 = _mm256_add_pd(fiz1,tz);
528 fjx0 = _mm256_add_pd(fjx0,tx);
529 fjy0 = _mm256_add_pd(fjy0,ty);
530 fjz0 = _mm256_add_pd(fjz0,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 if (gmx_mm256_any_lt(rsq11,rcutoff2))
541 r11 = _mm256_mul_pd(rsq11,rinv11);
543 /* EWALD ELECTROSTATICS */
545 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
546 ewrt = _mm256_mul_pd(r11,ewtabscale);
547 ewitab = _mm256_cvttpd_epi32(ewrt);
548 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
549 ewitab = _mm_slli_epi32(ewitab,2);
550 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
551 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
552 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
553 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
554 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
555 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
556 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
557 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
558 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
560 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
562 /* Update potential sum for this i atom from the interaction with this j atom. */
563 velec = _mm256_and_pd(velec,cutoff_mask);
564 velecsum = _mm256_add_pd(velecsum,velec);
568 fscal = _mm256_and_pd(fscal,cutoff_mask);
570 /* Calculate temporary vectorial force */
571 tx = _mm256_mul_pd(fscal,dx11);
572 ty = _mm256_mul_pd(fscal,dy11);
573 tz = _mm256_mul_pd(fscal,dz11);
575 /* Update vectorial force */
576 fix1 = _mm256_add_pd(fix1,tx);
577 fiy1 = _mm256_add_pd(fiy1,ty);
578 fiz1 = _mm256_add_pd(fiz1,tz);
580 fjx1 = _mm256_add_pd(fjx1,tx);
581 fjy1 = _mm256_add_pd(fjy1,ty);
582 fjz1 = _mm256_add_pd(fjz1,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 if (gmx_mm256_any_lt(rsq12,rcutoff2))
593 r12 = _mm256_mul_pd(rsq12,rinv12);
595 /* EWALD ELECTROSTATICS */
597 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
598 ewrt = _mm256_mul_pd(r12,ewtabscale);
599 ewitab = _mm256_cvttpd_epi32(ewrt);
600 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
601 ewitab = _mm_slli_epi32(ewitab,2);
602 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
603 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
604 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
605 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
606 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
607 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
608 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
609 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
610 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
612 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
614 /* Update potential sum for this i atom from the interaction with this j atom. */
615 velec = _mm256_and_pd(velec,cutoff_mask);
616 velecsum = _mm256_add_pd(velecsum,velec);
620 fscal = _mm256_and_pd(fscal,cutoff_mask);
622 /* Calculate temporary vectorial force */
623 tx = _mm256_mul_pd(fscal,dx12);
624 ty = _mm256_mul_pd(fscal,dy12);
625 tz = _mm256_mul_pd(fscal,dz12);
627 /* Update vectorial force */
628 fix1 = _mm256_add_pd(fix1,tx);
629 fiy1 = _mm256_add_pd(fiy1,ty);
630 fiz1 = _mm256_add_pd(fiz1,tz);
632 fjx2 = _mm256_add_pd(fjx2,tx);
633 fjy2 = _mm256_add_pd(fjy2,ty);
634 fjz2 = _mm256_add_pd(fjz2,tz);
638 /**************************
639 * CALCULATE INTERACTIONS *
640 **************************/
642 if (gmx_mm256_any_lt(rsq20,rcutoff2))
645 r20 = _mm256_mul_pd(rsq20,rinv20);
647 /* EWALD ELECTROSTATICS */
649 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
650 ewrt = _mm256_mul_pd(r20,ewtabscale);
651 ewitab = _mm256_cvttpd_epi32(ewrt);
652 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
653 ewitab = _mm_slli_epi32(ewitab,2);
654 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
655 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
656 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
657 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
658 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
659 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
660 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
661 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
662 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
664 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
666 /* Update potential sum for this i atom from the interaction with this j atom. */
667 velec = _mm256_and_pd(velec,cutoff_mask);
668 velecsum = _mm256_add_pd(velecsum,velec);
672 fscal = _mm256_and_pd(fscal,cutoff_mask);
674 /* Calculate temporary vectorial force */
675 tx = _mm256_mul_pd(fscal,dx20);
676 ty = _mm256_mul_pd(fscal,dy20);
677 tz = _mm256_mul_pd(fscal,dz20);
679 /* Update vectorial force */
680 fix2 = _mm256_add_pd(fix2,tx);
681 fiy2 = _mm256_add_pd(fiy2,ty);
682 fiz2 = _mm256_add_pd(fiz2,tz);
684 fjx0 = _mm256_add_pd(fjx0,tx);
685 fjy0 = _mm256_add_pd(fjy0,ty);
686 fjz0 = _mm256_add_pd(fjz0,tz);
690 /**************************
691 * CALCULATE INTERACTIONS *
692 **************************/
694 if (gmx_mm256_any_lt(rsq21,rcutoff2))
697 r21 = _mm256_mul_pd(rsq21,rinv21);
699 /* EWALD ELECTROSTATICS */
701 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
702 ewrt = _mm256_mul_pd(r21,ewtabscale);
703 ewitab = _mm256_cvttpd_epi32(ewrt);
704 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
705 ewitab = _mm_slli_epi32(ewitab,2);
706 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
707 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
708 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
709 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
710 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
711 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
712 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
713 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
714 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
716 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
718 /* Update potential sum for this i atom from the interaction with this j atom. */
719 velec = _mm256_and_pd(velec,cutoff_mask);
720 velecsum = _mm256_add_pd(velecsum,velec);
724 fscal = _mm256_and_pd(fscal,cutoff_mask);
726 /* Calculate temporary vectorial force */
727 tx = _mm256_mul_pd(fscal,dx21);
728 ty = _mm256_mul_pd(fscal,dy21);
729 tz = _mm256_mul_pd(fscal,dz21);
731 /* Update vectorial force */
732 fix2 = _mm256_add_pd(fix2,tx);
733 fiy2 = _mm256_add_pd(fiy2,ty);
734 fiz2 = _mm256_add_pd(fiz2,tz);
736 fjx1 = _mm256_add_pd(fjx1,tx);
737 fjy1 = _mm256_add_pd(fjy1,ty);
738 fjz1 = _mm256_add_pd(fjz1,tz);
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 if (gmx_mm256_any_lt(rsq22,rcutoff2))
749 r22 = _mm256_mul_pd(rsq22,rinv22);
751 /* EWALD ELECTROSTATICS */
753 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
754 ewrt = _mm256_mul_pd(r22,ewtabscale);
755 ewitab = _mm256_cvttpd_epi32(ewrt);
756 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
757 ewitab = _mm_slli_epi32(ewitab,2);
758 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
759 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
760 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
761 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
762 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
763 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
764 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
765 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
766 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
768 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
770 /* Update potential sum for this i atom from the interaction with this j atom. */
771 velec = _mm256_and_pd(velec,cutoff_mask);
772 velecsum = _mm256_add_pd(velecsum,velec);
776 fscal = _mm256_and_pd(fscal,cutoff_mask);
778 /* Calculate temporary vectorial force */
779 tx = _mm256_mul_pd(fscal,dx22);
780 ty = _mm256_mul_pd(fscal,dy22);
781 tz = _mm256_mul_pd(fscal,dz22);
783 /* Update vectorial force */
784 fix2 = _mm256_add_pd(fix2,tx);
785 fiy2 = _mm256_add_pd(fiy2,ty);
786 fiz2 = _mm256_add_pd(fiz2,tz);
788 fjx2 = _mm256_add_pd(fjx2,tx);
789 fjy2 = _mm256_add_pd(fjy2,ty);
790 fjz2 = _mm256_add_pd(fjz2,tz);
794 fjptrA = f+j_coord_offsetA;
795 fjptrB = f+j_coord_offsetB;
796 fjptrC = f+j_coord_offsetC;
797 fjptrD = f+j_coord_offsetD;
799 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
800 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
802 /* Inner loop uses 432 flops */
808 /* Get j neighbor index, and coordinate index */
809 jnrlistA = jjnr[jidx];
810 jnrlistB = jjnr[jidx+1];
811 jnrlistC = jjnr[jidx+2];
812 jnrlistD = jjnr[jidx+3];
813 /* Sign of each element will be negative for non-real atoms.
814 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
815 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
817 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
819 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
820 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
821 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
823 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
824 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
825 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
826 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
827 j_coord_offsetA = DIM*jnrA;
828 j_coord_offsetB = DIM*jnrB;
829 j_coord_offsetC = DIM*jnrC;
830 j_coord_offsetD = DIM*jnrD;
832 /* load j atom coordinates */
833 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
834 x+j_coord_offsetC,x+j_coord_offsetD,
835 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
837 /* Calculate displacement vector */
838 dx00 = _mm256_sub_pd(ix0,jx0);
839 dy00 = _mm256_sub_pd(iy0,jy0);
840 dz00 = _mm256_sub_pd(iz0,jz0);
841 dx01 = _mm256_sub_pd(ix0,jx1);
842 dy01 = _mm256_sub_pd(iy0,jy1);
843 dz01 = _mm256_sub_pd(iz0,jz1);
844 dx02 = _mm256_sub_pd(ix0,jx2);
845 dy02 = _mm256_sub_pd(iy0,jy2);
846 dz02 = _mm256_sub_pd(iz0,jz2);
847 dx10 = _mm256_sub_pd(ix1,jx0);
848 dy10 = _mm256_sub_pd(iy1,jy0);
849 dz10 = _mm256_sub_pd(iz1,jz0);
850 dx11 = _mm256_sub_pd(ix1,jx1);
851 dy11 = _mm256_sub_pd(iy1,jy1);
852 dz11 = _mm256_sub_pd(iz1,jz1);
853 dx12 = _mm256_sub_pd(ix1,jx2);
854 dy12 = _mm256_sub_pd(iy1,jy2);
855 dz12 = _mm256_sub_pd(iz1,jz2);
856 dx20 = _mm256_sub_pd(ix2,jx0);
857 dy20 = _mm256_sub_pd(iy2,jy0);
858 dz20 = _mm256_sub_pd(iz2,jz0);
859 dx21 = _mm256_sub_pd(ix2,jx1);
860 dy21 = _mm256_sub_pd(iy2,jy1);
861 dz21 = _mm256_sub_pd(iz2,jz1);
862 dx22 = _mm256_sub_pd(ix2,jx2);
863 dy22 = _mm256_sub_pd(iy2,jy2);
864 dz22 = _mm256_sub_pd(iz2,jz2);
866 /* Calculate squared distance and things based on it */
867 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
868 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
869 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
870 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
871 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
872 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
873 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
874 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
875 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
877 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
878 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
879 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
880 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
881 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
882 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
883 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
884 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
885 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
887 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
888 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
889 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
890 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
891 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
892 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
893 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
894 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
895 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
897 fjx0 = _mm256_setzero_pd();
898 fjy0 = _mm256_setzero_pd();
899 fjz0 = _mm256_setzero_pd();
900 fjx1 = _mm256_setzero_pd();
901 fjy1 = _mm256_setzero_pd();
902 fjz1 = _mm256_setzero_pd();
903 fjx2 = _mm256_setzero_pd();
904 fjy2 = _mm256_setzero_pd();
905 fjz2 = _mm256_setzero_pd();
907 /**************************
908 * CALCULATE INTERACTIONS *
909 **************************/
911 if (gmx_mm256_any_lt(rsq00,rcutoff2))
914 r00 = _mm256_mul_pd(rsq00,rinv00);
915 r00 = _mm256_andnot_pd(dummy_mask,r00);
917 /* EWALD ELECTROSTATICS */
919 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
920 ewrt = _mm256_mul_pd(r00,ewtabscale);
921 ewitab = _mm256_cvttpd_epi32(ewrt);
922 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
923 ewitab = _mm_slli_epi32(ewitab,2);
924 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
925 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
926 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
927 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
928 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
929 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
930 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
931 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
932 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
934 /* LENNARD-JONES DISPERSION/REPULSION */
936 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
937 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
938 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
939 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
940 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
941 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
943 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
945 /* Update potential sum for this i atom from the interaction with this j atom. */
946 velec = _mm256_and_pd(velec,cutoff_mask);
947 velec = _mm256_andnot_pd(dummy_mask,velec);
948 velecsum = _mm256_add_pd(velecsum,velec);
949 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
950 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
951 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
953 fscal = _mm256_add_pd(felec,fvdw);
955 fscal = _mm256_and_pd(fscal,cutoff_mask);
957 fscal = _mm256_andnot_pd(dummy_mask,fscal);
959 /* Calculate temporary vectorial force */
960 tx = _mm256_mul_pd(fscal,dx00);
961 ty = _mm256_mul_pd(fscal,dy00);
962 tz = _mm256_mul_pd(fscal,dz00);
964 /* Update vectorial force */
965 fix0 = _mm256_add_pd(fix0,tx);
966 fiy0 = _mm256_add_pd(fiy0,ty);
967 fiz0 = _mm256_add_pd(fiz0,tz);
969 fjx0 = _mm256_add_pd(fjx0,tx);
970 fjy0 = _mm256_add_pd(fjy0,ty);
971 fjz0 = _mm256_add_pd(fjz0,tz);
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
979 if (gmx_mm256_any_lt(rsq01,rcutoff2))
982 r01 = _mm256_mul_pd(rsq01,rinv01);
983 r01 = _mm256_andnot_pd(dummy_mask,r01);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt = _mm256_mul_pd(r01,ewtabscale);
989 ewitab = _mm256_cvttpd_epi32(ewrt);
990 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
991 ewitab = _mm_slli_epi32(ewitab,2);
992 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
993 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
994 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
995 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
996 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
997 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
998 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
999 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
1000 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1002 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1004 /* Update potential sum for this i atom from the interaction with this j atom. */
1005 velec = _mm256_and_pd(velec,cutoff_mask);
1006 velec = _mm256_andnot_pd(dummy_mask,velec);
1007 velecsum = _mm256_add_pd(velecsum,velec);
1011 fscal = _mm256_and_pd(fscal,cutoff_mask);
1013 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1015 /* Calculate temporary vectorial force */
1016 tx = _mm256_mul_pd(fscal,dx01);
1017 ty = _mm256_mul_pd(fscal,dy01);
1018 tz = _mm256_mul_pd(fscal,dz01);
1020 /* Update vectorial force */
1021 fix0 = _mm256_add_pd(fix0,tx);
1022 fiy0 = _mm256_add_pd(fiy0,ty);
1023 fiz0 = _mm256_add_pd(fiz0,tz);
1025 fjx1 = _mm256_add_pd(fjx1,tx);
1026 fjy1 = _mm256_add_pd(fjy1,ty);
1027 fjz1 = _mm256_add_pd(fjz1,tz);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1038 r02 = _mm256_mul_pd(rsq02,rinv02);
1039 r02 = _mm256_andnot_pd(dummy_mask,r02);
1041 /* EWALD ELECTROSTATICS */
1043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044 ewrt = _mm256_mul_pd(r02,ewtabscale);
1045 ewitab = _mm256_cvttpd_epi32(ewrt);
1046 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1047 ewitab = _mm_slli_epi32(ewitab,2);
1048 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1049 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1050 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1051 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1052 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1053 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1054 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1055 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1056 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1058 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1060 /* Update potential sum for this i atom from the interaction with this j atom. */
1061 velec = _mm256_and_pd(velec,cutoff_mask);
1062 velec = _mm256_andnot_pd(dummy_mask,velec);
1063 velecsum = _mm256_add_pd(velecsum,velec);
1067 fscal = _mm256_and_pd(fscal,cutoff_mask);
1069 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1071 /* Calculate temporary vectorial force */
1072 tx = _mm256_mul_pd(fscal,dx02);
1073 ty = _mm256_mul_pd(fscal,dy02);
1074 tz = _mm256_mul_pd(fscal,dz02);
1076 /* Update vectorial force */
1077 fix0 = _mm256_add_pd(fix0,tx);
1078 fiy0 = _mm256_add_pd(fiy0,ty);
1079 fiz0 = _mm256_add_pd(fiz0,tz);
1081 fjx2 = _mm256_add_pd(fjx2,tx);
1082 fjy2 = _mm256_add_pd(fjy2,ty);
1083 fjz2 = _mm256_add_pd(fjz2,tz);
1087 /**************************
1088 * CALCULATE INTERACTIONS *
1089 **************************/
1091 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1094 r10 = _mm256_mul_pd(rsq10,rinv10);
1095 r10 = _mm256_andnot_pd(dummy_mask,r10);
1097 /* EWALD ELECTROSTATICS */
1099 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100 ewrt = _mm256_mul_pd(r10,ewtabscale);
1101 ewitab = _mm256_cvttpd_epi32(ewrt);
1102 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1103 ewitab = _mm_slli_epi32(ewitab,2);
1104 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1105 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1106 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1107 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1108 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1109 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1110 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1111 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1112 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1114 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec = _mm256_and_pd(velec,cutoff_mask);
1118 velec = _mm256_andnot_pd(dummy_mask,velec);
1119 velecsum = _mm256_add_pd(velecsum,velec);
1123 fscal = _mm256_and_pd(fscal,cutoff_mask);
1125 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1127 /* Calculate temporary vectorial force */
1128 tx = _mm256_mul_pd(fscal,dx10);
1129 ty = _mm256_mul_pd(fscal,dy10);
1130 tz = _mm256_mul_pd(fscal,dz10);
1132 /* Update vectorial force */
1133 fix1 = _mm256_add_pd(fix1,tx);
1134 fiy1 = _mm256_add_pd(fiy1,ty);
1135 fiz1 = _mm256_add_pd(fiz1,tz);
1137 fjx0 = _mm256_add_pd(fjx0,tx);
1138 fjy0 = _mm256_add_pd(fjy0,ty);
1139 fjz0 = _mm256_add_pd(fjz0,tz);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1150 r11 = _mm256_mul_pd(rsq11,rinv11);
1151 r11 = _mm256_andnot_pd(dummy_mask,r11);
1153 /* EWALD ELECTROSTATICS */
1155 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1156 ewrt = _mm256_mul_pd(r11,ewtabscale);
1157 ewitab = _mm256_cvttpd_epi32(ewrt);
1158 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1159 ewitab = _mm_slli_epi32(ewitab,2);
1160 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1161 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1162 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1163 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1164 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1165 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1166 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1167 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1168 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1170 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1172 /* Update potential sum for this i atom from the interaction with this j atom. */
1173 velec = _mm256_and_pd(velec,cutoff_mask);
1174 velec = _mm256_andnot_pd(dummy_mask,velec);
1175 velecsum = _mm256_add_pd(velecsum,velec);
1179 fscal = _mm256_and_pd(fscal,cutoff_mask);
1181 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1183 /* Calculate temporary vectorial force */
1184 tx = _mm256_mul_pd(fscal,dx11);
1185 ty = _mm256_mul_pd(fscal,dy11);
1186 tz = _mm256_mul_pd(fscal,dz11);
1188 /* Update vectorial force */
1189 fix1 = _mm256_add_pd(fix1,tx);
1190 fiy1 = _mm256_add_pd(fiy1,ty);
1191 fiz1 = _mm256_add_pd(fiz1,tz);
1193 fjx1 = _mm256_add_pd(fjx1,tx);
1194 fjy1 = _mm256_add_pd(fjy1,ty);
1195 fjz1 = _mm256_add_pd(fjz1,tz);
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1206 r12 = _mm256_mul_pd(rsq12,rinv12);
1207 r12 = _mm256_andnot_pd(dummy_mask,r12);
1209 /* EWALD ELECTROSTATICS */
1211 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1212 ewrt = _mm256_mul_pd(r12,ewtabscale);
1213 ewitab = _mm256_cvttpd_epi32(ewrt);
1214 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1215 ewitab = _mm_slli_epi32(ewitab,2);
1216 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1217 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1218 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1219 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1220 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1221 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1222 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1223 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1224 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1226 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1228 /* Update potential sum for this i atom from the interaction with this j atom. */
1229 velec = _mm256_and_pd(velec,cutoff_mask);
1230 velec = _mm256_andnot_pd(dummy_mask,velec);
1231 velecsum = _mm256_add_pd(velecsum,velec);
1235 fscal = _mm256_and_pd(fscal,cutoff_mask);
1237 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1239 /* Calculate temporary vectorial force */
1240 tx = _mm256_mul_pd(fscal,dx12);
1241 ty = _mm256_mul_pd(fscal,dy12);
1242 tz = _mm256_mul_pd(fscal,dz12);
1244 /* Update vectorial force */
1245 fix1 = _mm256_add_pd(fix1,tx);
1246 fiy1 = _mm256_add_pd(fiy1,ty);
1247 fiz1 = _mm256_add_pd(fiz1,tz);
1249 fjx2 = _mm256_add_pd(fjx2,tx);
1250 fjy2 = _mm256_add_pd(fjy2,ty);
1251 fjz2 = _mm256_add_pd(fjz2,tz);
1255 /**************************
1256 * CALCULATE INTERACTIONS *
1257 **************************/
1259 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1262 r20 = _mm256_mul_pd(rsq20,rinv20);
1263 r20 = _mm256_andnot_pd(dummy_mask,r20);
1265 /* EWALD ELECTROSTATICS */
1267 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268 ewrt = _mm256_mul_pd(r20,ewtabscale);
1269 ewitab = _mm256_cvttpd_epi32(ewrt);
1270 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1271 ewitab = _mm_slli_epi32(ewitab,2);
1272 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1273 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1274 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1275 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1276 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1277 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1278 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1279 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1280 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1282 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1284 /* Update potential sum for this i atom from the interaction with this j atom. */
1285 velec = _mm256_and_pd(velec,cutoff_mask);
1286 velec = _mm256_andnot_pd(dummy_mask,velec);
1287 velecsum = _mm256_add_pd(velecsum,velec);
1291 fscal = _mm256_and_pd(fscal,cutoff_mask);
1293 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1295 /* Calculate temporary vectorial force */
1296 tx = _mm256_mul_pd(fscal,dx20);
1297 ty = _mm256_mul_pd(fscal,dy20);
1298 tz = _mm256_mul_pd(fscal,dz20);
1300 /* Update vectorial force */
1301 fix2 = _mm256_add_pd(fix2,tx);
1302 fiy2 = _mm256_add_pd(fiy2,ty);
1303 fiz2 = _mm256_add_pd(fiz2,tz);
1305 fjx0 = _mm256_add_pd(fjx0,tx);
1306 fjy0 = _mm256_add_pd(fjy0,ty);
1307 fjz0 = _mm256_add_pd(fjz0,tz);
1311 /**************************
1312 * CALCULATE INTERACTIONS *
1313 **************************/
1315 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1318 r21 = _mm256_mul_pd(rsq21,rinv21);
1319 r21 = _mm256_andnot_pd(dummy_mask,r21);
1321 /* EWALD ELECTROSTATICS */
1323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1324 ewrt = _mm256_mul_pd(r21,ewtabscale);
1325 ewitab = _mm256_cvttpd_epi32(ewrt);
1326 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1327 ewitab = _mm_slli_epi32(ewitab,2);
1328 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1329 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1330 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1331 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1332 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1333 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1334 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1335 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1336 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1338 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1340 /* Update potential sum for this i atom from the interaction with this j atom. */
1341 velec = _mm256_and_pd(velec,cutoff_mask);
1342 velec = _mm256_andnot_pd(dummy_mask,velec);
1343 velecsum = _mm256_add_pd(velecsum,velec);
1347 fscal = _mm256_and_pd(fscal,cutoff_mask);
1349 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1351 /* Calculate temporary vectorial force */
1352 tx = _mm256_mul_pd(fscal,dx21);
1353 ty = _mm256_mul_pd(fscal,dy21);
1354 tz = _mm256_mul_pd(fscal,dz21);
1356 /* Update vectorial force */
1357 fix2 = _mm256_add_pd(fix2,tx);
1358 fiy2 = _mm256_add_pd(fiy2,ty);
1359 fiz2 = _mm256_add_pd(fiz2,tz);
1361 fjx1 = _mm256_add_pd(fjx1,tx);
1362 fjy1 = _mm256_add_pd(fjy1,ty);
1363 fjz1 = _mm256_add_pd(fjz1,tz);
1367 /**************************
1368 * CALCULATE INTERACTIONS *
1369 **************************/
1371 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1374 r22 = _mm256_mul_pd(rsq22,rinv22);
1375 r22 = _mm256_andnot_pd(dummy_mask,r22);
1377 /* EWALD ELECTROSTATICS */
1379 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1380 ewrt = _mm256_mul_pd(r22,ewtabscale);
1381 ewitab = _mm256_cvttpd_epi32(ewrt);
1382 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1383 ewitab = _mm_slli_epi32(ewitab,2);
1384 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1385 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1386 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1387 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1388 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1389 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1390 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1391 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1392 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1394 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1396 /* Update potential sum for this i atom from the interaction with this j atom. */
1397 velec = _mm256_and_pd(velec,cutoff_mask);
1398 velec = _mm256_andnot_pd(dummy_mask,velec);
1399 velecsum = _mm256_add_pd(velecsum,velec);
1403 fscal = _mm256_and_pd(fscal,cutoff_mask);
1405 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1407 /* Calculate temporary vectorial force */
1408 tx = _mm256_mul_pd(fscal,dx22);
1409 ty = _mm256_mul_pd(fscal,dy22);
1410 tz = _mm256_mul_pd(fscal,dz22);
1412 /* Update vectorial force */
1413 fix2 = _mm256_add_pd(fix2,tx);
1414 fiy2 = _mm256_add_pd(fiy2,ty);
1415 fiz2 = _mm256_add_pd(fiz2,tz);
1417 fjx2 = _mm256_add_pd(fjx2,tx);
1418 fjy2 = _mm256_add_pd(fjy2,ty);
1419 fjz2 = _mm256_add_pd(fjz2,tz);
1423 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1424 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1425 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1426 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1428 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1429 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1431 /* Inner loop uses 441 flops */
1434 /* End of innermost loop */
1436 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1437 f+i_coord_offset,fshift+i_shift_offset);
1440 /* Update potential energies */
1441 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1442 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1444 /* Increment number of inner iterations */
1445 inneriter += j_index_end - j_index_start;
1447 /* Outer loop uses 20 flops */
1450 /* Increment number of outer iterations */
1453 /* Update outer/inner flops */
1455 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1458 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1459 * Electrostatics interaction: Ewald
1460 * VdW interaction: LennardJones
1461 * Geometry: Water3-Water3
1462 * Calculate force/pot: Force
1465 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1466 (t_nblist * gmx_restrict nlist,
1467 rvec * gmx_restrict xx,
1468 rvec * gmx_restrict ff,
1469 t_forcerec * gmx_restrict fr,
1470 t_mdatoms * gmx_restrict mdatoms,
1471 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1472 t_nrnb * gmx_restrict nrnb)
1474 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1475 * just 0 for non-waters.
1476 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1477 * jnr indices corresponding to data put in the four positions in the SIMD register.
1479 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1480 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1481 int jnrA,jnrB,jnrC,jnrD;
1482 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1483 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1484 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1485 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1486 real rcutoff_scalar;
1487 real *shiftvec,*fshift,*x,*f;
1488 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1489 real scratch[4*DIM];
1490 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1491 real * vdwioffsetptr0;
1492 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1493 real * vdwioffsetptr1;
1494 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1495 real * vdwioffsetptr2;
1496 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1497 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1498 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1499 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1500 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1501 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1502 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1503 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1504 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1505 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1506 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1507 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1508 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1509 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1510 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1511 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1512 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1515 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1518 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1519 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1521 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1522 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1524 __m256d dummy_mask,cutoff_mask;
1525 __m128 tmpmask0,tmpmask1;
1526 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1527 __m256d one = _mm256_set1_pd(1.0);
1528 __m256d two = _mm256_set1_pd(2.0);
1534 jindex = nlist->jindex;
1536 shiftidx = nlist->shift;
1538 shiftvec = fr->shift_vec[0];
1539 fshift = fr->fshift[0];
1540 facel = _mm256_set1_pd(fr->epsfac);
1541 charge = mdatoms->chargeA;
1542 nvdwtype = fr->ntype;
1543 vdwparam = fr->nbfp;
1544 vdwtype = mdatoms->typeA;
1546 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1547 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1548 beta2 = _mm256_mul_pd(beta,beta);
1549 beta3 = _mm256_mul_pd(beta,beta2);
1551 ewtab = fr->ic->tabq_coul_F;
1552 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1553 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1555 /* Setup water-specific parameters */
1556 inr = nlist->iinr[0];
1557 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1558 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1559 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1560 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1562 jq0 = _mm256_set1_pd(charge[inr+0]);
1563 jq1 = _mm256_set1_pd(charge[inr+1]);
1564 jq2 = _mm256_set1_pd(charge[inr+2]);
1565 vdwjidx0A = 2*vdwtype[inr+0];
1566 qq00 = _mm256_mul_pd(iq0,jq0);
1567 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1568 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1569 qq01 = _mm256_mul_pd(iq0,jq1);
1570 qq02 = _mm256_mul_pd(iq0,jq2);
1571 qq10 = _mm256_mul_pd(iq1,jq0);
1572 qq11 = _mm256_mul_pd(iq1,jq1);
1573 qq12 = _mm256_mul_pd(iq1,jq2);
1574 qq20 = _mm256_mul_pd(iq2,jq0);
1575 qq21 = _mm256_mul_pd(iq2,jq1);
1576 qq22 = _mm256_mul_pd(iq2,jq2);
1578 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1579 rcutoff_scalar = fr->rcoulomb;
1580 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1581 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1583 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1584 rvdw = _mm256_set1_pd(fr->rvdw);
1586 /* Avoid stupid compiler warnings */
1587 jnrA = jnrB = jnrC = jnrD = 0;
1588 j_coord_offsetA = 0;
1589 j_coord_offsetB = 0;
1590 j_coord_offsetC = 0;
1591 j_coord_offsetD = 0;
1596 for(iidx=0;iidx<4*DIM;iidx++)
1598 scratch[iidx] = 0.0;
1601 /* Start outer loop over neighborlists */
1602 for(iidx=0; iidx<nri; iidx++)
1604 /* Load shift vector for this list */
1605 i_shift_offset = DIM*shiftidx[iidx];
1607 /* Load limits for loop over neighbors */
1608 j_index_start = jindex[iidx];
1609 j_index_end = jindex[iidx+1];
1611 /* Get outer coordinate index */
1613 i_coord_offset = DIM*inr;
1615 /* Load i particle coords and add shift vector */
1616 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1617 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1619 fix0 = _mm256_setzero_pd();
1620 fiy0 = _mm256_setzero_pd();
1621 fiz0 = _mm256_setzero_pd();
1622 fix1 = _mm256_setzero_pd();
1623 fiy1 = _mm256_setzero_pd();
1624 fiz1 = _mm256_setzero_pd();
1625 fix2 = _mm256_setzero_pd();
1626 fiy2 = _mm256_setzero_pd();
1627 fiz2 = _mm256_setzero_pd();
1629 /* Start inner kernel loop */
1630 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1633 /* Get j neighbor index, and coordinate index */
1635 jnrB = jjnr[jidx+1];
1636 jnrC = jjnr[jidx+2];
1637 jnrD = jjnr[jidx+3];
1638 j_coord_offsetA = DIM*jnrA;
1639 j_coord_offsetB = DIM*jnrB;
1640 j_coord_offsetC = DIM*jnrC;
1641 j_coord_offsetD = DIM*jnrD;
1643 /* load j atom coordinates */
1644 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1645 x+j_coord_offsetC,x+j_coord_offsetD,
1646 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1648 /* Calculate displacement vector */
1649 dx00 = _mm256_sub_pd(ix0,jx0);
1650 dy00 = _mm256_sub_pd(iy0,jy0);
1651 dz00 = _mm256_sub_pd(iz0,jz0);
1652 dx01 = _mm256_sub_pd(ix0,jx1);
1653 dy01 = _mm256_sub_pd(iy0,jy1);
1654 dz01 = _mm256_sub_pd(iz0,jz1);
1655 dx02 = _mm256_sub_pd(ix0,jx2);
1656 dy02 = _mm256_sub_pd(iy0,jy2);
1657 dz02 = _mm256_sub_pd(iz0,jz2);
1658 dx10 = _mm256_sub_pd(ix1,jx0);
1659 dy10 = _mm256_sub_pd(iy1,jy0);
1660 dz10 = _mm256_sub_pd(iz1,jz0);
1661 dx11 = _mm256_sub_pd(ix1,jx1);
1662 dy11 = _mm256_sub_pd(iy1,jy1);
1663 dz11 = _mm256_sub_pd(iz1,jz1);
1664 dx12 = _mm256_sub_pd(ix1,jx2);
1665 dy12 = _mm256_sub_pd(iy1,jy2);
1666 dz12 = _mm256_sub_pd(iz1,jz2);
1667 dx20 = _mm256_sub_pd(ix2,jx0);
1668 dy20 = _mm256_sub_pd(iy2,jy0);
1669 dz20 = _mm256_sub_pd(iz2,jz0);
1670 dx21 = _mm256_sub_pd(ix2,jx1);
1671 dy21 = _mm256_sub_pd(iy2,jy1);
1672 dz21 = _mm256_sub_pd(iz2,jz1);
1673 dx22 = _mm256_sub_pd(ix2,jx2);
1674 dy22 = _mm256_sub_pd(iy2,jy2);
1675 dz22 = _mm256_sub_pd(iz2,jz2);
1677 /* Calculate squared distance and things based on it */
1678 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1679 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1680 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1681 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1682 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1683 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1684 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1685 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1686 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1688 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1689 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1690 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1691 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1692 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1693 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1694 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1695 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1696 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1698 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1699 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1700 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1701 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1702 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1703 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1704 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1705 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1706 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1708 fjx0 = _mm256_setzero_pd();
1709 fjy0 = _mm256_setzero_pd();
1710 fjz0 = _mm256_setzero_pd();
1711 fjx1 = _mm256_setzero_pd();
1712 fjy1 = _mm256_setzero_pd();
1713 fjz1 = _mm256_setzero_pd();
1714 fjx2 = _mm256_setzero_pd();
1715 fjy2 = _mm256_setzero_pd();
1716 fjz2 = _mm256_setzero_pd();
1718 /**************************
1719 * CALCULATE INTERACTIONS *
1720 **************************/
1722 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1725 r00 = _mm256_mul_pd(rsq00,rinv00);
1727 /* EWALD ELECTROSTATICS */
1729 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1730 ewrt = _mm256_mul_pd(r00,ewtabscale);
1731 ewitab = _mm256_cvttpd_epi32(ewrt);
1732 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1733 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1734 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1736 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1737 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1739 /* LENNARD-JONES DISPERSION/REPULSION */
1741 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1742 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1744 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1746 fscal = _mm256_add_pd(felec,fvdw);
1748 fscal = _mm256_and_pd(fscal,cutoff_mask);
1750 /* Calculate temporary vectorial force */
1751 tx = _mm256_mul_pd(fscal,dx00);
1752 ty = _mm256_mul_pd(fscal,dy00);
1753 tz = _mm256_mul_pd(fscal,dz00);
1755 /* Update vectorial force */
1756 fix0 = _mm256_add_pd(fix0,tx);
1757 fiy0 = _mm256_add_pd(fiy0,ty);
1758 fiz0 = _mm256_add_pd(fiz0,tz);
1760 fjx0 = _mm256_add_pd(fjx0,tx);
1761 fjy0 = _mm256_add_pd(fjy0,ty);
1762 fjz0 = _mm256_add_pd(fjz0,tz);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1773 r01 = _mm256_mul_pd(rsq01,rinv01);
1775 /* EWALD ELECTROSTATICS */
1777 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778 ewrt = _mm256_mul_pd(r01,ewtabscale);
1779 ewitab = _mm256_cvttpd_epi32(ewrt);
1780 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1781 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1782 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1784 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1785 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1787 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1791 fscal = _mm256_and_pd(fscal,cutoff_mask);
1793 /* Calculate temporary vectorial force */
1794 tx = _mm256_mul_pd(fscal,dx01);
1795 ty = _mm256_mul_pd(fscal,dy01);
1796 tz = _mm256_mul_pd(fscal,dz01);
1798 /* Update vectorial force */
1799 fix0 = _mm256_add_pd(fix0,tx);
1800 fiy0 = _mm256_add_pd(fiy0,ty);
1801 fiz0 = _mm256_add_pd(fiz0,tz);
1803 fjx1 = _mm256_add_pd(fjx1,tx);
1804 fjy1 = _mm256_add_pd(fjy1,ty);
1805 fjz1 = _mm256_add_pd(fjz1,tz);
1809 /**************************
1810 * CALCULATE INTERACTIONS *
1811 **************************/
1813 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1816 r02 = _mm256_mul_pd(rsq02,rinv02);
1818 /* EWALD ELECTROSTATICS */
1820 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1821 ewrt = _mm256_mul_pd(r02,ewtabscale);
1822 ewitab = _mm256_cvttpd_epi32(ewrt);
1823 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1824 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1825 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1827 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1828 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1830 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1834 fscal = _mm256_and_pd(fscal,cutoff_mask);
1836 /* Calculate temporary vectorial force */
1837 tx = _mm256_mul_pd(fscal,dx02);
1838 ty = _mm256_mul_pd(fscal,dy02);
1839 tz = _mm256_mul_pd(fscal,dz02);
1841 /* Update vectorial force */
1842 fix0 = _mm256_add_pd(fix0,tx);
1843 fiy0 = _mm256_add_pd(fiy0,ty);
1844 fiz0 = _mm256_add_pd(fiz0,tz);
1846 fjx2 = _mm256_add_pd(fjx2,tx);
1847 fjy2 = _mm256_add_pd(fjy2,ty);
1848 fjz2 = _mm256_add_pd(fjz2,tz);
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1859 r10 = _mm256_mul_pd(rsq10,rinv10);
1861 /* EWALD ELECTROSTATICS */
1863 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1864 ewrt = _mm256_mul_pd(r10,ewtabscale);
1865 ewitab = _mm256_cvttpd_epi32(ewrt);
1866 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1867 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1868 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1870 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1871 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1873 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1877 fscal = _mm256_and_pd(fscal,cutoff_mask);
1879 /* Calculate temporary vectorial force */
1880 tx = _mm256_mul_pd(fscal,dx10);
1881 ty = _mm256_mul_pd(fscal,dy10);
1882 tz = _mm256_mul_pd(fscal,dz10);
1884 /* Update vectorial force */
1885 fix1 = _mm256_add_pd(fix1,tx);
1886 fiy1 = _mm256_add_pd(fiy1,ty);
1887 fiz1 = _mm256_add_pd(fiz1,tz);
1889 fjx0 = _mm256_add_pd(fjx0,tx);
1890 fjy0 = _mm256_add_pd(fjy0,ty);
1891 fjz0 = _mm256_add_pd(fjz0,tz);
1895 /**************************
1896 * CALCULATE INTERACTIONS *
1897 **************************/
1899 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1902 r11 = _mm256_mul_pd(rsq11,rinv11);
1904 /* EWALD ELECTROSTATICS */
1906 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1907 ewrt = _mm256_mul_pd(r11,ewtabscale);
1908 ewitab = _mm256_cvttpd_epi32(ewrt);
1909 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1910 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1911 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1913 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1914 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1916 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1920 fscal = _mm256_and_pd(fscal,cutoff_mask);
1922 /* Calculate temporary vectorial force */
1923 tx = _mm256_mul_pd(fscal,dx11);
1924 ty = _mm256_mul_pd(fscal,dy11);
1925 tz = _mm256_mul_pd(fscal,dz11);
1927 /* Update vectorial force */
1928 fix1 = _mm256_add_pd(fix1,tx);
1929 fiy1 = _mm256_add_pd(fiy1,ty);
1930 fiz1 = _mm256_add_pd(fiz1,tz);
1932 fjx1 = _mm256_add_pd(fjx1,tx);
1933 fjy1 = _mm256_add_pd(fjy1,ty);
1934 fjz1 = _mm256_add_pd(fjz1,tz);
1938 /**************************
1939 * CALCULATE INTERACTIONS *
1940 **************************/
1942 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1945 r12 = _mm256_mul_pd(rsq12,rinv12);
1947 /* EWALD ELECTROSTATICS */
1949 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1950 ewrt = _mm256_mul_pd(r12,ewtabscale);
1951 ewitab = _mm256_cvttpd_epi32(ewrt);
1952 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1953 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1954 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1956 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1957 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1959 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1963 fscal = _mm256_and_pd(fscal,cutoff_mask);
1965 /* Calculate temporary vectorial force */
1966 tx = _mm256_mul_pd(fscal,dx12);
1967 ty = _mm256_mul_pd(fscal,dy12);
1968 tz = _mm256_mul_pd(fscal,dz12);
1970 /* Update vectorial force */
1971 fix1 = _mm256_add_pd(fix1,tx);
1972 fiy1 = _mm256_add_pd(fiy1,ty);
1973 fiz1 = _mm256_add_pd(fiz1,tz);
1975 fjx2 = _mm256_add_pd(fjx2,tx);
1976 fjy2 = _mm256_add_pd(fjy2,ty);
1977 fjz2 = _mm256_add_pd(fjz2,tz);
1981 /**************************
1982 * CALCULATE INTERACTIONS *
1983 **************************/
1985 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1988 r20 = _mm256_mul_pd(rsq20,rinv20);
1990 /* EWALD ELECTROSTATICS */
1992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993 ewrt = _mm256_mul_pd(r20,ewtabscale);
1994 ewitab = _mm256_cvttpd_epi32(ewrt);
1995 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1996 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1997 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1999 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2000 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2002 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2006 fscal = _mm256_and_pd(fscal,cutoff_mask);
2008 /* Calculate temporary vectorial force */
2009 tx = _mm256_mul_pd(fscal,dx20);
2010 ty = _mm256_mul_pd(fscal,dy20);
2011 tz = _mm256_mul_pd(fscal,dz20);
2013 /* Update vectorial force */
2014 fix2 = _mm256_add_pd(fix2,tx);
2015 fiy2 = _mm256_add_pd(fiy2,ty);
2016 fiz2 = _mm256_add_pd(fiz2,tz);
2018 fjx0 = _mm256_add_pd(fjx0,tx);
2019 fjy0 = _mm256_add_pd(fjy0,ty);
2020 fjz0 = _mm256_add_pd(fjz0,tz);
2024 /**************************
2025 * CALCULATE INTERACTIONS *
2026 **************************/
2028 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2031 r21 = _mm256_mul_pd(rsq21,rinv21);
2033 /* EWALD ELECTROSTATICS */
2035 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2036 ewrt = _mm256_mul_pd(r21,ewtabscale);
2037 ewitab = _mm256_cvttpd_epi32(ewrt);
2038 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2039 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2040 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2042 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2043 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2045 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2049 fscal = _mm256_and_pd(fscal,cutoff_mask);
2051 /* Calculate temporary vectorial force */
2052 tx = _mm256_mul_pd(fscal,dx21);
2053 ty = _mm256_mul_pd(fscal,dy21);
2054 tz = _mm256_mul_pd(fscal,dz21);
2056 /* Update vectorial force */
2057 fix2 = _mm256_add_pd(fix2,tx);
2058 fiy2 = _mm256_add_pd(fiy2,ty);
2059 fiz2 = _mm256_add_pd(fiz2,tz);
2061 fjx1 = _mm256_add_pd(fjx1,tx);
2062 fjy1 = _mm256_add_pd(fjy1,ty);
2063 fjz1 = _mm256_add_pd(fjz1,tz);
2067 /**************************
2068 * CALCULATE INTERACTIONS *
2069 **************************/
2071 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2074 r22 = _mm256_mul_pd(rsq22,rinv22);
2076 /* EWALD ELECTROSTATICS */
2078 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2079 ewrt = _mm256_mul_pd(r22,ewtabscale);
2080 ewitab = _mm256_cvttpd_epi32(ewrt);
2081 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2082 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2083 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2085 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2086 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2088 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2092 fscal = _mm256_and_pd(fscal,cutoff_mask);
2094 /* Calculate temporary vectorial force */
2095 tx = _mm256_mul_pd(fscal,dx22);
2096 ty = _mm256_mul_pd(fscal,dy22);
2097 tz = _mm256_mul_pd(fscal,dz22);
2099 /* Update vectorial force */
2100 fix2 = _mm256_add_pd(fix2,tx);
2101 fiy2 = _mm256_add_pd(fiy2,ty);
2102 fiz2 = _mm256_add_pd(fiz2,tz);
2104 fjx2 = _mm256_add_pd(fjx2,tx);
2105 fjy2 = _mm256_add_pd(fjy2,ty);
2106 fjz2 = _mm256_add_pd(fjz2,tz);
2110 fjptrA = f+j_coord_offsetA;
2111 fjptrB = f+j_coord_offsetB;
2112 fjptrC = f+j_coord_offsetC;
2113 fjptrD = f+j_coord_offsetD;
2115 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2116 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2118 /* Inner loop uses 358 flops */
2121 if(jidx<j_index_end)
2124 /* Get j neighbor index, and coordinate index */
2125 jnrlistA = jjnr[jidx];
2126 jnrlistB = jjnr[jidx+1];
2127 jnrlistC = jjnr[jidx+2];
2128 jnrlistD = jjnr[jidx+3];
2129 /* Sign of each element will be negative for non-real atoms.
2130 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2131 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2133 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2135 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2136 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2137 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2139 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2140 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2141 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2142 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2143 j_coord_offsetA = DIM*jnrA;
2144 j_coord_offsetB = DIM*jnrB;
2145 j_coord_offsetC = DIM*jnrC;
2146 j_coord_offsetD = DIM*jnrD;
2148 /* load j atom coordinates */
2149 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2150 x+j_coord_offsetC,x+j_coord_offsetD,
2151 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2153 /* Calculate displacement vector */
2154 dx00 = _mm256_sub_pd(ix0,jx0);
2155 dy00 = _mm256_sub_pd(iy0,jy0);
2156 dz00 = _mm256_sub_pd(iz0,jz0);
2157 dx01 = _mm256_sub_pd(ix0,jx1);
2158 dy01 = _mm256_sub_pd(iy0,jy1);
2159 dz01 = _mm256_sub_pd(iz0,jz1);
2160 dx02 = _mm256_sub_pd(ix0,jx2);
2161 dy02 = _mm256_sub_pd(iy0,jy2);
2162 dz02 = _mm256_sub_pd(iz0,jz2);
2163 dx10 = _mm256_sub_pd(ix1,jx0);
2164 dy10 = _mm256_sub_pd(iy1,jy0);
2165 dz10 = _mm256_sub_pd(iz1,jz0);
2166 dx11 = _mm256_sub_pd(ix1,jx1);
2167 dy11 = _mm256_sub_pd(iy1,jy1);
2168 dz11 = _mm256_sub_pd(iz1,jz1);
2169 dx12 = _mm256_sub_pd(ix1,jx2);
2170 dy12 = _mm256_sub_pd(iy1,jy2);
2171 dz12 = _mm256_sub_pd(iz1,jz2);
2172 dx20 = _mm256_sub_pd(ix2,jx0);
2173 dy20 = _mm256_sub_pd(iy2,jy0);
2174 dz20 = _mm256_sub_pd(iz2,jz0);
2175 dx21 = _mm256_sub_pd(ix2,jx1);
2176 dy21 = _mm256_sub_pd(iy2,jy1);
2177 dz21 = _mm256_sub_pd(iz2,jz1);
2178 dx22 = _mm256_sub_pd(ix2,jx2);
2179 dy22 = _mm256_sub_pd(iy2,jy2);
2180 dz22 = _mm256_sub_pd(iz2,jz2);
2182 /* Calculate squared distance and things based on it */
2183 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2184 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2185 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2186 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2187 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2188 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2189 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2190 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2191 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2193 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2194 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2195 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2196 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2197 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2198 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2199 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2200 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2201 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2203 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2204 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2205 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2206 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2207 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2208 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2209 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2210 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2211 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2213 fjx0 = _mm256_setzero_pd();
2214 fjy0 = _mm256_setzero_pd();
2215 fjz0 = _mm256_setzero_pd();
2216 fjx1 = _mm256_setzero_pd();
2217 fjy1 = _mm256_setzero_pd();
2218 fjz1 = _mm256_setzero_pd();
2219 fjx2 = _mm256_setzero_pd();
2220 fjy2 = _mm256_setzero_pd();
2221 fjz2 = _mm256_setzero_pd();
2223 /**************************
2224 * CALCULATE INTERACTIONS *
2225 **************************/
2227 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2230 r00 = _mm256_mul_pd(rsq00,rinv00);
2231 r00 = _mm256_andnot_pd(dummy_mask,r00);
2233 /* EWALD ELECTROSTATICS */
2235 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2236 ewrt = _mm256_mul_pd(r00,ewtabscale);
2237 ewitab = _mm256_cvttpd_epi32(ewrt);
2238 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2239 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2240 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2242 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2243 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2245 /* LENNARD-JONES DISPERSION/REPULSION */
2247 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2248 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2250 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2252 fscal = _mm256_add_pd(felec,fvdw);
2254 fscal = _mm256_and_pd(fscal,cutoff_mask);
2256 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2258 /* Calculate temporary vectorial force */
2259 tx = _mm256_mul_pd(fscal,dx00);
2260 ty = _mm256_mul_pd(fscal,dy00);
2261 tz = _mm256_mul_pd(fscal,dz00);
2263 /* Update vectorial force */
2264 fix0 = _mm256_add_pd(fix0,tx);
2265 fiy0 = _mm256_add_pd(fiy0,ty);
2266 fiz0 = _mm256_add_pd(fiz0,tz);
2268 fjx0 = _mm256_add_pd(fjx0,tx);
2269 fjy0 = _mm256_add_pd(fjy0,ty);
2270 fjz0 = _mm256_add_pd(fjz0,tz);
2274 /**************************
2275 * CALCULATE INTERACTIONS *
2276 **************************/
2278 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2281 r01 = _mm256_mul_pd(rsq01,rinv01);
2282 r01 = _mm256_andnot_pd(dummy_mask,r01);
2284 /* EWALD ELECTROSTATICS */
2286 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2287 ewrt = _mm256_mul_pd(r01,ewtabscale);
2288 ewitab = _mm256_cvttpd_epi32(ewrt);
2289 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2290 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2291 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2293 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2294 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2296 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2300 fscal = _mm256_and_pd(fscal,cutoff_mask);
2302 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2304 /* Calculate temporary vectorial force */
2305 tx = _mm256_mul_pd(fscal,dx01);
2306 ty = _mm256_mul_pd(fscal,dy01);
2307 tz = _mm256_mul_pd(fscal,dz01);
2309 /* Update vectorial force */
2310 fix0 = _mm256_add_pd(fix0,tx);
2311 fiy0 = _mm256_add_pd(fiy0,ty);
2312 fiz0 = _mm256_add_pd(fiz0,tz);
2314 fjx1 = _mm256_add_pd(fjx1,tx);
2315 fjy1 = _mm256_add_pd(fjy1,ty);
2316 fjz1 = _mm256_add_pd(fjz1,tz);
2320 /**************************
2321 * CALCULATE INTERACTIONS *
2322 **************************/
2324 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2327 r02 = _mm256_mul_pd(rsq02,rinv02);
2328 r02 = _mm256_andnot_pd(dummy_mask,r02);
2330 /* EWALD ELECTROSTATICS */
2332 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2333 ewrt = _mm256_mul_pd(r02,ewtabscale);
2334 ewitab = _mm256_cvttpd_epi32(ewrt);
2335 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2336 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2337 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2339 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2340 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2342 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2346 fscal = _mm256_and_pd(fscal,cutoff_mask);
2348 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2350 /* Calculate temporary vectorial force */
2351 tx = _mm256_mul_pd(fscal,dx02);
2352 ty = _mm256_mul_pd(fscal,dy02);
2353 tz = _mm256_mul_pd(fscal,dz02);
2355 /* Update vectorial force */
2356 fix0 = _mm256_add_pd(fix0,tx);
2357 fiy0 = _mm256_add_pd(fiy0,ty);
2358 fiz0 = _mm256_add_pd(fiz0,tz);
2360 fjx2 = _mm256_add_pd(fjx2,tx);
2361 fjy2 = _mm256_add_pd(fjy2,ty);
2362 fjz2 = _mm256_add_pd(fjz2,tz);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2373 r10 = _mm256_mul_pd(rsq10,rinv10);
2374 r10 = _mm256_andnot_pd(dummy_mask,r10);
2376 /* EWALD ELECTROSTATICS */
2378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2379 ewrt = _mm256_mul_pd(r10,ewtabscale);
2380 ewitab = _mm256_cvttpd_epi32(ewrt);
2381 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2382 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2383 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2385 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2386 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2388 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2392 fscal = _mm256_and_pd(fscal,cutoff_mask);
2394 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2396 /* Calculate temporary vectorial force */
2397 tx = _mm256_mul_pd(fscal,dx10);
2398 ty = _mm256_mul_pd(fscal,dy10);
2399 tz = _mm256_mul_pd(fscal,dz10);
2401 /* Update vectorial force */
2402 fix1 = _mm256_add_pd(fix1,tx);
2403 fiy1 = _mm256_add_pd(fiy1,ty);
2404 fiz1 = _mm256_add_pd(fiz1,tz);
2406 fjx0 = _mm256_add_pd(fjx0,tx);
2407 fjy0 = _mm256_add_pd(fjy0,ty);
2408 fjz0 = _mm256_add_pd(fjz0,tz);
2412 /**************************
2413 * CALCULATE INTERACTIONS *
2414 **************************/
2416 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2419 r11 = _mm256_mul_pd(rsq11,rinv11);
2420 r11 = _mm256_andnot_pd(dummy_mask,r11);
2422 /* EWALD ELECTROSTATICS */
2424 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2425 ewrt = _mm256_mul_pd(r11,ewtabscale);
2426 ewitab = _mm256_cvttpd_epi32(ewrt);
2427 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2428 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2429 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2431 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2432 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2434 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2438 fscal = _mm256_and_pd(fscal,cutoff_mask);
2440 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2442 /* Calculate temporary vectorial force */
2443 tx = _mm256_mul_pd(fscal,dx11);
2444 ty = _mm256_mul_pd(fscal,dy11);
2445 tz = _mm256_mul_pd(fscal,dz11);
2447 /* Update vectorial force */
2448 fix1 = _mm256_add_pd(fix1,tx);
2449 fiy1 = _mm256_add_pd(fiy1,ty);
2450 fiz1 = _mm256_add_pd(fiz1,tz);
2452 fjx1 = _mm256_add_pd(fjx1,tx);
2453 fjy1 = _mm256_add_pd(fjy1,ty);
2454 fjz1 = _mm256_add_pd(fjz1,tz);
2458 /**************************
2459 * CALCULATE INTERACTIONS *
2460 **************************/
2462 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2465 r12 = _mm256_mul_pd(rsq12,rinv12);
2466 r12 = _mm256_andnot_pd(dummy_mask,r12);
2468 /* EWALD ELECTROSTATICS */
2470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2471 ewrt = _mm256_mul_pd(r12,ewtabscale);
2472 ewitab = _mm256_cvttpd_epi32(ewrt);
2473 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2474 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2475 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2477 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2478 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2480 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2484 fscal = _mm256_and_pd(fscal,cutoff_mask);
2486 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2488 /* Calculate temporary vectorial force */
2489 tx = _mm256_mul_pd(fscal,dx12);
2490 ty = _mm256_mul_pd(fscal,dy12);
2491 tz = _mm256_mul_pd(fscal,dz12);
2493 /* Update vectorial force */
2494 fix1 = _mm256_add_pd(fix1,tx);
2495 fiy1 = _mm256_add_pd(fiy1,ty);
2496 fiz1 = _mm256_add_pd(fiz1,tz);
2498 fjx2 = _mm256_add_pd(fjx2,tx);
2499 fjy2 = _mm256_add_pd(fjy2,ty);
2500 fjz2 = _mm256_add_pd(fjz2,tz);
2504 /**************************
2505 * CALCULATE INTERACTIONS *
2506 **************************/
2508 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2511 r20 = _mm256_mul_pd(rsq20,rinv20);
2512 r20 = _mm256_andnot_pd(dummy_mask,r20);
2514 /* EWALD ELECTROSTATICS */
2516 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2517 ewrt = _mm256_mul_pd(r20,ewtabscale);
2518 ewitab = _mm256_cvttpd_epi32(ewrt);
2519 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2520 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2521 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2523 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2524 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2526 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2530 fscal = _mm256_and_pd(fscal,cutoff_mask);
2532 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2534 /* Calculate temporary vectorial force */
2535 tx = _mm256_mul_pd(fscal,dx20);
2536 ty = _mm256_mul_pd(fscal,dy20);
2537 tz = _mm256_mul_pd(fscal,dz20);
2539 /* Update vectorial force */
2540 fix2 = _mm256_add_pd(fix2,tx);
2541 fiy2 = _mm256_add_pd(fiy2,ty);
2542 fiz2 = _mm256_add_pd(fiz2,tz);
2544 fjx0 = _mm256_add_pd(fjx0,tx);
2545 fjy0 = _mm256_add_pd(fjy0,ty);
2546 fjz0 = _mm256_add_pd(fjz0,tz);
2550 /**************************
2551 * CALCULATE INTERACTIONS *
2552 **************************/
2554 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2557 r21 = _mm256_mul_pd(rsq21,rinv21);
2558 r21 = _mm256_andnot_pd(dummy_mask,r21);
2560 /* EWALD ELECTROSTATICS */
2562 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2563 ewrt = _mm256_mul_pd(r21,ewtabscale);
2564 ewitab = _mm256_cvttpd_epi32(ewrt);
2565 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2566 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2567 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2569 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2570 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2572 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2576 fscal = _mm256_and_pd(fscal,cutoff_mask);
2578 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2580 /* Calculate temporary vectorial force */
2581 tx = _mm256_mul_pd(fscal,dx21);
2582 ty = _mm256_mul_pd(fscal,dy21);
2583 tz = _mm256_mul_pd(fscal,dz21);
2585 /* Update vectorial force */
2586 fix2 = _mm256_add_pd(fix2,tx);
2587 fiy2 = _mm256_add_pd(fiy2,ty);
2588 fiz2 = _mm256_add_pd(fiz2,tz);
2590 fjx1 = _mm256_add_pd(fjx1,tx);
2591 fjy1 = _mm256_add_pd(fjy1,ty);
2592 fjz1 = _mm256_add_pd(fjz1,tz);
2596 /**************************
2597 * CALCULATE INTERACTIONS *
2598 **************************/
2600 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2603 r22 = _mm256_mul_pd(rsq22,rinv22);
2604 r22 = _mm256_andnot_pd(dummy_mask,r22);
2606 /* EWALD ELECTROSTATICS */
2608 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2609 ewrt = _mm256_mul_pd(r22,ewtabscale);
2610 ewitab = _mm256_cvttpd_epi32(ewrt);
2611 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2612 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2613 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2615 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2616 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2618 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2622 fscal = _mm256_and_pd(fscal,cutoff_mask);
2624 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2626 /* Calculate temporary vectorial force */
2627 tx = _mm256_mul_pd(fscal,dx22);
2628 ty = _mm256_mul_pd(fscal,dy22);
2629 tz = _mm256_mul_pd(fscal,dz22);
2631 /* Update vectorial force */
2632 fix2 = _mm256_add_pd(fix2,tx);
2633 fiy2 = _mm256_add_pd(fiy2,ty);
2634 fiz2 = _mm256_add_pd(fiz2,tz);
2636 fjx2 = _mm256_add_pd(fjx2,tx);
2637 fjy2 = _mm256_add_pd(fjy2,ty);
2638 fjz2 = _mm256_add_pd(fjz2,tz);
2642 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2643 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2644 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2645 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2647 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2648 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2650 /* Inner loop uses 367 flops */
2653 /* End of innermost loop */
2655 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2656 f+i_coord_offset,fshift+i_shift_offset);
2658 /* Increment number of inner iterations */
2659 inneriter += j_index_end - j_index_start;
2661 /* Outer loop uses 18 flops */
2664 /* Increment number of outer iterations */
2667 /* Update outer/inner flops */
2669 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);