2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gmx_math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
114 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
116 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
119 __m256d dummy_mask,cutoff_mask;
120 __m128 tmpmask0,tmpmask1;
121 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
122 __m256d one = _mm256_set1_pd(1.0);
123 __m256d two = _mm256_set1_pd(2.0);
129 jindex = nlist->jindex;
131 shiftidx = nlist->shift;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm256_set1_pd(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
142 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
143 beta2 = _mm256_mul_pd(beta,beta);
144 beta3 = _mm256_mul_pd(beta,beta2);
146 ewtab = fr->ic->tabq_coul_FDV0;
147 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
148 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
150 /* Setup water-specific parameters */
151 inr = nlist->iinr[0];
152 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
153 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
154 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
155 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
157 jq0 = _mm256_set1_pd(charge[inr+0]);
158 jq1 = _mm256_set1_pd(charge[inr+1]);
159 jq2 = _mm256_set1_pd(charge[inr+2]);
160 vdwjidx0A = 2*vdwtype[inr+0];
161 qq00 = _mm256_mul_pd(iq0,jq0);
162 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
163 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
164 qq01 = _mm256_mul_pd(iq0,jq1);
165 qq02 = _mm256_mul_pd(iq0,jq2);
166 qq10 = _mm256_mul_pd(iq1,jq0);
167 qq11 = _mm256_mul_pd(iq1,jq1);
168 qq12 = _mm256_mul_pd(iq1,jq2);
169 qq20 = _mm256_mul_pd(iq2,jq0);
170 qq21 = _mm256_mul_pd(iq2,jq1);
171 qq22 = _mm256_mul_pd(iq2,jq2);
173 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
174 rcutoff_scalar = fr->rcoulomb;
175 rcutoff = _mm256_set1_pd(rcutoff_scalar);
176 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
178 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
179 rvdw = _mm256_set1_pd(fr->rvdw);
181 /* Avoid stupid compiler warnings */
182 jnrA = jnrB = jnrC = jnrD = 0;
191 for(iidx=0;iidx<4*DIM;iidx++)
196 /* Start outer loop over neighborlists */
197 for(iidx=0; iidx<nri; iidx++)
199 /* Load shift vector for this list */
200 i_shift_offset = DIM*shiftidx[iidx];
202 /* Load limits for loop over neighbors */
203 j_index_start = jindex[iidx];
204 j_index_end = jindex[iidx+1];
206 /* Get outer coordinate index */
208 i_coord_offset = DIM*inr;
210 /* Load i particle coords and add shift vector */
211 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
212 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
214 fix0 = _mm256_setzero_pd();
215 fiy0 = _mm256_setzero_pd();
216 fiz0 = _mm256_setzero_pd();
217 fix1 = _mm256_setzero_pd();
218 fiy1 = _mm256_setzero_pd();
219 fiz1 = _mm256_setzero_pd();
220 fix2 = _mm256_setzero_pd();
221 fiy2 = _mm256_setzero_pd();
222 fiz2 = _mm256_setzero_pd();
224 /* Reset potential sums */
225 velecsum = _mm256_setzero_pd();
226 vvdwsum = _mm256_setzero_pd();
228 /* Start inner kernel loop */
229 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
232 /* Get j neighbor index, and coordinate index */
237 j_coord_offsetA = DIM*jnrA;
238 j_coord_offsetB = DIM*jnrB;
239 j_coord_offsetC = DIM*jnrC;
240 j_coord_offsetD = DIM*jnrD;
242 /* load j atom coordinates */
243 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
244 x+j_coord_offsetC,x+j_coord_offsetD,
245 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
247 /* Calculate displacement vector */
248 dx00 = _mm256_sub_pd(ix0,jx0);
249 dy00 = _mm256_sub_pd(iy0,jy0);
250 dz00 = _mm256_sub_pd(iz0,jz0);
251 dx01 = _mm256_sub_pd(ix0,jx1);
252 dy01 = _mm256_sub_pd(iy0,jy1);
253 dz01 = _mm256_sub_pd(iz0,jz1);
254 dx02 = _mm256_sub_pd(ix0,jx2);
255 dy02 = _mm256_sub_pd(iy0,jy2);
256 dz02 = _mm256_sub_pd(iz0,jz2);
257 dx10 = _mm256_sub_pd(ix1,jx0);
258 dy10 = _mm256_sub_pd(iy1,jy0);
259 dz10 = _mm256_sub_pd(iz1,jz0);
260 dx11 = _mm256_sub_pd(ix1,jx1);
261 dy11 = _mm256_sub_pd(iy1,jy1);
262 dz11 = _mm256_sub_pd(iz1,jz1);
263 dx12 = _mm256_sub_pd(ix1,jx2);
264 dy12 = _mm256_sub_pd(iy1,jy2);
265 dz12 = _mm256_sub_pd(iz1,jz2);
266 dx20 = _mm256_sub_pd(ix2,jx0);
267 dy20 = _mm256_sub_pd(iy2,jy0);
268 dz20 = _mm256_sub_pd(iz2,jz0);
269 dx21 = _mm256_sub_pd(ix2,jx1);
270 dy21 = _mm256_sub_pd(iy2,jy1);
271 dz21 = _mm256_sub_pd(iz2,jz1);
272 dx22 = _mm256_sub_pd(ix2,jx2);
273 dy22 = _mm256_sub_pd(iy2,jy2);
274 dz22 = _mm256_sub_pd(iz2,jz2);
276 /* Calculate squared distance and things based on it */
277 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
278 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
279 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
280 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
281 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
282 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
283 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
284 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
285 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
287 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
288 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
289 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
290 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
291 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
292 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
293 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
294 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
295 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
297 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
298 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
299 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
300 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
301 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
302 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
303 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
304 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
305 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
307 fjx0 = _mm256_setzero_pd();
308 fjy0 = _mm256_setzero_pd();
309 fjz0 = _mm256_setzero_pd();
310 fjx1 = _mm256_setzero_pd();
311 fjy1 = _mm256_setzero_pd();
312 fjz1 = _mm256_setzero_pd();
313 fjx2 = _mm256_setzero_pd();
314 fjy2 = _mm256_setzero_pd();
315 fjz2 = _mm256_setzero_pd();
317 /**************************
318 * CALCULATE INTERACTIONS *
319 **************************/
321 if (gmx_mm256_any_lt(rsq00,rcutoff2))
324 r00 = _mm256_mul_pd(rsq00,rinv00);
326 /* EWALD ELECTROSTATICS */
328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
329 ewrt = _mm256_mul_pd(r00,ewtabscale);
330 ewitab = _mm256_cvttpd_epi32(ewrt);
331 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
332 ewitab = _mm_slli_epi32(ewitab,2);
333 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
334 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
335 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
336 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
337 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
338 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
339 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
340 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
341 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
343 /* LENNARD-JONES DISPERSION/REPULSION */
345 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
346 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
347 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
348 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
349 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
350 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
352 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
354 /* Update potential sum for this i atom from the interaction with this j atom. */
355 velec = _mm256_and_pd(velec,cutoff_mask);
356 velecsum = _mm256_add_pd(velecsum,velec);
357 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
358 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
360 fscal = _mm256_add_pd(felec,fvdw);
362 fscal = _mm256_and_pd(fscal,cutoff_mask);
364 /* Calculate temporary vectorial force */
365 tx = _mm256_mul_pd(fscal,dx00);
366 ty = _mm256_mul_pd(fscal,dy00);
367 tz = _mm256_mul_pd(fscal,dz00);
369 /* Update vectorial force */
370 fix0 = _mm256_add_pd(fix0,tx);
371 fiy0 = _mm256_add_pd(fiy0,ty);
372 fiz0 = _mm256_add_pd(fiz0,tz);
374 fjx0 = _mm256_add_pd(fjx0,tx);
375 fjy0 = _mm256_add_pd(fjy0,ty);
376 fjz0 = _mm256_add_pd(fjz0,tz);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 if (gmx_mm256_any_lt(rsq01,rcutoff2))
387 r01 = _mm256_mul_pd(rsq01,rinv01);
389 /* EWALD ELECTROSTATICS */
391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
392 ewrt = _mm256_mul_pd(r01,ewtabscale);
393 ewitab = _mm256_cvttpd_epi32(ewrt);
394 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
395 ewitab = _mm_slli_epi32(ewitab,2);
396 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
397 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
398 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
399 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
400 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
401 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
402 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
403 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
404 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
406 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
408 /* Update potential sum for this i atom from the interaction with this j atom. */
409 velec = _mm256_and_pd(velec,cutoff_mask);
410 velecsum = _mm256_add_pd(velecsum,velec);
414 fscal = _mm256_and_pd(fscal,cutoff_mask);
416 /* Calculate temporary vectorial force */
417 tx = _mm256_mul_pd(fscal,dx01);
418 ty = _mm256_mul_pd(fscal,dy01);
419 tz = _mm256_mul_pd(fscal,dz01);
421 /* Update vectorial force */
422 fix0 = _mm256_add_pd(fix0,tx);
423 fiy0 = _mm256_add_pd(fiy0,ty);
424 fiz0 = _mm256_add_pd(fiz0,tz);
426 fjx1 = _mm256_add_pd(fjx1,tx);
427 fjy1 = _mm256_add_pd(fjy1,ty);
428 fjz1 = _mm256_add_pd(fjz1,tz);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 if (gmx_mm256_any_lt(rsq02,rcutoff2))
439 r02 = _mm256_mul_pd(rsq02,rinv02);
441 /* EWALD ELECTROSTATICS */
443 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
444 ewrt = _mm256_mul_pd(r02,ewtabscale);
445 ewitab = _mm256_cvttpd_epi32(ewrt);
446 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
447 ewitab = _mm_slli_epi32(ewitab,2);
448 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
449 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
450 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
451 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
452 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
453 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
454 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
455 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
456 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
458 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
460 /* Update potential sum for this i atom from the interaction with this j atom. */
461 velec = _mm256_and_pd(velec,cutoff_mask);
462 velecsum = _mm256_add_pd(velecsum,velec);
466 fscal = _mm256_and_pd(fscal,cutoff_mask);
468 /* Calculate temporary vectorial force */
469 tx = _mm256_mul_pd(fscal,dx02);
470 ty = _mm256_mul_pd(fscal,dy02);
471 tz = _mm256_mul_pd(fscal,dz02);
473 /* Update vectorial force */
474 fix0 = _mm256_add_pd(fix0,tx);
475 fiy0 = _mm256_add_pd(fiy0,ty);
476 fiz0 = _mm256_add_pd(fiz0,tz);
478 fjx2 = _mm256_add_pd(fjx2,tx);
479 fjy2 = _mm256_add_pd(fjy2,ty);
480 fjz2 = _mm256_add_pd(fjz2,tz);
484 /**************************
485 * CALCULATE INTERACTIONS *
486 **************************/
488 if (gmx_mm256_any_lt(rsq10,rcutoff2))
491 r10 = _mm256_mul_pd(rsq10,rinv10);
493 /* EWALD ELECTROSTATICS */
495 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
496 ewrt = _mm256_mul_pd(r10,ewtabscale);
497 ewitab = _mm256_cvttpd_epi32(ewrt);
498 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
499 ewitab = _mm_slli_epi32(ewitab,2);
500 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
501 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
502 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
503 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
504 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
505 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
506 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
507 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
508 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
510 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velec = _mm256_and_pd(velec,cutoff_mask);
514 velecsum = _mm256_add_pd(velecsum,velec);
518 fscal = _mm256_and_pd(fscal,cutoff_mask);
520 /* Calculate temporary vectorial force */
521 tx = _mm256_mul_pd(fscal,dx10);
522 ty = _mm256_mul_pd(fscal,dy10);
523 tz = _mm256_mul_pd(fscal,dz10);
525 /* Update vectorial force */
526 fix1 = _mm256_add_pd(fix1,tx);
527 fiy1 = _mm256_add_pd(fiy1,ty);
528 fiz1 = _mm256_add_pd(fiz1,tz);
530 fjx0 = _mm256_add_pd(fjx0,tx);
531 fjy0 = _mm256_add_pd(fjy0,ty);
532 fjz0 = _mm256_add_pd(fjz0,tz);
536 /**************************
537 * CALCULATE INTERACTIONS *
538 **************************/
540 if (gmx_mm256_any_lt(rsq11,rcutoff2))
543 r11 = _mm256_mul_pd(rsq11,rinv11);
545 /* EWALD ELECTROSTATICS */
547 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
548 ewrt = _mm256_mul_pd(r11,ewtabscale);
549 ewitab = _mm256_cvttpd_epi32(ewrt);
550 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
551 ewitab = _mm_slli_epi32(ewitab,2);
552 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
553 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
554 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
555 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
556 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
557 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
558 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
559 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
560 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
562 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
564 /* Update potential sum for this i atom from the interaction with this j atom. */
565 velec = _mm256_and_pd(velec,cutoff_mask);
566 velecsum = _mm256_add_pd(velecsum,velec);
570 fscal = _mm256_and_pd(fscal,cutoff_mask);
572 /* Calculate temporary vectorial force */
573 tx = _mm256_mul_pd(fscal,dx11);
574 ty = _mm256_mul_pd(fscal,dy11);
575 tz = _mm256_mul_pd(fscal,dz11);
577 /* Update vectorial force */
578 fix1 = _mm256_add_pd(fix1,tx);
579 fiy1 = _mm256_add_pd(fiy1,ty);
580 fiz1 = _mm256_add_pd(fiz1,tz);
582 fjx1 = _mm256_add_pd(fjx1,tx);
583 fjy1 = _mm256_add_pd(fjy1,ty);
584 fjz1 = _mm256_add_pd(fjz1,tz);
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 if (gmx_mm256_any_lt(rsq12,rcutoff2))
595 r12 = _mm256_mul_pd(rsq12,rinv12);
597 /* EWALD ELECTROSTATICS */
599 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
600 ewrt = _mm256_mul_pd(r12,ewtabscale);
601 ewitab = _mm256_cvttpd_epi32(ewrt);
602 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
603 ewitab = _mm_slli_epi32(ewitab,2);
604 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
605 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
606 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
607 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
608 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
609 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
610 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
611 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
612 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
614 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
616 /* Update potential sum for this i atom from the interaction with this j atom. */
617 velec = _mm256_and_pd(velec,cutoff_mask);
618 velecsum = _mm256_add_pd(velecsum,velec);
622 fscal = _mm256_and_pd(fscal,cutoff_mask);
624 /* Calculate temporary vectorial force */
625 tx = _mm256_mul_pd(fscal,dx12);
626 ty = _mm256_mul_pd(fscal,dy12);
627 tz = _mm256_mul_pd(fscal,dz12);
629 /* Update vectorial force */
630 fix1 = _mm256_add_pd(fix1,tx);
631 fiy1 = _mm256_add_pd(fiy1,ty);
632 fiz1 = _mm256_add_pd(fiz1,tz);
634 fjx2 = _mm256_add_pd(fjx2,tx);
635 fjy2 = _mm256_add_pd(fjy2,ty);
636 fjz2 = _mm256_add_pd(fjz2,tz);
640 /**************************
641 * CALCULATE INTERACTIONS *
642 **************************/
644 if (gmx_mm256_any_lt(rsq20,rcutoff2))
647 r20 = _mm256_mul_pd(rsq20,rinv20);
649 /* EWALD ELECTROSTATICS */
651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
652 ewrt = _mm256_mul_pd(r20,ewtabscale);
653 ewitab = _mm256_cvttpd_epi32(ewrt);
654 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
655 ewitab = _mm_slli_epi32(ewitab,2);
656 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
657 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
658 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
659 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
660 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
661 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
662 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
663 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
664 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
666 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
668 /* Update potential sum for this i atom from the interaction with this j atom. */
669 velec = _mm256_and_pd(velec,cutoff_mask);
670 velecsum = _mm256_add_pd(velecsum,velec);
674 fscal = _mm256_and_pd(fscal,cutoff_mask);
676 /* Calculate temporary vectorial force */
677 tx = _mm256_mul_pd(fscal,dx20);
678 ty = _mm256_mul_pd(fscal,dy20);
679 tz = _mm256_mul_pd(fscal,dz20);
681 /* Update vectorial force */
682 fix2 = _mm256_add_pd(fix2,tx);
683 fiy2 = _mm256_add_pd(fiy2,ty);
684 fiz2 = _mm256_add_pd(fiz2,tz);
686 fjx0 = _mm256_add_pd(fjx0,tx);
687 fjy0 = _mm256_add_pd(fjy0,ty);
688 fjz0 = _mm256_add_pd(fjz0,tz);
692 /**************************
693 * CALCULATE INTERACTIONS *
694 **************************/
696 if (gmx_mm256_any_lt(rsq21,rcutoff2))
699 r21 = _mm256_mul_pd(rsq21,rinv21);
701 /* EWALD ELECTROSTATICS */
703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
704 ewrt = _mm256_mul_pd(r21,ewtabscale);
705 ewitab = _mm256_cvttpd_epi32(ewrt);
706 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
707 ewitab = _mm_slli_epi32(ewitab,2);
708 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
709 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
710 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
711 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
712 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
713 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
714 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
715 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
716 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
718 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec = _mm256_and_pd(velec,cutoff_mask);
722 velecsum = _mm256_add_pd(velecsum,velec);
726 fscal = _mm256_and_pd(fscal,cutoff_mask);
728 /* Calculate temporary vectorial force */
729 tx = _mm256_mul_pd(fscal,dx21);
730 ty = _mm256_mul_pd(fscal,dy21);
731 tz = _mm256_mul_pd(fscal,dz21);
733 /* Update vectorial force */
734 fix2 = _mm256_add_pd(fix2,tx);
735 fiy2 = _mm256_add_pd(fiy2,ty);
736 fiz2 = _mm256_add_pd(fiz2,tz);
738 fjx1 = _mm256_add_pd(fjx1,tx);
739 fjy1 = _mm256_add_pd(fjy1,ty);
740 fjz1 = _mm256_add_pd(fjz1,tz);
744 /**************************
745 * CALCULATE INTERACTIONS *
746 **************************/
748 if (gmx_mm256_any_lt(rsq22,rcutoff2))
751 r22 = _mm256_mul_pd(rsq22,rinv22);
753 /* EWALD ELECTROSTATICS */
755 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
756 ewrt = _mm256_mul_pd(r22,ewtabscale);
757 ewitab = _mm256_cvttpd_epi32(ewrt);
758 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
759 ewitab = _mm_slli_epi32(ewitab,2);
760 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
761 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
762 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
763 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
764 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
765 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
766 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
767 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
768 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
770 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
772 /* Update potential sum for this i atom from the interaction with this j atom. */
773 velec = _mm256_and_pd(velec,cutoff_mask);
774 velecsum = _mm256_add_pd(velecsum,velec);
778 fscal = _mm256_and_pd(fscal,cutoff_mask);
780 /* Calculate temporary vectorial force */
781 tx = _mm256_mul_pd(fscal,dx22);
782 ty = _mm256_mul_pd(fscal,dy22);
783 tz = _mm256_mul_pd(fscal,dz22);
785 /* Update vectorial force */
786 fix2 = _mm256_add_pd(fix2,tx);
787 fiy2 = _mm256_add_pd(fiy2,ty);
788 fiz2 = _mm256_add_pd(fiz2,tz);
790 fjx2 = _mm256_add_pd(fjx2,tx);
791 fjy2 = _mm256_add_pd(fjy2,ty);
792 fjz2 = _mm256_add_pd(fjz2,tz);
796 fjptrA = f+j_coord_offsetA;
797 fjptrB = f+j_coord_offsetB;
798 fjptrC = f+j_coord_offsetC;
799 fjptrD = f+j_coord_offsetD;
801 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
802 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
804 /* Inner loop uses 432 flops */
810 /* Get j neighbor index, and coordinate index */
811 jnrlistA = jjnr[jidx];
812 jnrlistB = jjnr[jidx+1];
813 jnrlistC = jjnr[jidx+2];
814 jnrlistD = jjnr[jidx+3];
815 /* Sign of each element will be negative for non-real atoms.
816 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
817 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
819 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
821 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
822 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
823 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
825 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
826 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
827 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
828 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
829 j_coord_offsetA = DIM*jnrA;
830 j_coord_offsetB = DIM*jnrB;
831 j_coord_offsetC = DIM*jnrC;
832 j_coord_offsetD = DIM*jnrD;
834 /* load j atom coordinates */
835 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
836 x+j_coord_offsetC,x+j_coord_offsetD,
837 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
839 /* Calculate displacement vector */
840 dx00 = _mm256_sub_pd(ix0,jx0);
841 dy00 = _mm256_sub_pd(iy0,jy0);
842 dz00 = _mm256_sub_pd(iz0,jz0);
843 dx01 = _mm256_sub_pd(ix0,jx1);
844 dy01 = _mm256_sub_pd(iy0,jy1);
845 dz01 = _mm256_sub_pd(iz0,jz1);
846 dx02 = _mm256_sub_pd(ix0,jx2);
847 dy02 = _mm256_sub_pd(iy0,jy2);
848 dz02 = _mm256_sub_pd(iz0,jz2);
849 dx10 = _mm256_sub_pd(ix1,jx0);
850 dy10 = _mm256_sub_pd(iy1,jy0);
851 dz10 = _mm256_sub_pd(iz1,jz0);
852 dx11 = _mm256_sub_pd(ix1,jx1);
853 dy11 = _mm256_sub_pd(iy1,jy1);
854 dz11 = _mm256_sub_pd(iz1,jz1);
855 dx12 = _mm256_sub_pd(ix1,jx2);
856 dy12 = _mm256_sub_pd(iy1,jy2);
857 dz12 = _mm256_sub_pd(iz1,jz2);
858 dx20 = _mm256_sub_pd(ix2,jx0);
859 dy20 = _mm256_sub_pd(iy2,jy0);
860 dz20 = _mm256_sub_pd(iz2,jz0);
861 dx21 = _mm256_sub_pd(ix2,jx1);
862 dy21 = _mm256_sub_pd(iy2,jy1);
863 dz21 = _mm256_sub_pd(iz2,jz1);
864 dx22 = _mm256_sub_pd(ix2,jx2);
865 dy22 = _mm256_sub_pd(iy2,jy2);
866 dz22 = _mm256_sub_pd(iz2,jz2);
868 /* Calculate squared distance and things based on it */
869 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
870 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
871 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
872 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
873 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
874 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
875 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
876 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
877 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
879 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
880 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
881 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
882 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
883 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
884 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
885 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
886 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
887 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
889 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
890 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
891 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
892 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
893 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
894 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
895 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
896 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
897 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
899 fjx0 = _mm256_setzero_pd();
900 fjy0 = _mm256_setzero_pd();
901 fjz0 = _mm256_setzero_pd();
902 fjx1 = _mm256_setzero_pd();
903 fjy1 = _mm256_setzero_pd();
904 fjz1 = _mm256_setzero_pd();
905 fjx2 = _mm256_setzero_pd();
906 fjy2 = _mm256_setzero_pd();
907 fjz2 = _mm256_setzero_pd();
909 /**************************
910 * CALCULATE INTERACTIONS *
911 **************************/
913 if (gmx_mm256_any_lt(rsq00,rcutoff2))
916 r00 = _mm256_mul_pd(rsq00,rinv00);
917 r00 = _mm256_andnot_pd(dummy_mask,r00);
919 /* EWALD ELECTROSTATICS */
921 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
922 ewrt = _mm256_mul_pd(r00,ewtabscale);
923 ewitab = _mm256_cvttpd_epi32(ewrt);
924 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
925 ewitab = _mm_slli_epi32(ewitab,2);
926 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
927 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
928 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
929 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
930 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
931 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
932 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
933 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
934 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
936 /* LENNARD-JONES DISPERSION/REPULSION */
938 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
939 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
940 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
941 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
942 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
943 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
945 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
947 /* Update potential sum for this i atom from the interaction with this j atom. */
948 velec = _mm256_and_pd(velec,cutoff_mask);
949 velec = _mm256_andnot_pd(dummy_mask,velec);
950 velecsum = _mm256_add_pd(velecsum,velec);
951 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
952 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
953 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
955 fscal = _mm256_add_pd(felec,fvdw);
957 fscal = _mm256_and_pd(fscal,cutoff_mask);
959 fscal = _mm256_andnot_pd(dummy_mask,fscal);
961 /* Calculate temporary vectorial force */
962 tx = _mm256_mul_pd(fscal,dx00);
963 ty = _mm256_mul_pd(fscal,dy00);
964 tz = _mm256_mul_pd(fscal,dz00);
966 /* Update vectorial force */
967 fix0 = _mm256_add_pd(fix0,tx);
968 fiy0 = _mm256_add_pd(fiy0,ty);
969 fiz0 = _mm256_add_pd(fiz0,tz);
971 fjx0 = _mm256_add_pd(fjx0,tx);
972 fjy0 = _mm256_add_pd(fjy0,ty);
973 fjz0 = _mm256_add_pd(fjz0,tz);
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 if (gmx_mm256_any_lt(rsq01,rcutoff2))
984 r01 = _mm256_mul_pd(rsq01,rinv01);
985 r01 = _mm256_andnot_pd(dummy_mask,r01);
987 /* EWALD ELECTROSTATICS */
989 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
990 ewrt = _mm256_mul_pd(r01,ewtabscale);
991 ewitab = _mm256_cvttpd_epi32(ewrt);
992 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
993 ewitab = _mm_slli_epi32(ewitab,2);
994 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
995 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
996 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
997 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
998 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
999 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1000 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1001 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
1002 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1004 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1006 /* Update potential sum for this i atom from the interaction with this j atom. */
1007 velec = _mm256_and_pd(velec,cutoff_mask);
1008 velec = _mm256_andnot_pd(dummy_mask,velec);
1009 velecsum = _mm256_add_pd(velecsum,velec);
1013 fscal = _mm256_and_pd(fscal,cutoff_mask);
1015 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm256_mul_pd(fscal,dx01);
1019 ty = _mm256_mul_pd(fscal,dy01);
1020 tz = _mm256_mul_pd(fscal,dz01);
1022 /* Update vectorial force */
1023 fix0 = _mm256_add_pd(fix0,tx);
1024 fiy0 = _mm256_add_pd(fiy0,ty);
1025 fiz0 = _mm256_add_pd(fiz0,tz);
1027 fjx1 = _mm256_add_pd(fjx1,tx);
1028 fjy1 = _mm256_add_pd(fjy1,ty);
1029 fjz1 = _mm256_add_pd(fjz1,tz);
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1040 r02 = _mm256_mul_pd(rsq02,rinv02);
1041 r02 = _mm256_andnot_pd(dummy_mask,r02);
1043 /* EWALD ELECTROSTATICS */
1045 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1046 ewrt = _mm256_mul_pd(r02,ewtabscale);
1047 ewitab = _mm256_cvttpd_epi32(ewrt);
1048 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1049 ewitab = _mm_slli_epi32(ewitab,2);
1050 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1051 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1052 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1053 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1054 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1055 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1056 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1057 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1058 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1060 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1062 /* Update potential sum for this i atom from the interaction with this j atom. */
1063 velec = _mm256_and_pd(velec,cutoff_mask);
1064 velec = _mm256_andnot_pd(dummy_mask,velec);
1065 velecsum = _mm256_add_pd(velecsum,velec);
1069 fscal = _mm256_and_pd(fscal,cutoff_mask);
1071 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm256_mul_pd(fscal,dx02);
1075 ty = _mm256_mul_pd(fscal,dy02);
1076 tz = _mm256_mul_pd(fscal,dz02);
1078 /* Update vectorial force */
1079 fix0 = _mm256_add_pd(fix0,tx);
1080 fiy0 = _mm256_add_pd(fiy0,ty);
1081 fiz0 = _mm256_add_pd(fiz0,tz);
1083 fjx2 = _mm256_add_pd(fjx2,tx);
1084 fjy2 = _mm256_add_pd(fjy2,ty);
1085 fjz2 = _mm256_add_pd(fjz2,tz);
1089 /**************************
1090 * CALCULATE INTERACTIONS *
1091 **************************/
1093 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1096 r10 = _mm256_mul_pd(rsq10,rinv10);
1097 r10 = _mm256_andnot_pd(dummy_mask,r10);
1099 /* EWALD ELECTROSTATICS */
1101 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1102 ewrt = _mm256_mul_pd(r10,ewtabscale);
1103 ewitab = _mm256_cvttpd_epi32(ewrt);
1104 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1105 ewitab = _mm_slli_epi32(ewitab,2);
1106 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1107 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1108 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1109 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1110 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1111 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1112 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1113 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1114 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1116 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1118 /* Update potential sum for this i atom from the interaction with this j atom. */
1119 velec = _mm256_and_pd(velec,cutoff_mask);
1120 velec = _mm256_andnot_pd(dummy_mask,velec);
1121 velecsum = _mm256_add_pd(velecsum,velec);
1125 fscal = _mm256_and_pd(fscal,cutoff_mask);
1127 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1129 /* Calculate temporary vectorial force */
1130 tx = _mm256_mul_pd(fscal,dx10);
1131 ty = _mm256_mul_pd(fscal,dy10);
1132 tz = _mm256_mul_pd(fscal,dz10);
1134 /* Update vectorial force */
1135 fix1 = _mm256_add_pd(fix1,tx);
1136 fiy1 = _mm256_add_pd(fiy1,ty);
1137 fiz1 = _mm256_add_pd(fiz1,tz);
1139 fjx0 = _mm256_add_pd(fjx0,tx);
1140 fjy0 = _mm256_add_pd(fjy0,ty);
1141 fjz0 = _mm256_add_pd(fjz0,tz);
1145 /**************************
1146 * CALCULATE INTERACTIONS *
1147 **************************/
1149 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1152 r11 = _mm256_mul_pd(rsq11,rinv11);
1153 r11 = _mm256_andnot_pd(dummy_mask,r11);
1155 /* EWALD ELECTROSTATICS */
1157 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1158 ewrt = _mm256_mul_pd(r11,ewtabscale);
1159 ewitab = _mm256_cvttpd_epi32(ewrt);
1160 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1161 ewitab = _mm_slli_epi32(ewitab,2);
1162 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1163 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1164 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1165 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1166 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1167 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1168 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1169 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1170 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1172 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1174 /* Update potential sum for this i atom from the interaction with this j atom. */
1175 velec = _mm256_and_pd(velec,cutoff_mask);
1176 velec = _mm256_andnot_pd(dummy_mask,velec);
1177 velecsum = _mm256_add_pd(velecsum,velec);
1181 fscal = _mm256_and_pd(fscal,cutoff_mask);
1183 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1185 /* Calculate temporary vectorial force */
1186 tx = _mm256_mul_pd(fscal,dx11);
1187 ty = _mm256_mul_pd(fscal,dy11);
1188 tz = _mm256_mul_pd(fscal,dz11);
1190 /* Update vectorial force */
1191 fix1 = _mm256_add_pd(fix1,tx);
1192 fiy1 = _mm256_add_pd(fiy1,ty);
1193 fiz1 = _mm256_add_pd(fiz1,tz);
1195 fjx1 = _mm256_add_pd(fjx1,tx);
1196 fjy1 = _mm256_add_pd(fjy1,ty);
1197 fjz1 = _mm256_add_pd(fjz1,tz);
1201 /**************************
1202 * CALCULATE INTERACTIONS *
1203 **************************/
1205 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1208 r12 = _mm256_mul_pd(rsq12,rinv12);
1209 r12 = _mm256_andnot_pd(dummy_mask,r12);
1211 /* EWALD ELECTROSTATICS */
1213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1214 ewrt = _mm256_mul_pd(r12,ewtabscale);
1215 ewitab = _mm256_cvttpd_epi32(ewrt);
1216 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1217 ewitab = _mm_slli_epi32(ewitab,2);
1218 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1219 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1220 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1221 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1222 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1223 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1224 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1225 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1226 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1228 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1230 /* Update potential sum for this i atom from the interaction with this j atom. */
1231 velec = _mm256_and_pd(velec,cutoff_mask);
1232 velec = _mm256_andnot_pd(dummy_mask,velec);
1233 velecsum = _mm256_add_pd(velecsum,velec);
1237 fscal = _mm256_and_pd(fscal,cutoff_mask);
1239 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1241 /* Calculate temporary vectorial force */
1242 tx = _mm256_mul_pd(fscal,dx12);
1243 ty = _mm256_mul_pd(fscal,dy12);
1244 tz = _mm256_mul_pd(fscal,dz12);
1246 /* Update vectorial force */
1247 fix1 = _mm256_add_pd(fix1,tx);
1248 fiy1 = _mm256_add_pd(fiy1,ty);
1249 fiz1 = _mm256_add_pd(fiz1,tz);
1251 fjx2 = _mm256_add_pd(fjx2,tx);
1252 fjy2 = _mm256_add_pd(fjy2,ty);
1253 fjz2 = _mm256_add_pd(fjz2,tz);
1257 /**************************
1258 * CALCULATE INTERACTIONS *
1259 **************************/
1261 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1264 r20 = _mm256_mul_pd(rsq20,rinv20);
1265 r20 = _mm256_andnot_pd(dummy_mask,r20);
1267 /* EWALD ELECTROSTATICS */
1269 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1270 ewrt = _mm256_mul_pd(r20,ewtabscale);
1271 ewitab = _mm256_cvttpd_epi32(ewrt);
1272 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1273 ewitab = _mm_slli_epi32(ewitab,2);
1274 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1275 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1276 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1277 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1278 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1279 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1280 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1281 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1282 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1284 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1286 /* Update potential sum for this i atom from the interaction with this j atom. */
1287 velec = _mm256_and_pd(velec,cutoff_mask);
1288 velec = _mm256_andnot_pd(dummy_mask,velec);
1289 velecsum = _mm256_add_pd(velecsum,velec);
1293 fscal = _mm256_and_pd(fscal,cutoff_mask);
1295 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1297 /* Calculate temporary vectorial force */
1298 tx = _mm256_mul_pd(fscal,dx20);
1299 ty = _mm256_mul_pd(fscal,dy20);
1300 tz = _mm256_mul_pd(fscal,dz20);
1302 /* Update vectorial force */
1303 fix2 = _mm256_add_pd(fix2,tx);
1304 fiy2 = _mm256_add_pd(fiy2,ty);
1305 fiz2 = _mm256_add_pd(fiz2,tz);
1307 fjx0 = _mm256_add_pd(fjx0,tx);
1308 fjy0 = _mm256_add_pd(fjy0,ty);
1309 fjz0 = _mm256_add_pd(fjz0,tz);
1313 /**************************
1314 * CALCULATE INTERACTIONS *
1315 **************************/
1317 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1320 r21 = _mm256_mul_pd(rsq21,rinv21);
1321 r21 = _mm256_andnot_pd(dummy_mask,r21);
1323 /* EWALD ELECTROSTATICS */
1325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1326 ewrt = _mm256_mul_pd(r21,ewtabscale);
1327 ewitab = _mm256_cvttpd_epi32(ewrt);
1328 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1329 ewitab = _mm_slli_epi32(ewitab,2);
1330 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1331 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1332 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1333 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1334 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1335 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1336 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1337 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1338 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1340 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1342 /* Update potential sum for this i atom from the interaction with this j atom. */
1343 velec = _mm256_and_pd(velec,cutoff_mask);
1344 velec = _mm256_andnot_pd(dummy_mask,velec);
1345 velecsum = _mm256_add_pd(velecsum,velec);
1349 fscal = _mm256_and_pd(fscal,cutoff_mask);
1351 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1353 /* Calculate temporary vectorial force */
1354 tx = _mm256_mul_pd(fscal,dx21);
1355 ty = _mm256_mul_pd(fscal,dy21);
1356 tz = _mm256_mul_pd(fscal,dz21);
1358 /* Update vectorial force */
1359 fix2 = _mm256_add_pd(fix2,tx);
1360 fiy2 = _mm256_add_pd(fiy2,ty);
1361 fiz2 = _mm256_add_pd(fiz2,tz);
1363 fjx1 = _mm256_add_pd(fjx1,tx);
1364 fjy1 = _mm256_add_pd(fjy1,ty);
1365 fjz1 = _mm256_add_pd(fjz1,tz);
1369 /**************************
1370 * CALCULATE INTERACTIONS *
1371 **************************/
1373 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1376 r22 = _mm256_mul_pd(rsq22,rinv22);
1377 r22 = _mm256_andnot_pd(dummy_mask,r22);
1379 /* EWALD ELECTROSTATICS */
1381 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1382 ewrt = _mm256_mul_pd(r22,ewtabscale);
1383 ewitab = _mm256_cvttpd_epi32(ewrt);
1384 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1385 ewitab = _mm_slli_epi32(ewitab,2);
1386 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1387 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1388 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1389 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1390 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1391 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1392 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1393 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1394 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1396 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1398 /* Update potential sum for this i atom from the interaction with this j atom. */
1399 velec = _mm256_and_pd(velec,cutoff_mask);
1400 velec = _mm256_andnot_pd(dummy_mask,velec);
1401 velecsum = _mm256_add_pd(velecsum,velec);
1405 fscal = _mm256_and_pd(fscal,cutoff_mask);
1407 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1409 /* Calculate temporary vectorial force */
1410 tx = _mm256_mul_pd(fscal,dx22);
1411 ty = _mm256_mul_pd(fscal,dy22);
1412 tz = _mm256_mul_pd(fscal,dz22);
1414 /* Update vectorial force */
1415 fix2 = _mm256_add_pd(fix2,tx);
1416 fiy2 = _mm256_add_pd(fiy2,ty);
1417 fiz2 = _mm256_add_pd(fiz2,tz);
1419 fjx2 = _mm256_add_pd(fjx2,tx);
1420 fjy2 = _mm256_add_pd(fjy2,ty);
1421 fjz2 = _mm256_add_pd(fjz2,tz);
1425 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1426 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1427 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1428 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1430 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1431 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1433 /* Inner loop uses 441 flops */
1436 /* End of innermost loop */
1438 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1439 f+i_coord_offset,fshift+i_shift_offset);
1442 /* Update potential energies */
1443 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1444 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1446 /* Increment number of inner iterations */
1447 inneriter += j_index_end - j_index_start;
1449 /* Outer loop uses 20 flops */
1452 /* Increment number of outer iterations */
1455 /* Update outer/inner flops */
1457 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1460 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1461 * Electrostatics interaction: Ewald
1462 * VdW interaction: LennardJones
1463 * Geometry: Water3-Water3
1464 * Calculate force/pot: Force
1467 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1468 (t_nblist * gmx_restrict nlist,
1469 rvec * gmx_restrict xx,
1470 rvec * gmx_restrict ff,
1471 t_forcerec * gmx_restrict fr,
1472 t_mdatoms * gmx_restrict mdatoms,
1473 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1474 t_nrnb * gmx_restrict nrnb)
1476 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1477 * just 0 for non-waters.
1478 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1479 * jnr indices corresponding to data put in the four positions in the SIMD register.
1481 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1482 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1483 int jnrA,jnrB,jnrC,jnrD;
1484 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1485 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1486 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1487 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1488 real rcutoff_scalar;
1489 real *shiftvec,*fshift,*x,*f;
1490 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1491 real scratch[4*DIM];
1492 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1493 real * vdwioffsetptr0;
1494 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1495 real * vdwioffsetptr1;
1496 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1497 real * vdwioffsetptr2;
1498 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1499 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1500 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1501 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1502 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1503 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1504 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1505 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1506 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1507 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1508 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1509 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1510 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1511 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1512 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1513 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1514 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1517 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1520 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1521 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1523 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1524 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1526 __m256d dummy_mask,cutoff_mask;
1527 __m128 tmpmask0,tmpmask1;
1528 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1529 __m256d one = _mm256_set1_pd(1.0);
1530 __m256d two = _mm256_set1_pd(2.0);
1536 jindex = nlist->jindex;
1538 shiftidx = nlist->shift;
1540 shiftvec = fr->shift_vec[0];
1541 fshift = fr->fshift[0];
1542 facel = _mm256_set1_pd(fr->epsfac);
1543 charge = mdatoms->chargeA;
1544 nvdwtype = fr->ntype;
1545 vdwparam = fr->nbfp;
1546 vdwtype = mdatoms->typeA;
1548 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1549 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1550 beta2 = _mm256_mul_pd(beta,beta);
1551 beta3 = _mm256_mul_pd(beta,beta2);
1553 ewtab = fr->ic->tabq_coul_F;
1554 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1555 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1557 /* Setup water-specific parameters */
1558 inr = nlist->iinr[0];
1559 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1560 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1561 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1562 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1564 jq0 = _mm256_set1_pd(charge[inr+0]);
1565 jq1 = _mm256_set1_pd(charge[inr+1]);
1566 jq2 = _mm256_set1_pd(charge[inr+2]);
1567 vdwjidx0A = 2*vdwtype[inr+0];
1568 qq00 = _mm256_mul_pd(iq0,jq0);
1569 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1570 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1571 qq01 = _mm256_mul_pd(iq0,jq1);
1572 qq02 = _mm256_mul_pd(iq0,jq2);
1573 qq10 = _mm256_mul_pd(iq1,jq0);
1574 qq11 = _mm256_mul_pd(iq1,jq1);
1575 qq12 = _mm256_mul_pd(iq1,jq2);
1576 qq20 = _mm256_mul_pd(iq2,jq0);
1577 qq21 = _mm256_mul_pd(iq2,jq1);
1578 qq22 = _mm256_mul_pd(iq2,jq2);
1580 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1581 rcutoff_scalar = fr->rcoulomb;
1582 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1583 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1585 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1586 rvdw = _mm256_set1_pd(fr->rvdw);
1588 /* Avoid stupid compiler warnings */
1589 jnrA = jnrB = jnrC = jnrD = 0;
1590 j_coord_offsetA = 0;
1591 j_coord_offsetB = 0;
1592 j_coord_offsetC = 0;
1593 j_coord_offsetD = 0;
1598 for(iidx=0;iidx<4*DIM;iidx++)
1600 scratch[iidx] = 0.0;
1603 /* Start outer loop over neighborlists */
1604 for(iidx=0; iidx<nri; iidx++)
1606 /* Load shift vector for this list */
1607 i_shift_offset = DIM*shiftidx[iidx];
1609 /* Load limits for loop over neighbors */
1610 j_index_start = jindex[iidx];
1611 j_index_end = jindex[iidx+1];
1613 /* Get outer coordinate index */
1615 i_coord_offset = DIM*inr;
1617 /* Load i particle coords and add shift vector */
1618 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1619 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1621 fix0 = _mm256_setzero_pd();
1622 fiy0 = _mm256_setzero_pd();
1623 fiz0 = _mm256_setzero_pd();
1624 fix1 = _mm256_setzero_pd();
1625 fiy1 = _mm256_setzero_pd();
1626 fiz1 = _mm256_setzero_pd();
1627 fix2 = _mm256_setzero_pd();
1628 fiy2 = _mm256_setzero_pd();
1629 fiz2 = _mm256_setzero_pd();
1631 /* Start inner kernel loop */
1632 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1635 /* Get j neighbor index, and coordinate index */
1637 jnrB = jjnr[jidx+1];
1638 jnrC = jjnr[jidx+2];
1639 jnrD = jjnr[jidx+3];
1640 j_coord_offsetA = DIM*jnrA;
1641 j_coord_offsetB = DIM*jnrB;
1642 j_coord_offsetC = DIM*jnrC;
1643 j_coord_offsetD = DIM*jnrD;
1645 /* load j atom coordinates */
1646 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1647 x+j_coord_offsetC,x+j_coord_offsetD,
1648 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1650 /* Calculate displacement vector */
1651 dx00 = _mm256_sub_pd(ix0,jx0);
1652 dy00 = _mm256_sub_pd(iy0,jy0);
1653 dz00 = _mm256_sub_pd(iz0,jz0);
1654 dx01 = _mm256_sub_pd(ix0,jx1);
1655 dy01 = _mm256_sub_pd(iy0,jy1);
1656 dz01 = _mm256_sub_pd(iz0,jz1);
1657 dx02 = _mm256_sub_pd(ix0,jx2);
1658 dy02 = _mm256_sub_pd(iy0,jy2);
1659 dz02 = _mm256_sub_pd(iz0,jz2);
1660 dx10 = _mm256_sub_pd(ix1,jx0);
1661 dy10 = _mm256_sub_pd(iy1,jy0);
1662 dz10 = _mm256_sub_pd(iz1,jz0);
1663 dx11 = _mm256_sub_pd(ix1,jx1);
1664 dy11 = _mm256_sub_pd(iy1,jy1);
1665 dz11 = _mm256_sub_pd(iz1,jz1);
1666 dx12 = _mm256_sub_pd(ix1,jx2);
1667 dy12 = _mm256_sub_pd(iy1,jy2);
1668 dz12 = _mm256_sub_pd(iz1,jz2);
1669 dx20 = _mm256_sub_pd(ix2,jx0);
1670 dy20 = _mm256_sub_pd(iy2,jy0);
1671 dz20 = _mm256_sub_pd(iz2,jz0);
1672 dx21 = _mm256_sub_pd(ix2,jx1);
1673 dy21 = _mm256_sub_pd(iy2,jy1);
1674 dz21 = _mm256_sub_pd(iz2,jz1);
1675 dx22 = _mm256_sub_pd(ix2,jx2);
1676 dy22 = _mm256_sub_pd(iy2,jy2);
1677 dz22 = _mm256_sub_pd(iz2,jz2);
1679 /* Calculate squared distance and things based on it */
1680 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1681 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1682 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1683 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1684 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1685 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1686 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1687 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1688 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1690 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1691 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1692 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1693 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1694 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1695 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1696 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1697 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1698 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1700 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1701 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1702 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1703 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1704 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1705 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1706 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1707 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1708 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1710 fjx0 = _mm256_setzero_pd();
1711 fjy0 = _mm256_setzero_pd();
1712 fjz0 = _mm256_setzero_pd();
1713 fjx1 = _mm256_setzero_pd();
1714 fjy1 = _mm256_setzero_pd();
1715 fjz1 = _mm256_setzero_pd();
1716 fjx2 = _mm256_setzero_pd();
1717 fjy2 = _mm256_setzero_pd();
1718 fjz2 = _mm256_setzero_pd();
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1727 r00 = _mm256_mul_pd(rsq00,rinv00);
1729 /* EWALD ELECTROSTATICS */
1731 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1732 ewrt = _mm256_mul_pd(r00,ewtabscale);
1733 ewitab = _mm256_cvttpd_epi32(ewrt);
1734 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1735 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1736 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1738 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1739 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1741 /* LENNARD-JONES DISPERSION/REPULSION */
1743 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1744 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1746 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1748 fscal = _mm256_add_pd(felec,fvdw);
1750 fscal = _mm256_and_pd(fscal,cutoff_mask);
1752 /* Calculate temporary vectorial force */
1753 tx = _mm256_mul_pd(fscal,dx00);
1754 ty = _mm256_mul_pd(fscal,dy00);
1755 tz = _mm256_mul_pd(fscal,dz00);
1757 /* Update vectorial force */
1758 fix0 = _mm256_add_pd(fix0,tx);
1759 fiy0 = _mm256_add_pd(fiy0,ty);
1760 fiz0 = _mm256_add_pd(fiz0,tz);
1762 fjx0 = _mm256_add_pd(fjx0,tx);
1763 fjy0 = _mm256_add_pd(fjy0,ty);
1764 fjz0 = _mm256_add_pd(fjz0,tz);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1775 r01 = _mm256_mul_pd(rsq01,rinv01);
1777 /* EWALD ELECTROSTATICS */
1779 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1780 ewrt = _mm256_mul_pd(r01,ewtabscale);
1781 ewitab = _mm256_cvttpd_epi32(ewrt);
1782 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1783 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1784 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1786 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1787 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1789 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1793 fscal = _mm256_and_pd(fscal,cutoff_mask);
1795 /* Calculate temporary vectorial force */
1796 tx = _mm256_mul_pd(fscal,dx01);
1797 ty = _mm256_mul_pd(fscal,dy01);
1798 tz = _mm256_mul_pd(fscal,dz01);
1800 /* Update vectorial force */
1801 fix0 = _mm256_add_pd(fix0,tx);
1802 fiy0 = _mm256_add_pd(fiy0,ty);
1803 fiz0 = _mm256_add_pd(fiz0,tz);
1805 fjx1 = _mm256_add_pd(fjx1,tx);
1806 fjy1 = _mm256_add_pd(fjy1,ty);
1807 fjz1 = _mm256_add_pd(fjz1,tz);
1811 /**************************
1812 * CALCULATE INTERACTIONS *
1813 **************************/
1815 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1818 r02 = _mm256_mul_pd(rsq02,rinv02);
1820 /* EWALD ELECTROSTATICS */
1822 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1823 ewrt = _mm256_mul_pd(r02,ewtabscale);
1824 ewitab = _mm256_cvttpd_epi32(ewrt);
1825 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1826 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1827 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1829 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1830 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1832 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1836 fscal = _mm256_and_pd(fscal,cutoff_mask);
1838 /* Calculate temporary vectorial force */
1839 tx = _mm256_mul_pd(fscal,dx02);
1840 ty = _mm256_mul_pd(fscal,dy02);
1841 tz = _mm256_mul_pd(fscal,dz02);
1843 /* Update vectorial force */
1844 fix0 = _mm256_add_pd(fix0,tx);
1845 fiy0 = _mm256_add_pd(fiy0,ty);
1846 fiz0 = _mm256_add_pd(fiz0,tz);
1848 fjx2 = _mm256_add_pd(fjx2,tx);
1849 fjy2 = _mm256_add_pd(fjy2,ty);
1850 fjz2 = _mm256_add_pd(fjz2,tz);
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1861 r10 = _mm256_mul_pd(rsq10,rinv10);
1863 /* EWALD ELECTROSTATICS */
1865 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1866 ewrt = _mm256_mul_pd(r10,ewtabscale);
1867 ewitab = _mm256_cvttpd_epi32(ewrt);
1868 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1869 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1870 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1872 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1873 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1875 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1879 fscal = _mm256_and_pd(fscal,cutoff_mask);
1881 /* Calculate temporary vectorial force */
1882 tx = _mm256_mul_pd(fscal,dx10);
1883 ty = _mm256_mul_pd(fscal,dy10);
1884 tz = _mm256_mul_pd(fscal,dz10);
1886 /* Update vectorial force */
1887 fix1 = _mm256_add_pd(fix1,tx);
1888 fiy1 = _mm256_add_pd(fiy1,ty);
1889 fiz1 = _mm256_add_pd(fiz1,tz);
1891 fjx0 = _mm256_add_pd(fjx0,tx);
1892 fjy0 = _mm256_add_pd(fjy0,ty);
1893 fjz0 = _mm256_add_pd(fjz0,tz);
1897 /**************************
1898 * CALCULATE INTERACTIONS *
1899 **************************/
1901 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1904 r11 = _mm256_mul_pd(rsq11,rinv11);
1906 /* EWALD ELECTROSTATICS */
1908 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1909 ewrt = _mm256_mul_pd(r11,ewtabscale);
1910 ewitab = _mm256_cvttpd_epi32(ewrt);
1911 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1912 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1913 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1915 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1916 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1918 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1922 fscal = _mm256_and_pd(fscal,cutoff_mask);
1924 /* Calculate temporary vectorial force */
1925 tx = _mm256_mul_pd(fscal,dx11);
1926 ty = _mm256_mul_pd(fscal,dy11);
1927 tz = _mm256_mul_pd(fscal,dz11);
1929 /* Update vectorial force */
1930 fix1 = _mm256_add_pd(fix1,tx);
1931 fiy1 = _mm256_add_pd(fiy1,ty);
1932 fiz1 = _mm256_add_pd(fiz1,tz);
1934 fjx1 = _mm256_add_pd(fjx1,tx);
1935 fjy1 = _mm256_add_pd(fjy1,ty);
1936 fjz1 = _mm256_add_pd(fjz1,tz);
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1947 r12 = _mm256_mul_pd(rsq12,rinv12);
1949 /* EWALD ELECTROSTATICS */
1951 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1952 ewrt = _mm256_mul_pd(r12,ewtabscale);
1953 ewitab = _mm256_cvttpd_epi32(ewrt);
1954 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1955 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1956 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1958 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1959 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1961 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1965 fscal = _mm256_and_pd(fscal,cutoff_mask);
1967 /* Calculate temporary vectorial force */
1968 tx = _mm256_mul_pd(fscal,dx12);
1969 ty = _mm256_mul_pd(fscal,dy12);
1970 tz = _mm256_mul_pd(fscal,dz12);
1972 /* Update vectorial force */
1973 fix1 = _mm256_add_pd(fix1,tx);
1974 fiy1 = _mm256_add_pd(fiy1,ty);
1975 fiz1 = _mm256_add_pd(fiz1,tz);
1977 fjx2 = _mm256_add_pd(fjx2,tx);
1978 fjy2 = _mm256_add_pd(fjy2,ty);
1979 fjz2 = _mm256_add_pd(fjz2,tz);
1983 /**************************
1984 * CALCULATE INTERACTIONS *
1985 **************************/
1987 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1990 r20 = _mm256_mul_pd(rsq20,rinv20);
1992 /* EWALD ELECTROSTATICS */
1994 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1995 ewrt = _mm256_mul_pd(r20,ewtabscale);
1996 ewitab = _mm256_cvttpd_epi32(ewrt);
1997 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1998 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1999 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2001 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2002 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2004 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2008 fscal = _mm256_and_pd(fscal,cutoff_mask);
2010 /* Calculate temporary vectorial force */
2011 tx = _mm256_mul_pd(fscal,dx20);
2012 ty = _mm256_mul_pd(fscal,dy20);
2013 tz = _mm256_mul_pd(fscal,dz20);
2015 /* Update vectorial force */
2016 fix2 = _mm256_add_pd(fix2,tx);
2017 fiy2 = _mm256_add_pd(fiy2,ty);
2018 fiz2 = _mm256_add_pd(fiz2,tz);
2020 fjx0 = _mm256_add_pd(fjx0,tx);
2021 fjy0 = _mm256_add_pd(fjy0,ty);
2022 fjz0 = _mm256_add_pd(fjz0,tz);
2026 /**************************
2027 * CALCULATE INTERACTIONS *
2028 **************************/
2030 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2033 r21 = _mm256_mul_pd(rsq21,rinv21);
2035 /* EWALD ELECTROSTATICS */
2037 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2038 ewrt = _mm256_mul_pd(r21,ewtabscale);
2039 ewitab = _mm256_cvttpd_epi32(ewrt);
2040 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2041 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2042 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2044 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2045 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2047 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2051 fscal = _mm256_and_pd(fscal,cutoff_mask);
2053 /* Calculate temporary vectorial force */
2054 tx = _mm256_mul_pd(fscal,dx21);
2055 ty = _mm256_mul_pd(fscal,dy21);
2056 tz = _mm256_mul_pd(fscal,dz21);
2058 /* Update vectorial force */
2059 fix2 = _mm256_add_pd(fix2,tx);
2060 fiy2 = _mm256_add_pd(fiy2,ty);
2061 fiz2 = _mm256_add_pd(fiz2,tz);
2063 fjx1 = _mm256_add_pd(fjx1,tx);
2064 fjy1 = _mm256_add_pd(fjy1,ty);
2065 fjz1 = _mm256_add_pd(fjz1,tz);
2069 /**************************
2070 * CALCULATE INTERACTIONS *
2071 **************************/
2073 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2076 r22 = _mm256_mul_pd(rsq22,rinv22);
2078 /* EWALD ELECTROSTATICS */
2080 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2081 ewrt = _mm256_mul_pd(r22,ewtabscale);
2082 ewitab = _mm256_cvttpd_epi32(ewrt);
2083 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2084 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2085 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2087 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2088 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2090 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2094 fscal = _mm256_and_pd(fscal,cutoff_mask);
2096 /* Calculate temporary vectorial force */
2097 tx = _mm256_mul_pd(fscal,dx22);
2098 ty = _mm256_mul_pd(fscal,dy22);
2099 tz = _mm256_mul_pd(fscal,dz22);
2101 /* Update vectorial force */
2102 fix2 = _mm256_add_pd(fix2,tx);
2103 fiy2 = _mm256_add_pd(fiy2,ty);
2104 fiz2 = _mm256_add_pd(fiz2,tz);
2106 fjx2 = _mm256_add_pd(fjx2,tx);
2107 fjy2 = _mm256_add_pd(fjy2,ty);
2108 fjz2 = _mm256_add_pd(fjz2,tz);
2112 fjptrA = f+j_coord_offsetA;
2113 fjptrB = f+j_coord_offsetB;
2114 fjptrC = f+j_coord_offsetC;
2115 fjptrD = f+j_coord_offsetD;
2117 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2118 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2120 /* Inner loop uses 358 flops */
2123 if(jidx<j_index_end)
2126 /* Get j neighbor index, and coordinate index */
2127 jnrlistA = jjnr[jidx];
2128 jnrlistB = jjnr[jidx+1];
2129 jnrlistC = jjnr[jidx+2];
2130 jnrlistD = jjnr[jidx+3];
2131 /* Sign of each element will be negative for non-real atoms.
2132 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2133 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2135 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2137 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2138 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2139 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2141 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2142 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2143 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2144 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2145 j_coord_offsetA = DIM*jnrA;
2146 j_coord_offsetB = DIM*jnrB;
2147 j_coord_offsetC = DIM*jnrC;
2148 j_coord_offsetD = DIM*jnrD;
2150 /* load j atom coordinates */
2151 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2152 x+j_coord_offsetC,x+j_coord_offsetD,
2153 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2155 /* Calculate displacement vector */
2156 dx00 = _mm256_sub_pd(ix0,jx0);
2157 dy00 = _mm256_sub_pd(iy0,jy0);
2158 dz00 = _mm256_sub_pd(iz0,jz0);
2159 dx01 = _mm256_sub_pd(ix0,jx1);
2160 dy01 = _mm256_sub_pd(iy0,jy1);
2161 dz01 = _mm256_sub_pd(iz0,jz1);
2162 dx02 = _mm256_sub_pd(ix0,jx2);
2163 dy02 = _mm256_sub_pd(iy0,jy2);
2164 dz02 = _mm256_sub_pd(iz0,jz2);
2165 dx10 = _mm256_sub_pd(ix1,jx0);
2166 dy10 = _mm256_sub_pd(iy1,jy0);
2167 dz10 = _mm256_sub_pd(iz1,jz0);
2168 dx11 = _mm256_sub_pd(ix1,jx1);
2169 dy11 = _mm256_sub_pd(iy1,jy1);
2170 dz11 = _mm256_sub_pd(iz1,jz1);
2171 dx12 = _mm256_sub_pd(ix1,jx2);
2172 dy12 = _mm256_sub_pd(iy1,jy2);
2173 dz12 = _mm256_sub_pd(iz1,jz2);
2174 dx20 = _mm256_sub_pd(ix2,jx0);
2175 dy20 = _mm256_sub_pd(iy2,jy0);
2176 dz20 = _mm256_sub_pd(iz2,jz0);
2177 dx21 = _mm256_sub_pd(ix2,jx1);
2178 dy21 = _mm256_sub_pd(iy2,jy1);
2179 dz21 = _mm256_sub_pd(iz2,jz1);
2180 dx22 = _mm256_sub_pd(ix2,jx2);
2181 dy22 = _mm256_sub_pd(iy2,jy2);
2182 dz22 = _mm256_sub_pd(iz2,jz2);
2184 /* Calculate squared distance and things based on it */
2185 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2186 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2187 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2188 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2189 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2190 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2191 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2192 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2193 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2195 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2196 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2197 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2198 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2199 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2200 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2201 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2202 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2203 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2205 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2206 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2207 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2208 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2209 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2210 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2211 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2212 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2213 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2215 fjx0 = _mm256_setzero_pd();
2216 fjy0 = _mm256_setzero_pd();
2217 fjz0 = _mm256_setzero_pd();
2218 fjx1 = _mm256_setzero_pd();
2219 fjy1 = _mm256_setzero_pd();
2220 fjz1 = _mm256_setzero_pd();
2221 fjx2 = _mm256_setzero_pd();
2222 fjy2 = _mm256_setzero_pd();
2223 fjz2 = _mm256_setzero_pd();
2225 /**************************
2226 * CALCULATE INTERACTIONS *
2227 **************************/
2229 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2232 r00 = _mm256_mul_pd(rsq00,rinv00);
2233 r00 = _mm256_andnot_pd(dummy_mask,r00);
2235 /* EWALD ELECTROSTATICS */
2237 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2238 ewrt = _mm256_mul_pd(r00,ewtabscale);
2239 ewitab = _mm256_cvttpd_epi32(ewrt);
2240 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2241 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2242 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2244 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2245 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2247 /* LENNARD-JONES DISPERSION/REPULSION */
2249 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2250 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2252 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2254 fscal = _mm256_add_pd(felec,fvdw);
2256 fscal = _mm256_and_pd(fscal,cutoff_mask);
2258 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2260 /* Calculate temporary vectorial force */
2261 tx = _mm256_mul_pd(fscal,dx00);
2262 ty = _mm256_mul_pd(fscal,dy00);
2263 tz = _mm256_mul_pd(fscal,dz00);
2265 /* Update vectorial force */
2266 fix0 = _mm256_add_pd(fix0,tx);
2267 fiy0 = _mm256_add_pd(fiy0,ty);
2268 fiz0 = _mm256_add_pd(fiz0,tz);
2270 fjx0 = _mm256_add_pd(fjx0,tx);
2271 fjy0 = _mm256_add_pd(fjy0,ty);
2272 fjz0 = _mm256_add_pd(fjz0,tz);
2276 /**************************
2277 * CALCULATE INTERACTIONS *
2278 **************************/
2280 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2283 r01 = _mm256_mul_pd(rsq01,rinv01);
2284 r01 = _mm256_andnot_pd(dummy_mask,r01);
2286 /* EWALD ELECTROSTATICS */
2288 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2289 ewrt = _mm256_mul_pd(r01,ewtabscale);
2290 ewitab = _mm256_cvttpd_epi32(ewrt);
2291 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2292 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2293 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2295 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2296 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2298 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2302 fscal = _mm256_and_pd(fscal,cutoff_mask);
2304 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2306 /* Calculate temporary vectorial force */
2307 tx = _mm256_mul_pd(fscal,dx01);
2308 ty = _mm256_mul_pd(fscal,dy01);
2309 tz = _mm256_mul_pd(fscal,dz01);
2311 /* Update vectorial force */
2312 fix0 = _mm256_add_pd(fix0,tx);
2313 fiy0 = _mm256_add_pd(fiy0,ty);
2314 fiz0 = _mm256_add_pd(fiz0,tz);
2316 fjx1 = _mm256_add_pd(fjx1,tx);
2317 fjy1 = _mm256_add_pd(fjy1,ty);
2318 fjz1 = _mm256_add_pd(fjz1,tz);
2322 /**************************
2323 * CALCULATE INTERACTIONS *
2324 **************************/
2326 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2329 r02 = _mm256_mul_pd(rsq02,rinv02);
2330 r02 = _mm256_andnot_pd(dummy_mask,r02);
2332 /* EWALD ELECTROSTATICS */
2334 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335 ewrt = _mm256_mul_pd(r02,ewtabscale);
2336 ewitab = _mm256_cvttpd_epi32(ewrt);
2337 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2338 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2339 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2341 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2342 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2344 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2348 fscal = _mm256_and_pd(fscal,cutoff_mask);
2350 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2352 /* Calculate temporary vectorial force */
2353 tx = _mm256_mul_pd(fscal,dx02);
2354 ty = _mm256_mul_pd(fscal,dy02);
2355 tz = _mm256_mul_pd(fscal,dz02);
2357 /* Update vectorial force */
2358 fix0 = _mm256_add_pd(fix0,tx);
2359 fiy0 = _mm256_add_pd(fiy0,ty);
2360 fiz0 = _mm256_add_pd(fiz0,tz);
2362 fjx2 = _mm256_add_pd(fjx2,tx);
2363 fjy2 = _mm256_add_pd(fjy2,ty);
2364 fjz2 = _mm256_add_pd(fjz2,tz);
2368 /**************************
2369 * CALCULATE INTERACTIONS *
2370 **************************/
2372 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2375 r10 = _mm256_mul_pd(rsq10,rinv10);
2376 r10 = _mm256_andnot_pd(dummy_mask,r10);
2378 /* EWALD ELECTROSTATICS */
2380 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2381 ewrt = _mm256_mul_pd(r10,ewtabscale);
2382 ewitab = _mm256_cvttpd_epi32(ewrt);
2383 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2384 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2385 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2387 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2388 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2390 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2394 fscal = _mm256_and_pd(fscal,cutoff_mask);
2396 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2398 /* Calculate temporary vectorial force */
2399 tx = _mm256_mul_pd(fscal,dx10);
2400 ty = _mm256_mul_pd(fscal,dy10);
2401 tz = _mm256_mul_pd(fscal,dz10);
2403 /* Update vectorial force */
2404 fix1 = _mm256_add_pd(fix1,tx);
2405 fiy1 = _mm256_add_pd(fiy1,ty);
2406 fiz1 = _mm256_add_pd(fiz1,tz);
2408 fjx0 = _mm256_add_pd(fjx0,tx);
2409 fjy0 = _mm256_add_pd(fjy0,ty);
2410 fjz0 = _mm256_add_pd(fjz0,tz);
2414 /**************************
2415 * CALCULATE INTERACTIONS *
2416 **************************/
2418 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2421 r11 = _mm256_mul_pd(rsq11,rinv11);
2422 r11 = _mm256_andnot_pd(dummy_mask,r11);
2424 /* EWALD ELECTROSTATICS */
2426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2427 ewrt = _mm256_mul_pd(r11,ewtabscale);
2428 ewitab = _mm256_cvttpd_epi32(ewrt);
2429 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2430 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2431 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2433 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2434 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2436 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2440 fscal = _mm256_and_pd(fscal,cutoff_mask);
2442 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2444 /* Calculate temporary vectorial force */
2445 tx = _mm256_mul_pd(fscal,dx11);
2446 ty = _mm256_mul_pd(fscal,dy11);
2447 tz = _mm256_mul_pd(fscal,dz11);
2449 /* Update vectorial force */
2450 fix1 = _mm256_add_pd(fix1,tx);
2451 fiy1 = _mm256_add_pd(fiy1,ty);
2452 fiz1 = _mm256_add_pd(fiz1,tz);
2454 fjx1 = _mm256_add_pd(fjx1,tx);
2455 fjy1 = _mm256_add_pd(fjy1,ty);
2456 fjz1 = _mm256_add_pd(fjz1,tz);
2460 /**************************
2461 * CALCULATE INTERACTIONS *
2462 **************************/
2464 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2467 r12 = _mm256_mul_pd(rsq12,rinv12);
2468 r12 = _mm256_andnot_pd(dummy_mask,r12);
2470 /* EWALD ELECTROSTATICS */
2472 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2473 ewrt = _mm256_mul_pd(r12,ewtabscale);
2474 ewitab = _mm256_cvttpd_epi32(ewrt);
2475 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2476 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2477 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2479 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2480 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2482 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2486 fscal = _mm256_and_pd(fscal,cutoff_mask);
2488 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2490 /* Calculate temporary vectorial force */
2491 tx = _mm256_mul_pd(fscal,dx12);
2492 ty = _mm256_mul_pd(fscal,dy12);
2493 tz = _mm256_mul_pd(fscal,dz12);
2495 /* Update vectorial force */
2496 fix1 = _mm256_add_pd(fix1,tx);
2497 fiy1 = _mm256_add_pd(fiy1,ty);
2498 fiz1 = _mm256_add_pd(fiz1,tz);
2500 fjx2 = _mm256_add_pd(fjx2,tx);
2501 fjy2 = _mm256_add_pd(fjy2,ty);
2502 fjz2 = _mm256_add_pd(fjz2,tz);
2506 /**************************
2507 * CALCULATE INTERACTIONS *
2508 **************************/
2510 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2513 r20 = _mm256_mul_pd(rsq20,rinv20);
2514 r20 = _mm256_andnot_pd(dummy_mask,r20);
2516 /* EWALD ELECTROSTATICS */
2518 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2519 ewrt = _mm256_mul_pd(r20,ewtabscale);
2520 ewitab = _mm256_cvttpd_epi32(ewrt);
2521 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2522 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2523 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2525 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2526 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2528 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2532 fscal = _mm256_and_pd(fscal,cutoff_mask);
2534 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2536 /* Calculate temporary vectorial force */
2537 tx = _mm256_mul_pd(fscal,dx20);
2538 ty = _mm256_mul_pd(fscal,dy20);
2539 tz = _mm256_mul_pd(fscal,dz20);
2541 /* Update vectorial force */
2542 fix2 = _mm256_add_pd(fix2,tx);
2543 fiy2 = _mm256_add_pd(fiy2,ty);
2544 fiz2 = _mm256_add_pd(fiz2,tz);
2546 fjx0 = _mm256_add_pd(fjx0,tx);
2547 fjy0 = _mm256_add_pd(fjy0,ty);
2548 fjz0 = _mm256_add_pd(fjz0,tz);
2552 /**************************
2553 * CALCULATE INTERACTIONS *
2554 **************************/
2556 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2559 r21 = _mm256_mul_pd(rsq21,rinv21);
2560 r21 = _mm256_andnot_pd(dummy_mask,r21);
2562 /* EWALD ELECTROSTATICS */
2564 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2565 ewrt = _mm256_mul_pd(r21,ewtabscale);
2566 ewitab = _mm256_cvttpd_epi32(ewrt);
2567 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2568 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2569 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2571 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2572 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2574 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2578 fscal = _mm256_and_pd(fscal,cutoff_mask);
2580 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2582 /* Calculate temporary vectorial force */
2583 tx = _mm256_mul_pd(fscal,dx21);
2584 ty = _mm256_mul_pd(fscal,dy21);
2585 tz = _mm256_mul_pd(fscal,dz21);
2587 /* Update vectorial force */
2588 fix2 = _mm256_add_pd(fix2,tx);
2589 fiy2 = _mm256_add_pd(fiy2,ty);
2590 fiz2 = _mm256_add_pd(fiz2,tz);
2592 fjx1 = _mm256_add_pd(fjx1,tx);
2593 fjy1 = _mm256_add_pd(fjy1,ty);
2594 fjz1 = _mm256_add_pd(fjz1,tz);
2598 /**************************
2599 * CALCULATE INTERACTIONS *
2600 **************************/
2602 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2605 r22 = _mm256_mul_pd(rsq22,rinv22);
2606 r22 = _mm256_andnot_pd(dummy_mask,r22);
2608 /* EWALD ELECTROSTATICS */
2610 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2611 ewrt = _mm256_mul_pd(r22,ewtabscale);
2612 ewitab = _mm256_cvttpd_epi32(ewrt);
2613 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2614 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2615 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2617 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2618 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2620 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2624 fscal = _mm256_and_pd(fscal,cutoff_mask);
2626 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2628 /* Calculate temporary vectorial force */
2629 tx = _mm256_mul_pd(fscal,dx22);
2630 ty = _mm256_mul_pd(fscal,dy22);
2631 tz = _mm256_mul_pd(fscal,dz22);
2633 /* Update vectorial force */
2634 fix2 = _mm256_add_pd(fix2,tx);
2635 fiy2 = _mm256_add_pd(fiy2,ty);
2636 fiz2 = _mm256_add_pd(fiz2,tz);
2638 fjx2 = _mm256_add_pd(fjx2,tx);
2639 fjy2 = _mm256_add_pd(fjy2,ty);
2640 fjz2 = _mm256_add_pd(fjz2,tz);
2644 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2645 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2646 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2647 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2649 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2650 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2652 /* Inner loop uses 367 flops */
2655 /* End of innermost loop */
2657 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2658 f+i_coord_offset,fshift+i_shift_offset);
2660 /* Increment number of inner iterations */
2661 inneriter += j_index_end - j_index_start;
2663 /* Outer loop uses 18 flops */
2666 /* Increment number of outer iterations */
2669 /* Update outer/inner flops */
2671 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);