2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 real * vdwgridioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 real * vdwgridioffsetptr1;
88 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 real * vdwgridioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
114 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
125 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
126 __m256d one_half = _mm256_set1_pd(0.5);
127 __m256d minus_one = _mm256_set1_pd(-1.0);
129 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
130 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
132 __m256d dummy_mask,cutoff_mask;
133 __m128 tmpmask0,tmpmask1;
134 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
135 __m256d one = _mm256_set1_pd(1.0);
136 __m256d two = _mm256_set1_pd(2.0);
142 jindex = nlist->jindex;
144 shiftidx = nlist->shift;
146 shiftvec = fr->shift_vec[0];
147 fshift = fr->fshift[0];
148 facel = _mm256_set1_pd(fr->ic->epsfac);
149 charge = mdatoms->chargeA;
150 nvdwtype = fr->ntype;
152 vdwtype = mdatoms->typeA;
153 vdwgridparam = fr->ljpme_c6grid;
154 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
155 ewclj = _mm256_set1_pd(fr->ic->ewaldcoeff_lj);
156 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
158 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
159 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
160 beta2 = _mm256_mul_pd(beta,beta);
161 beta3 = _mm256_mul_pd(beta,beta2);
163 ewtab = fr->ic->tabq_coul_FDV0;
164 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
165 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
167 /* Setup water-specific parameters */
168 inr = nlist->iinr[0];
169 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
170 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
171 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
172 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
173 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
175 jq0 = _mm256_set1_pd(charge[inr+0]);
176 jq1 = _mm256_set1_pd(charge[inr+1]);
177 jq2 = _mm256_set1_pd(charge[inr+2]);
178 vdwjidx0A = 2*vdwtype[inr+0];
179 qq00 = _mm256_mul_pd(iq0,jq0);
180 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
181 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
182 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
183 qq01 = _mm256_mul_pd(iq0,jq1);
184 qq02 = _mm256_mul_pd(iq0,jq2);
185 qq10 = _mm256_mul_pd(iq1,jq0);
186 qq11 = _mm256_mul_pd(iq1,jq1);
187 qq12 = _mm256_mul_pd(iq1,jq2);
188 qq20 = _mm256_mul_pd(iq2,jq0);
189 qq21 = _mm256_mul_pd(iq2,jq1);
190 qq22 = _mm256_mul_pd(iq2,jq2);
192 /* Avoid stupid compiler warnings */
193 jnrA = jnrB = jnrC = jnrD = 0;
202 for(iidx=0;iidx<4*DIM;iidx++)
207 /* Start outer loop over neighborlists */
208 for(iidx=0; iidx<nri; iidx++)
210 /* Load shift vector for this list */
211 i_shift_offset = DIM*shiftidx[iidx];
213 /* Load limits for loop over neighbors */
214 j_index_start = jindex[iidx];
215 j_index_end = jindex[iidx+1];
217 /* Get outer coordinate index */
219 i_coord_offset = DIM*inr;
221 /* Load i particle coords and add shift vector */
222 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
223 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
225 fix0 = _mm256_setzero_pd();
226 fiy0 = _mm256_setzero_pd();
227 fiz0 = _mm256_setzero_pd();
228 fix1 = _mm256_setzero_pd();
229 fiy1 = _mm256_setzero_pd();
230 fiz1 = _mm256_setzero_pd();
231 fix2 = _mm256_setzero_pd();
232 fiy2 = _mm256_setzero_pd();
233 fiz2 = _mm256_setzero_pd();
235 /* Reset potential sums */
236 velecsum = _mm256_setzero_pd();
237 vvdwsum = _mm256_setzero_pd();
239 /* Start inner kernel loop */
240 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
243 /* Get j neighbor index, and coordinate index */
248 j_coord_offsetA = DIM*jnrA;
249 j_coord_offsetB = DIM*jnrB;
250 j_coord_offsetC = DIM*jnrC;
251 j_coord_offsetD = DIM*jnrD;
253 /* load j atom coordinates */
254 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
255 x+j_coord_offsetC,x+j_coord_offsetD,
256 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
258 /* Calculate displacement vector */
259 dx00 = _mm256_sub_pd(ix0,jx0);
260 dy00 = _mm256_sub_pd(iy0,jy0);
261 dz00 = _mm256_sub_pd(iz0,jz0);
262 dx01 = _mm256_sub_pd(ix0,jx1);
263 dy01 = _mm256_sub_pd(iy0,jy1);
264 dz01 = _mm256_sub_pd(iz0,jz1);
265 dx02 = _mm256_sub_pd(ix0,jx2);
266 dy02 = _mm256_sub_pd(iy0,jy2);
267 dz02 = _mm256_sub_pd(iz0,jz2);
268 dx10 = _mm256_sub_pd(ix1,jx0);
269 dy10 = _mm256_sub_pd(iy1,jy0);
270 dz10 = _mm256_sub_pd(iz1,jz0);
271 dx11 = _mm256_sub_pd(ix1,jx1);
272 dy11 = _mm256_sub_pd(iy1,jy1);
273 dz11 = _mm256_sub_pd(iz1,jz1);
274 dx12 = _mm256_sub_pd(ix1,jx2);
275 dy12 = _mm256_sub_pd(iy1,jy2);
276 dz12 = _mm256_sub_pd(iz1,jz2);
277 dx20 = _mm256_sub_pd(ix2,jx0);
278 dy20 = _mm256_sub_pd(iy2,jy0);
279 dz20 = _mm256_sub_pd(iz2,jz0);
280 dx21 = _mm256_sub_pd(ix2,jx1);
281 dy21 = _mm256_sub_pd(iy2,jy1);
282 dz21 = _mm256_sub_pd(iz2,jz1);
283 dx22 = _mm256_sub_pd(ix2,jx2);
284 dy22 = _mm256_sub_pd(iy2,jy2);
285 dz22 = _mm256_sub_pd(iz2,jz2);
287 /* Calculate squared distance and things based on it */
288 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
289 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
290 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
291 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
292 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
293 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
294 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
295 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
296 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
298 rinv00 = avx256_invsqrt_d(rsq00);
299 rinv01 = avx256_invsqrt_d(rsq01);
300 rinv02 = avx256_invsqrt_d(rsq02);
301 rinv10 = avx256_invsqrt_d(rsq10);
302 rinv11 = avx256_invsqrt_d(rsq11);
303 rinv12 = avx256_invsqrt_d(rsq12);
304 rinv20 = avx256_invsqrt_d(rsq20);
305 rinv21 = avx256_invsqrt_d(rsq21);
306 rinv22 = avx256_invsqrt_d(rsq22);
308 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
309 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
310 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
311 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
312 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
313 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
314 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
315 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
316 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
318 fjx0 = _mm256_setzero_pd();
319 fjy0 = _mm256_setzero_pd();
320 fjz0 = _mm256_setzero_pd();
321 fjx1 = _mm256_setzero_pd();
322 fjy1 = _mm256_setzero_pd();
323 fjz1 = _mm256_setzero_pd();
324 fjx2 = _mm256_setzero_pd();
325 fjy2 = _mm256_setzero_pd();
326 fjz2 = _mm256_setzero_pd();
328 /**************************
329 * CALCULATE INTERACTIONS *
330 **************************/
332 r00 = _mm256_mul_pd(rsq00,rinv00);
334 /* EWALD ELECTROSTATICS */
336 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
337 ewrt = _mm256_mul_pd(r00,ewtabscale);
338 ewitab = _mm256_cvttpd_epi32(ewrt);
339 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
340 ewitab = _mm_slli_epi32(ewitab,2);
341 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
342 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
343 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
344 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
345 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
346 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
347 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
348 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
349 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
351 /* Analytical LJ-PME */
352 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
353 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
354 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
355 exponent = avx256_exp_d(ewcljrsq);
356 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
357 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
358 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
359 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
360 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
361 vvdw = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
362 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
363 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velecsum = _mm256_add_pd(velecsum,velec);
367 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
369 fscal = _mm256_add_pd(felec,fvdw);
371 /* Calculate temporary vectorial force */
372 tx = _mm256_mul_pd(fscal,dx00);
373 ty = _mm256_mul_pd(fscal,dy00);
374 tz = _mm256_mul_pd(fscal,dz00);
376 /* Update vectorial force */
377 fix0 = _mm256_add_pd(fix0,tx);
378 fiy0 = _mm256_add_pd(fiy0,ty);
379 fiz0 = _mm256_add_pd(fiz0,tz);
381 fjx0 = _mm256_add_pd(fjx0,tx);
382 fjy0 = _mm256_add_pd(fjy0,ty);
383 fjz0 = _mm256_add_pd(fjz0,tz);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 r01 = _mm256_mul_pd(rsq01,rinv01);
391 /* EWALD ELECTROSTATICS */
393 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
394 ewrt = _mm256_mul_pd(r01,ewtabscale);
395 ewitab = _mm256_cvttpd_epi32(ewrt);
396 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
397 ewitab = _mm_slli_epi32(ewitab,2);
398 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
399 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
400 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
401 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
402 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
403 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
404 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
405 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
406 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
408 /* Update potential sum for this i atom from the interaction with this j atom. */
409 velecsum = _mm256_add_pd(velecsum,velec);
413 /* Calculate temporary vectorial force */
414 tx = _mm256_mul_pd(fscal,dx01);
415 ty = _mm256_mul_pd(fscal,dy01);
416 tz = _mm256_mul_pd(fscal,dz01);
418 /* Update vectorial force */
419 fix0 = _mm256_add_pd(fix0,tx);
420 fiy0 = _mm256_add_pd(fiy0,ty);
421 fiz0 = _mm256_add_pd(fiz0,tz);
423 fjx1 = _mm256_add_pd(fjx1,tx);
424 fjy1 = _mm256_add_pd(fjy1,ty);
425 fjz1 = _mm256_add_pd(fjz1,tz);
427 /**************************
428 * CALCULATE INTERACTIONS *
429 **************************/
431 r02 = _mm256_mul_pd(rsq02,rinv02);
433 /* EWALD ELECTROSTATICS */
435 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
436 ewrt = _mm256_mul_pd(r02,ewtabscale);
437 ewitab = _mm256_cvttpd_epi32(ewrt);
438 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
439 ewitab = _mm_slli_epi32(ewitab,2);
440 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
441 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
442 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
443 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
444 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
445 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
446 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
447 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
448 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velecsum = _mm256_add_pd(velecsum,velec);
455 /* Calculate temporary vectorial force */
456 tx = _mm256_mul_pd(fscal,dx02);
457 ty = _mm256_mul_pd(fscal,dy02);
458 tz = _mm256_mul_pd(fscal,dz02);
460 /* Update vectorial force */
461 fix0 = _mm256_add_pd(fix0,tx);
462 fiy0 = _mm256_add_pd(fiy0,ty);
463 fiz0 = _mm256_add_pd(fiz0,tz);
465 fjx2 = _mm256_add_pd(fjx2,tx);
466 fjy2 = _mm256_add_pd(fjy2,ty);
467 fjz2 = _mm256_add_pd(fjz2,tz);
469 /**************************
470 * CALCULATE INTERACTIONS *
471 **************************/
473 r10 = _mm256_mul_pd(rsq10,rinv10);
475 /* EWALD ELECTROSTATICS */
477 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
478 ewrt = _mm256_mul_pd(r10,ewtabscale);
479 ewitab = _mm256_cvttpd_epi32(ewrt);
480 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
481 ewitab = _mm_slli_epi32(ewitab,2);
482 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
483 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
484 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
485 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
486 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
487 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
488 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
489 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
490 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velecsum = _mm256_add_pd(velecsum,velec);
497 /* Calculate temporary vectorial force */
498 tx = _mm256_mul_pd(fscal,dx10);
499 ty = _mm256_mul_pd(fscal,dy10);
500 tz = _mm256_mul_pd(fscal,dz10);
502 /* Update vectorial force */
503 fix1 = _mm256_add_pd(fix1,tx);
504 fiy1 = _mm256_add_pd(fiy1,ty);
505 fiz1 = _mm256_add_pd(fiz1,tz);
507 fjx0 = _mm256_add_pd(fjx0,tx);
508 fjy0 = _mm256_add_pd(fjy0,ty);
509 fjz0 = _mm256_add_pd(fjz0,tz);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 r11 = _mm256_mul_pd(rsq11,rinv11);
517 /* EWALD ELECTROSTATICS */
519 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
520 ewrt = _mm256_mul_pd(r11,ewtabscale);
521 ewitab = _mm256_cvttpd_epi32(ewrt);
522 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
523 ewitab = _mm_slli_epi32(ewitab,2);
524 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
525 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
526 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
527 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
528 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
529 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
530 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
531 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
532 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum = _mm256_add_pd(velecsum,velec);
539 /* Calculate temporary vectorial force */
540 tx = _mm256_mul_pd(fscal,dx11);
541 ty = _mm256_mul_pd(fscal,dy11);
542 tz = _mm256_mul_pd(fscal,dz11);
544 /* Update vectorial force */
545 fix1 = _mm256_add_pd(fix1,tx);
546 fiy1 = _mm256_add_pd(fiy1,ty);
547 fiz1 = _mm256_add_pd(fiz1,tz);
549 fjx1 = _mm256_add_pd(fjx1,tx);
550 fjy1 = _mm256_add_pd(fjy1,ty);
551 fjz1 = _mm256_add_pd(fjz1,tz);
553 /**************************
554 * CALCULATE INTERACTIONS *
555 **************************/
557 r12 = _mm256_mul_pd(rsq12,rinv12);
559 /* EWALD ELECTROSTATICS */
561 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
562 ewrt = _mm256_mul_pd(r12,ewtabscale);
563 ewitab = _mm256_cvttpd_epi32(ewrt);
564 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
565 ewitab = _mm_slli_epi32(ewitab,2);
566 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
567 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
568 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
569 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
570 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
571 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
572 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
573 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
574 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
576 /* Update potential sum for this i atom from the interaction with this j atom. */
577 velecsum = _mm256_add_pd(velecsum,velec);
581 /* Calculate temporary vectorial force */
582 tx = _mm256_mul_pd(fscal,dx12);
583 ty = _mm256_mul_pd(fscal,dy12);
584 tz = _mm256_mul_pd(fscal,dz12);
586 /* Update vectorial force */
587 fix1 = _mm256_add_pd(fix1,tx);
588 fiy1 = _mm256_add_pd(fiy1,ty);
589 fiz1 = _mm256_add_pd(fiz1,tz);
591 fjx2 = _mm256_add_pd(fjx2,tx);
592 fjy2 = _mm256_add_pd(fjy2,ty);
593 fjz2 = _mm256_add_pd(fjz2,tz);
595 /**************************
596 * CALCULATE INTERACTIONS *
597 **************************/
599 r20 = _mm256_mul_pd(rsq20,rinv20);
601 /* EWALD ELECTROSTATICS */
603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604 ewrt = _mm256_mul_pd(r20,ewtabscale);
605 ewitab = _mm256_cvttpd_epi32(ewrt);
606 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
607 ewitab = _mm_slli_epi32(ewitab,2);
608 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
609 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
610 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
611 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
612 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
613 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
614 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
615 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
616 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
618 /* Update potential sum for this i atom from the interaction with this j atom. */
619 velecsum = _mm256_add_pd(velecsum,velec);
623 /* Calculate temporary vectorial force */
624 tx = _mm256_mul_pd(fscal,dx20);
625 ty = _mm256_mul_pd(fscal,dy20);
626 tz = _mm256_mul_pd(fscal,dz20);
628 /* Update vectorial force */
629 fix2 = _mm256_add_pd(fix2,tx);
630 fiy2 = _mm256_add_pd(fiy2,ty);
631 fiz2 = _mm256_add_pd(fiz2,tz);
633 fjx0 = _mm256_add_pd(fjx0,tx);
634 fjy0 = _mm256_add_pd(fjy0,ty);
635 fjz0 = _mm256_add_pd(fjz0,tz);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 r21 = _mm256_mul_pd(rsq21,rinv21);
643 /* EWALD ELECTROSTATICS */
645 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
646 ewrt = _mm256_mul_pd(r21,ewtabscale);
647 ewitab = _mm256_cvttpd_epi32(ewrt);
648 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
649 ewitab = _mm_slli_epi32(ewitab,2);
650 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
651 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
652 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
653 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
654 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
655 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
656 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
657 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
658 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 velecsum = _mm256_add_pd(velecsum,velec);
665 /* Calculate temporary vectorial force */
666 tx = _mm256_mul_pd(fscal,dx21);
667 ty = _mm256_mul_pd(fscal,dy21);
668 tz = _mm256_mul_pd(fscal,dz21);
670 /* Update vectorial force */
671 fix2 = _mm256_add_pd(fix2,tx);
672 fiy2 = _mm256_add_pd(fiy2,ty);
673 fiz2 = _mm256_add_pd(fiz2,tz);
675 fjx1 = _mm256_add_pd(fjx1,tx);
676 fjy1 = _mm256_add_pd(fjy1,ty);
677 fjz1 = _mm256_add_pd(fjz1,tz);
679 /**************************
680 * CALCULATE INTERACTIONS *
681 **************************/
683 r22 = _mm256_mul_pd(rsq22,rinv22);
685 /* EWALD ELECTROSTATICS */
687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688 ewrt = _mm256_mul_pd(r22,ewtabscale);
689 ewitab = _mm256_cvttpd_epi32(ewrt);
690 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
691 ewitab = _mm_slli_epi32(ewitab,2);
692 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
693 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
694 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
695 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
696 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
697 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
698 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
699 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
700 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
702 /* Update potential sum for this i atom from the interaction with this j atom. */
703 velecsum = _mm256_add_pd(velecsum,velec);
707 /* Calculate temporary vectorial force */
708 tx = _mm256_mul_pd(fscal,dx22);
709 ty = _mm256_mul_pd(fscal,dy22);
710 tz = _mm256_mul_pd(fscal,dz22);
712 /* Update vectorial force */
713 fix2 = _mm256_add_pd(fix2,tx);
714 fiy2 = _mm256_add_pd(fiy2,ty);
715 fiz2 = _mm256_add_pd(fiz2,tz);
717 fjx2 = _mm256_add_pd(fjx2,tx);
718 fjy2 = _mm256_add_pd(fjy2,ty);
719 fjz2 = _mm256_add_pd(fjz2,tz);
721 fjptrA = f+j_coord_offsetA;
722 fjptrB = f+j_coord_offsetB;
723 fjptrC = f+j_coord_offsetC;
724 fjptrD = f+j_coord_offsetD;
726 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
727 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
729 /* Inner loop uses 400 flops */
735 /* Get j neighbor index, and coordinate index */
736 jnrlistA = jjnr[jidx];
737 jnrlistB = jjnr[jidx+1];
738 jnrlistC = jjnr[jidx+2];
739 jnrlistD = jjnr[jidx+3];
740 /* Sign of each element will be negative for non-real atoms.
741 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
742 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
744 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
746 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
747 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
748 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
750 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
751 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
752 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
753 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
754 j_coord_offsetA = DIM*jnrA;
755 j_coord_offsetB = DIM*jnrB;
756 j_coord_offsetC = DIM*jnrC;
757 j_coord_offsetD = DIM*jnrD;
759 /* load j atom coordinates */
760 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
761 x+j_coord_offsetC,x+j_coord_offsetD,
762 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
764 /* Calculate displacement vector */
765 dx00 = _mm256_sub_pd(ix0,jx0);
766 dy00 = _mm256_sub_pd(iy0,jy0);
767 dz00 = _mm256_sub_pd(iz0,jz0);
768 dx01 = _mm256_sub_pd(ix0,jx1);
769 dy01 = _mm256_sub_pd(iy0,jy1);
770 dz01 = _mm256_sub_pd(iz0,jz1);
771 dx02 = _mm256_sub_pd(ix0,jx2);
772 dy02 = _mm256_sub_pd(iy0,jy2);
773 dz02 = _mm256_sub_pd(iz0,jz2);
774 dx10 = _mm256_sub_pd(ix1,jx0);
775 dy10 = _mm256_sub_pd(iy1,jy0);
776 dz10 = _mm256_sub_pd(iz1,jz0);
777 dx11 = _mm256_sub_pd(ix1,jx1);
778 dy11 = _mm256_sub_pd(iy1,jy1);
779 dz11 = _mm256_sub_pd(iz1,jz1);
780 dx12 = _mm256_sub_pd(ix1,jx2);
781 dy12 = _mm256_sub_pd(iy1,jy2);
782 dz12 = _mm256_sub_pd(iz1,jz2);
783 dx20 = _mm256_sub_pd(ix2,jx0);
784 dy20 = _mm256_sub_pd(iy2,jy0);
785 dz20 = _mm256_sub_pd(iz2,jz0);
786 dx21 = _mm256_sub_pd(ix2,jx1);
787 dy21 = _mm256_sub_pd(iy2,jy1);
788 dz21 = _mm256_sub_pd(iz2,jz1);
789 dx22 = _mm256_sub_pd(ix2,jx2);
790 dy22 = _mm256_sub_pd(iy2,jy2);
791 dz22 = _mm256_sub_pd(iz2,jz2);
793 /* Calculate squared distance and things based on it */
794 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
795 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
796 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
797 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
798 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
799 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
800 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
801 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
802 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
804 rinv00 = avx256_invsqrt_d(rsq00);
805 rinv01 = avx256_invsqrt_d(rsq01);
806 rinv02 = avx256_invsqrt_d(rsq02);
807 rinv10 = avx256_invsqrt_d(rsq10);
808 rinv11 = avx256_invsqrt_d(rsq11);
809 rinv12 = avx256_invsqrt_d(rsq12);
810 rinv20 = avx256_invsqrt_d(rsq20);
811 rinv21 = avx256_invsqrt_d(rsq21);
812 rinv22 = avx256_invsqrt_d(rsq22);
814 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
815 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
816 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
817 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
818 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
819 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
820 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
821 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
822 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
824 fjx0 = _mm256_setzero_pd();
825 fjy0 = _mm256_setzero_pd();
826 fjz0 = _mm256_setzero_pd();
827 fjx1 = _mm256_setzero_pd();
828 fjy1 = _mm256_setzero_pd();
829 fjz1 = _mm256_setzero_pd();
830 fjx2 = _mm256_setzero_pd();
831 fjy2 = _mm256_setzero_pd();
832 fjz2 = _mm256_setzero_pd();
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 r00 = _mm256_mul_pd(rsq00,rinv00);
839 r00 = _mm256_andnot_pd(dummy_mask,r00);
841 /* EWALD ELECTROSTATICS */
843 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
844 ewrt = _mm256_mul_pd(r00,ewtabscale);
845 ewitab = _mm256_cvttpd_epi32(ewrt);
846 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
847 ewitab = _mm_slli_epi32(ewitab,2);
848 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
849 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
850 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
851 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
852 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
853 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
854 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
855 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
856 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
858 /* Analytical LJ-PME */
859 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
860 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
861 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
862 exponent = avx256_exp_d(ewcljrsq);
863 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
864 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
865 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
866 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
867 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
868 vvdw = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
869 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
870 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
872 /* Update potential sum for this i atom from the interaction with this j atom. */
873 velec = _mm256_andnot_pd(dummy_mask,velec);
874 velecsum = _mm256_add_pd(velecsum,velec);
875 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
876 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
878 fscal = _mm256_add_pd(felec,fvdw);
880 fscal = _mm256_andnot_pd(dummy_mask,fscal);
882 /* Calculate temporary vectorial force */
883 tx = _mm256_mul_pd(fscal,dx00);
884 ty = _mm256_mul_pd(fscal,dy00);
885 tz = _mm256_mul_pd(fscal,dz00);
887 /* Update vectorial force */
888 fix0 = _mm256_add_pd(fix0,tx);
889 fiy0 = _mm256_add_pd(fiy0,ty);
890 fiz0 = _mm256_add_pd(fiz0,tz);
892 fjx0 = _mm256_add_pd(fjx0,tx);
893 fjy0 = _mm256_add_pd(fjy0,ty);
894 fjz0 = _mm256_add_pd(fjz0,tz);
896 /**************************
897 * CALCULATE INTERACTIONS *
898 **************************/
900 r01 = _mm256_mul_pd(rsq01,rinv01);
901 r01 = _mm256_andnot_pd(dummy_mask,r01);
903 /* EWALD ELECTROSTATICS */
905 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
906 ewrt = _mm256_mul_pd(r01,ewtabscale);
907 ewitab = _mm256_cvttpd_epi32(ewrt);
908 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
909 ewitab = _mm_slli_epi32(ewitab,2);
910 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
911 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
912 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
913 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
914 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
915 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
916 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
917 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
918 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
920 /* Update potential sum for this i atom from the interaction with this j atom. */
921 velec = _mm256_andnot_pd(dummy_mask,velec);
922 velecsum = _mm256_add_pd(velecsum,velec);
926 fscal = _mm256_andnot_pd(dummy_mask,fscal);
928 /* Calculate temporary vectorial force */
929 tx = _mm256_mul_pd(fscal,dx01);
930 ty = _mm256_mul_pd(fscal,dy01);
931 tz = _mm256_mul_pd(fscal,dz01);
933 /* Update vectorial force */
934 fix0 = _mm256_add_pd(fix0,tx);
935 fiy0 = _mm256_add_pd(fiy0,ty);
936 fiz0 = _mm256_add_pd(fiz0,tz);
938 fjx1 = _mm256_add_pd(fjx1,tx);
939 fjy1 = _mm256_add_pd(fjy1,ty);
940 fjz1 = _mm256_add_pd(fjz1,tz);
942 /**************************
943 * CALCULATE INTERACTIONS *
944 **************************/
946 r02 = _mm256_mul_pd(rsq02,rinv02);
947 r02 = _mm256_andnot_pd(dummy_mask,r02);
949 /* EWALD ELECTROSTATICS */
951 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
952 ewrt = _mm256_mul_pd(r02,ewtabscale);
953 ewitab = _mm256_cvttpd_epi32(ewrt);
954 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
955 ewitab = _mm_slli_epi32(ewitab,2);
956 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
957 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
958 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
959 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
960 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
961 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
962 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
963 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
964 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
966 /* Update potential sum for this i atom from the interaction with this j atom. */
967 velec = _mm256_andnot_pd(dummy_mask,velec);
968 velecsum = _mm256_add_pd(velecsum,velec);
972 fscal = _mm256_andnot_pd(dummy_mask,fscal);
974 /* Calculate temporary vectorial force */
975 tx = _mm256_mul_pd(fscal,dx02);
976 ty = _mm256_mul_pd(fscal,dy02);
977 tz = _mm256_mul_pd(fscal,dz02);
979 /* Update vectorial force */
980 fix0 = _mm256_add_pd(fix0,tx);
981 fiy0 = _mm256_add_pd(fiy0,ty);
982 fiz0 = _mm256_add_pd(fiz0,tz);
984 fjx2 = _mm256_add_pd(fjx2,tx);
985 fjy2 = _mm256_add_pd(fjy2,ty);
986 fjz2 = _mm256_add_pd(fjz2,tz);
988 /**************************
989 * CALCULATE INTERACTIONS *
990 **************************/
992 r10 = _mm256_mul_pd(rsq10,rinv10);
993 r10 = _mm256_andnot_pd(dummy_mask,r10);
995 /* EWALD ELECTROSTATICS */
997 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
998 ewrt = _mm256_mul_pd(r10,ewtabscale);
999 ewitab = _mm256_cvttpd_epi32(ewrt);
1000 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1001 ewitab = _mm_slli_epi32(ewitab,2);
1002 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1003 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1004 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1005 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1006 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1007 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1008 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1009 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1010 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1012 /* Update potential sum for this i atom from the interaction with this j atom. */
1013 velec = _mm256_andnot_pd(dummy_mask,velec);
1014 velecsum = _mm256_add_pd(velecsum,velec);
1018 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1020 /* Calculate temporary vectorial force */
1021 tx = _mm256_mul_pd(fscal,dx10);
1022 ty = _mm256_mul_pd(fscal,dy10);
1023 tz = _mm256_mul_pd(fscal,dz10);
1025 /* Update vectorial force */
1026 fix1 = _mm256_add_pd(fix1,tx);
1027 fiy1 = _mm256_add_pd(fiy1,ty);
1028 fiz1 = _mm256_add_pd(fiz1,tz);
1030 fjx0 = _mm256_add_pd(fjx0,tx);
1031 fjy0 = _mm256_add_pd(fjy0,ty);
1032 fjz0 = _mm256_add_pd(fjz0,tz);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 r11 = _mm256_mul_pd(rsq11,rinv11);
1039 r11 = _mm256_andnot_pd(dummy_mask,r11);
1041 /* EWALD ELECTROSTATICS */
1043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044 ewrt = _mm256_mul_pd(r11,ewtabscale);
1045 ewitab = _mm256_cvttpd_epi32(ewrt);
1046 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1047 ewitab = _mm_slli_epi32(ewitab,2);
1048 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1049 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1050 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1051 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1052 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1053 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1054 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1055 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1056 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm256_andnot_pd(dummy_mask,velec);
1060 velecsum = _mm256_add_pd(velecsum,velec);
1064 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1066 /* Calculate temporary vectorial force */
1067 tx = _mm256_mul_pd(fscal,dx11);
1068 ty = _mm256_mul_pd(fscal,dy11);
1069 tz = _mm256_mul_pd(fscal,dz11);
1071 /* Update vectorial force */
1072 fix1 = _mm256_add_pd(fix1,tx);
1073 fiy1 = _mm256_add_pd(fiy1,ty);
1074 fiz1 = _mm256_add_pd(fiz1,tz);
1076 fjx1 = _mm256_add_pd(fjx1,tx);
1077 fjy1 = _mm256_add_pd(fjy1,ty);
1078 fjz1 = _mm256_add_pd(fjz1,tz);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 r12 = _mm256_mul_pd(rsq12,rinv12);
1085 r12 = _mm256_andnot_pd(dummy_mask,r12);
1087 /* EWALD ELECTROSTATICS */
1089 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1090 ewrt = _mm256_mul_pd(r12,ewtabscale);
1091 ewitab = _mm256_cvttpd_epi32(ewrt);
1092 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1093 ewitab = _mm_slli_epi32(ewitab,2);
1094 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1095 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1096 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1097 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1098 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1099 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1100 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1101 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1102 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1104 /* Update potential sum for this i atom from the interaction with this j atom. */
1105 velec = _mm256_andnot_pd(dummy_mask,velec);
1106 velecsum = _mm256_add_pd(velecsum,velec);
1110 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1112 /* Calculate temporary vectorial force */
1113 tx = _mm256_mul_pd(fscal,dx12);
1114 ty = _mm256_mul_pd(fscal,dy12);
1115 tz = _mm256_mul_pd(fscal,dz12);
1117 /* Update vectorial force */
1118 fix1 = _mm256_add_pd(fix1,tx);
1119 fiy1 = _mm256_add_pd(fiy1,ty);
1120 fiz1 = _mm256_add_pd(fiz1,tz);
1122 fjx2 = _mm256_add_pd(fjx2,tx);
1123 fjy2 = _mm256_add_pd(fjy2,ty);
1124 fjz2 = _mm256_add_pd(fjz2,tz);
1126 /**************************
1127 * CALCULATE INTERACTIONS *
1128 **************************/
1130 r20 = _mm256_mul_pd(rsq20,rinv20);
1131 r20 = _mm256_andnot_pd(dummy_mask,r20);
1133 /* EWALD ELECTROSTATICS */
1135 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1136 ewrt = _mm256_mul_pd(r20,ewtabscale);
1137 ewitab = _mm256_cvttpd_epi32(ewrt);
1138 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1139 ewitab = _mm_slli_epi32(ewitab,2);
1140 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1141 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1142 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1143 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1144 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1145 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1146 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1147 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1148 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1150 /* Update potential sum for this i atom from the interaction with this j atom. */
1151 velec = _mm256_andnot_pd(dummy_mask,velec);
1152 velecsum = _mm256_add_pd(velecsum,velec);
1156 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1158 /* Calculate temporary vectorial force */
1159 tx = _mm256_mul_pd(fscal,dx20);
1160 ty = _mm256_mul_pd(fscal,dy20);
1161 tz = _mm256_mul_pd(fscal,dz20);
1163 /* Update vectorial force */
1164 fix2 = _mm256_add_pd(fix2,tx);
1165 fiy2 = _mm256_add_pd(fiy2,ty);
1166 fiz2 = _mm256_add_pd(fiz2,tz);
1168 fjx0 = _mm256_add_pd(fjx0,tx);
1169 fjy0 = _mm256_add_pd(fjy0,ty);
1170 fjz0 = _mm256_add_pd(fjz0,tz);
1172 /**************************
1173 * CALCULATE INTERACTIONS *
1174 **************************/
1176 r21 = _mm256_mul_pd(rsq21,rinv21);
1177 r21 = _mm256_andnot_pd(dummy_mask,r21);
1179 /* EWALD ELECTROSTATICS */
1181 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1182 ewrt = _mm256_mul_pd(r21,ewtabscale);
1183 ewitab = _mm256_cvttpd_epi32(ewrt);
1184 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1185 ewitab = _mm_slli_epi32(ewitab,2);
1186 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1187 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1188 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1189 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1190 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1191 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1192 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1193 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1194 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1196 /* Update potential sum for this i atom from the interaction with this j atom. */
1197 velec = _mm256_andnot_pd(dummy_mask,velec);
1198 velecsum = _mm256_add_pd(velecsum,velec);
1202 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1204 /* Calculate temporary vectorial force */
1205 tx = _mm256_mul_pd(fscal,dx21);
1206 ty = _mm256_mul_pd(fscal,dy21);
1207 tz = _mm256_mul_pd(fscal,dz21);
1209 /* Update vectorial force */
1210 fix2 = _mm256_add_pd(fix2,tx);
1211 fiy2 = _mm256_add_pd(fiy2,ty);
1212 fiz2 = _mm256_add_pd(fiz2,tz);
1214 fjx1 = _mm256_add_pd(fjx1,tx);
1215 fjy1 = _mm256_add_pd(fjy1,ty);
1216 fjz1 = _mm256_add_pd(fjz1,tz);
1218 /**************************
1219 * CALCULATE INTERACTIONS *
1220 **************************/
1222 r22 = _mm256_mul_pd(rsq22,rinv22);
1223 r22 = _mm256_andnot_pd(dummy_mask,r22);
1225 /* EWALD ELECTROSTATICS */
1227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1228 ewrt = _mm256_mul_pd(r22,ewtabscale);
1229 ewitab = _mm256_cvttpd_epi32(ewrt);
1230 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1231 ewitab = _mm_slli_epi32(ewitab,2);
1232 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1233 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1234 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1235 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1236 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1237 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1238 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1239 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1240 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1242 /* Update potential sum for this i atom from the interaction with this j atom. */
1243 velec = _mm256_andnot_pd(dummy_mask,velec);
1244 velecsum = _mm256_add_pd(velecsum,velec);
1248 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1250 /* Calculate temporary vectorial force */
1251 tx = _mm256_mul_pd(fscal,dx22);
1252 ty = _mm256_mul_pd(fscal,dy22);
1253 tz = _mm256_mul_pd(fscal,dz22);
1255 /* Update vectorial force */
1256 fix2 = _mm256_add_pd(fix2,tx);
1257 fiy2 = _mm256_add_pd(fiy2,ty);
1258 fiz2 = _mm256_add_pd(fiz2,tz);
1260 fjx2 = _mm256_add_pd(fjx2,tx);
1261 fjy2 = _mm256_add_pd(fjy2,ty);
1262 fjz2 = _mm256_add_pd(fjz2,tz);
1264 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1265 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1266 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1267 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1269 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1270 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1272 /* Inner loop uses 409 flops */
1275 /* End of innermost loop */
1277 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1278 f+i_coord_offset,fshift+i_shift_offset);
1281 /* Update potential energies */
1282 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1283 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1285 /* Increment number of inner iterations */
1286 inneriter += j_index_end - j_index_start;
1288 /* Outer loop uses 20 flops */
1291 /* Increment number of outer iterations */
1294 /* Update outer/inner flops */
1296 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1299 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double
1300 * Electrostatics interaction: Ewald
1301 * VdW interaction: LJEwald
1302 * Geometry: Water3-Water3
1303 * Calculate force/pot: Force
1306 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double
1307 (t_nblist * gmx_restrict nlist,
1308 rvec * gmx_restrict xx,
1309 rvec * gmx_restrict ff,
1310 struct t_forcerec * gmx_restrict fr,
1311 t_mdatoms * gmx_restrict mdatoms,
1312 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1313 t_nrnb * gmx_restrict nrnb)
1315 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1316 * just 0 for non-waters.
1317 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1318 * jnr indices corresponding to data put in the four positions in the SIMD register.
1320 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1321 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1322 int jnrA,jnrB,jnrC,jnrD;
1323 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1324 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1325 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1326 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1327 real rcutoff_scalar;
1328 real *shiftvec,*fshift,*x,*f;
1329 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1330 real scratch[4*DIM];
1331 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1332 real * vdwioffsetptr0;
1333 real * vdwgridioffsetptr0;
1334 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1335 real * vdwioffsetptr1;
1336 real * vdwgridioffsetptr1;
1337 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1338 real * vdwioffsetptr2;
1339 real * vdwgridioffsetptr2;
1340 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1341 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1342 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1343 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1344 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1345 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1346 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1347 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1348 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1349 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1350 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1351 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1352 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1353 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1354 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1355 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1356 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1359 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1362 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1363 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1374 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1375 __m256d one_half = _mm256_set1_pd(0.5);
1376 __m256d minus_one = _mm256_set1_pd(-1.0);
1378 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1379 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1381 __m256d dummy_mask,cutoff_mask;
1382 __m128 tmpmask0,tmpmask1;
1383 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1384 __m256d one = _mm256_set1_pd(1.0);
1385 __m256d two = _mm256_set1_pd(2.0);
1391 jindex = nlist->jindex;
1393 shiftidx = nlist->shift;
1395 shiftvec = fr->shift_vec[0];
1396 fshift = fr->fshift[0];
1397 facel = _mm256_set1_pd(fr->ic->epsfac);
1398 charge = mdatoms->chargeA;
1399 nvdwtype = fr->ntype;
1400 vdwparam = fr->nbfp;
1401 vdwtype = mdatoms->typeA;
1402 vdwgridparam = fr->ljpme_c6grid;
1403 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1404 ewclj = _mm256_set1_pd(fr->ic->ewaldcoeff_lj);
1405 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1407 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1408 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1409 beta2 = _mm256_mul_pd(beta,beta);
1410 beta3 = _mm256_mul_pd(beta,beta2);
1412 ewtab = fr->ic->tabq_coul_F;
1413 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1414 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1416 /* Setup water-specific parameters */
1417 inr = nlist->iinr[0];
1418 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1419 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1420 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1421 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1422 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1424 jq0 = _mm256_set1_pd(charge[inr+0]);
1425 jq1 = _mm256_set1_pd(charge[inr+1]);
1426 jq2 = _mm256_set1_pd(charge[inr+2]);
1427 vdwjidx0A = 2*vdwtype[inr+0];
1428 qq00 = _mm256_mul_pd(iq0,jq0);
1429 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1430 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1431 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1432 qq01 = _mm256_mul_pd(iq0,jq1);
1433 qq02 = _mm256_mul_pd(iq0,jq2);
1434 qq10 = _mm256_mul_pd(iq1,jq0);
1435 qq11 = _mm256_mul_pd(iq1,jq1);
1436 qq12 = _mm256_mul_pd(iq1,jq2);
1437 qq20 = _mm256_mul_pd(iq2,jq0);
1438 qq21 = _mm256_mul_pd(iq2,jq1);
1439 qq22 = _mm256_mul_pd(iq2,jq2);
1441 /* Avoid stupid compiler warnings */
1442 jnrA = jnrB = jnrC = jnrD = 0;
1443 j_coord_offsetA = 0;
1444 j_coord_offsetB = 0;
1445 j_coord_offsetC = 0;
1446 j_coord_offsetD = 0;
1451 for(iidx=0;iidx<4*DIM;iidx++)
1453 scratch[iidx] = 0.0;
1456 /* Start outer loop over neighborlists */
1457 for(iidx=0; iidx<nri; iidx++)
1459 /* Load shift vector for this list */
1460 i_shift_offset = DIM*shiftidx[iidx];
1462 /* Load limits for loop over neighbors */
1463 j_index_start = jindex[iidx];
1464 j_index_end = jindex[iidx+1];
1466 /* Get outer coordinate index */
1468 i_coord_offset = DIM*inr;
1470 /* Load i particle coords and add shift vector */
1471 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1472 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1474 fix0 = _mm256_setzero_pd();
1475 fiy0 = _mm256_setzero_pd();
1476 fiz0 = _mm256_setzero_pd();
1477 fix1 = _mm256_setzero_pd();
1478 fiy1 = _mm256_setzero_pd();
1479 fiz1 = _mm256_setzero_pd();
1480 fix2 = _mm256_setzero_pd();
1481 fiy2 = _mm256_setzero_pd();
1482 fiz2 = _mm256_setzero_pd();
1484 /* Start inner kernel loop */
1485 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1488 /* Get j neighbor index, and coordinate index */
1490 jnrB = jjnr[jidx+1];
1491 jnrC = jjnr[jidx+2];
1492 jnrD = jjnr[jidx+3];
1493 j_coord_offsetA = DIM*jnrA;
1494 j_coord_offsetB = DIM*jnrB;
1495 j_coord_offsetC = DIM*jnrC;
1496 j_coord_offsetD = DIM*jnrD;
1498 /* load j atom coordinates */
1499 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1500 x+j_coord_offsetC,x+j_coord_offsetD,
1501 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1503 /* Calculate displacement vector */
1504 dx00 = _mm256_sub_pd(ix0,jx0);
1505 dy00 = _mm256_sub_pd(iy0,jy0);
1506 dz00 = _mm256_sub_pd(iz0,jz0);
1507 dx01 = _mm256_sub_pd(ix0,jx1);
1508 dy01 = _mm256_sub_pd(iy0,jy1);
1509 dz01 = _mm256_sub_pd(iz0,jz1);
1510 dx02 = _mm256_sub_pd(ix0,jx2);
1511 dy02 = _mm256_sub_pd(iy0,jy2);
1512 dz02 = _mm256_sub_pd(iz0,jz2);
1513 dx10 = _mm256_sub_pd(ix1,jx0);
1514 dy10 = _mm256_sub_pd(iy1,jy0);
1515 dz10 = _mm256_sub_pd(iz1,jz0);
1516 dx11 = _mm256_sub_pd(ix1,jx1);
1517 dy11 = _mm256_sub_pd(iy1,jy1);
1518 dz11 = _mm256_sub_pd(iz1,jz1);
1519 dx12 = _mm256_sub_pd(ix1,jx2);
1520 dy12 = _mm256_sub_pd(iy1,jy2);
1521 dz12 = _mm256_sub_pd(iz1,jz2);
1522 dx20 = _mm256_sub_pd(ix2,jx0);
1523 dy20 = _mm256_sub_pd(iy2,jy0);
1524 dz20 = _mm256_sub_pd(iz2,jz0);
1525 dx21 = _mm256_sub_pd(ix2,jx1);
1526 dy21 = _mm256_sub_pd(iy2,jy1);
1527 dz21 = _mm256_sub_pd(iz2,jz1);
1528 dx22 = _mm256_sub_pd(ix2,jx2);
1529 dy22 = _mm256_sub_pd(iy2,jy2);
1530 dz22 = _mm256_sub_pd(iz2,jz2);
1532 /* Calculate squared distance and things based on it */
1533 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1534 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1535 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1536 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1537 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1538 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1539 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1540 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1541 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1543 rinv00 = avx256_invsqrt_d(rsq00);
1544 rinv01 = avx256_invsqrt_d(rsq01);
1545 rinv02 = avx256_invsqrt_d(rsq02);
1546 rinv10 = avx256_invsqrt_d(rsq10);
1547 rinv11 = avx256_invsqrt_d(rsq11);
1548 rinv12 = avx256_invsqrt_d(rsq12);
1549 rinv20 = avx256_invsqrt_d(rsq20);
1550 rinv21 = avx256_invsqrt_d(rsq21);
1551 rinv22 = avx256_invsqrt_d(rsq22);
1553 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1554 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1555 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1556 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1557 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1558 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1559 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1560 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1561 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1563 fjx0 = _mm256_setzero_pd();
1564 fjy0 = _mm256_setzero_pd();
1565 fjz0 = _mm256_setzero_pd();
1566 fjx1 = _mm256_setzero_pd();
1567 fjy1 = _mm256_setzero_pd();
1568 fjz1 = _mm256_setzero_pd();
1569 fjx2 = _mm256_setzero_pd();
1570 fjy2 = _mm256_setzero_pd();
1571 fjz2 = _mm256_setzero_pd();
1573 /**************************
1574 * CALCULATE INTERACTIONS *
1575 **************************/
1577 r00 = _mm256_mul_pd(rsq00,rinv00);
1579 /* EWALD ELECTROSTATICS */
1581 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1582 ewrt = _mm256_mul_pd(r00,ewtabscale);
1583 ewitab = _mm256_cvttpd_epi32(ewrt);
1584 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1585 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1586 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1588 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1589 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1591 /* Analytical LJ-PME */
1592 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1593 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
1594 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1595 exponent = avx256_exp_d(ewcljrsq);
1596 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1597 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1598 /* f6A = 6 * C6grid * (1 - poly) */
1599 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1600 /* f6B = C6grid * exponent * beta^6 */
1601 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1602 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1603 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1605 fscal = _mm256_add_pd(felec,fvdw);
1607 /* Calculate temporary vectorial force */
1608 tx = _mm256_mul_pd(fscal,dx00);
1609 ty = _mm256_mul_pd(fscal,dy00);
1610 tz = _mm256_mul_pd(fscal,dz00);
1612 /* Update vectorial force */
1613 fix0 = _mm256_add_pd(fix0,tx);
1614 fiy0 = _mm256_add_pd(fiy0,ty);
1615 fiz0 = _mm256_add_pd(fiz0,tz);
1617 fjx0 = _mm256_add_pd(fjx0,tx);
1618 fjy0 = _mm256_add_pd(fjy0,ty);
1619 fjz0 = _mm256_add_pd(fjz0,tz);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 r01 = _mm256_mul_pd(rsq01,rinv01);
1627 /* EWALD ELECTROSTATICS */
1629 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1630 ewrt = _mm256_mul_pd(r01,ewtabscale);
1631 ewitab = _mm256_cvttpd_epi32(ewrt);
1632 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1633 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1634 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1636 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1637 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1641 /* Calculate temporary vectorial force */
1642 tx = _mm256_mul_pd(fscal,dx01);
1643 ty = _mm256_mul_pd(fscal,dy01);
1644 tz = _mm256_mul_pd(fscal,dz01);
1646 /* Update vectorial force */
1647 fix0 = _mm256_add_pd(fix0,tx);
1648 fiy0 = _mm256_add_pd(fiy0,ty);
1649 fiz0 = _mm256_add_pd(fiz0,tz);
1651 fjx1 = _mm256_add_pd(fjx1,tx);
1652 fjy1 = _mm256_add_pd(fjy1,ty);
1653 fjz1 = _mm256_add_pd(fjz1,tz);
1655 /**************************
1656 * CALCULATE INTERACTIONS *
1657 **************************/
1659 r02 = _mm256_mul_pd(rsq02,rinv02);
1661 /* EWALD ELECTROSTATICS */
1663 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1664 ewrt = _mm256_mul_pd(r02,ewtabscale);
1665 ewitab = _mm256_cvttpd_epi32(ewrt);
1666 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1667 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1668 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1670 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1671 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1675 /* Calculate temporary vectorial force */
1676 tx = _mm256_mul_pd(fscal,dx02);
1677 ty = _mm256_mul_pd(fscal,dy02);
1678 tz = _mm256_mul_pd(fscal,dz02);
1680 /* Update vectorial force */
1681 fix0 = _mm256_add_pd(fix0,tx);
1682 fiy0 = _mm256_add_pd(fiy0,ty);
1683 fiz0 = _mm256_add_pd(fiz0,tz);
1685 fjx2 = _mm256_add_pd(fjx2,tx);
1686 fjy2 = _mm256_add_pd(fjy2,ty);
1687 fjz2 = _mm256_add_pd(fjz2,tz);
1689 /**************************
1690 * CALCULATE INTERACTIONS *
1691 **************************/
1693 r10 = _mm256_mul_pd(rsq10,rinv10);
1695 /* EWALD ELECTROSTATICS */
1697 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1698 ewrt = _mm256_mul_pd(r10,ewtabscale);
1699 ewitab = _mm256_cvttpd_epi32(ewrt);
1700 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1701 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1702 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1704 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1705 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1709 /* Calculate temporary vectorial force */
1710 tx = _mm256_mul_pd(fscal,dx10);
1711 ty = _mm256_mul_pd(fscal,dy10);
1712 tz = _mm256_mul_pd(fscal,dz10);
1714 /* Update vectorial force */
1715 fix1 = _mm256_add_pd(fix1,tx);
1716 fiy1 = _mm256_add_pd(fiy1,ty);
1717 fiz1 = _mm256_add_pd(fiz1,tz);
1719 fjx0 = _mm256_add_pd(fjx0,tx);
1720 fjy0 = _mm256_add_pd(fjy0,ty);
1721 fjz0 = _mm256_add_pd(fjz0,tz);
1723 /**************************
1724 * CALCULATE INTERACTIONS *
1725 **************************/
1727 r11 = _mm256_mul_pd(rsq11,rinv11);
1729 /* EWALD ELECTROSTATICS */
1731 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1732 ewrt = _mm256_mul_pd(r11,ewtabscale);
1733 ewitab = _mm256_cvttpd_epi32(ewrt);
1734 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1735 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1736 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1738 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1739 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1743 /* Calculate temporary vectorial force */
1744 tx = _mm256_mul_pd(fscal,dx11);
1745 ty = _mm256_mul_pd(fscal,dy11);
1746 tz = _mm256_mul_pd(fscal,dz11);
1748 /* Update vectorial force */
1749 fix1 = _mm256_add_pd(fix1,tx);
1750 fiy1 = _mm256_add_pd(fiy1,ty);
1751 fiz1 = _mm256_add_pd(fiz1,tz);
1753 fjx1 = _mm256_add_pd(fjx1,tx);
1754 fjy1 = _mm256_add_pd(fjy1,ty);
1755 fjz1 = _mm256_add_pd(fjz1,tz);
1757 /**************************
1758 * CALCULATE INTERACTIONS *
1759 **************************/
1761 r12 = _mm256_mul_pd(rsq12,rinv12);
1763 /* EWALD ELECTROSTATICS */
1765 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1766 ewrt = _mm256_mul_pd(r12,ewtabscale);
1767 ewitab = _mm256_cvttpd_epi32(ewrt);
1768 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1769 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1770 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1772 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1773 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1777 /* Calculate temporary vectorial force */
1778 tx = _mm256_mul_pd(fscal,dx12);
1779 ty = _mm256_mul_pd(fscal,dy12);
1780 tz = _mm256_mul_pd(fscal,dz12);
1782 /* Update vectorial force */
1783 fix1 = _mm256_add_pd(fix1,tx);
1784 fiy1 = _mm256_add_pd(fiy1,ty);
1785 fiz1 = _mm256_add_pd(fiz1,tz);
1787 fjx2 = _mm256_add_pd(fjx2,tx);
1788 fjy2 = _mm256_add_pd(fjy2,ty);
1789 fjz2 = _mm256_add_pd(fjz2,tz);
1791 /**************************
1792 * CALCULATE INTERACTIONS *
1793 **************************/
1795 r20 = _mm256_mul_pd(rsq20,rinv20);
1797 /* EWALD ELECTROSTATICS */
1799 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1800 ewrt = _mm256_mul_pd(r20,ewtabscale);
1801 ewitab = _mm256_cvttpd_epi32(ewrt);
1802 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1803 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1804 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1806 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1807 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1811 /* Calculate temporary vectorial force */
1812 tx = _mm256_mul_pd(fscal,dx20);
1813 ty = _mm256_mul_pd(fscal,dy20);
1814 tz = _mm256_mul_pd(fscal,dz20);
1816 /* Update vectorial force */
1817 fix2 = _mm256_add_pd(fix2,tx);
1818 fiy2 = _mm256_add_pd(fiy2,ty);
1819 fiz2 = _mm256_add_pd(fiz2,tz);
1821 fjx0 = _mm256_add_pd(fjx0,tx);
1822 fjy0 = _mm256_add_pd(fjy0,ty);
1823 fjz0 = _mm256_add_pd(fjz0,tz);
1825 /**************************
1826 * CALCULATE INTERACTIONS *
1827 **************************/
1829 r21 = _mm256_mul_pd(rsq21,rinv21);
1831 /* EWALD ELECTROSTATICS */
1833 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1834 ewrt = _mm256_mul_pd(r21,ewtabscale);
1835 ewitab = _mm256_cvttpd_epi32(ewrt);
1836 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1837 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1838 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1840 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1841 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1845 /* Calculate temporary vectorial force */
1846 tx = _mm256_mul_pd(fscal,dx21);
1847 ty = _mm256_mul_pd(fscal,dy21);
1848 tz = _mm256_mul_pd(fscal,dz21);
1850 /* Update vectorial force */
1851 fix2 = _mm256_add_pd(fix2,tx);
1852 fiy2 = _mm256_add_pd(fiy2,ty);
1853 fiz2 = _mm256_add_pd(fiz2,tz);
1855 fjx1 = _mm256_add_pd(fjx1,tx);
1856 fjy1 = _mm256_add_pd(fjy1,ty);
1857 fjz1 = _mm256_add_pd(fjz1,tz);
1859 /**************************
1860 * CALCULATE INTERACTIONS *
1861 **************************/
1863 r22 = _mm256_mul_pd(rsq22,rinv22);
1865 /* EWALD ELECTROSTATICS */
1867 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1868 ewrt = _mm256_mul_pd(r22,ewtabscale);
1869 ewitab = _mm256_cvttpd_epi32(ewrt);
1870 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1871 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1872 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1874 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1875 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1879 /* Calculate temporary vectorial force */
1880 tx = _mm256_mul_pd(fscal,dx22);
1881 ty = _mm256_mul_pd(fscal,dy22);
1882 tz = _mm256_mul_pd(fscal,dz22);
1884 /* Update vectorial force */
1885 fix2 = _mm256_add_pd(fix2,tx);
1886 fiy2 = _mm256_add_pd(fiy2,ty);
1887 fiz2 = _mm256_add_pd(fiz2,tz);
1889 fjx2 = _mm256_add_pd(fjx2,tx);
1890 fjy2 = _mm256_add_pd(fjy2,ty);
1891 fjz2 = _mm256_add_pd(fjz2,tz);
1893 fjptrA = f+j_coord_offsetA;
1894 fjptrB = f+j_coord_offsetB;
1895 fjptrC = f+j_coord_offsetC;
1896 fjptrD = f+j_coord_offsetD;
1898 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1899 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1901 /* Inner loop uses 347 flops */
1904 if(jidx<j_index_end)
1907 /* Get j neighbor index, and coordinate index */
1908 jnrlistA = jjnr[jidx];
1909 jnrlistB = jjnr[jidx+1];
1910 jnrlistC = jjnr[jidx+2];
1911 jnrlistD = jjnr[jidx+3];
1912 /* Sign of each element will be negative for non-real atoms.
1913 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1914 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1916 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1918 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1919 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1920 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1922 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1923 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1924 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1925 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1926 j_coord_offsetA = DIM*jnrA;
1927 j_coord_offsetB = DIM*jnrB;
1928 j_coord_offsetC = DIM*jnrC;
1929 j_coord_offsetD = DIM*jnrD;
1931 /* load j atom coordinates */
1932 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1933 x+j_coord_offsetC,x+j_coord_offsetD,
1934 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1936 /* Calculate displacement vector */
1937 dx00 = _mm256_sub_pd(ix0,jx0);
1938 dy00 = _mm256_sub_pd(iy0,jy0);
1939 dz00 = _mm256_sub_pd(iz0,jz0);
1940 dx01 = _mm256_sub_pd(ix0,jx1);
1941 dy01 = _mm256_sub_pd(iy0,jy1);
1942 dz01 = _mm256_sub_pd(iz0,jz1);
1943 dx02 = _mm256_sub_pd(ix0,jx2);
1944 dy02 = _mm256_sub_pd(iy0,jy2);
1945 dz02 = _mm256_sub_pd(iz0,jz2);
1946 dx10 = _mm256_sub_pd(ix1,jx0);
1947 dy10 = _mm256_sub_pd(iy1,jy0);
1948 dz10 = _mm256_sub_pd(iz1,jz0);
1949 dx11 = _mm256_sub_pd(ix1,jx1);
1950 dy11 = _mm256_sub_pd(iy1,jy1);
1951 dz11 = _mm256_sub_pd(iz1,jz1);
1952 dx12 = _mm256_sub_pd(ix1,jx2);
1953 dy12 = _mm256_sub_pd(iy1,jy2);
1954 dz12 = _mm256_sub_pd(iz1,jz2);
1955 dx20 = _mm256_sub_pd(ix2,jx0);
1956 dy20 = _mm256_sub_pd(iy2,jy0);
1957 dz20 = _mm256_sub_pd(iz2,jz0);
1958 dx21 = _mm256_sub_pd(ix2,jx1);
1959 dy21 = _mm256_sub_pd(iy2,jy1);
1960 dz21 = _mm256_sub_pd(iz2,jz1);
1961 dx22 = _mm256_sub_pd(ix2,jx2);
1962 dy22 = _mm256_sub_pd(iy2,jy2);
1963 dz22 = _mm256_sub_pd(iz2,jz2);
1965 /* Calculate squared distance and things based on it */
1966 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1967 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1968 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1969 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1970 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1971 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1972 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1973 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1974 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1976 rinv00 = avx256_invsqrt_d(rsq00);
1977 rinv01 = avx256_invsqrt_d(rsq01);
1978 rinv02 = avx256_invsqrt_d(rsq02);
1979 rinv10 = avx256_invsqrt_d(rsq10);
1980 rinv11 = avx256_invsqrt_d(rsq11);
1981 rinv12 = avx256_invsqrt_d(rsq12);
1982 rinv20 = avx256_invsqrt_d(rsq20);
1983 rinv21 = avx256_invsqrt_d(rsq21);
1984 rinv22 = avx256_invsqrt_d(rsq22);
1986 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1987 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1988 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1989 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1990 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1991 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1992 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1993 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1994 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1996 fjx0 = _mm256_setzero_pd();
1997 fjy0 = _mm256_setzero_pd();
1998 fjz0 = _mm256_setzero_pd();
1999 fjx1 = _mm256_setzero_pd();
2000 fjy1 = _mm256_setzero_pd();
2001 fjz1 = _mm256_setzero_pd();
2002 fjx2 = _mm256_setzero_pd();
2003 fjy2 = _mm256_setzero_pd();
2004 fjz2 = _mm256_setzero_pd();
2006 /**************************
2007 * CALCULATE INTERACTIONS *
2008 **************************/
2010 r00 = _mm256_mul_pd(rsq00,rinv00);
2011 r00 = _mm256_andnot_pd(dummy_mask,r00);
2013 /* EWALD ELECTROSTATICS */
2015 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2016 ewrt = _mm256_mul_pd(r00,ewtabscale);
2017 ewitab = _mm256_cvttpd_epi32(ewrt);
2018 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2019 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2020 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2022 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2023 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2025 /* Analytical LJ-PME */
2026 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2027 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
2028 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2029 exponent = avx256_exp_d(ewcljrsq);
2030 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2031 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2032 /* f6A = 6 * C6grid * (1 - poly) */
2033 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2034 /* f6B = C6grid * exponent * beta^6 */
2035 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2036 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2037 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2039 fscal = _mm256_add_pd(felec,fvdw);
2041 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2043 /* Calculate temporary vectorial force */
2044 tx = _mm256_mul_pd(fscal,dx00);
2045 ty = _mm256_mul_pd(fscal,dy00);
2046 tz = _mm256_mul_pd(fscal,dz00);
2048 /* Update vectorial force */
2049 fix0 = _mm256_add_pd(fix0,tx);
2050 fiy0 = _mm256_add_pd(fiy0,ty);
2051 fiz0 = _mm256_add_pd(fiz0,tz);
2053 fjx0 = _mm256_add_pd(fjx0,tx);
2054 fjy0 = _mm256_add_pd(fjy0,ty);
2055 fjz0 = _mm256_add_pd(fjz0,tz);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 r01 = _mm256_mul_pd(rsq01,rinv01);
2062 r01 = _mm256_andnot_pd(dummy_mask,r01);
2064 /* EWALD ELECTROSTATICS */
2066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2067 ewrt = _mm256_mul_pd(r01,ewtabscale);
2068 ewitab = _mm256_cvttpd_epi32(ewrt);
2069 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2070 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2071 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2073 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2074 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2078 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2080 /* Calculate temporary vectorial force */
2081 tx = _mm256_mul_pd(fscal,dx01);
2082 ty = _mm256_mul_pd(fscal,dy01);
2083 tz = _mm256_mul_pd(fscal,dz01);
2085 /* Update vectorial force */
2086 fix0 = _mm256_add_pd(fix0,tx);
2087 fiy0 = _mm256_add_pd(fiy0,ty);
2088 fiz0 = _mm256_add_pd(fiz0,tz);
2090 fjx1 = _mm256_add_pd(fjx1,tx);
2091 fjy1 = _mm256_add_pd(fjy1,ty);
2092 fjz1 = _mm256_add_pd(fjz1,tz);
2094 /**************************
2095 * CALCULATE INTERACTIONS *
2096 **************************/
2098 r02 = _mm256_mul_pd(rsq02,rinv02);
2099 r02 = _mm256_andnot_pd(dummy_mask,r02);
2101 /* EWALD ELECTROSTATICS */
2103 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2104 ewrt = _mm256_mul_pd(r02,ewtabscale);
2105 ewitab = _mm256_cvttpd_epi32(ewrt);
2106 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2107 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2108 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2110 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2111 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2115 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2117 /* Calculate temporary vectorial force */
2118 tx = _mm256_mul_pd(fscal,dx02);
2119 ty = _mm256_mul_pd(fscal,dy02);
2120 tz = _mm256_mul_pd(fscal,dz02);
2122 /* Update vectorial force */
2123 fix0 = _mm256_add_pd(fix0,tx);
2124 fiy0 = _mm256_add_pd(fiy0,ty);
2125 fiz0 = _mm256_add_pd(fiz0,tz);
2127 fjx2 = _mm256_add_pd(fjx2,tx);
2128 fjy2 = _mm256_add_pd(fjy2,ty);
2129 fjz2 = _mm256_add_pd(fjz2,tz);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 r10 = _mm256_mul_pd(rsq10,rinv10);
2136 r10 = _mm256_andnot_pd(dummy_mask,r10);
2138 /* EWALD ELECTROSTATICS */
2140 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2141 ewrt = _mm256_mul_pd(r10,ewtabscale);
2142 ewitab = _mm256_cvttpd_epi32(ewrt);
2143 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2144 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2145 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2147 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2148 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2152 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2154 /* Calculate temporary vectorial force */
2155 tx = _mm256_mul_pd(fscal,dx10);
2156 ty = _mm256_mul_pd(fscal,dy10);
2157 tz = _mm256_mul_pd(fscal,dz10);
2159 /* Update vectorial force */
2160 fix1 = _mm256_add_pd(fix1,tx);
2161 fiy1 = _mm256_add_pd(fiy1,ty);
2162 fiz1 = _mm256_add_pd(fiz1,tz);
2164 fjx0 = _mm256_add_pd(fjx0,tx);
2165 fjy0 = _mm256_add_pd(fjy0,ty);
2166 fjz0 = _mm256_add_pd(fjz0,tz);
2168 /**************************
2169 * CALCULATE INTERACTIONS *
2170 **************************/
2172 r11 = _mm256_mul_pd(rsq11,rinv11);
2173 r11 = _mm256_andnot_pd(dummy_mask,r11);
2175 /* EWALD ELECTROSTATICS */
2177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2178 ewrt = _mm256_mul_pd(r11,ewtabscale);
2179 ewitab = _mm256_cvttpd_epi32(ewrt);
2180 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2181 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2182 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2184 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2185 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2189 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2191 /* Calculate temporary vectorial force */
2192 tx = _mm256_mul_pd(fscal,dx11);
2193 ty = _mm256_mul_pd(fscal,dy11);
2194 tz = _mm256_mul_pd(fscal,dz11);
2196 /* Update vectorial force */
2197 fix1 = _mm256_add_pd(fix1,tx);
2198 fiy1 = _mm256_add_pd(fiy1,ty);
2199 fiz1 = _mm256_add_pd(fiz1,tz);
2201 fjx1 = _mm256_add_pd(fjx1,tx);
2202 fjy1 = _mm256_add_pd(fjy1,ty);
2203 fjz1 = _mm256_add_pd(fjz1,tz);
2205 /**************************
2206 * CALCULATE INTERACTIONS *
2207 **************************/
2209 r12 = _mm256_mul_pd(rsq12,rinv12);
2210 r12 = _mm256_andnot_pd(dummy_mask,r12);
2212 /* EWALD ELECTROSTATICS */
2214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2215 ewrt = _mm256_mul_pd(r12,ewtabscale);
2216 ewitab = _mm256_cvttpd_epi32(ewrt);
2217 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2218 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2219 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2221 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2222 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2226 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2228 /* Calculate temporary vectorial force */
2229 tx = _mm256_mul_pd(fscal,dx12);
2230 ty = _mm256_mul_pd(fscal,dy12);
2231 tz = _mm256_mul_pd(fscal,dz12);
2233 /* Update vectorial force */
2234 fix1 = _mm256_add_pd(fix1,tx);
2235 fiy1 = _mm256_add_pd(fiy1,ty);
2236 fiz1 = _mm256_add_pd(fiz1,tz);
2238 fjx2 = _mm256_add_pd(fjx2,tx);
2239 fjy2 = _mm256_add_pd(fjy2,ty);
2240 fjz2 = _mm256_add_pd(fjz2,tz);
2242 /**************************
2243 * CALCULATE INTERACTIONS *
2244 **************************/
2246 r20 = _mm256_mul_pd(rsq20,rinv20);
2247 r20 = _mm256_andnot_pd(dummy_mask,r20);
2249 /* EWALD ELECTROSTATICS */
2251 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2252 ewrt = _mm256_mul_pd(r20,ewtabscale);
2253 ewitab = _mm256_cvttpd_epi32(ewrt);
2254 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2255 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2256 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2258 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2259 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2263 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2265 /* Calculate temporary vectorial force */
2266 tx = _mm256_mul_pd(fscal,dx20);
2267 ty = _mm256_mul_pd(fscal,dy20);
2268 tz = _mm256_mul_pd(fscal,dz20);
2270 /* Update vectorial force */
2271 fix2 = _mm256_add_pd(fix2,tx);
2272 fiy2 = _mm256_add_pd(fiy2,ty);
2273 fiz2 = _mm256_add_pd(fiz2,tz);
2275 fjx0 = _mm256_add_pd(fjx0,tx);
2276 fjy0 = _mm256_add_pd(fjy0,ty);
2277 fjz0 = _mm256_add_pd(fjz0,tz);
2279 /**************************
2280 * CALCULATE INTERACTIONS *
2281 **************************/
2283 r21 = _mm256_mul_pd(rsq21,rinv21);
2284 r21 = _mm256_andnot_pd(dummy_mask,r21);
2286 /* EWALD ELECTROSTATICS */
2288 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2289 ewrt = _mm256_mul_pd(r21,ewtabscale);
2290 ewitab = _mm256_cvttpd_epi32(ewrt);
2291 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2292 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2293 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2295 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2296 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2300 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2302 /* Calculate temporary vectorial force */
2303 tx = _mm256_mul_pd(fscal,dx21);
2304 ty = _mm256_mul_pd(fscal,dy21);
2305 tz = _mm256_mul_pd(fscal,dz21);
2307 /* Update vectorial force */
2308 fix2 = _mm256_add_pd(fix2,tx);
2309 fiy2 = _mm256_add_pd(fiy2,ty);
2310 fiz2 = _mm256_add_pd(fiz2,tz);
2312 fjx1 = _mm256_add_pd(fjx1,tx);
2313 fjy1 = _mm256_add_pd(fjy1,ty);
2314 fjz1 = _mm256_add_pd(fjz1,tz);
2316 /**************************
2317 * CALCULATE INTERACTIONS *
2318 **************************/
2320 r22 = _mm256_mul_pd(rsq22,rinv22);
2321 r22 = _mm256_andnot_pd(dummy_mask,r22);
2323 /* EWALD ELECTROSTATICS */
2325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2326 ewrt = _mm256_mul_pd(r22,ewtabscale);
2327 ewitab = _mm256_cvttpd_epi32(ewrt);
2328 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2329 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2330 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2332 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2333 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2337 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2339 /* Calculate temporary vectorial force */
2340 tx = _mm256_mul_pd(fscal,dx22);
2341 ty = _mm256_mul_pd(fscal,dy22);
2342 tz = _mm256_mul_pd(fscal,dz22);
2344 /* Update vectorial force */
2345 fix2 = _mm256_add_pd(fix2,tx);
2346 fiy2 = _mm256_add_pd(fiy2,ty);
2347 fiz2 = _mm256_add_pd(fiz2,tz);
2349 fjx2 = _mm256_add_pd(fjx2,tx);
2350 fjy2 = _mm256_add_pd(fjy2,ty);
2351 fjz2 = _mm256_add_pd(fjz2,tz);
2353 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2354 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2355 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2356 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2358 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2359 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2361 /* Inner loop uses 356 flops */
2364 /* End of innermost loop */
2366 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2367 f+i_coord_offset,fshift+i_shift_offset);
2369 /* Increment number of inner iterations */
2370 inneriter += j_index_end - j_index_start;
2372 /* Outer loop uses 18 flops */
2375 /* Increment number of outer iterations */
2378 /* Update outer/inner flops */
2380 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);