2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
107 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
111 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
113 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
116 __m256d dummy_mask,cutoff_mask;
117 __m128 tmpmask0,tmpmask1;
118 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
119 __m256d one = _mm256_set1_pd(1.0);
120 __m256d two = _mm256_set1_pd(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm256_set1_pd(fr->ic->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
139 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
140 beta2 = _mm256_mul_pd(beta,beta);
141 beta3 = _mm256_mul_pd(beta,beta2);
143 ewtab = fr->ic->tabq_coul_FDV0;
144 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
145 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
150 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
151 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
152 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
154 jq0 = _mm256_set1_pd(charge[inr+0]);
155 jq1 = _mm256_set1_pd(charge[inr+1]);
156 jq2 = _mm256_set1_pd(charge[inr+2]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 qq00 = _mm256_mul_pd(iq0,jq0);
159 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
160 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
161 qq01 = _mm256_mul_pd(iq0,jq1);
162 qq02 = _mm256_mul_pd(iq0,jq2);
163 qq10 = _mm256_mul_pd(iq1,jq0);
164 qq11 = _mm256_mul_pd(iq1,jq1);
165 qq12 = _mm256_mul_pd(iq1,jq2);
166 qq20 = _mm256_mul_pd(iq2,jq0);
167 qq21 = _mm256_mul_pd(iq2,jq1);
168 qq22 = _mm256_mul_pd(iq2,jq2);
170 /* Avoid stupid compiler warnings */
171 jnrA = jnrB = jnrC = jnrD = 0;
180 for(iidx=0;iidx<4*DIM;iidx++)
185 /* Start outer loop over neighborlists */
186 for(iidx=0; iidx<nri; iidx++)
188 /* Load shift vector for this list */
189 i_shift_offset = DIM*shiftidx[iidx];
191 /* Load limits for loop over neighbors */
192 j_index_start = jindex[iidx];
193 j_index_end = jindex[iidx+1];
195 /* Get outer coordinate index */
197 i_coord_offset = DIM*inr;
199 /* Load i particle coords and add shift vector */
200 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
201 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
203 fix0 = _mm256_setzero_pd();
204 fiy0 = _mm256_setzero_pd();
205 fiz0 = _mm256_setzero_pd();
206 fix1 = _mm256_setzero_pd();
207 fiy1 = _mm256_setzero_pd();
208 fiz1 = _mm256_setzero_pd();
209 fix2 = _mm256_setzero_pd();
210 fiy2 = _mm256_setzero_pd();
211 fiz2 = _mm256_setzero_pd();
213 /* Reset potential sums */
214 velecsum = _mm256_setzero_pd();
215 vvdwsum = _mm256_setzero_pd();
217 /* Start inner kernel loop */
218 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
221 /* Get j neighbor index, and coordinate index */
226 j_coord_offsetA = DIM*jnrA;
227 j_coord_offsetB = DIM*jnrB;
228 j_coord_offsetC = DIM*jnrC;
229 j_coord_offsetD = DIM*jnrD;
231 /* load j atom coordinates */
232 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
233 x+j_coord_offsetC,x+j_coord_offsetD,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
236 /* Calculate displacement vector */
237 dx00 = _mm256_sub_pd(ix0,jx0);
238 dy00 = _mm256_sub_pd(iy0,jy0);
239 dz00 = _mm256_sub_pd(iz0,jz0);
240 dx01 = _mm256_sub_pd(ix0,jx1);
241 dy01 = _mm256_sub_pd(iy0,jy1);
242 dz01 = _mm256_sub_pd(iz0,jz1);
243 dx02 = _mm256_sub_pd(ix0,jx2);
244 dy02 = _mm256_sub_pd(iy0,jy2);
245 dz02 = _mm256_sub_pd(iz0,jz2);
246 dx10 = _mm256_sub_pd(ix1,jx0);
247 dy10 = _mm256_sub_pd(iy1,jy0);
248 dz10 = _mm256_sub_pd(iz1,jz0);
249 dx11 = _mm256_sub_pd(ix1,jx1);
250 dy11 = _mm256_sub_pd(iy1,jy1);
251 dz11 = _mm256_sub_pd(iz1,jz1);
252 dx12 = _mm256_sub_pd(ix1,jx2);
253 dy12 = _mm256_sub_pd(iy1,jy2);
254 dz12 = _mm256_sub_pd(iz1,jz2);
255 dx20 = _mm256_sub_pd(ix2,jx0);
256 dy20 = _mm256_sub_pd(iy2,jy0);
257 dz20 = _mm256_sub_pd(iz2,jz0);
258 dx21 = _mm256_sub_pd(ix2,jx1);
259 dy21 = _mm256_sub_pd(iy2,jy1);
260 dz21 = _mm256_sub_pd(iz2,jz1);
261 dx22 = _mm256_sub_pd(ix2,jx2);
262 dy22 = _mm256_sub_pd(iy2,jy2);
263 dz22 = _mm256_sub_pd(iz2,jz2);
265 /* Calculate squared distance and things based on it */
266 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
267 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
268 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
269 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
270 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
271 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
272 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
273 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
274 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
276 rinv00 = avx256_invsqrt_d(rsq00);
277 rinv01 = avx256_invsqrt_d(rsq01);
278 rinv02 = avx256_invsqrt_d(rsq02);
279 rinv10 = avx256_invsqrt_d(rsq10);
280 rinv11 = avx256_invsqrt_d(rsq11);
281 rinv12 = avx256_invsqrt_d(rsq12);
282 rinv20 = avx256_invsqrt_d(rsq20);
283 rinv21 = avx256_invsqrt_d(rsq21);
284 rinv22 = avx256_invsqrt_d(rsq22);
286 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
287 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
288 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
289 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
290 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
291 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
292 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
293 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
294 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
296 fjx0 = _mm256_setzero_pd();
297 fjy0 = _mm256_setzero_pd();
298 fjz0 = _mm256_setzero_pd();
299 fjx1 = _mm256_setzero_pd();
300 fjy1 = _mm256_setzero_pd();
301 fjz1 = _mm256_setzero_pd();
302 fjx2 = _mm256_setzero_pd();
303 fjy2 = _mm256_setzero_pd();
304 fjz2 = _mm256_setzero_pd();
306 /**************************
307 * CALCULATE INTERACTIONS *
308 **************************/
310 r00 = _mm256_mul_pd(rsq00,rinv00);
312 /* EWALD ELECTROSTATICS */
314 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
315 ewrt = _mm256_mul_pd(r00,ewtabscale);
316 ewitab = _mm256_cvttpd_epi32(ewrt);
317 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
318 ewitab = _mm_slli_epi32(ewitab,2);
319 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
320 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
321 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
322 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
323 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
324 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
325 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
326 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
327 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
329 /* LENNARD-JONES DISPERSION/REPULSION */
331 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
332 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
333 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
334 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
335 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
337 /* Update potential sum for this i atom from the interaction with this j atom. */
338 velecsum = _mm256_add_pd(velecsum,velec);
339 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
341 fscal = _mm256_add_pd(felec,fvdw);
343 /* Calculate temporary vectorial force */
344 tx = _mm256_mul_pd(fscal,dx00);
345 ty = _mm256_mul_pd(fscal,dy00);
346 tz = _mm256_mul_pd(fscal,dz00);
348 /* Update vectorial force */
349 fix0 = _mm256_add_pd(fix0,tx);
350 fiy0 = _mm256_add_pd(fiy0,ty);
351 fiz0 = _mm256_add_pd(fiz0,tz);
353 fjx0 = _mm256_add_pd(fjx0,tx);
354 fjy0 = _mm256_add_pd(fjy0,ty);
355 fjz0 = _mm256_add_pd(fjz0,tz);
357 /**************************
358 * CALCULATE INTERACTIONS *
359 **************************/
361 r01 = _mm256_mul_pd(rsq01,rinv01);
363 /* EWALD ELECTROSTATICS */
365 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
366 ewrt = _mm256_mul_pd(r01,ewtabscale);
367 ewitab = _mm256_cvttpd_epi32(ewrt);
368 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
369 ewitab = _mm_slli_epi32(ewitab,2);
370 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
371 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
372 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
373 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
374 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
375 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
376 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
377 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
378 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velecsum = _mm256_add_pd(velecsum,velec);
385 /* Calculate temporary vectorial force */
386 tx = _mm256_mul_pd(fscal,dx01);
387 ty = _mm256_mul_pd(fscal,dy01);
388 tz = _mm256_mul_pd(fscal,dz01);
390 /* Update vectorial force */
391 fix0 = _mm256_add_pd(fix0,tx);
392 fiy0 = _mm256_add_pd(fiy0,ty);
393 fiz0 = _mm256_add_pd(fiz0,tz);
395 fjx1 = _mm256_add_pd(fjx1,tx);
396 fjy1 = _mm256_add_pd(fjy1,ty);
397 fjz1 = _mm256_add_pd(fjz1,tz);
399 /**************************
400 * CALCULATE INTERACTIONS *
401 **************************/
403 r02 = _mm256_mul_pd(rsq02,rinv02);
405 /* EWALD ELECTROSTATICS */
407 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408 ewrt = _mm256_mul_pd(r02,ewtabscale);
409 ewitab = _mm256_cvttpd_epi32(ewrt);
410 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
411 ewitab = _mm_slli_epi32(ewitab,2);
412 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
413 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
414 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
415 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
416 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
417 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
418 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
419 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
420 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velecsum = _mm256_add_pd(velecsum,velec);
427 /* Calculate temporary vectorial force */
428 tx = _mm256_mul_pd(fscal,dx02);
429 ty = _mm256_mul_pd(fscal,dy02);
430 tz = _mm256_mul_pd(fscal,dz02);
432 /* Update vectorial force */
433 fix0 = _mm256_add_pd(fix0,tx);
434 fiy0 = _mm256_add_pd(fiy0,ty);
435 fiz0 = _mm256_add_pd(fiz0,tz);
437 fjx2 = _mm256_add_pd(fjx2,tx);
438 fjy2 = _mm256_add_pd(fjy2,ty);
439 fjz2 = _mm256_add_pd(fjz2,tz);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 r10 = _mm256_mul_pd(rsq10,rinv10);
447 /* EWALD ELECTROSTATICS */
449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
450 ewrt = _mm256_mul_pd(r10,ewtabscale);
451 ewitab = _mm256_cvttpd_epi32(ewrt);
452 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
453 ewitab = _mm_slli_epi32(ewitab,2);
454 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
455 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
456 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
457 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
458 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
459 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
460 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
461 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
462 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velecsum = _mm256_add_pd(velecsum,velec);
469 /* Calculate temporary vectorial force */
470 tx = _mm256_mul_pd(fscal,dx10);
471 ty = _mm256_mul_pd(fscal,dy10);
472 tz = _mm256_mul_pd(fscal,dz10);
474 /* Update vectorial force */
475 fix1 = _mm256_add_pd(fix1,tx);
476 fiy1 = _mm256_add_pd(fiy1,ty);
477 fiz1 = _mm256_add_pd(fiz1,tz);
479 fjx0 = _mm256_add_pd(fjx0,tx);
480 fjy0 = _mm256_add_pd(fjy0,ty);
481 fjz0 = _mm256_add_pd(fjz0,tz);
483 /**************************
484 * CALCULATE INTERACTIONS *
485 **************************/
487 r11 = _mm256_mul_pd(rsq11,rinv11);
489 /* EWALD ELECTROSTATICS */
491 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
492 ewrt = _mm256_mul_pd(r11,ewtabscale);
493 ewitab = _mm256_cvttpd_epi32(ewrt);
494 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
495 ewitab = _mm_slli_epi32(ewitab,2);
496 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
497 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
498 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
499 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
500 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
501 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
502 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
503 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
504 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
506 /* Update potential sum for this i atom from the interaction with this j atom. */
507 velecsum = _mm256_add_pd(velecsum,velec);
511 /* Calculate temporary vectorial force */
512 tx = _mm256_mul_pd(fscal,dx11);
513 ty = _mm256_mul_pd(fscal,dy11);
514 tz = _mm256_mul_pd(fscal,dz11);
516 /* Update vectorial force */
517 fix1 = _mm256_add_pd(fix1,tx);
518 fiy1 = _mm256_add_pd(fiy1,ty);
519 fiz1 = _mm256_add_pd(fiz1,tz);
521 fjx1 = _mm256_add_pd(fjx1,tx);
522 fjy1 = _mm256_add_pd(fjy1,ty);
523 fjz1 = _mm256_add_pd(fjz1,tz);
525 /**************************
526 * CALCULATE INTERACTIONS *
527 **************************/
529 r12 = _mm256_mul_pd(rsq12,rinv12);
531 /* EWALD ELECTROSTATICS */
533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
534 ewrt = _mm256_mul_pd(r12,ewtabscale);
535 ewitab = _mm256_cvttpd_epi32(ewrt);
536 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
537 ewitab = _mm_slli_epi32(ewitab,2);
538 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
539 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
540 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
541 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
542 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
543 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
544 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
545 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
546 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velecsum = _mm256_add_pd(velecsum,velec);
553 /* Calculate temporary vectorial force */
554 tx = _mm256_mul_pd(fscal,dx12);
555 ty = _mm256_mul_pd(fscal,dy12);
556 tz = _mm256_mul_pd(fscal,dz12);
558 /* Update vectorial force */
559 fix1 = _mm256_add_pd(fix1,tx);
560 fiy1 = _mm256_add_pd(fiy1,ty);
561 fiz1 = _mm256_add_pd(fiz1,tz);
563 fjx2 = _mm256_add_pd(fjx2,tx);
564 fjy2 = _mm256_add_pd(fjy2,ty);
565 fjz2 = _mm256_add_pd(fjz2,tz);
567 /**************************
568 * CALCULATE INTERACTIONS *
569 **************************/
571 r20 = _mm256_mul_pd(rsq20,rinv20);
573 /* EWALD ELECTROSTATICS */
575 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
576 ewrt = _mm256_mul_pd(r20,ewtabscale);
577 ewitab = _mm256_cvttpd_epi32(ewrt);
578 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
579 ewitab = _mm_slli_epi32(ewitab,2);
580 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
581 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
582 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
583 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
584 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
585 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
586 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
587 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
588 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
590 /* Update potential sum for this i atom from the interaction with this j atom. */
591 velecsum = _mm256_add_pd(velecsum,velec);
595 /* Calculate temporary vectorial force */
596 tx = _mm256_mul_pd(fscal,dx20);
597 ty = _mm256_mul_pd(fscal,dy20);
598 tz = _mm256_mul_pd(fscal,dz20);
600 /* Update vectorial force */
601 fix2 = _mm256_add_pd(fix2,tx);
602 fiy2 = _mm256_add_pd(fiy2,ty);
603 fiz2 = _mm256_add_pd(fiz2,tz);
605 fjx0 = _mm256_add_pd(fjx0,tx);
606 fjy0 = _mm256_add_pd(fjy0,ty);
607 fjz0 = _mm256_add_pd(fjz0,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 r21 = _mm256_mul_pd(rsq21,rinv21);
615 /* EWALD ELECTROSTATICS */
617 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
618 ewrt = _mm256_mul_pd(r21,ewtabscale);
619 ewitab = _mm256_cvttpd_epi32(ewrt);
620 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
621 ewitab = _mm_slli_epi32(ewitab,2);
622 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
623 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
624 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
625 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
626 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
627 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
628 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
629 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
630 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velecsum = _mm256_add_pd(velecsum,velec);
637 /* Calculate temporary vectorial force */
638 tx = _mm256_mul_pd(fscal,dx21);
639 ty = _mm256_mul_pd(fscal,dy21);
640 tz = _mm256_mul_pd(fscal,dz21);
642 /* Update vectorial force */
643 fix2 = _mm256_add_pd(fix2,tx);
644 fiy2 = _mm256_add_pd(fiy2,ty);
645 fiz2 = _mm256_add_pd(fiz2,tz);
647 fjx1 = _mm256_add_pd(fjx1,tx);
648 fjy1 = _mm256_add_pd(fjy1,ty);
649 fjz1 = _mm256_add_pd(fjz1,tz);
651 /**************************
652 * CALCULATE INTERACTIONS *
653 **************************/
655 r22 = _mm256_mul_pd(rsq22,rinv22);
657 /* EWALD ELECTROSTATICS */
659 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
660 ewrt = _mm256_mul_pd(r22,ewtabscale);
661 ewitab = _mm256_cvttpd_epi32(ewrt);
662 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
663 ewitab = _mm_slli_epi32(ewitab,2);
664 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
665 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
666 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
667 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
668 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
669 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
670 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
671 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
672 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
674 /* Update potential sum for this i atom from the interaction with this j atom. */
675 velecsum = _mm256_add_pd(velecsum,velec);
679 /* Calculate temporary vectorial force */
680 tx = _mm256_mul_pd(fscal,dx22);
681 ty = _mm256_mul_pd(fscal,dy22);
682 tz = _mm256_mul_pd(fscal,dz22);
684 /* Update vectorial force */
685 fix2 = _mm256_add_pd(fix2,tx);
686 fiy2 = _mm256_add_pd(fiy2,ty);
687 fiz2 = _mm256_add_pd(fiz2,tz);
689 fjx2 = _mm256_add_pd(fjx2,tx);
690 fjy2 = _mm256_add_pd(fjy2,ty);
691 fjz2 = _mm256_add_pd(fjz2,tz);
693 fjptrA = f+j_coord_offsetA;
694 fjptrB = f+j_coord_offsetB;
695 fjptrC = f+j_coord_offsetC;
696 fjptrD = f+j_coord_offsetD;
698 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
699 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
701 /* Inner loop uses 381 flops */
707 /* Get j neighbor index, and coordinate index */
708 jnrlistA = jjnr[jidx];
709 jnrlistB = jjnr[jidx+1];
710 jnrlistC = jjnr[jidx+2];
711 jnrlistD = jjnr[jidx+3];
712 /* Sign of each element will be negative for non-real atoms.
713 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
714 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
716 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
718 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
719 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
720 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
722 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
723 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
724 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
725 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
726 j_coord_offsetA = DIM*jnrA;
727 j_coord_offsetB = DIM*jnrB;
728 j_coord_offsetC = DIM*jnrC;
729 j_coord_offsetD = DIM*jnrD;
731 /* load j atom coordinates */
732 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
733 x+j_coord_offsetC,x+j_coord_offsetD,
734 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
736 /* Calculate displacement vector */
737 dx00 = _mm256_sub_pd(ix0,jx0);
738 dy00 = _mm256_sub_pd(iy0,jy0);
739 dz00 = _mm256_sub_pd(iz0,jz0);
740 dx01 = _mm256_sub_pd(ix0,jx1);
741 dy01 = _mm256_sub_pd(iy0,jy1);
742 dz01 = _mm256_sub_pd(iz0,jz1);
743 dx02 = _mm256_sub_pd(ix0,jx2);
744 dy02 = _mm256_sub_pd(iy0,jy2);
745 dz02 = _mm256_sub_pd(iz0,jz2);
746 dx10 = _mm256_sub_pd(ix1,jx0);
747 dy10 = _mm256_sub_pd(iy1,jy0);
748 dz10 = _mm256_sub_pd(iz1,jz0);
749 dx11 = _mm256_sub_pd(ix1,jx1);
750 dy11 = _mm256_sub_pd(iy1,jy1);
751 dz11 = _mm256_sub_pd(iz1,jz1);
752 dx12 = _mm256_sub_pd(ix1,jx2);
753 dy12 = _mm256_sub_pd(iy1,jy2);
754 dz12 = _mm256_sub_pd(iz1,jz2);
755 dx20 = _mm256_sub_pd(ix2,jx0);
756 dy20 = _mm256_sub_pd(iy2,jy0);
757 dz20 = _mm256_sub_pd(iz2,jz0);
758 dx21 = _mm256_sub_pd(ix2,jx1);
759 dy21 = _mm256_sub_pd(iy2,jy1);
760 dz21 = _mm256_sub_pd(iz2,jz1);
761 dx22 = _mm256_sub_pd(ix2,jx2);
762 dy22 = _mm256_sub_pd(iy2,jy2);
763 dz22 = _mm256_sub_pd(iz2,jz2);
765 /* Calculate squared distance and things based on it */
766 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
767 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
768 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
769 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
770 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
771 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
772 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
773 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
774 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
776 rinv00 = avx256_invsqrt_d(rsq00);
777 rinv01 = avx256_invsqrt_d(rsq01);
778 rinv02 = avx256_invsqrt_d(rsq02);
779 rinv10 = avx256_invsqrt_d(rsq10);
780 rinv11 = avx256_invsqrt_d(rsq11);
781 rinv12 = avx256_invsqrt_d(rsq12);
782 rinv20 = avx256_invsqrt_d(rsq20);
783 rinv21 = avx256_invsqrt_d(rsq21);
784 rinv22 = avx256_invsqrt_d(rsq22);
786 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
787 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
788 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
789 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
790 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
791 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
792 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
793 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
794 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
796 fjx0 = _mm256_setzero_pd();
797 fjy0 = _mm256_setzero_pd();
798 fjz0 = _mm256_setzero_pd();
799 fjx1 = _mm256_setzero_pd();
800 fjy1 = _mm256_setzero_pd();
801 fjz1 = _mm256_setzero_pd();
802 fjx2 = _mm256_setzero_pd();
803 fjy2 = _mm256_setzero_pd();
804 fjz2 = _mm256_setzero_pd();
806 /**************************
807 * CALCULATE INTERACTIONS *
808 **************************/
810 r00 = _mm256_mul_pd(rsq00,rinv00);
811 r00 = _mm256_andnot_pd(dummy_mask,r00);
813 /* EWALD ELECTROSTATICS */
815 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
816 ewrt = _mm256_mul_pd(r00,ewtabscale);
817 ewitab = _mm256_cvttpd_epi32(ewrt);
818 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
819 ewitab = _mm_slli_epi32(ewitab,2);
820 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
821 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
822 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
823 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
824 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
825 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
826 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
827 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
828 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
830 /* LENNARD-JONES DISPERSION/REPULSION */
832 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
833 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
834 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
835 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
836 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
838 /* Update potential sum for this i atom from the interaction with this j atom. */
839 velec = _mm256_andnot_pd(dummy_mask,velec);
840 velecsum = _mm256_add_pd(velecsum,velec);
841 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
842 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
844 fscal = _mm256_add_pd(felec,fvdw);
846 fscal = _mm256_andnot_pd(dummy_mask,fscal);
848 /* Calculate temporary vectorial force */
849 tx = _mm256_mul_pd(fscal,dx00);
850 ty = _mm256_mul_pd(fscal,dy00);
851 tz = _mm256_mul_pd(fscal,dz00);
853 /* Update vectorial force */
854 fix0 = _mm256_add_pd(fix0,tx);
855 fiy0 = _mm256_add_pd(fiy0,ty);
856 fiz0 = _mm256_add_pd(fiz0,tz);
858 fjx0 = _mm256_add_pd(fjx0,tx);
859 fjy0 = _mm256_add_pd(fjy0,ty);
860 fjz0 = _mm256_add_pd(fjz0,tz);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 r01 = _mm256_mul_pd(rsq01,rinv01);
867 r01 = _mm256_andnot_pd(dummy_mask,r01);
869 /* EWALD ELECTROSTATICS */
871 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
872 ewrt = _mm256_mul_pd(r01,ewtabscale);
873 ewitab = _mm256_cvttpd_epi32(ewrt);
874 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
875 ewitab = _mm_slli_epi32(ewitab,2);
876 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
877 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
878 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
879 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
880 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
881 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
882 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
883 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
884 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 velec = _mm256_andnot_pd(dummy_mask,velec);
888 velecsum = _mm256_add_pd(velecsum,velec);
892 fscal = _mm256_andnot_pd(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm256_mul_pd(fscal,dx01);
896 ty = _mm256_mul_pd(fscal,dy01);
897 tz = _mm256_mul_pd(fscal,dz01);
899 /* Update vectorial force */
900 fix0 = _mm256_add_pd(fix0,tx);
901 fiy0 = _mm256_add_pd(fiy0,ty);
902 fiz0 = _mm256_add_pd(fiz0,tz);
904 fjx1 = _mm256_add_pd(fjx1,tx);
905 fjy1 = _mm256_add_pd(fjy1,ty);
906 fjz1 = _mm256_add_pd(fjz1,tz);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 r02 = _mm256_mul_pd(rsq02,rinv02);
913 r02 = _mm256_andnot_pd(dummy_mask,r02);
915 /* EWALD ELECTROSTATICS */
917 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
918 ewrt = _mm256_mul_pd(r02,ewtabscale);
919 ewitab = _mm256_cvttpd_epi32(ewrt);
920 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
921 ewitab = _mm_slli_epi32(ewitab,2);
922 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
923 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
924 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
925 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
926 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
927 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
928 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
929 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
930 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
932 /* Update potential sum for this i atom from the interaction with this j atom. */
933 velec = _mm256_andnot_pd(dummy_mask,velec);
934 velecsum = _mm256_add_pd(velecsum,velec);
938 fscal = _mm256_andnot_pd(dummy_mask,fscal);
940 /* Calculate temporary vectorial force */
941 tx = _mm256_mul_pd(fscal,dx02);
942 ty = _mm256_mul_pd(fscal,dy02);
943 tz = _mm256_mul_pd(fscal,dz02);
945 /* Update vectorial force */
946 fix0 = _mm256_add_pd(fix0,tx);
947 fiy0 = _mm256_add_pd(fiy0,ty);
948 fiz0 = _mm256_add_pd(fiz0,tz);
950 fjx2 = _mm256_add_pd(fjx2,tx);
951 fjy2 = _mm256_add_pd(fjy2,ty);
952 fjz2 = _mm256_add_pd(fjz2,tz);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 r10 = _mm256_mul_pd(rsq10,rinv10);
959 r10 = _mm256_andnot_pd(dummy_mask,r10);
961 /* EWALD ELECTROSTATICS */
963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
964 ewrt = _mm256_mul_pd(r10,ewtabscale);
965 ewitab = _mm256_cvttpd_epi32(ewrt);
966 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
967 ewitab = _mm_slli_epi32(ewitab,2);
968 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
969 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
970 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
971 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
972 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
973 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
974 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
975 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
976 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm256_andnot_pd(dummy_mask,velec);
980 velecsum = _mm256_add_pd(velecsum,velec);
984 fscal = _mm256_andnot_pd(dummy_mask,fscal);
986 /* Calculate temporary vectorial force */
987 tx = _mm256_mul_pd(fscal,dx10);
988 ty = _mm256_mul_pd(fscal,dy10);
989 tz = _mm256_mul_pd(fscal,dz10);
991 /* Update vectorial force */
992 fix1 = _mm256_add_pd(fix1,tx);
993 fiy1 = _mm256_add_pd(fiy1,ty);
994 fiz1 = _mm256_add_pd(fiz1,tz);
996 fjx0 = _mm256_add_pd(fjx0,tx);
997 fjy0 = _mm256_add_pd(fjy0,ty);
998 fjz0 = _mm256_add_pd(fjz0,tz);
1000 /**************************
1001 * CALCULATE INTERACTIONS *
1002 **************************/
1004 r11 = _mm256_mul_pd(rsq11,rinv11);
1005 r11 = _mm256_andnot_pd(dummy_mask,r11);
1007 /* EWALD ELECTROSTATICS */
1009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1010 ewrt = _mm256_mul_pd(r11,ewtabscale);
1011 ewitab = _mm256_cvttpd_epi32(ewrt);
1012 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1013 ewitab = _mm_slli_epi32(ewitab,2);
1014 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1015 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1016 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1017 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1018 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1019 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1020 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1021 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1022 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1024 /* Update potential sum for this i atom from the interaction with this j atom. */
1025 velec = _mm256_andnot_pd(dummy_mask,velec);
1026 velecsum = _mm256_add_pd(velecsum,velec);
1030 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1032 /* Calculate temporary vectorial force */
1033 tx = _mm256_mul_pd(fscal,dx11);
1034 ty = _mm256_mul_pd(fscal,dy11);
1035 tz = _mm256_mul_pd(fscal,dz11);
1037 /* Update vectorial force */
1038 fix1 = _mm256_add_pd(fix1,tx);
1039 fiy1 = _mm256_add_pd(fiy1,ty);
1040 fiz1 = _mm256_add_pd(fiz1,tz);
1042 fjx1 = _mm256_add_pd(fjx1,tx);
1043 fjy1 = _mm256_add_pd(fjy1,ty);
1044 fjz1 = _mm256_add_pd(fjz1,tz);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 r12 = _mm256_mul_pd(rsq12,rinv12);
1051 r12 = _mm256_andnot_pd(dummy_mask,r12);
1053 /* EWALD ELECTROSTATICS */
1055 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1056 ewrt = _mm256_mul_pd(r12,ewtabscale);
1057 ewitab = _mm256_cvttpd_epi32(ewrt);
1058 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1059 ewitab = _mm_slli_epi32(ewitab,2);
1060 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1061 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1062 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1063 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1064 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1065 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1066 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1067 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1068 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1070 /* Update potential sum for this i atom from the interaction with this j atom. */
1071 velec = _mm256_andnot_pd(dummy_mask,velec);
1072 velecsum = _mm256_add_pd(velecsum,velec);
1076 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1078 /* Calculate temporary vectorial force */
1079 tx = _mm256_mul_pd(fscal,dx12);
1080 ty = _mm256_mul_pd(fscal,dy12);
1081 tz = _mm256_mul_pd(fscal,dz12);
1083 /* Update vectorial force */
1084 fix1 = _mm256_add_pd(fix1,tx);
1085 fiy1 = _mm256_add_pd(fiy1,ty);
1086 fiz1 = _mm256_add_pd(fiz1,tz);
1088 fjx2 = _mm256_add_pd(fjx2,tx);
1089 fjy2 = _mm256_add_pd(fjy2,ty);
1090 fjz2 = _mm256_add_pd(fjz2,tz);
1092 /**************************
1093 * CALCULATE INTERACTIONS *
1094 **************************/
1096 r20 = _mm256_mul_pd(rsq20,rinv20);
1097 r20 = _mm256_andnot_pd(dummy_mask,r20);
1099 /* EWALD ELECTROSTATICS */
1101 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1102 ewrt = _mm256_mul_pd(r20,ewtabscale);
1103 ewitab = _mm256_cvttpd_epi32(ewrt);
1104 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1105 ewitab = _mm_slli_epi32(ewitab,2);
1106 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1107 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1108 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1109 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1110 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1111 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1112 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1113 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1114 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec = _mm256_andnot_pd(dummy_mask,velec);
1118 velecsum = _mm256_add_pd(velecsum,velec);
1122 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1124 /* Calculate temporary vectorial force */
1125 tx = _mm256_mul_pd(fscal,dx20);
1126 ty = _mm256_mul_pd(fscal,dy20);
1127 tz = _mm256_mul_pd(fscal,dz20);
1129 /* Update vectorial force */
1130 fix2 = _mm256_add_pd(fix2,tx);
1131 fiy2 = _mm256_add_pd(fiy2,ty);
1132 fiz2 = _mm256_add_pd(fiz2,tz);
1134 fjx0 = _mm256_add_pd(fjx0,tx);
1135 fjy0 = _mm256_add_pd(fjy0,ty);
1136 fjz0 = _mm256_add_pd(fjz0,tz);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 r21 = _mm256_mul_pd(rsq21,rinv21);
1143 r21 = _mm256_andnot_pd(dummy_mask,r21);
1145 /* EWALD ELECTROSTATICS */
1147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1148 ewrt = _mm256_mul_pd(r21,ewtabscale);
1149 ewitab = _mm256_cvttpd_epi32(ewrt);
1150 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1151 ewitab = _mm_slli_epi32(ewitab,2);
1152 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1153 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1154 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1155 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1156 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1157 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1158 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1159 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1160 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1162 /* Update potential sum for this i atom from the interaction with this j atom. */
1163 velec = _mm256_andnot_pd(dummy_mask,velec);
1164 velecsum = _mm256_add_pd(velecsum,velec);
1168 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1170 /* Calculate temporary vectorial force */
1171 tx = _mm256_mul_pd(fscal,dx21);
1172 ty = _mm256_mul_pd(fscal,dy21);
1173 tz = _mm256_mul_pd(fscal,dz21);
1175 /* Update vectorial force */
1176 fix2 = _mm256_add_pd(fix2,tx);
1177 fiy2 = _mm256_add_pd(fiy2,ty);
1178 fiz2 = _mm256_add_pd(fiz2,tz);
1180 fjx1 = _mm256_add_pd(fjx1,tx);
1181 fjy1 = _mm256_add_pd(fjy1,ty);
1182 fjz1 = _mm256_add_pd(fjz1,tz);
1184 /**************************
1185 * CALCULATE INTERACTIONS *
1186 **************************/
1188 r22 = _mm256_mul_pd(rsq22,rinv22);
1189 r22 = _mm256_andnot_pd(dummy_mask,r22);
1191 /* EWALD ELECTROSTATICS */
1193 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1194 ewrt = _mm256_mul_pd(r22,ewtabscale);
1195 ewitab = _mm256_cvttpd_epi32(ewrt);
1196 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1197 ewitab = _mm_slli_epi32(ewitab,2);
1198 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1199 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1200 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1201 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1202 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1203 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1204 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1205 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1206 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1208 /* Update potential sum for this i atom from the interaction with this j atom. */
1209 velec = _mm256_andnot_pd(dummy_mask,velec);
1210 velecsum = _mm256_add_pd(velecsum,velec);
1214 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1216 /* Calculate temporary vectorial force */
1217 tx = _mm256_mul_pd(fscal,dx22);
1218 ty = _mm256_mul_pd(fscal,dy22);
1219 tz = _mm256_mul_pd(fscal,dz22);
1221 /* Update vectorial force */
1222 fix2 = _mm256_add_pd(fix2,tx);
1223 fiy2 = _mm256_add_pd(fiy2,ty);
1224 fiz2 = _mm256_add_pd(fiz2,tz);
1226 fjx2 = _mm256_add_pd(fjx2,tx);
1227 fjy2 = _mm256_add_pd(fjy2,ty);
1228 fjz2 = _mm256_add_pd(fjz2,tz);
1230 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1231 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1232 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1233 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1235 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1236 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1238 /* Inner loop uses 390 flops */
1241 /* End of innermost loop */
1243 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1244 f+i_coord_offset,fshift+i_shift_offset);
1247 /* Update potential energies */
1248 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1249 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1251 /* Increment number of inner iterations */
1252 inneriter += j_index_end - j_index_start;
1254 /* Outer loop uses 20 flops */
1257 /* Increment number of outer iterations */
1260 /* Update outer/inner flops */
1262 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1265 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
1266 * Electrostatics interaction: Ewald
1267 * VdW interaction: LennardJones
1268 * Geometry: Water3-Water3
1269 * Calculate force/pot: Force
1272 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
1273 (t_nblist * gmx_restrict nlist,
1274 rvec * gmx_restrict xx,
1275 rvec * gmx_restrict ff,
1276 struct t_forcerec * gmx_restrict fr,
1277 t_mdatoms * gmx_restrict mdatoms,
1278 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1279 t_nrnb * gmx_restrict nrnb)
1281 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1282 * just 0 for non-waters.
1283 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1284 * jnr indices corresponding to data put in the four positions in the SIMD register.
1286 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1287 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1288 int jnrA,jnrB,jnrC,jnrD;
1289 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1290 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1291 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1292 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1293 real rcutoff_scalar;
1294 real *shiftvec,*fshift,*x,*f;
1295 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1296 real scratch[4*DIM];
1297 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1298 real * vdwioffsetptr0;
1299 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1300 real * vdwioffsetptr1;
1301 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1302 real * vdwioffsetptr2;
1303 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1304 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1305 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1306 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1307 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1308 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1309 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1310 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1311 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1312 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1313 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1314 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1315 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1316 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1317 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1318 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1319 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1322 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1325 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1326 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1328 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1329 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1331 __m256d dummy_mask,cutoff_mask;
1332 __m128 tmpmask0,tmpmask1;
1333 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1334 __m256d one = _mm256_set1_pd(1.0);
1335 __m256d two = _mm256_set1_pd(2.0);
1341 jindex = nlist->jindex;
1343 shiftidx = nlist->shift;
1345 shiftvec = fr->shift_vec[0];
1346 fshift = fr->fshift[0];
1347 facel = _mm256_set1_pd(fr->ic->epsfac);
1348 charge = mdatoms->chargeA;
1349 nvdwtype = fr->ntype;
1350 vdwparam = fr->nbfp;
1351 vdwtype = mdatoms->typeA;
1353 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1354 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1355 beta2 = _mm256_mul_pd(beta,beta);
1356 beta3 = _mm256_mul_pd(beta,beta2);
1358 ewtab = fr->ic->tabq_coul_F;
1359 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1360 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1362 /* Setup water-specific parameters */
1363 inr = nlist->iinr[0];
1364 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1365 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1366 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1367 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1369 jq0 = _mm256_set1_pd(charge[inr+0]);
1370 jq1 = _mm256_set1_pd(charge[inr+1]);
1371 jq2 = _mm256_set1_pd(charge[inr+2]);
1372 vdwjidx0A = 2*vdwtype[inr+0];
1373 qq00 = _mm256_mul_pd(iq0,jq0);
1374 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1375 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1376 qq01 = _mm256_mul_pd(iq0,jq1);
1377 qq02 = _mm256_mul_pd(iq0,jq2);
1378 qq10 = _mm256_mul_pd(iq1,jq0);
1379 qq11 = _mm256_mul_pd(iq1,jq1);
1380 qq12 = _mm256_mul_pd(iq1,jq2);
1381 qq20 = _mm256_mul_pd(iq2,jq0);
1382 qq21 = _mm256_mul_pd(iq2,jq1);
1383 qq22 = _mm256_mul_pd(iq2,jq2);
1385 /* Avoid stupid compiler warnings */
1386 jnrA = jnrB = jnrC = jnrD = 0;
1387 j_coord_offsetA = 0;
1388 j_coord_offsetB = 0;
1389 j_coord_offsetC = 0;
1390 j_coord_offsetD = 0;
1395 for(iidx=0;iidx<4*DIM;iidx++)
1397 scratch[iidx] = 0.0;
1400 /* Start outer loop over neighborlists */
1401 for(iidx=0; iidx<nri; iidx++)
1403 /* Load shift vector for this list */
1404 i_shift_offset = DIM*shiftidx[iidx];
1406 /* Load limits for loop over neighbors */
1407 j_index_start = jindex[iidx];
1408 j_index_end = jindex[iidx+1];
1410 /* Get outer coordinate index */
1412 i_coord_offset = DIM*inr;
1414 /* Load i particle coords and add shift vector */
1415 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1416 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1418 fix0 = _mm256_setzero_pd();
1419 fiy0 = _mm256_setzero_pd();
1420 fiz0 = _mm256_setzero_pd();
1421 fix1 = _mm256_setzero_pd();
1422 fiy1 = _mm256_setzero_pd();
1423 fiz1 = _mm256_setzero_pd();
1424 fix2 = _mm256_setzero_pd();
1425 fiy2 = _mm256_setzero_pd();
1426 fiz2 = _mm256_setzero_pd();
1428 /* Start inner kernel loop */
1429 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1432 /* Get j neighbor index, and coordinate index */
1434 jnrB = jjnr[jidx+1];
1435 jnrC = jjnr[jidx+2];
1436 jnrD = jjnr[jidx+3];
1437 j_coord_offsetA = DIM*jnrA;
1438 j_coord_offsetB = DIM*jnrB;
1439 j_coord_offsetC = DIM*jnrC;
1440 j_coord_offsetD = DIM*jnrD;
1442 /* load j atom coordinates */
1443 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1444 x+j_coord_offsetC,x+j_coord_offsetD,
1445 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1447 /* Calculate displacement vector */
1448 dx00 = _mm256_sub_pd(ix0,jx0);
1449 dy00 = _mm256_sub_pd(iy0,jy0);
1450 dz00 = _mm256_sub_pd(iz0,jz0);
1451 dx01 = _mm256_sub_pd(ix0,jx1);
1452 dy01 = _mm256_sub_pd(iy0,jy1);
1453 dz01 = _mm256_sub_pd(iz0,jz1);
1454 dx02 = _mm256_sub_pd(ix0,jx2);
1455 dy02 = _mm256_sub_pd(iy0,jy2);
1456 dz02 = _mm256_sub_pd(iz0,jz2);
1457 dx10 = _mm256_sub_pd(ix1,jx0);
1458 dy10 = _mm256_sub_pd(iy1,jy0);
1459 dz10 = _mm256_sub_pd(iz1,jz0);
1460 dx11 = _mm256_sub_pd(ix1,jx1);
1461 dy11 = _mm256_sub_pd(iy1,jy1);
1462 dz11 = _mm256_sub_pd(iz1,jz1);
1463 dx12 = _mm256_sub_pd(ix1,jx2);
1464 dy12 = _mm256_sub_pd(iy1,jy2);
1465 dz12 = _mm256_sub_pd(iz1,jz2);
1466 dx20 = _mm256_sub_pd(ix2,jx0);
1467 dy20 = _mm256_sub_pd(iy2,jy0);
1468 dz20 = _mm256_sub_pd(iz2,jz0);
1469 dx21 = _mm256_sub_pd(ix2,jx1);
1470 dy21 = _mm256_sub_pd(iy2,jy1);
1471 dz21 = _mm256_sub_pd(iz2,jz1);
1472 dx22 = _mm256_sub_pd(ix2,jx2);
1473 dy22 = _mm256_sub_pd(iy2,jy2);
1474 dz22 = _mm256_sub_pd(iz2,jz2);
1476 /* Calculate squared distance and things based on it */
1477 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1478 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1479 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1480 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1481 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1482 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1483 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1484 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1485 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1487 rinv00 = avx256_invsqrt_d(rsq00);
1488 rinv01 = avx256_invsqrt_d(rsq01);
1489 rinv02 = avx256_invsqrt_d(rsq02);
1490 rinv10 = avx256_invsqrt_d(rsq10);
1491 rinv11 = avx256_invsqrt_d(rsq11);
1492 rinv12 = avx256_invsqrt_d(rsq12);
1493 rinv20 = avx256_invsqrt_d(rsq20);
1494 rinv21 = avx256_invsqrt_d(rsq21);
1495 rinv22 = avx256_invsqrt_d(rsq22);
1497 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1498 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1499 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1500 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1501 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1502 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1503 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1504 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1505 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1507 fjx0 = _mm256_setzero_pd();
1508 fjy0 = _mm256_setzero_pd();
1509 fjz0 = _mm256_setzero_pd();
1510 fjx1 = _mm256_setzero_pd();
1511 fjy1 = _mm256_setzero_pd();
1512 fjz1 = _mm256_setzero_pd();
1513 fjx2 = _mm256_setzero_pd();
1514 fjy2 = _mm256_setzero_pd();
1515 fjz2 = _mm256_setzero_pd();
1517 /**************************
1518 * CALCULATE INTERACTIONS *
1519 **************************/
1521 r00 = _mm256_mul_pd(rsq00,rinv00);
1523 /* EWALD ELECTROSTATICS */
1525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1526 ewrt = _mm256_mul_pd(r00,ewtabscale);
1527 ewitab = _mm256_cvttpd_epi32(ewrt);
1528 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1529 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1530 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1532 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1533 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1535 /* LENNARD-JONES DISPERSION/REPULSION */
1537 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1538 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1540 fscal = _mm256_add_pd(felec,fvdw);
1542 /* Calculate temporary vectorial force */
1543 tx = _mm256_mul_pd(fscal,dx00);
1544 ty = _mm256_mul_pd(fscal,dy00);
1545 tz = _mm256_mul_pd(fscal,dz00);
1547 /* Update vectorial force */
1548 fix0 = _mm256_add_pd(fix0,tx);
1549 fiy0 = _mm256_add_pd(fiy0,ty);
1550 fiz0 = _mm256_add_pd(fiz0,tz);
1552 fjx0 = _mm256_add_pd(fjx0,tx);
1553 fjy0 = _mm256_add_pd(fjy0,ty);
1554 fjz0 = _mm256_add_pd(fjz0,tz);
1556 /**************************
1557 * CALCULATE INTERACTIONS *
1558 **************************/
1560 r01 = _mm256_mul_pd(rsq01,rinv01);
1562 /* EWALD ELECTROSTATICS */
1564 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1565 ewrt = _mm256_mul_pd(r01,ewtabscale);
1566 ewitab = _mm256_cvttpd_epi32(ewrt);
1567 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1568 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1569 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1571 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1572 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1576 /* Calculate temporary vectorial force */
1577 tx = _mm256_mul_pd(fscal,dx01);
1578 ty = _mm256_mul_pd(fscal,dy01);
1579 tz = _mm256_mul_pd(fscal,dz01);
1581 /* Update vectorial force */
1582 fix0 = _mm256_add_pd(fix0,tx);
1583 fiy0 = _mm256_add_pd(fiy0,ty);
1584 fiz0 = _mm256_add_pd(fiz0,tz);
1586 fjx1 = _mm256_add_pd(fjx1,tx);
1587 fjy1 = _mm256_add_pd(fjy1,ty);
1588 fjz1 = _mm256_add_pd(fjz1,tz);
1590 /**************************
1591 * CALCULATE INTERACTIONS *
1592 **************************/
1594 r02 = _mm256_mul_pd(rsq02,rinv02);
1596 /* EWALD ELECTROSTATICS */
1598 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1599 ewrt = _mm256_mul_pd(r02,ewtabscale);
1600 ewitab = _mm256_cvttpd_epi32(ewrt);
1601 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1602 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1603 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1605 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1606 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1610 /* Calculate temporary vectorial force */
1611 tx = _mm256_mul_pd(fscal,dx02);
1612 ty = _mm256_mul_pd(fscal,dy02);
1613 tz = _mm256_mul_pd(fscal,dz02);
1615 /* Update vectorial force */
1616 fix0 = _mm256_add_pd(fix0,tx);
1617 fiy0 = _mm256_add_pd(fiy0,ty);
1618 fiz0 = _mm256_add_pd(fiz0,tz);
1620 fjx2 = _mm256_add_pd(fjx2,tx);
1621 fjy2 = _mm256_add_pd(fjy2,ty);
1622 fjz2 = _mm256_add_pd(fjz2,tz);
1624 /**************************
1625 * CALCULATE INTERACTIONS *
1626 **************************/
1628 r10 = _mm256_mul_pd(rsq10,rinv10);
1630 /* EWALD ELECTROSTATICS */
1632 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1633 ewrt = _mm256_mul_pd(r10,ewtabscale);
1634 ewitab = _mm256_cvttpd_epi32(ewrt);
1635 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1636 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1637 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1639 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1640 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1644 /* Calculate temporary vectorial force */
1645 tx = _mm256_mul_pd(fscal,dx10);
1646 ty = _mm256_mul_pd(fscal,dy10);
1647 tz = _mm256_mul_pd(fscal,dz10);
1649 /* Update vectorial force */
1650 fix1 = _mm256_add_pd(fix1,tx);
1651 fiy1 = _mm256_add_pd(fiy1,ty);
1652 fiz1 = _mm256_add_pd(fiz1,tz);
1654 fjx0 = _mm256_add_pd(fjx0,tx);
1655 fjy0 = _mm256_add_pd(fjy0,ty);
1656 fjz0 = _mm256_add_pd(fjz0,tz);
1658 /**************************
1659 * CALCULATE INTERACTIONS *
1660 **************************/
1662 r11 = _mm256_mul_pd(rsq11,rinv11);
1664 /* EWALD ELECTROSTATICS */
1666 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1667 ewrt = _mm256_mul_pd(r11,ewtabscale);
1668 ewitab = _mm256_cvttpd_epi32(ewrt);
1669 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1670 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1671 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1673 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1674 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1678 /* Calculate temporary vectorial force */
1679 tx = _mm256_mul_pd(fscal,dx11);
1680 ty = _mm256_mul_pd(fscal,dy11);
1681 tz = _mm256_mul_pd(fscal,dz11);
1683 /* Update vectorial force */
1684 fix1 = _mm256_add_pd(fix1,tx);
1685 fiy1 = _mm256_add_pd(fiy1,ty);
1686 fiz1 = _mm256_add_pd(fiz1,tz);
1688 fjx1 = _mm256_add_pd(fjx1,tx);
1689 fjy1 = _mm256_add_pd(fjy1,ty);
1690 fjz1 = _mm256_add_pd(fjz1,tz);
1692 /**************************
1693 * CALCULATE INTERACTIONS *
1694 **************************/
1696 r12 = _mm256_mul_pd(rsq12,rinv12);
1698 /* EWALD ELECTROSTATICS */
1700 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1701 ewrt = _mm256_mul_pd(r12,ewtabscale);
1702 ewitab = _mm256_cvttpd_epi32(ewrt);
1703 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1704 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1705 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1707 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1708 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1712 /* Calculate temporary vectorial force */
1713 tx = _mm256_mul_pd(fscal,dx12);
1714 ty = _mm256_mul_pd(fscal,dy12);
1715 tz = _mm256_mul_pd(fscal,dz12);
1717 /* Update vectorial force */
1718 fix1 = _mm256_add_pd(fix1,tx);
1719 fiy1 = _mm256_add_pd(fiy1,ty);
1720 fiz1 = _mm256_add_pd(fiz1,tz);
1722 fjx2 = _mm256_add_pd(fjx2,tx);
1723 fjy2 = _mm256_add_pd(fjy2,ty);
1724 fjz2 = _mm256_add_pd(fjz2,tz);
1726 /**************************
1727 * CALCULATE INTERACTIONS *
1728 **************************/
1730 r20 = _mm256_mul_pd(rsq20,rinv20);
1732 /* EWALD ELECTROSTATICS */
1734 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1735 ewrt = _mm256_mul_pd(r20,ewtabscale);
1736 ewitab = _mm256_cvttpd_epi32(ewrt);
1737 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1738 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1739 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1741 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1742 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1746 /* Calculate temporary vectorial force */
1747 tx = _mm256_mul_pd(fscal,dx20);
1748 ty = _mm256_mul_pd(fscal,dy20);
1749 tz = _mm256_mul_pd(fscal,dz20);
1751 /* Update vectorial force */
1752 fix2 = _mm256_add_pd(fix2,tx);
1753 fiy2 = _mm256_add_pd(fiy2,ty);
1754 fiz2 = _mm256_add_pd(fiz2,tz);
1756 fjx0 = _mm256_add_pd(fjx0,tx);
1757 fjy0 = _mm256_add_pd(fjy0,ty);
1758 fjz0 = _mm256_add_pd(fjz0,tz);
1760 /**************************
1761 * CALCULATE INTERACTIONS *
1762 **************************/
1764 r21 = _mm256_mul_pd(rsq21,rinv21);
1766 /* EWALD ELECTROSTATICS */
1768 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769 ewrt = _mm256_mul_pd(r21,ewtabscale);
1770 ewitab = _mm256_cvttpd_epi32(ewrt);
1771 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1772 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1773 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1775 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1776 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1780 /* Calculate temporary vectorial force */
1781 tx = _mm256_mul_pd(fscal,dx21);
1782 ty = _mm256_mul_pd(fscal,dy21);
1783 tz = _mm256_mul_pd(fscal,dz21);
1785 /* Update vectorial force */
1786 fix2 = _mm256_add_pd(fix2,tx);
1787 fiy2 = _mm256_add_pd(fiy2,ty);
1788 fiz2 = _mm256_add_pd(fiz2,tz);
1790 fjx1 = _mm256_add_pd(fjx1,tx);
1791 fjy1 = _mm256_add_pd(fjy1,ty);
1792 fjz1 = _mm256_add_pd(fjz1,tz);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 r22 = _mm256_mul_pd(rsq22,rinv22);
1800 /* EWALD ELECTROSTATICS */
1802 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1803 ewrt = _mm256_mul_pd(r22,ewtabscale);
1804 ewitab = _mm256_cvttpd_epi32(ewrt);
1805 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1806 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1807 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1809 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1810 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1814 /* Calculate temporary vectorial force */
1815 tx = _mm256_mul_pd(fscal,dx22);
1816 ty = _mm256_mul_pd(fscal,dy22);
1817 tz = _mm256_mul_pd(fscal,dz22);
1819 /* Update vectorial force */
1820 fix2 = _mm256_add_pd(fix2,tx);
1821 fiy2 = _mm256_add_pd(fiy2,ty);
1822 fiz2 = _mm256_add_pd(fiz2,tz);
1824 fjx2 = _mm256_add_pd(fjx2,tx);
1825 fjy2 = _mm256_add_pd(fjy2,ty);
1826 fjz2 = _mm256_add_pd(fjz2,tz);
1828 fjptrA = f+j_coord_offsetA;
1829 fjptrB = f+j_coord_offsetB;
1830 fjptrC = f+j_coord_offsetC;
1831 fjptrD = f+j_coord_offsetD;
1833 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1834 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1836 /* Inner loop uses 331 flops */
1839 if(jidx<j_index_end)
1842 /* Get j neighbor index, and coordinate index */
1843 jnrlistA = jjnr[jidx];
1844 jnrlistB = jjnr[jidx+1];
1845 jnrlistC = jjnr[jidx+2];
1846 jnrlistD = jjnr[jidx+3];
1847 /* Sign of each element will be negative for non-real atoms.
1848 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1849 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1851 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1853 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1854 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1855 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1857 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1858 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1859 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1860 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1861 j_coord_offsetA = DIM*jnrA;
1862 j_coord_offsetB = DIM*jnrB;
1863 j_coord_offsetC = DIM*jnrC;
1864 j_coord_offsetD = DIM*jnrD;
1866 /* load j atom coordinates */
1867 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1868 x+j_coord_offsetC,x+j_coord_offsetD,
1869 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1871 /* Calculate displacement vector */
1872 dx00 = _mm256_sub_pd(ix0,jx0);
1873 dy00 = _mm256_sub_pd(iy0,jy0);
1874 dz00 = _mm256_sub_pd(iz0,jz0);
1875 dx01 = _mm256_sub_pd(ix0,jx1);
1876 dy01 = _mm256_sub_pd(iy0,jy1);
1877 dz01 = _mm256_sub_pd(iz0,jz1);
1878 dx02 = _mm256_sub_pd(ix0,jx2);
1879 dy02 = _mm256_sub_pd(iy0,jy2);
1880 dz02 = _mm256_sub_pd(iz0,jz2);
1881 dx10 = _mm256_sub_pd(ix1,jx0);
1882 dy10 = _mm256_sub_pd(iy1,jy0);
1883 dz10 = _mm256_sub_pd(iz1,jz0);
1884 dx11 = _mm256_sub_pd(ix1,jx1);
1885 dy11 = _mm256_sub_pd(iy1,jy1);
1886 dz11 = _mm256_sub_pd(iz1,jz1);
1887 dx12 = _mm256_sub_pd(ix1,jx2);
1888 dy12 = _mm256_sub_pd(iy1,jy2);
1889 dz12 = _mm256_sub_pd(iz1,jz2);
1890 dx20 = _mm256_sub_pd(ix2,jx0);
1891 dy20 = _mm256_sub_pd(iy2,jy0);
1892 dz20 = _mm256_sub_pd(iz2,jz0);
1893 dx21 = _mm256_sub_pd(ix2,jx1);
1894 dy21 = _mm256_sub_pd(iy2,jy1);
1895 dz21 = _mm256_sub_pd(iz2,jz1);
1896 dx22 = _mm256_sub_pd(ix2,jx2);
1897 dy22 = _mm256_sub_pd(iy2,jy2);
1898 dz22 = _mm256_sub_pd(iz2,jz2);
1900 /* Calculate squared distance and things based on it */
1901 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1902 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1903 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1904 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1905 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1906 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1907 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1908 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1909 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1911 rinv00 = avx256_invsqrt_d(rsq00);
1912 rinv01 = avx256_invsqrt_d(rsq01);
1913 rinv02 = avx256_invsqrt_d(rsq02);
1914 rinv10 = avx256_invsqrt_d(rsq10);
1915 rinv11 = avx256_invsqrt_d(rsq11);
1916 rinv12 = avx256_invsqrt_d(rsq12);
1917 rinv20 = avx256_invsqrt_d(rsq20);
1918 rinv21 = avx256_invsqrt_d(rsq21);
1919 rinv22 = avx256_invsqrt_d(rsq22);
1921 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1922 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1923 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1924 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1925 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1926 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1927 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1928 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1929 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1931 fjx0 = _mm256_setzero_pd();
1932 fjy0 = _mm256_setzero_pd();
1933 fjz0 = _mm256_setzero_pd();
1934 fjx1 = _mm256_setzero_pd();
1935 fjy1 = _mm256_setzero_pd();
1936 fjz1 = _mm256_setzero_pd();
1937 fjx2 = _mm256_setzero_pd();
1938 fjy2 = _mm256_setzero_pd();
1939 fjz2 = _mm256_setzero_pd();
1941 /**************************
1942 * CALCULATE INTERACTIONS *
1943 **************************/
1945 r00 = _mm256_mul_pd(rsq00,rinv00);
1946 r00 = _mm256_andnot_pd(dummy_mask,r00);
1948 /* EWALD ELECTROSTATICS */
1950 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1951 ewrt = _mm256_mul_pd(r00,ewtabscale);
1952 ewitab = _mm256_cvttpd_epi32(ewrt);
1953 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1954 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1955 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1957 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1958 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1960 /* LENNARD-JONES DISPERSION/REPULSION */
1962 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1963 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1965 fscal = _mm256_add_pd(felec,fvdw);
1967 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1969 /* Calculate temporary vectorial force */
1970 tx = _mm256_mul_pd(fscal,dx00);
1971 ty = _mm256_mul_pd(fscal,dy00);
1972 tz = _mm256_mul_pd(fscal,dz00);
1974 /* Update vectorial force */
1975 fix0 = _mm256_add_pd(fix0,tx);
1976 fiy0 = _mm256_add_pd(fiy0,ty);
1977 fiz0 = _mm256_add_pd(fiz0,tz);
1979 fjx0 = _mm256_add_pd(fjx0,tx);
1980 fjy0 = _mm256_add_pd(fjy0,ty);
1981 fjz0 = _mm256_add_pd(fjz0,tz);
1983 /**************************
1984 * CALCULATE INTERACTIONS *
1985 **************************/
1987 r01 = _mm256_mul_pd(rsq01,rinv01);
1988 r01 = _mm256_andnot_pd(dummy_mask,r01);
1990 /* EWALD ELECTROSTATICS */
1992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993 ewrt = _mm256_mul_pd(r01,ewtabscale);
1994 ewitab = _mm256_cvttpd_epi32(ewrt);
1995 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1996 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1997 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1999 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2000 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2004 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2006 /* Calculate temporary vectorial force */
2007 tx = _mm256_mul_pd(fscal,dx01);
2008 ty = _mm256_mul_pd(fscal,dy01);
2009 tz = _mm256_mul_pd(fscal,dz01);
2011 /* Update vectorial force */
2012 fix0 = _mm256_add_pd(fix0,tx);
2013 fiy0 = _mm256_add_pd(fiy0,ty);
2014 fiz0 = _mm256_add_pd(fiz0,tz);
2016 fjx1 = _mm256_add_pd(fjx1,tx);
2017 fjy1 = _mm256_add_pd(fjy1,ty);
2018 fjz1 = _mm256_add_pd(fjz1,tz);
2020 /**************************
2021 * CALCULATE INTERACTIONS *
2022 **************************/
2024 r02 = _mm256_mul_pd(rsq02,rinv02);
2025 r02 = _mm256_andnot_pd(dummy_mask,r02);
2027 /* EWALD ELECTROSTATICS */
2029 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2030 ewrt = _mm256_mul_pd(r02,ewtabscale);
2031 ewitab = _mm256_cvttpd_epi32(ewrt);
2032 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2033 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2034 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2036 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2037 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2041 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2043 /* Calculate temporary vectorial force */
2044 tx = _mm256_mul_pd(fscal,dx02);
2045 ty = _mm256_mul_pd(fscal,dy02);
2046 tz = _mm256_mul_pd(fscal,dz02);
2048 /* Update vectorial force */
2049 fix0 = _mm256_add_pd(fix0,tx);
2050 fiy0 = _mm256_add_pd(fiy0,ty);
2051 fiz0 = _mm256_add_pd(fiz0,tz);
2053 fjx2 = _mm256_add_pd(fjx2,tx);
2054 fjy2 = _mm256_add_pd(fjy2,ty);
2055 fjz2 = _mm256_add_pd(fjz2,tz);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 r10 = _mm256_mul_pd(rsq10,rinv10);
2062 r10 = _mm256_andnot_pd(dummy_mask,r10);
2064 /* EWALD ELECTROSTATICS */
2066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2067 ewrt = _mm256_mul_pd(r10,ewtabscale);
2068 ewitab = _mm256_cvttpd_epi32(ewrt);
2069 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2070 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2071 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2073 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2074 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2078 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2080 /* Calculate temporary vectorial force */
2081 tx = _mm256_mul_pd(fscal,dx10);
2082 ty = _mm256_mul_pd(fscal,dy10);
2083 tz = _mm256_mul_pd(fscal,dz10);
2085 /* Update vectorial force */
2086 fix1 = _mm256_add_pd(fix1,tx);
2087 fiy1 = _mm256_add_pd(fiy1,ty);
2088 fiz1 = _mm256_add_pd(fiz1,tz);
2090 fjx0 = _mm256_add_pd(fjx0,tx);
2091 fjy0 = _mm256_add_pd(fjy0,ty);
2092 fjz0 = _mm256_add_pd(fjz0,tz);
2094 /**************************
2095 * CALCULATE INTERACTIONS *
2096 **************************/
2098 r11 = _mm256_mul_pd(rsq11,rinv11);
2099 r11 = _mm256_andnot_pd(dummy_mask,r11);
2101 /* EWALD ELECTROSTATICS */
2103 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2104 ewrt = _mm256_mul_pd(r11,ewtabscale);
2105 ewitab = _mm256_cvttpd_epi32(ewrt);
2106 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2107 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2108 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2110 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2111 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2115 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2117 /* Calculate temporary vectorial force */
2118 tx = _mm256_mul_pd(fscal,dx11);
2119 ty = _mm256_mul_pd(fscal,dy11);
2120 tz = _mm256_mul_pd(fscal,dz11);
2122 /* Update vectorial force */
2123 fix1 = _mm256_add_pd(fix1,tx);
2124 fiy1 = _mm256_add_pd(fiy1,ty);
2125 fiz1 = _mm256_add_pd(fiz1,tz);
2127 fjx1 = _mm256_add_pd(fjx1,tx);
2128 fjy1 = _mm256_add_pd(fjy1,ty);
2129 fjz1 = _mm256_add_pd(fjz1,tz);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 r12 = _mm256_mul_pd(rsq12,rinv12);
2136 r12 = _mm256_andnot_pd(dummy_mask,r12);
2138 /* EWALD ELECTROSTATICS */
2140 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2141 ewrt = _mm256_mul_pd(r12,ewtabscale);
2142 ewitab = _mm256_cvttpd_epi32(ewrt);
2143 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2144 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2145 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2147 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2148 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2152 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2154 /* Calculate temporary vectorial force */
2155 tx = _mm256_mul_pd(fscal,dx12);
2156 ty = _mm256_mul_pd(fscal,dy12);
2157 tz = _mm256_mul_pd(fscal,dz12);
2159 /* Update vectorial force */
2160 fix1 = _mm256_add_pd(fix1,tx);
2161 fiy1 = _mm256_add_pd(fiy1,ty);
2162 fiz1 = _mm256_add_pd(fiz1,tz);
2164 fjx2 = _mm256_add_pd(fjx2,tx);
2165 fjy2 = _mm256_add_pd(fjy2,ty);
2166 fjz2 = _mm256_add_pd(fjz2,tz);
2168 /**************************
2169 * CALCULATE INTERACTIONS *
2170 **************************/
2172 r20 = _mm256_mul_pd(rsq20,rinv20);
2173 r20 = _mm256_andnot_pd(dummy_mask,r20);
2175 /* EWALD ELECTROSTATICS */
2177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2178 ewrt = _mm256_mul_pd(r20,ewtabscale);
2179 ewitab = _mm256_cvttpd_epi32(ewrt);
2180 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2181 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2182 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2184 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2185 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2189 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2191 /* Calculate temporary vectorial force */
2192 tx = _mm256_mul_pd(fscal,dx20);
2193 ty = _mm256_mul_pd(fscal,dy20);
2194 tz = _mm256_mul_pd(fscal,dz20);
2196 /* Update vectorial force */
2197 fix2 = _mm256_add_pd(fix2,tx);
2198 fiy2 = _mm256_add_pd(fiy2,ty);
2199 fiz2 = _mm256_add_pd(fiz2,tz);
2201 fjx0 = _mm256_add_pd(fjx0,tx);
2202 fjy0 = _mm256_add_pd(fjy0,ty);
2203 fjz0 = _mm256_add_pd(fjz0,tz);
2205 /**************************
2206 * CALCULATE INTERACTIONS *
2207 **************************/
2209 r21 = _mm256_mul_pd(rsq21,rinv21);
2210 r21 = _mm256_andnot_pd(dummy_mask,r21);
2212 /* EWALD ELECTROSTATICS */
2214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2215 ewrt = _mm256_mul_pd(r21,ewtabscale);
2216 ewitab = _mm256_cvttpd_epi32(ewrt);
2217 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2218 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2219 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2221 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2222 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2226 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2228 /* Calculate temporary vectorial force */
2229 tx = _mm256_mul_pd(fscal,dx21);
2230 ty = _mm256_mul_pd(fscal,dy21);
2231 tz = _mm256_mul_pd(fscal,dz21);
2233 /* Update vectorial force */
2234 fix2 = _mm256_add_pd(fix2,tx);
2235 fiy2 = _mm256_add_pd(fiy2,ty);
2236 fiz2 = _mm256_add_pd(fiz2,tz);
2238 fjx1 = _mm256_add_pd(fjx1,tx);
2239 fjy1 = _mm256_add_pd(fjy1,ty);
2240 fjz1 = _mm256_add_pd(fjz1,tz);
2242 /**************************
2243 * CALCULATE INTERACTIONS *
2244 **************************/
2246 r22 = _mm256_mul_pd(rsq22,rinv22);
2247 r22 = _mm256_andnot_pd(dummy_mask,r22);
2249 /* EWALD ELECTROSTATICS */
2251 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2252 ewrt = _mm256_mul_pd(r22,ewtabscale);
2253 ewitab = _mm256_cvttpd_epi32(ewrt);
2254 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2255 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2256 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2258 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2259 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2263 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2265 /* Calculate temporary vectorial force */
2266 tx = _mm256_mul_pd(fscal,dx22);
2267 ty = _mm256_mul_pd(fscal,dy22);
2268 tz = _mm256_mul_pd(fscal,dz22);
2270 /* Update vectorial force */
2271 fix2 = _mm256_add_pd(fix2,tx);
2272 fiy2 = _mm256_add_pd(fiy2,ty);
2273 fiz2 = _mm256_add_pd(fiz2,tz);
2275 fjx2 = _mm256_add_pd(fjx2,tx);
2276 fjy2 = _mm256_add_pd(fjy2,ty);
2277 fjz2 = _mm256_add_pd(fjz2,tz);
2279 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2280 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2281 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2282 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2284 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2285 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2287 /* Inner loop uses 340 flops */
2290 /* End of innermost loop */
2292 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2293 f+i_coord_offset,fshift+i_shift_offset);
2295 /* Increment number of inner iterations */
2296 inneriter += j_index_end - j_index_start;
2298 /* Outer loop uses 18 flops */
2301 /* Increment number of outer iterations */
2304 /* Update outer/inner flops */
2306 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);