2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
100 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
101 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
103 __m256d dummy_mask,cutoff_mask;
104 __m128 tmpmask0,tmpmask1;
105 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
106 __m256d one = _mm256_set1_pd(1.0);
107 __m256d two = _mm256_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm256_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
126 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
127 beta2 = _mm256_mul_pd(beta,beta);
128 beta3 = _mm256_mul_pd(beta,beta2);
130 ewtab = fr->ic->tabq_coul_FDV0;
131 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
132 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
134 /* Setup water-specific parameters */
135 inr = nlist->iinr[0];
136 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
137 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
138 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
139 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
141 jq0 = _mm256_set1_pd(charge[inr+0]);
142 jq1 = _mm256_set1_pd(charge[inr+1]);
143 jq2 = _mm256_set1_pd(charge[inr+2]);
144 vdwjidx0A = 2*vdwtype[inr+0];
145 qq00 = _mm256_mul_pd(iq0,jq0);
146 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
147 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
148 qq01 = _mm256_mul_pd(iq0,jq1);
149 qq02 = _mm256_mul_pd(iq0,jq2);
150 qq10 = _mm256_mul_pd(iq1,jq0);
151 qq11 = _mm256_mul_pd(iq1,jq1);
152 qq12 = _mm256_mul_pd(iq1,jq2);
153 qq20 = _mm256_mul_pd(iq2,jq0);
154 qq21 = _mm256_mul_pd(iq2,jq1);
155 qq22 = _mm256_mul_pd(iq2,jq2);
157 /* Avoid stupid compiler warnings */
158 jnrA = jnrB = jnrC = jnrD = 0;
167 for(iidx=0;iidx<4*DIM;iidx++)
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
190 fix0 = _mm256_setzero_pd();
191 fiy0 = _mm256_setzero_pd();
192 fiz0 = _mm256_setzero_pd();
193 fix1 = _mm256_setzero_pd();
194 fiy1 = _mm256_setzero_pd();
195 fiz1 = _mm256_setzero_pd();
196 fix2 = _mm256_setzero_pd();
197 fiy2 = _mm256_setzero_pd();
198 fiz2 = _mm256_setzero_pd();
200 /* Reset potential sums */
201 velecsum = _mm256_setzero_pd();
202 vvdwsum = _mm256_setzero_pd();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
208 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA = DIM*jnrA;
214 j_coord_offsetB = DIM*jnrB;
215 j_coord_offsetC = DIM*jnrC;
216 j_coord_offsetD = DIM*jnrD;
218 /* load j atom coordinates */
219 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
220 x+j_coord_offsetC,x+j_coord_offsetD,
221 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
223 /* Calculate displacement vector */
224 dx00 = _mm256_sub_pd(ix0,jx0);
225 dy00 = _mm256_sub_pd(iy0,jy0);
226 dz00 = _mm256_sub_pd(iz0,jz0);
227 dx01 = _mm256_sub_pd(ix0,jx1);
228 dy01 = _mm256_sub_pd(iy0,jy1);
229 dz01 = _mm256_sub_pd(iz0,jz1);
230 dx02 = _mm256_sub_pd(ix0,jx2);
231 dy02 = _mm256_sub_pd(iy0,jy2);
232 dz02 = _mm256_sub_pd(iz0,jz2);
233 dx10 = _mm256_sub_pd(ix1,jx0);
234 dy10 = _mm256_sub_pd(iy1,jy0);
235 dz10 = _mm256_sub_pd(iz1,jz0);
236 dx11 = _mm256_sub_pd(ix1,jx1);
237 dy11 = _mm256_sub_pd(iy1,jy1);
238 dz11 = _mm256_sub_pd(iz1,jz1);
239 dx12 = _mm256_sub_pd(ix1,jx2);
240 dy12 = _mm256_sub_pd(iy1,jy2);
241 dz12 = _mm256_sub_pd(iz1,jz2);
242 dx20 = _mm256_sub_pd(ix2,jx0);
243 dy20 = _mm256_sub_pd(iy2,jy0);
244 dz20 = _mm256_sub_pd(iz2,jz0);
245 dx21 = _mm256_sub_pd(ix2,jx1);
246 dy21 = _mm256_sub_pd(iy2,jy1);
247 dz21 = _mm256_sub_pd(iz2,jz1);
248 dx22 = _mm256_sub_pd(ix2,jx2);
249 dy22 = _mm256_sub_pd(iy2,jy2);
250 dz22 = _mm256_sub_pd(iz2,jz2);
252 /* Calculate squared distance and things based on it */
253 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
254 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
255 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
256 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
257 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
258 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
259 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
260 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
261 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
263 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
264 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
265 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
266 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
267 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
268 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
269 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
270 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
271 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
273 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
274 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
275 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
276 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
277 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
278 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
279 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
280 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
281 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
283 fjx0 = _mm256_setzero_pd();
284 fjy0 = _mm256_setzero_pd();
285 fjz0 = _mm256_setzero_pd();
286 fjx1 = _mm256_setzero_pd();
287 fjy1 = _mm256_setzero_pd();
288 fjz1 = _mm256_setzero_pd();
289 fjx2 = _mm256_setzero_pd();
290 fjy2 = _mm256_setzero_pd();
291 fjz2 = _mm256_setzero_pd();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 r00 = _mm256_mul_pd(rsq00,rinv00);
299 /* EWALD ELECTROSTATICS */
301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
302 ewrt = _mm256_mul_pd(r00,ewtabscale);
303 ewitab = _mm256_cvttpd_epi32(ewrt);
304 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
305 ewitab = _mm_slli_epi32(ewitab,2);
306 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
307 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
308 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
309 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
310 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
311 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
312 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
313 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
314 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
316 /* LENNARD-JONES DISPERSION/REPULSION */
318 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
319 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
320 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
321 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
322 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 velecsum = _mm256_add_pd(velecsum,velec);
326 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
328 fscal = _mm256_add_pd(felec,fvdw);
330 /* Calculate temporary vectorial force */
331 tx = _mm256_mul_pd(fscal,dx00);
332 ty = _mm256_mul_pd(fscal,dy00);
333 tz = _mm256_mul_pd(fscal,dz00);
335 /* Update vectorial force */
336 fix0 = _mm256_add_pd(fix0,tx);
337 fiy0 = _mm256_add_pd(fiy0,ty);
338 fiz0 = _mm256_add_pd(fiz0,tz);
340 fjx0 = _mm256_add_pd(fjx0,tx);
341 fjy0 = _mm256_add_pd(fjy0,ty);
342 fjz0 = _mm256_add_pd(fjz0,tz);
344 /**************************
345 * CALCULATE INTERACTIONS *
346 **************************/
348 r01 = _mm256_mul_pd(rsq01,rinv01);
350 /* EWALD ELECTROSTATICS */
352 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
353 ewrt = _mm256_mul_pd(r01,ewtabscale);
354 ewitab = _mm256_cvttpd_epi32(ewrt);
355 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
356 ewitab = _mm_slli_epi32(ewitab,2);
357 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
358 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
359 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
360 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
361 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
362 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
363 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
364 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
365 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 velecsum = _mm256_add_pd(velecsum,velec);
372 /* Calculate temporary vectorial force */
373 tx = _mm256_mul_pd(fscal,dx01);
374 ty = _mm256_mul_pd(fscal,dy01);
375 tz = _mm256_mul_pd(fscal,dz01);
377 /* Update vectorial force */
378 fix0 = _mm256_add_pd(fix0,tx);
379 fiy0 = _mm256_add_pd(fiy0,ty);
380 fiz0 = _mm256_add_pd(fiz0,tz);
382 fjx1 = _mm256_add_pd(fjx1,tx);
383 fjy1 = _mm256_add_pd(fjy1,ty);
384 fjz1 = _mm256_add_pd(fjz1,tz);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 r02 = _mm256_mul_pd(rsq02,rinv02);
392 /* EWALD ELECTROSTATICS */
394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395 ewrt = _mm256_mul_pd(r02,ewtabscale);
396 ewitab = _mm256_cvttpd_epi32(ewrt);
397 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
398 ewitab = _mm_slli_epi32(ewitab,2);
399 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
400 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
401 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
402 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
403 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
404 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
405 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
406 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
407 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velecsum = _mm256_add_pd(velecsum,velec);
414 /* Calculate temporary vectorial force */
415 tx = _mm256_mul_pd(fscal,dx02);
416 ty = _mm256_mul_pd(fscal,dy02);
417 tz = _mm256_mul_pd(fscal,dz02);
419 /* Update vectorial force */
420 fix0 = _mm256_add_pd(fix0,tx);
421 fiy0 = _mm256_add_pd(fiy0,ty);
422 fiz0 = _mm256_add_pd(fiz0,tz);
424 fjx2 = _mm256_add_pd(fjx2,tx);
425 fjy2 = _mm256_add_pd(fjy2,ty);
426 fjz2 = _mm256_add_pd(fjz2,tz);
428 /**************************
429 * CALCULATE INTERACTIONS *
430 **************************/
432 r10 = _mm256_mul_pd(rsq10,rinv10);
434 /* EWALD ELECTROSTATICS */
436 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
437 ewrt = _mm256_mul_pd(r10,ewtabscale);
438 ewitab = _mm256_cvttpd_epi32(ewrt);
439 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
440 ewitab = _mm_slli_epi32(ewitab,2);
441 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
442 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
443 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
444 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
445 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
446 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
447 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
448 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
449 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
451 /* Update potential sum for this i atom from the interaction with this j atom. */
452 velecsum = _mm256_add_pd(velecsum,velec);
456 /* Calculate temporary vectorial force */
457 tx = _mm256_mul_pd(fscal,dx10);
458 ty = _mm256_mul_pd(fscal,dy10);
459 tz = _mm256_mul_pd(fscal,dz10);
461 /* Update vectorial force */
462 fix1 = _mm256_add_pd(fix1,tx);
463 fiy1 = _mm256_add_pd(fiy1,ty);
464 fiz1 = _mm256_add_pd(fiz1,tz);
466 fjx0 = _mm256_add_pd(fjx0,tx);
467 fjy0 = _mm256_add_pd(fjy0,ty);
468 fjz0 = _mm256_add_pd(fjz0,tz);
470 /**************************
471 * CALCULATE INTERACTIONS *
472 **************************/
474 r11 = _mm256_mul_pd(rsq11,rinv11);
476 /* EWALD ELECTROSTATICS */
478 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
479 ewrt = _mm256_mul_pd(r11,ewtabscale);
480 ewitab = _mm256_cvttpd_epi32(ewrt);
481 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
482 ewitab = _mm_slli_epi32(ewitab,2);
483 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
484 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
485 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
486 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
487 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
488 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
489 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
490 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
491 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velecsum = _mm256_add_pd(velecsum,velec);
498 /* Calculate temporary vectorial force */
499 tx = _mm256_mul_pd(fscal,dx11);
500 ty = _mm256_mul_pd(fscal,dy11);
501 tz = _mm256_mul_pd(fscal,dz11);
503 /* Update vectorial force */
504 fix1 = _mm256_add_pd(fix1,tx);
505 fiy1 = _mm256_add_pd(fiy1,ty);
506 fiz1 = _mm256_add_pd(fiz1,tz);
508 fjx1 = _mm256_add_pd(fjx1,tx);
509 fjy1 = _mm256_add_pd(fjy1,ty);
510 fjz1 = _mm256_add_pd(fjz1,tz);
512 /**************************
513 * CALCULATE INTERACTIONS *
514 **************************/
516 r12 = _mm256_mul_pd(rsq12,rinv12);
518 /* EWALD ELECTROSTATICS */
520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521 ewrt = _mm256_mul_pd(r12,ewtabscale);
522 ewitab = _mm256_cvttpd_epi32(ewrt);
523 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
524 ewitab = _mm_slli_epi32(ewitab,2);
525 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
526 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
527 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
528 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
529 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
530 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
531 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
532 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
533 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
535 /* Update potential sum for this i atom from the interaction with this j atom. */
536 velecsum = _mm256_add_pd(velecsum,velec);
540 /* Calculate temporary vectorial force */
541 tx = _mm256_mul_pd(fscal,dx12);
542 ty = _mm256_mul_pd(fscal,dy12);
543 tz = _mm256_mul_pd(fscal,dz12);
545 /* Update vectorial force */
546 fix1 = _mm256_add_pd(fix1,tx);
547 fiy1 = _mm256_add_pd(fiy1,ty);
548 fiz1 = _mm256_add_pd(fiz1,tz);
550 fjx2 = _mm256_add_pd(fjx2,tx);
551 fjy2 = _mm256_add_pd(fjy2,ty);
552 fjz2 = _mm256_add_pd(fjz2,tz);
554 /**************************
555 * CALCULATE INTERACTIONS *
556 **************************/
558 r20 = _mm256_mul_pd(rsq20,rinv20);
560 /* EWALD ELECTROSTATICS */
562 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563 ewrt = _mm256_mul_pd(r20,ewtabscale);
564 ewitab = _mm256_cvttpd_epi32(ewrt);
565 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
566 ewitab = _mm_slli_epi32(ewitab,2);
567 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
568 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
569 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
570 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
571 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
572 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
573 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
574 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
575 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
577 /* Update potential sum for this i atom from the interaction with this j atom. */
578 velecsum = _mm256_add_pd(velecsum,velec);
582 /* Calculate temporary vectorial force */
583 tx = _mm256_mul_pd(fscal,dx20);
584 ty = _mm256_mul_pd(fscal,dy20);
585 tz = _mm256_mul_pd(fscal,dz20);
587 /* Update vectorial force */
588 fix2 = _mm256_add_pd(fix2,tx);
589 fiy2 = _mm256_add_pd(fiy2,ty);
590 fiz2 = _mm256_add_pd(fiz2,tz);
592 fjx0 = _mm256_add_pd(fjx0,tx);
593 fjy0 = _mm256_add_pd(fjy0,ty);
594 fjz0 = _mm256_add_pd(fjz0,tz);
596 /**************************
597 * CALCULATE INTERACTIONS *
598 **************************/
600 r21 = _mm256_mul_pd(rsq21,rinv21);
602 /* EWALD ELECTROSTATICS */
604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
605 ewrt = _mm256_mul_pd(r21,ewtabscale);
606 ewitab = _mm256_cvttpd_epi32(ewrt);
607 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
608 ewitab = _mm_slli_epi32(ewitab,2);
609 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
610 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
611 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
612 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
613 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
614 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
615 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
616 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
617 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
619 /* Update potential sum for this i atom from the interaction with this j atom. */
620 velecsum = _mm256_add_pd(velecsum,velec);
624 /* Calculate temporary vectorial force */
625 tx = _mm256_mul_pd(fscal,dx21);
626 ty = _mm256_mul_pd(fscal,dy21);
627 tz = _mm256_mul_pd(fscal,dz21);
629 /* Update vectorial force */
630 fix2 = _mm256_add_pd(fix2,tx);
631 fiy2 = _mm256_add_pd(fiy2,ty);
632 fiz2 = _mm256_add_pd(fiz2,tz);
634 fjx1 = _mm256_add_pd(fjx1,tx);
635 fjy1 = _mm256_add_pd(fjy1,ty);
636 fjz1 = _mm256_add_pd(fjz1,tz);
638 /**************************
639 * CALCULATE INTERACTIONS *
640 **************************/
642 r22 = _mm256_mul_pd(rsq22,rinv22);
644 /* EWALD ELECTROSTATICS */
646 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
647 ewrt = _mm256_mul_pd(r22,ewtabscale);
648 ewitab = _mm256_cvttpd_epi32(ewrt);
649 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
650 ewitab = _mm_slli_epi32(ewitab,2);
651 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
652 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
653 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
654 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
655 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
656 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
657 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
658 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
659 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
661 /* Update potential sum for this i atom from the interaction with this j atom. */
662 velecsum = _mm256_add_pd(velecsum,velec);
666 /* Calculate temporary vectorial force */
667 tx = _mm256_mul_pd(fscal,dx22);
668 ty = _mm256_mul_pd(fscal,dy22);
669 tz = _mm256_mul_pd(fscal,dz22);
671 /* Update vectorial force */
672 fix2 = _mm256_add_pd(fix2,tx);
673 fiy2 = _mm256_add_pd(fiy2,ty);
674 fiz2 = _mm256_add_pd(fiz2,tz);
676 fjx2 = _mm256_add_pd(fjx2,tx);
677 fjy2 = _mm256_add_pd(fjy2,ty);
678 fjz2 = _mm256_add_pd(fjz2,tz);
680 fjptrA = f+j_coord_offsetA;
681 fjptrB = f+j_coord_offsetB;
682 fjptrC = f+j_coord_offsetC;
683 fjptrD = f+j_coord_offsetD;
685 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
686 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
688 /* Inner loop uses 381 flops */
694 /* Get j neighbor index, and coordinate index */
695 jnrlistA = jjnr[jidx];
696 jnrlistB = jjnr[jidx+1];
697 jnrlistC = jjnr[jidx+2];
698 jnrlistD = jjnr[jidx+3];
699 /* Sign of each element will be negative for non-real atoms.
700 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
701 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
703 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
705 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
706 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
707 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
709 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
710 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
711 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
712 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
713 j_coord_offsetA = DIM*jnrA;
714 j_coord_offsetB = DIM*jnrB;
715 j_coord_offsetC = DIM*jnrC;
716 j_coord_offsetD = DIM*jnrD;
718 /* load j atom coordinates */
719 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
720 x+j_coord_offsetC,x+j_coord_offsetD,
721 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
723 /* Calculate displacement vector */
724 dx00 = _mm256_sub_pd(ix0,jx0);
725 dy00 = _mm256_sub_pd(iy0,jy0);
726 dz00 = _mm256_sub_pd(iz0,jz0);
727 dx01 = _mm256_sub_pd(ix0,jx1);
728 dy01 = _mm256_sub_pd(iy0,jy1);
729 dz01 = _mm256_sub_pd(iz0,jz1);
730 dx02 = _mm256_sub_pd(ix0,jx2);
731 dy02 = _mm256_sub_pd(iy0,jy2);
732 dz02 = _mm256_sub_pd(iz0,jz2);
733 dx10 = _mm256_sub_pd(ix1,jx0);
734 dy10 = _mm256_sub_pd(iy1,jy0);
735 dz10 = _mm256_sub_pd(iz1,jz0);
736 dx11 = _mm256_sub_pd(ix1,jx1);
737 dy11 = _mm256_sub_pd(iy1,jy1);
738 dz11 = _mm256_sub_pd(iz1,jz1);
739 dx12 = _mm256_sub_pd(ix1,jx2);
740 dy12 = _mm256_sub_pd(iy1,jy2);
741 dz12 = _mm256_sub_pd(iz1,jz2);
742 dx20 = _mm256_sub_pd(ix2,jx0);
743 dy20 = _mm256_sub_pd(iy2,jy0);
744 dz20 = _mm256_sub_pd(iz2,jz0);
745 dx21 = _mm256_sub_pd(ix2,jx1);
746 dy21 = _mm256_sub_pd(iy2,jy1);
747 dz21 = _mm256_sub_pd(iz2,jz1);
748 dx22 = _mm256_sub_pd(ix2,jx2);
749 dy22 = _mm256_sub_pd(iy2,jy2);
750 dz22 = _mm256_sub_pd(iz2,jz2);
752 /* Calculate squared distance and things based on it */
753 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
754 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
755 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
756 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
757 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
758 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
759 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
760 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
761 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
763 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
764 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
765 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
766 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
767 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
768 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
769 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
770 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
771 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
773 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
774 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
775 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
776 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
777 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
778 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
779 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
780 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
781 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
783 fjx0 = _mm256_setzero_pd();
784 fjy0 = _mm256_setzero_pd();
785 fjz0 = _mm256_setzero_pd();
786 fjx1 = _mm256_setzero_pd();
787 fjy1 = _mm256_setzero_pd();
788 fjz1 = _mm256_setzero_pd();
789 fjx2 = _mm256_setzero_pd();
790 fjy2 = _mm256_setzero_pd();
791 fjz2 = _mm256_setzero_pd();
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 r00 = _mm256_mul_pd(rsq00,rinv00);
798 r00 = _mm256_andnot_pd(dummy_mask,r00);
800 /* EWALD ELECTROSTATICS */
802 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
803 ewrt = _mm256_mul_pd(r00,ewtabscale);
804 ewitab = _mm256_cvttpd_epi32(ewrt);
805 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
806 ewitab = _mm_slli_epi32(ewitab,2);
807 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
808 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
809 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
810 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
811 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
812 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
813 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
814 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
815 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
817 /* LENNARD-JONES DISPERSION/REPULSION */
819 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
820 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
821 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
822 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
823 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
825 /* Update potential sum for this i atom from the interaction with this j atom. */
826 velec = _mm256_andnot_pd(dummy_mask,velec);
827 velecsum = _mm256_add_pd(velecsum,velec);
828 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
829 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
831 fscal = _mm256_add_pd(felec,fvdw);
833 fscal = _mm256_andnot_pd(dummy_mask,fscal);
835 /* Calculate temporary vectorial force */
836 tx = _mm256_mul_pd(fscal,dx00);
837 ty = _mm256_mul_pd(fscal,dy00);
838 tz = _mm256_mul_pd(fscal,dz00);
840 /* Update vectorial force */
841 fix0 = _mm256_add_pd(fix0,tx);
842 fiy0 = _mm256_add_pd(fiy0,ty);
843 fiz0 = _mm256_add_pd(fiz0,tz);
845 fjx0 = _mm256_add_pd(fjx0,tx);
846 fjy0 = _mm256_add_pd(fjy0,ty);
847 fjz0 = _mm256_add_pd(fjz0,tz);
849 /**************************
850 * CALCULATE INTERACTIONS *
851 **************************/
853 r01 = _mm256_mul_pd(rsq01,rinv01);
854 r01 = _mm256_andnot_pd(dummy_mask,r01);
856 /* EWALD ELECTROSTATICS */
858 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
859 ewrt = _mm256_mul_pd(r01,ewtabscale);
860 ewitab = _mm256_cvttpd_epi32(ewrt);
861 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
862 ewitab = _mm_slli_epi32(ewitab,2);
863 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
864 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
865 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
866 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
867 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
868 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
869 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
870 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
871 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm256_andnot_pd(dummy_mask,velec);
875 velecsum = _mm256_add_pd(velecsum,velec);
879 fscal = _mm256_andnot_pd(dummy_mask,fscal);
881 /* Calculate temporary vectorial force */
882 tx = _mm256_mul_pd(fscal,dx01);
883 ty = _mm256_mul_pd(fscal,dy01);
884 tz = _mm256_mul_pd(fscal,dz01);
886 /* Update vectorial force */
887 fix0 = _mm256_add_pd(fix0,tx);
888 fiy0 = _mm256_add_pd(fiy0,ty);
889 fiz0 = _mm256_add_pd(fiz0,tz);
891 fjx1 = _mm256_add_pd(fjx1,tx);
892 fjy1 = _mm256_add_pd(fjy1,ty);
893 fjz1 = _mm256_add_pd(fjz1,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 r02 = _mm256_mul_pd(rsq02,rinv02);
900 r02 = _mm256_andnot_pd(dummy_mask,r02);
902 /* EWALD ELECTROSTATICS */
904 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
905 ewrt = _mm256_mul_pd(r02,ewtabscale);
906 ewitab = _mm256_cvttpd_epi32(ewrt);
907 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
908 ewitab = _mm_slli_epi32(ewitab,2);
909 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
910 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
911 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
912 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
913 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
914 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
915 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
916 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
917 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec = _mm256_andnot_pd(dummy_mask,velec);
921 velecsum = _mm256_add_pd(velecsum,velec);
925 fscal = _mm256_andnot_pd(dummy_mask,fscal);
927 /* Calculate temporary vectorial force */
928 tx = _mm256_mul_pd(fscal,dx02);
929 ty = _mm256_mul_pd(fscal,dy02);
930 tz = _mm256_mul_pd(fscal,dz02);
932 /* Update vectorial force */
933 fix0 = _mm256_add_pd(fix0,tx);
934 fiy0 = _mm256_add_pd(fiy0,ty);
935 fiz0 = _mm256_add_pd(fiz0,tz);
937 fjx2 = _mm256_add_pd(fjx2,tx);
938 fjy2 = _mm256_add_pd(fjy2,ty);
939 fjz2 = _mm256_add_pd(fjz2,tz);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 r10 = _mm256_mul_pd(rsq10,rinv10);
946 r10 = _mm256_andnot_pd(dummy_mask,r10);
948 /* EWALD ELECTROSTATICS */
950 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
951 ewrt = _mm256_mul_pd(r10,ewtabscale);
952 ewitab = _mm256_cvttpd_epi32(ewrt);
953 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
954 ewitab = _mm_slli_epi32(ewitab,2);
955 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
956 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
957 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
958 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
959 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
960 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
961 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
962 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
963 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
965 /* Update potential sum for this i atom from the interaction with this j atom. */
966 velec = _mm256_andnot_pd(dummy_mask,velec);
967 velecsum = _mm256_add_pd(velecsum,velec);
971 fscal = _mm256_andnot_pd(dummy_mask,fscal);
973 /* Calculate temporary vectorial force */
974 tx = _mm256_mul_pd(fscal,dx10);
975 ty = _mm256_mul_pd(fscal,dy10);
976 tz = _mm256_mul_pd(fscal,dz10);
978 /* Update vectorial force */
979 fix1 = _mm256_add_pd(fix1,tx);
980 fiy1 = _mm256_add_pd(fiy1,ty);
981 fiz1 = _mm256_add_pd(fiz1,tz);
983 fjx0 = _mm256_add_pd(fjx0,tx);
984 fjy0 = _mm256_add_pd(fjy0,ty);
985 fjz0 = _mm256_add_pd(fjz0,tz);
987 /**************************
988 * CALCULATE INTERACTIONS *
989 **************************/
991 r11 = _mm256_mul_pd(rsq11,rinv11);
992 r11 = _mm256_andnot_pd(dummy_mask,r11);
994 /* EWALD ELECTROSTATICS */
996 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
997 ewrt = _mm256_mul_pd(r11,ewtabscale);
998 ewitab = _mm256_cvttpd_epi32(ewrt);
999 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1000 ewitab = _mm_slli_epi32(ewitab,2);
1001 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1002 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1003 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1004 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1005 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1006 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1007 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1008 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1009 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1011 /* Update potential sum for this i atom from the interaction with this j atom. */
1012 velec = _mm256_andnot_pd(dummy_mask,velec);
1013 velecsum = _mm256_add_pd(velecsum,velec);
1017 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1019 /* Calculate temporary vectorial force */
1020 tx = _mm256_mul_pd(fscal,dx11);
1021 ty = _mm256_mul_pd(fscal,dy11);
1022 tz = _mm256_mul_pd(fscal,dz11);
1024 /* Update vectorial force */
1025 fix1 = _mm256_add_pd(fix1,tx);
1026 fiy1 = _mm256_add_pd(fiy1,ty);
1027 fiz1 = _mm256_add_pd(fiz1,tz);
1029 fjx1 = _mm256_add_pd(fjx1,tx);
1030 fjy1 = _mm256_add_pd(fjy1,ty);
1031 fjz1 = _mm256_add_pd(fjz1,tz);
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 r12 = _mm256_mul_pd(rsq12,rinv12);
1038 r12 = _mm256_andnot_pd(dummy_mask,r12);
1040 /* EWALD ELECTROSTATICS */
1042 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1043 ewrt = _mm256_mul_pd(r12,ewtabscale);
1044 ewitab = _mm256_cvttpd_epi32(ewrt);
1045 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1046 ewitab = _mm_slli_epi32(ewitab,2);
1047 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1048 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1049 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1050 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1051 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1052 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1053 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1054 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1055 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1057 /* Update potential sum for this i atom from the interaction with this j atom. */
1058 velec = _mm256_andnot_pd(dummy_mask,velec);
1059 velecsum = _mm256_add_pd(velecsum,velec);
1063 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1065 /* Calculate temporary vectorial force */
1066 tx = _mm256_mul_pd(fscal,dx12);
1067 ty = _mm256_mul_pd(fscal,dy12);
1068 tz = _mm256_mul_pd(fscal,dz12);
1070 /* Update vectorial force */
1071 fix1 = _mm256_add_pd(fix1,tx);
1072 fiy1 = _mm256_add_pd(fiy1,ty);
1073 fiz1 = _mm256_add_pd(fiz1,tz);
1075 fjx2 = _mm256_add_pd(fjx2,tx);
1076 fjy2 = _mm256_add_pd(fjy2,ty);
1077 fjz2 = _mm256_add_pd(fjz2,tz);
1079 /**************************
1080 * CALCULATE INTERACTIONS *
1081 **************************/
1083 r20 = _mm256_mul_pd(rsq20,rinv20);
1084 r20 = _mm256_andnot_pd(dummy_mask,r20);
1086 /* EWALD ELECTROSTATICS */
1088 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1089 ewrt = _mm256_mul_pd(r20,ewtabscale);
1090 ewitab = _mm256_cvttpd_epi32(ewrt);
1091 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1092 ewitab = _mm_slli_epi32(ewitab,2);
1093 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1094 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1095 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1096 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1097 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1098 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1099 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1100 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1101 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1103 /* Update potential sum for this i atom from the interaction with this j atom. */
1104 velec = _mm256_andnot_pd(dummy_mask,velec);
1105 velecsum = _mm256_add_pd(velecsum,velec);
1109 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1111 /* Calculate temporary vectorial force */
1112 tx = _mm256_mul_pd(fscal,dx20);
1113 ty = _mm256_mul_pd(fscal,dy20);
1114 tz = _mm256_mul_pd(fscal,dz20);
1116 /* Update vectorial force */
1117 fix2 = _mm256_add_pd(fix2,tx);
1118 fiy2 = _mm256_add_pd(fiy2,ty);
1119 fiz2 = _mm256_add_pd(fiz2,tz);
1121 fjx0 = _mm256_add_pd(fjx0,tx);
1122 fjy0 = _mm256_add_pd(fjy0,ty);
1123 fjz0 = _mm256_add_pd(fjz0,tz);
1125 /**************************
1126 * CALCULATE INTERACTIONS *
1127 **************************/
1129 r21 = _mm256_mul_pd(rsq21,rinv21);
1130 r21 = _mm256_andnot_pd(dummy_mask,r21);
1132 /* EWALD ELECTROSTATICS */
1134 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1135 ewrt = _mm256_mul_pd(r21,ewtabscale);
1136 ewitab = _mm256_cvttpd_epi32(ewrt);
1137 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1138 ewitab = _mm_slli_epi32(ewitab,2);
1139 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1140 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1141 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1142 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1143 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1144 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1145 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1146 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1147 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm256_andnot_pd(dummy_mask,velec);
1151 velecsum = _mm256_add_pd(velecsum,velec);
1155 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1157 /* Calculate temporary vectorial force */
1158 tx = _mm256_mul_pd(fscal,dx21);
1159 ty = _mm256_mul_pd(fscal,dy21);
1160 tz = _mm256_mul_pd(fscal,dz21);
1162 /* Update vectorial force */
1163 fix2 = _mm256_add_pd(fix2,tx);
1164 fiy2 = _mm256_add_pd(fiy2,ty);
1165 fiz2 = _mm256_add_pd(fiz2,tz);
1167 fjx1 = _mm256_add_pd(fjx1,tx);
1168 fjy1 = _mm256_add_pd(fjy1,ty);
1169 fjz1 = _mm256_add_pd(fjz1,tz);
1171 /**************************
1172 * CALCULATE INTERACTIONS *
1173 **************************/
1175 r22 = _mm256_mul_pd(rsq22,rinv22);
1176 r22 = _mm256_andnot_pd(dummy_mask,r22);
1178 /* EWALD ELECTROSTATICS */
1180 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1181 ewrt = _mm256_mul_pd(r22,ewtabscale);
1182 ewitab = _mm256_cvttpd_epi32(ewrt);
1183 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1184 ewitab = _mm_slli_epi32(ewitab,2);
1185 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1186 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1187 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1188 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1189 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1190 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1191 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1192 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1193 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec = _mm256_andnot_pd(dummy_mask,velec);
1197 velecsum = _mm256_add_pd(velecsum,velec);
1201 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1203 /* Calculate temporary vectorial force */
1204 tx = _mm256_mul_pd(fscal,dx22);
1205 ty = _mm256_mul_pd(fscal,dy22);
1206 tz = _mm256_mul_pd(fscal,dz22);
1208 /* Update vectorial force */
1209 fix2 = _mm256_add_pd(fix2,tx);
1210 fiy2 = _mm256_add_pd(fiy2,ty);
1211 fiz2 = _mm256_add_pd(fiz2,tz);
1213 fjx2 = _mm256_add_pd(fjx2,tx);
1214 fjy2 = _mm256_add_pd(fjy2,ty);
1215 fjz2 = _mm256_add_pd(fjz2,tz);
1217 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1218 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1219 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1220 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1222 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1223 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1225 /* Inner loop uses 390 flops */
1228 /* End of innermost loop */
1230 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1231 f+i_coord_offset,fshift+i_shift_offset);
1234 /* Update potential energies */
1235 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1236 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1238 /* Increment number of inner iterations */
1239 inneriter += j_index_end - j_index_start;
1241 /* Outer loop uses 20 flops */
1244 /* Increment number of outer iterations */
1247 /* Update outer/inner flops */
1249 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1252 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
1253 * Electrostatics interaction: Ewald
1254 * VdW interaction: LennardJones
1255 * Geometry: Water3-Water3
1256 * Calculate force/pot: Force
1259 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
1260 (t_nblist * gmx_restrict nlist,
1261 rvec * gmx_restrict xx,
1262 rvec * gmx_restrict ff,
1263 t_forcerec * gmx_restrict fr,
1264 t_mdatoms * gmx_restrict mdatoms,
1265 nb_kernel_data_t * gmx_restrict kernel_data,
1266 t_nrnb * gmx_restrict nrnb)
1268 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1269 * just 0 for non-waters.
1270 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1271 * jnr indices corresponding to data put in the four positions in the SIMD register.
1273 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1274 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1275 int jnrA,jnrB,jnrC,jnrD;
1276 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1277 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1278 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1279 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1280 real rcutoff_scalar;
1281 real *shiftvec,*fshift,*x,*f;
1282 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1283 real scratch[4*DIM];
1284 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1285 real * vdwioffsetptr0;
1286 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1287 real * vdwioffsetptr1;
1288 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1289 real * vdwioffsetptr2;
1290 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1291 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1292 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1293 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1294 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1295 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1296 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1297 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1298 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1299 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1300 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1301 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1302 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1303 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1304 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1305 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1306 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1309 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1312 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1313 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1315 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1316 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1318 __m256d dummy_mask,cutoff_mask;
1319 __m128 tmpmask0,tmpmask1;
1320 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1321 __m256d one = _mm256_set1_pd(1.0);
1322 __m256d two = _mm256_set1_pd(2.0);
1328 jindex = nlist->jindex;
1330 shiftidx = nlist->shift;
1332 shiftvec = fr->shift_vec[0];
1333 fshift = fr->fshift[0];
1334 facel = _mm256_set1_pd(fr->epsfac);
1335 charge = mdatoms->chargeA;
1336 nvdwtype = fr->ntype;
1337 vdwparam = fr->nbfp;
1338 vdwtype = mdatoms->typeA;
1340 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1341 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1342 beta2 = _mm256_mul_pd(beta,beta);
1343 beta3 = _mm256_mul_pd(beta,beta2);
1345 ewtab = fr->ic->tabq_coul_F;
1346 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1347 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1349 /* Setup water-specific parameters */
1350 inr = nlist->iinr[0];
1351 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1352 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1353 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1354 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1356 jq0 = _mm256_set1_pd(charge[inr+0]);
1357 jq1 = _mm256_set1_pd(charge[inr+1]);
1358 jq2 = _mm256_set1_pd(charge[inr+2]);
1359 vdwjidx0A = 2*vdwtype[inr+0];
1360 qq00 = _mm256_mul_pd(iq0,jq0);
1361 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1362 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1363 qq01 = _mm256_mul_pd(iq0,jq1);
1364 qq02 = _mm256_mul_pd(iq0,jq2);
1365 qq10 = _mm256_mul_pd(iq1,jq0);
1366 qq11 = _mm256_mul_pd(iq1,jq1);
1367 qq12 = _mm256_mul_pd(iq1,jq2);
1368 qq20 = _mm256_mul_pd(iq2,jq0);
1369 qq21 = _mm256_mul_pd(iq2,jq1);
1370 qq22 = _mm256_mul_pd(iq2,jq2);
1372 /* Avoid stupid compiler warnings */
1373 jnrA = jnrB = jnrC = jnrD = 0;
1374 j_coord_offsetA = 0;
1375 j_coord_offsetB = 0;
1376 j_coord_offsetC = 0;
1377 j_coord_offsetD = 0;
1382 for(iidx=0;iidx<4*DIM;iidx++)
1384 scratch[iidx] = 0.0;
1387 /* Start outer loop over neighborlists */
1388 for(iidx=0; iidx<nri; iidx++)
1390 /* Load shift vector for this list */
1391 i_shift_offset = DIM*shiftidx[iidx];
1393 /* Load limits for loop over neighbors */
1394 j_index_start = jindex[iidx];
1395 j_index_end = jindex[iidx+1];
1397 /* Get outer coordinate index */
1399 i_coord_offset = DIM*inr;
1401 /* Load i particle coords and add shift vector */
1402 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1403 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1405 fix0 = _mm256_setzero_pd();
1406 fiy0 = _mm256_setzero_pd();
1407 fiz0 = _mm256_setzero_pd();
1408 fix1 = _mm256_setzero_pd();
1409 fiy1 = _mm256_setzero_pd();
1410 fiz1 = _mm256_setzero_pd();
1411 fix2 = _mm256_setzero_pd();
1412 fiy2 = _mm256_setzero_pd();
1413 fiz2 = _mm256_setzero_pd();
1415 /* Start inner kernel loop */
1416 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1419 /* Get j neighbor index, and coordinate index */
1421 jnrB = jjnr[jidx+1];
1422 jnrC = jjnr[jidx+2];
1423 jnrD = jjnr[jidx+3];
1424 j_coord_offsetA = DIM*jnrA;
1425 j_coord_offsetB = DIM*jnrB;
1426 j_coord_offsetC = DIM*jnrC;
1427 j_coord_offsetD = DIM*jnrD;
1429 /* load j atom coordinates */
1430 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1431 x+j_coord_offsetC,x+j_coord_offsetD,
1432 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1434 /* Calculate displacement vector */
1435 dx00 = _mm256_sub_pd(ix0,jx0);
1436 dy00 = _mm256_sub_pd(iy0,jy0);
1437 dz00 = _mm256_sub_pd(iz0,jz0);
1438 dx01 = _mm256_sub_pd(ix0,jx1);
1439 dy01 = _mm256_sub_pd(iy0,jy1);
1440 dz01 = _mm256_sub_pd(iz0,jz1);
1441 dx02 = _mm256_sub_pd(ix0,jx2);
1442 dy02 = _mm256_sub_pd(iy0,jy2);
1443 dz02 = _mm256_sub_pd(iz0,jz2);
1444 dx10 = _mm256_sub_pd(ix1,jx0);
1445 dy10 = _mm256_sub_pd(iy1,jy0);
1446 dz10 = _mm256_sub_pd(iz1,jz0);
1447 dx11 = _mm256_sub_pd(ix1,jx1);
1448 dy11 = _mm256_sub_pd(iy1,jy1);
1449 dz11 = _mm256_sub_pd(iz1,jz1);
1450 dx12 = _mm256_sub_pd(ix1,jx2);
1451 dy12 = _mm256_sub_pd(iy1,jy2);
1452 dz12 = _mm256_sub_pd(iz1,jz2);
1453 dx20 = _mm256_sub_pd(ix2,jx0);
1454 dy20 = _mm256_sub_pd(iy2,jy0);
1455 dz20 = _mm256_sub_pd(iz2,jz0);
1456 dx21 = _mm256_sub_pd(ix2,jx1);
1457 dy21 = _mm256_sub_pd(iy2,jy1);
1458 dz21 = _mm256_sub_pd(iz2,jz1);
1459 dx22 = _mm256_sub_pd(ix2,jx2);
1460 dy22 = _mm256_sub_pd(iy2,jy2);
1461 dz22 = _mm256_sub_pd(iz2,jz2);
1463 /* Calculate squared distance and things based on it */
1464 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1465 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1466 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1467 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1468 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1469 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1470 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1471 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1472 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1474 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1475 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1476 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1477 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1478 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1479 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1480 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1481 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1482 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1484 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1485 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1486 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1487 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1488 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1489 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1490 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1491 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1492 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1494 fjx0 = _mm256_setzero_pd();
1495 fjy0 = _mm256_setzero_pd();
1496 fjz0 = _mm256_setzero_pd();
1497 fjx1 = _mm256_setzero_pd();
1498 fjy1 = _mm256_setzero_pd();
1499 fjz1 = _mm256_setzero_pd();
1500 fjx2 = _mm256_setzero_pd();
1501 fjy2 = _mm256_setzero_pd();
1502 fjz2 = _mm256_setzero_pd();
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 r00 = _mm256_mul_pd(rsq00,rinv00);
1510 /* EWALD ELECTROSTATICS */
1512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1513 ewrt = _mm256_mul_pd(r00,ewtabscale);
1514 ewitab = _mm256_cvttpd_epi32(ewrt);
1515 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1516 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1517 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1519 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1520 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1522 /* LENNARD-JONES DISPERSION/REPULSION */
1524 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1525 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1527 fscal = _mm256_add_pd(felec,fvdw);
1529 /* Calculate temporary vectorial force */
1530 tx = _mm256_mul_pd(fscal,dx00);
1531 ty = _mm256_mul_pd(fscal,dy00);
1532 tz = _mm256_mul_pd(fscal,dz00);
1534 /* Update vectorial force */
1535 fix0 = _mm256_add_pd(fix0,tx);
1536 fiy0 = _mm256_add_pd(fiy0,ty);
1537 fiz0 = _mm256_add_pd(fiz0,tz);
1539 fjx0 = _mm256_add_pd(fjx0,tx);
1540 fjy0 = _mm256_add_pd(fjy0,ty);
1541 fjz0 = _mm256_add_pd(fjz0,tz);
1543 /**************************
1544 * CALCULATE INTERACTIONS *
1545 **************************/
1547 r01 = _mm256_mul_pd(rsq01,rinv01);
1549 /* EWALD ELECTROSTATICS */
1551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1552 ewrt = _mm256_mul_pd(r01,ewtabscale);
1553 ewitab = _mm256_cvttpd_epi32(ewrt);
1554 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1555 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1556 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1558 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1559 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1563 /* Calculate temporary vectorial force */
1564 tx = _mm256_mul_pd(fscal,dx01);
1565 ty = _mm256_mul_pd(fscal,dy01);
1566 tz = _mm256_mul_pd(fscal,dz01);
1568 /* Update vectorial force */
1569 fix0 = _mm256_add_pd(fix0,tx);
1570 fiy0 = _mm256_add_pd(fiy0,ty);
1571 fiz0 = _mm256_add_pd(fiz0,tz);
1573 fjx1 = _mm256_add_pd(fjx1,tx);
1574 fjy1 = _mm256_add_pd(fjy1,ty);
1575 fjz1 = _mm256_add_pd(fjz1,tz);
1577 /**************************
1578 * CALCULATE INTERACTIONS *
1579 **************************/
1581 r02 = _mm256_mul_pd(rsq02,rinv02);
1583 /* EWALD ELECTROSTATICS */
1585 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1586 ewrt = _mm256_mul_pd(r02,ewtabscale);
1587 ewitab = _mm256_cvttpd_epi32(ewrt);
1588 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1589 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1590 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1592 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1593 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1597 /* Calculate temporary vectorial force */
1598 tx = _mm256_mul_pd(fscal,dx02);
1599 ty = _mm256_mul_pd(fscal,dy02);
1600 tz = _mm256_mul_pd(fscal,dz02);
1602 /* Update vectorial force */
1603 fix0 = _mm256_add_pd(fix0,tx);
1604 fiy0 = _mm256_add_pd(fiy0,ty);
1605 fiz0 = _mm256_add_pd(fiz0,tz);
1607 fjx2 = _mm256_add_pd(fjx2,tx);
1608 fjy2 = _mm256_add_pd(fjy2,ty);
1609 fjz2 = _mm256_add_pd(fjz2,tz);
1611 /**************************
1612 * CALCULATE INTERACTIONS *
1613 **************************/
1615 r10 = _mm256_mul_pd(rsq10,rinv10);
1617 /* EWALD ELECTROSTATICS */
1619 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1620 ewrt = _mm256_mul_pd(r10,ewtabscale);
1621 ewitab = _mm256_cvttpd_epi32(ewrt);
1622 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1623 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1624 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1626 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1627 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1631 /* Calculate temporary vectorial force */
1632 tx = _mm256_mul_pd(fscal,dx10);
1633 ty = _mm256_mul_pd(fscal,dy10);
1634 tz = _mm256_mul_pd(fscal,dz10);
1636 /* Update vectorial force */
1637 fix1 = _mm256_add_pd(fix1,tx);
1638 fiy1 = _mm256_add_pd(fiy1,ty);
1639 fiz1 = _mm256_add_pd(fiz1,tz);
1641 fjx0 = _mm256_add_pd(fjx0,tx);
1642 fjy0 = _mm256_add_pd(fjy0,ty);
1643 fjz0 = _mm256_add_pd(fjz0,tz);
1645 /**************************
1646 * CALCULATE INTERACTIONS *
1647 **************************/
1649 r11 = _mm256_mul_pd(rsq11,rinv11);
1651 /* EWALD ELECTROSTATICS */
1653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1654 ewrt = _mm256_mul_pd(r11,ewtabscale);
1655 ewitab = _mm256_cvttpd_epi32(ewrt);
1656 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1657 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1658 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1660 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1661 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1665 /* Calculate temporary vectorial force */
1666 tx = _mm256_mul_pd(fscal,dx11);
1667 ty = _mm256_mul_pd(fscal,dy11);
1668 tz = _mm256_mul_pd(fscal,dz11);
1670 /* Update vectorial force */
1671 fix1 = _mm256_add_pd(fix1,tx);
1672 fiy1 = _mm256_add_pd(fiy1,ty);
1673 fiz1 = _mm256_add_pd(fiz1,tz);
1675 fjx1 = _mm256_add_pd(fjx1,tx);
1676 fjy1 = _mm256_add_pd(fjy1,ty);
1677 fjz1 = _mm256_add_pd(fjz1,tz);
1679 /**************************
1680 * CALCULATE INTERACTIONS *
1681 **************************/
1683 r12 = _mm256_mul_pd(rsq12,rinv12);
1685 /* EWALD ELECTROSTATICS */
1687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1688 ewrt = _mm256_mul_pd(r12,ewtabscale);
1689 ewitab = _mm256_cvttpd_epi32(ewrt);
1690 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1691 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1692 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1694 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1695 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1699 /* Calculate temporary vectorial force */
1700 tx = _mm256_mul_pd(fscal,dx12);
1701 ty = _mm256_mul_pd(fscal,dy12);
1702 tz = _mm256_mul_pd(fscal,dz12);
1704 /* Update vectorial force */
1705 fix1 = _mm256_add_pd(fix1,tx);
1706 fiy1 = _mm256_add_pd(fiy1,ty);
1707 fiz1 = _mm256_add_pd(fiz1,tz);
1709 fjx2 = _mm256_add_pd(fjx2,tx);
1710 fjy2 = _mm256_add_pd(fjy2,ty);
1711 fjz2 = _mm256_add_pd(fjz2,tz);
1713 /**************************
1714 * CALCULATE INTERACTIONS *
1715 **************************/
1717 r20 = _mm256_mul_pd(rsq20,rinv20);
1719 /* EWALD ELECTROSTATICS */
1721 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1722 ewrt = _mm256_mul_pd(r20,ewtabscale);
1723 ewitab = _mm256_cvttpd_epi32(ewrt);
1724 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1725 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1726 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1728 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1729 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1733 /* Calculate temporary vectorial force */
1734 tx = _mm256_mul_pd(fscal,dx20);
1735 ty = _mm256_mul_pd(fscal,dy20);
1736 tz = _mm256_mul_pd(fscal,dz20);
1738 /* Update vectorial force */
1739 fix2 = _mm256_add_pd(fix2,tx);
1740 fiy2 = _mm256_add_pd(fiy2,ty);
1741 fiz2 = _mm256_add_pd(fiz2,tz);
1743 fjx0 = _mm256_add_pd(fjx0,tx);
1744 fjy0 = _mm256_add_pd(fjy0,ty);
1745 fjz0 = _mm256_add_pd(fjz0,tz);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 r21 = _mm256_mul_pd(rsq21,rinv21);
1753 /* EWALD ELECTROSTATICS */
1755 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1756 ewrt = _mm256_mul_pd(r21,ewtabscale);
1757 ewitab = _mm256_cvttpd_epi32(ewrt);
1758 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1759 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1760 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1762 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1763 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1767 /* Calculate temporary vectorial force */
1768 tx = _mm256_mul_pd(fscal,dx21);
1769 ty = _mm256_mul_pd(fscal,dy21);
1770 tz = _mm256_mul_pd(fscal,dz21);
1772 /* Update vectorial force */
1773 fix2 = _mm256_add_pd(fix2,tx);
1774 fiy2 = _mm256_add_pd(fiy2,ty);
1775 fiz2 = _mm256_add_pd(fiz2,tz);
1777 fjx1 = _mm256_add_pd(fjx1,tx);
1778 fjy1 = _mm256_add_pd(fjy1,ty);
1779 fjz1 = _mm256_add_pd(fjz1,tz);
1781 /**************************
1782 * CALCULATE INTERACTIONS *
1783 **************************/
1785 r22 = _mm256_mul_pd(rsq22,rinv22);
1787 /* EWALD ELECTROSTATICS */
1789 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1790 ewrt = _mm256_mul_pd(r22,ewtabscale);
1791 ewitab = _mm256_cvttpd_epi32(ewrt);
1792 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1793 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1794 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1796 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1797 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1801 /* Calculate temporary vectorial force */
1802 tx = _mm256_mul_pd(fscal,dx22);
1803 ty = _mm256_mul_pd(fscal,dy22);
1804 tz = _mm256_mul_pd(fscal,dz22);
1806 /* Update vectorial force */
1807 fix2 = _mm256_add_pd(fix2,tx);
1808 fiy2 = _mm256_add_pd(fiy2,ty);
1809 fiz2 = _mm256_add_pd(fiz2,tz);
1811 fjx2 = _mm256_add_pd(fjx2,tx);
1812 fjy2 = _mm256_add_pd(fjy2,ty);
1813 fjz2 = _mm256_add_pd(fjz2,tz);
1815 fjptrA = f+j_coord_offsetA;
1816 fjptrB = f+j_coord_offsetB;
1817 fjptrC = f+j_coord_offsetC;
1818 fjptrD = f+j_coord_offsetD;
1820 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1821 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1823 /* Inner loop uses 331 flops */
1826 if(jidx<j_index_end)
1829 /* Get j neighbor index, and coordinate index */
1830 jnrlistA = jjnr[jidx];
1831 jnrlistB = jjnr[jidx+1];
1832 jnrlistC = jjnr[jidx+2];
1833 jnrlistD = jjnr[jidx+3];
1834 /* Sign of each element will be negative for non-real atoms.
1835 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1836 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1838 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1840 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1841 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1842 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1844 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1845 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1846 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1847 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1848 j_coord_offsetA = DIM*jnrA;
1849 j_coord_offsetB = DIM*jnrB;
1850 j_coord_offsetC = DIM*jnrC;
1851 j_coord_offsetD = DIM*jnrD;
1853 /* load j atom coordinates */
1854 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1855 x+j_coord_offsetC,x+j_coord_offsetD,
1856 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1858 /* Calculate displacement vector */
1859 dx00 = _mm256_sub_pd(ix0,jx0);
1860 dy00 = _mm256_sub_pd(iy0,jy0);
1861 dz00 = _mm256_sub_pd(iz0,jz0);
1862 dx01 = _mm256_sub_pd(ix0,jx1);
1863 dy01 = _mm256_sub_pd(iy0,jy1);
1864 dz01 = _mm256_sub_pd(iz0,jz1);
1865 dx02 = _mm256_sub_pd(ix0,jx2);
1866 dy02 = _mm256_sub_pd(iy0,jy2);
1867 dz02 = _mm256_sub_pd(iz0,jz2);
1868 dx10 = _mm256_sub_pd(ix1,jx0);
1869 dy10 = _mm256_sub_pd(iy1,jy0);
1870 dz10 = _mm256_sub_pd(iz1,jz0);
1871 dx11 = _mm256_sub_pd(ix1,jx1);
1872 dy11 = _mm256_sub_pd(iy1,jy1);
1873 dz11 = _mm256_sub_pd(iz1,jz1);
1874 dx12 = _mm256_sub_pd(ix1,jx2);
1875 dy12 = _mm256_sub_pd(iy1,jy2);
1876 dz12 = _mm256_sub_pd(iz1,jz2);
1877 dx20 = _mm256_sub_pd(ix2,jx0);
1878 dy20 = _mm256_sub_pd(iy2,jy0);
1879 dz20 = _mm256_sub_pd(iz2,jz0);
1880 dx21 = _mm256_sub_pd(ix2,jx1);
1881 dy21 = _mm256_sub_pd(iy2,jy1);
1882 dz21 = _mm256_sub_pd(iz2,jz1);
1883 dx22 = _mm256_sub_pd(ix2,jx2);
1884 dy22 = _mm256_sub_pd(iy2,jy2);
1885 dz22 = _mm256_sub_pd(iz2,jz2);
1887 /* Calculate squared distance and things based on it */
1888 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1889 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1890 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1891 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1892 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1893 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1894 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1895 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1896 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1898 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1899 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1900 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1901 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1902 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1903 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1904 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1905 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1906 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1908 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1909 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1910 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1911 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1912 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1913 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1914 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1915 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1916 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1918 fjx0 = _mm256_setzero_pd();
1919 fjy0 = _mm256_setzero_pd();
1920 fjz0 = _mm256_setzero_pd();
1921 fjx1 = _mm256_setzero_pd();
1922 fjy1 = _mm256_setzero_pd();
1923 fjz1 = _mm256_setzero_pd();
1924 fjx2 = _mm256_setzero_pd();
1925 fjy2 = _mm256_setzero_pd();
1926 fjz2 = _mm256_setzero_pd();
1928 /**************************
1929 * CALCULATE INTERACTIONS *
1930 **************************/
1932 r00 = _mm256_mul_pd(rsq00,rinv00);
1933 r00 = _mm256_andnot_pd(dummy_mask,r00);
1935 /* EWALD ELECTROSTATICS */
1937 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1938 ewrt = _mm256_mul_pd(r00,ewtabscale);
1939 ewitab = _mm256_cvttpd_epi32(ewrt);
1940 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1941 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1942 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1944 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1945 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1947 /* LENNARD-JONES DISPERSION/REPULSION */
1949 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1950 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1952 fscal = _mm256_add_pd(felec,fvdw);
1954 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1956 /* Calculate temporary vectorial force */
1957 tx = _mm256_mul_pd(fscal,dx00);
1958 ty = _mm256_mul_pd(fscal,dy00);
1959 tz = _mm256_mul_pd(fscal,dz00);
1961 /* Update vectorial force */
1962 fix0 = _mm256_add_pd(fix0,tx);
1963 fiy0 = _mm256_add_pd(fiy0,ty);
1964 fiz0 = _mm256_add_pd(fiz0,tz);
1966 fjx0 = _mm256_add_pd(fjx0,tx);
1967 fjy0 = _mm256_add_pd(fjy0,ty);
1968 fjz0 = _mm256_add_pd(fjz0,tz);
1970 /**************************
1971 * CALCULATE INTERACTIONS *
1972 **************************/
1974 r01 = _mm256_mul_pd(rsq01,rinv01);
1975 r01 = _mm256_andnot_pd(dummy_mask,r01);
1977 /* EWALD ELECTROSTATICS */
1979 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1980 ewrt = _mm256_mul_pd(r01,ewtabscale);
1981 ewitab = _mm256_cvttpd_epi32(ewrt);
1982 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1983 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1984 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1986 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1987 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1991 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1993 /* Calculate temporary vectorial force */
1994 tx = _mm256_mul_pd(fscal,dx01);
1995 ty = _mm256_mul_pd(fscal,dy01);
1996 tz = _mm256_mul_pd(fscal,dz01);
1998 /* Update vectorial force */
1999 fix0 = _mm256_add_pd(fix0,tx);
2000 fiy0 = _mm256_add_pd(fiy0,ty);
2001 fiz0 = _mm256_add_pd(fiz0,tz);
2003 fjx1 = _mm256_add_pd(fjx1,tx);
2004 fjy1 = _mm256_add_pd(fjy1,ty);
2005 fjz1 = _mm256_add_pd(fjz1,tz);
2007 /**************************
2008 * CALCULATE INTERACTIONS *
2009 **************************/
2011 r02 = _mm256_mul_pd(rsq02,rinv02);
2012 r02 = _mm256_andnot_pd(dummy_mask,r02);
2014 /* EWALD ELECTROSTATICS */
2016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2017 ewrt = _mm256_mul_pd(r02,ewtabscale);
2018 ewitab = _mm256_cvttpd_epi32(ewrt);
2019 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2020 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2021 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2023 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2024 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2028 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2030 /* Calculate temporary vectorial force */
2031 tx = _mm256_mul_pd(fscal,dx02);
2032 ty = _mm256_mul_pd(fscal,dy02);
2033 tz = _mm256_mul_pd(fscal,dz02);
2035 /* Update vectorial force */
2036 fix0 = _mm256_add_pd(fix0,tx);
2037 fiy0 = _mm256_add_pd(fiy0,ty);
2038 fiz0 = _mm256_add_pd(fiz0,tz);
2040 fjx2 = _mm256_add_pd(fjx2,tx);
2041 fjy2 = _mm256_add_pd(fjy2,ty);
2042 fjz2 = _mm256_add_pd(fjz2,tz);
2044 /**************************
2045 * CALCULATE INTERACTIONS *
2046 **************************/
2048 r10 = _mm256_mul_pd(rsq10,rinv10);
2049 r10 = _mm256_andnot_pd(dummy_mask,r10);
2051 /* EWALD ELECTROSTATICS */
2053 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2054 ewrt = _mm256_mul_pd(r10,ewtabscale);
2055 ewitab = _mm256_cvttpd_epi32(ewrt);
2056 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2057 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2058 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2060 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2061 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2065 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2067 /* Calculate temporary vectorial force */
2068 tx = _mm256_mul_pd(fscal,dx10);
2069 ty = _mm256_mul_pd(fscal,dy10);
2070 tz = _mm256_mul_pd(fscal,dz10);
2072 /* Update vectorial force */
2073 fix1 = _mm256_add_pd(fix1,tx);
2074 fiy1 = _mm256_add_pd(fiy1,ty);
2075 fiz1 = _mm256_add_pd(fiz1,tz);
2077 fjx0 = _mm256_add_pd(fjx0,tx);
2078 fjy0 = _mm256_add_pd(fjy0,ty);
2079 fjz0 = _mm256_add_pd(fjz0,tz);
2081 /**************************
2082 * CALCULATE INTERACTIONS *
2083 **************************/
2085 r11 = _mm256_mul_pd(rsq11,rinv11);
2086 r11 = _mm256_andnot_pd(dummy_mask,r11);
2088 /* EWALD ELECTROSTATICS */
2090 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2091 ewrt = _mm256_mul_pd(r11,ewtabscale);
2092 ewitab = _mm256_cvttpd_epi32(ewrt);
2093 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2094 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2095 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2097 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2098 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2102 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2104 /* Calculate temporary vectorial force */
2105 tx = _mm256_mul_pd(fscal,dx11);
2106 ty = _mm256_mul_pd(fscal,dy11);
2107 tz = _mm256_mul_pd(fscal,dz11);
2109 /* Update vectorial force */
2110 fix1 = _mm256_add_pd(fix1,tx);
2111 fiy1 = _mm256_add_pd(fiy1,ty);
2112 fiz1 = _mm256_add_pd(fiz1,tz);
2114 fjx1 = _mm256_add_pd(fjx1,tx);
2115 fjy1 = _mm256_add_pd(fjy1,ty);
2116 fjz1 = _mm256_add_pd(fjz1,tz);
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 r12 = _mm256_mul_pd(rsq12,rinv12);
2123 r12 = _mm256_andnot_pd(dummy_mask,r12);
2125 /* EWALD ELECTROSTATICS */
2127 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2128 ewrt = _mm256_mul_pd(r12,ewtabscale);
2129 ewitab = _mm256_cvttpd_epi32(ewrt);
2130 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2131 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2132 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2134 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2135 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2139 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2141 /* Calculate temporary vectorial force */
2142 tx = _mm256_mul_pd(fscal,dx12);
2143 ty = _mm256_mul_pd(fscal,dy12);
2144 tz = _mm256_mul_pd(fscal,dz12);
2146 /* Update vectorial force */
2147 fix1 = _mm256_add_pd(fix1,tx);
2148 fiy1 = _mm256_add_pd(fiy1,ty);
2149 fiz1 = _mm256_add_pd(fiz1,tz);
2151 fjx2 = _mm256_add_pd(fjx2,tx);
2152 fjy2 = _mm256_add_pd(fjy2,ty);
2153 fjz2 = _mm256_add_pd(fjz2,tz);
2155 /**************************
2156 * CALCULATE INTERACTIONS *
2157 **************************/
2159 r20 = _mm256_mul_pd(rsq20,rinv20);
2160 r20 = _mm256_andnot_pd(dummy_mask,r20);
2162 /* EWALD ELECTROSTATICS */
2164 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2165 ewrt = _mm256_mul_pd(r20,ewtabscale);
2166 ewitab = _mm256_cvttpd_epi32(ewrt);
2167 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2168 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2169 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2171 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2172 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2176 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2178 /* Calculate temporary vectorial force */
2179 tx = _mm256_mul_pd(fscal,dx20);
2180 ty = _mm256_mul_pd(fscal,dy20);
2181 tz = _mm256_mul_pd(fscal,dz20);
2183 /* Update vectorial force */
2184 fix2 = _mm256_add_pd(fix2,tx);
2185 fiy2 = _mm256_add_pd(fiy2,ty);
2186 fiz2 = _mm256_add_pd(fiz2,tz);
2188 fjx0 = _mm256_add_pd(fjx0,tx);
2189 fjy0 = _mm256_add_pd(fjy0,ty);
2190 fjz0 = _mm256_add_pd(fjz0,tz);
2192 /**************************
2193 * CALCULATE INTERACTIONS *
2194 **************************/
2196 r21 = _mm256_mul_pd(rsq21,rinv21);
2197 r21 = _mm256_andnot_pd(dummy_mask,r21);
2199 /* EWALD ELECTROSTATICS */
2201 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2202 ewrt = _mm256_mul_pd(r21,ewtabscale);
2203 ewitab = _mm256_cvttpd_epi32(ewrt);
2204 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2205 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2206 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2208 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2209 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2213 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2215 /* Calculate temporary vectorial force */
2216 tx = _mm256_mul_pd(fscal,dx21);
2217 ty = _mm256_mul_pd(fscal,dy21);
2218 tz = _mm256_mul_pd(fscal,dz21);
2220 /* Update vectorial force */
2221 fix2 = _mm256_add_pd(fix2,tx);
2222 fiy2 = _mm256_add_pd(fiy2,ty);
2223 fiz2 = _mm256_add_pd(fiz2,tz);
2225 fjx1 = _mm256_add_pd(fjx1,tx);
2226 fjy1 = _mm256_add_pd(fjy1,ty);
2227 fjz1 = _mm256_add_pd(fjz1,tz);
2229 /**************************
2230 * CALCULATE INTERACTIONS *
2231 **************************/
2233 r22 = _mm256_mul_pd(rsq22,rinv22);
2234 r22 = _mm256_andnot_pd(dummy_mask,r22);
2236 /* EWALD ELECTROSTATICS */
2238 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2239 ewrt = _mm256_mul_pd(r22,ewtabscale);
2240 ewitab = _mm256_cvttpd_epi32(ewrt);
2241 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2242 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2243 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2245 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2246 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2250 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2252 /* Calculate temporary vectorial force */
2253 tx = _mm256_mul_pd(fscal,dx22);
2254 ty = _mm256_mul_pd(fscal,dy22);
2255 tz = _mm256_mul_pd(fscal,dz22);
2257 /* Update vectorial force */
2258 fix2 = _mm256_add_pd(fix2,tx);
2259 fiy2 = _mm256_add_pd(fiy2,ty);
2260 fiz2 = _mm256_add_pd(fiz2,tz);
2262 fjx2 = _mm256_add_pd(fjx2,tx);
2263 fjy2 = _mm256_add_pd(fjy2,ty);
2264 fjz2 = _mm256_add_pd(fjz2,tz);
2266 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2267 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2268 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2269 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2271 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2272 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2274 /* Inner loop uses 340 flops */
2277 /* End of innermost loop */
2279 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2280 f+i_coord_offset,fshift+i_shift_offset);
2282 /* Increment number of inner iterations */
2283 inneriter += j_index_end - j_index_start;
2285 /* Outer loop uses 18 flops */
2288 /* Increment number of outer iterations */
2291 /* Update outer/inner flops */
2293 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);