2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
107 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
110 __m256d dummy_mask,cutoff_mask;
111 __m128 tmpmask0,tmpmask1;
112 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
113 __m256d one = _mm256_set1_pd(1.0);
114 __m256d two = _mm256_set1_pd(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_pd(fr->ic->epsfac);
127 charge = mdatoms->chargeA;
129 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
130 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
131 beta2 = _mm256_mul_pd(beta,beta);
132 beta3 = _mm256_mul_pd(beta,beta2);
134 ewtab = fr->ic->tabq_coul_FDV0;
135 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
136 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
138 /* Setup water-specific parameters */
139 inr = nlist->iinr[0];
140 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
141 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
142 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
144 jq0 = _mm256_set1_pd(charge[inr+0]);
145 jq1 = _mm256_set1_pd(charge[inr+1]);
146 jq2 = _mm256_set1_pd(charge[inr+2]);
147 qq00 = _mm256_mul_pd(iq0,jq0);
148 qq01 = _mm256_mul_pd(iq0,jq1);
149 qq02 = _mm256_mul_pd(iq0,jq2);
150 qq10 = _mm256_mul_pd(iq1,jq0);
151 qq11 = _mm256_mul_pd(iq1,jq1);
152 qq12 = _mm256_mul_pd(iq1,jq2);
153 qq20 = _mm256_mul_pd(iq2,jq0);
154 qq21 = _mm256_mul_pd(iq2,jq1);
155 qq22 = _mm256_mul_pd(iq2,jq2);
157 /* Avoid stupid compiler warnings */
158 jnrA = jnrB = jnrC = jnrD = 0;
167 for(iidx=0;iidx<4*DIM;iidx++)
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
190 fix0 = _mm256_setzero_pd();
191 fiy0 = _mm256_setzero_pd();
192 fiz0 = _mm256_setzero_pd();
193 fix1 = _mm256_setzero_pd();
194 fiy1 = _mm256_setzero_pd();
195 fiz1 = _mm256_setzero_pd();
196 fix2 = _mm256_setzero_pd();
197 fiy2 = _mm256_setzero_pd();
198 fiz2 = _mm256_setzero_pd();
200 /* Reset potential sums */
201 velecsum = _mm256_setzero_pd();
203 /* Start inner kernel loop */
204 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
207 /* Get j neighbor index, and coordinate index */
212 j_coord_offsetA = DIM*jnrA;
213 j_coord_offsetB = DIM*jnrB;
214 j_coord_offsetC = DIM*jnrC;
215 j_coord_offsetD = DIM*jnrD;
217 /* load j atom coordinates */
218 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
219 x+j_coord_offsetC,x+j_coord_offsetD,
220 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
222 /* Calculate displacement vector */
223 dx00 = _mm256_sub_pd(ix0,jx0);
224 dy00 = _mm256_sub_pd(iy0,jy0);
225 dz00 = _mm256_sub_pd(iz0,jz0);
226 dx01 = _mm256_sub_pd(ix0,jx1);
227 dy01 = _mm256_sub_pd(iy0,jy1);
228 dz01 = _mm256_sub_pd(iz0,jz1);
229 dx02 = _mm256_sub_pd(ix0,jx2);
230 dy02 = _mm256_sub_pd(iy0,jy2);
231 dz02 = _mm256_sub_pd(iz0,jz2);
232 dx10 = _mm256_sub_pd(ix1,jx0);
233 dy10 = _mm256_sub_pd(iy1,jy0);
234 dz10 = _mm256_sub_pd(iz1,jz0);
235 dx11 = _mm256_sub_pd(ix1,jx1);
236 dy11 = _mm256_sub_pd(iy1,jy1);
237 dz11 = _mm256_sub_pd(iz1,jz1);
238 dx12 = _mm256_sub_pd(ix1,jx2);
239 dy12 = _mm256_sub_pd(iy1,jy2);
240 dz12 = _mm256_sub_pd(iz1,jz2);
241 dx20 = _mm256_sub_pd(ix2,jx0);
242 dy20 = _mm256_sub_pd(iy2,jy0);
243 dz20 = _mm256_sub_pd(iz2,jz0);
244 dx21 = _mm256_sub_pd(ix2,jx1);
245 dy21 = _mm256_sub_pd(iy2,jy1);
246 dz21 = _mm256_sub_pd(iz2,jz1);
247 dx22 = _mm256_sub_pd(ix2,jx2);
248 dy22 = _mm256_sub_pd(iy2,jy2);
249 dz22 = _mm256_sub_pd(iz2,jz2);
251 /* Calculate squared distance and things based on it */
252 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
253 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
254 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
255 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
256 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
257 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
258 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
259 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
260 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
262 rinv00 = avx256_invsqrt_d(rsq00);
263 rinv01 = avx256_invsqrt_d(rsq01);
264 rinv02 = avx256_invsqrt_d(rsq02);
265 rinv10 = avx256_invsqrt_d(rsq10);
266 rinv11 = avx256_invsqrt_d(rsq11);
267 rinv12 = avx256_invsqrt_d(rsq12);
268 rinv20 = avx256_invsqrt_d(rsq20);
269 rinv21 = avx256_invsqrt_d(rsq21);
270 rinv22 = avx256_invsqrt_d(rsq22);
272 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
273 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
274 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
275 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
276 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
277 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
278 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
279 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
280 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
282 fjx0 = _mm256_setzero_pd();
283 fjy0 = _mm256_setzero_pd();
284 fjz0 = _mm256_setzero_pd();
285 fjx1 = _mm256_setzero_pd();
286 fjy1 = _mm256_setzero_pd();
287 fjz1 = _mm256_setzero_pd();
288 fjx2 = _mm256_setzero_pd();
289 fjy2 = _mm256_setzero_pd();
290 fjz2 = _mm256_setzero_pd();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 r00 = _mm256_mul_pd(rsq00,rinv00);
298 /* EWALD ELECTROSTATICS */
300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
301 ewrt = _mm256_mul_pd(r00,ewtabscale);
302 ewitab = _mm256_cvttpd_epi32(ewrt);
303 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
304 ewitab = _mm_slli_epi32(ewitab,2);
305 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
306 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
307 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
308 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
309 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
310 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
311 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
312 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
313 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
315 /* Update potential sum for this i atom from the interaction with this j atom. */
316 velecsum = _mm256_add_pd(velecsum,velec);
320 /* Calculate temporary vectorial force */
321 tx = _mm256_mul_pd(fscal,dx00);
322 ty = _mm256_mul_pd(fscal,dy00);
323 tz = _mm256_mul_pd(fscal,dz00);
325 /* Update vectorial force */
326 fix0 = _mm256_add_pd(fix0,tx);
327 fiy0 = _mm256_add_pd(fiy0,ty);
328 fiz0 = _mm256_add_pd(fiz0,tz);
330 fjx0 = _mm256_add_pd(fjx0,tx);
331 fjy0 = _mm256_add_pd(fjy0,ty);
332 fjz0 = _mm256_add_pd(fjz0,tz);
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 r01 = _mm256_mul_pd(rsq01,rinv01);
340 /* EWALD ELECTROSTATICS */
342 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
343 ewrt = _mm256_mul_pd(r01,ewtabscale);
344 ewitab = _mm256_cvttpd_epi32(ewrt);
345 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
346 ewitab = _mm_slli_epi32(ewitab,2);
347 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
348 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
349 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
350 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
351 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
352 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
353 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
354 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
355 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm256_add_pd(velecsum,velec);
362 /* Calculate temporary vectorial force */
363 tx = _mm256_mul_pd(fscal,dx01);
364 ty = _mm256_mul_pd(fscal,dy01);
365 tz = _mm256_mul_pd(fscal,dz01);
367 /* Update vectorial force */
368 fix0 = _mm256_add_pd(fix0,tx);
369 fiy0 = _mm256_add_pd(fiy0,ty);
370 fiz0 = _mm256_add_pd(fiz0,tz);
372 fjx1 = _mm256_add_pd(fjx1,tx);
373 fjy1 = _mm256_add_pd(fjy1,ty);
374 fjz1 = _mm256_add_pd(fjz1,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r02 = _mm256_mul_pd(rsq02,rinv02);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm256_mul_pd(r02,ewtabscale);
386 ewitab = _mm256_cvttpd_epi32(ewrt);
387 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
390 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
391 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
392 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
393 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
394 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
395 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
396 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
397 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm256_add_pd(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm256_mul_pd(fscal,dx02);
406 ty = _mm256_mul_pd(fscal,dy02);
407 tz = _mm256_mul_pd(fscal,dz02);
409 /* Update vectorial force */
410 fix0 = _mm256_add_pd(fix0,tx);
411 fiy0 = _mm256_add_pd(fiy0,ty);
412 fiz0 = _mm256_add_pd(fiz0,tz);
414 fjx2 = _mm256_add_pd(fjx2,tx);
415 fjy2 = _mm256_add_pd(fjy2,ty);
416 fjz2 = _mm256_add_pd(fjz2,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r10 = _mm256_mul_pd(rsq10,rinv10);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt = _mm256_mul_pd(r10,ewtabscale);
428 ewitab = _mm256_cvttpd_epi32(ewrt);
429 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
430 ewitab = _mm_slli_epi32(ewitab,2);
431 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
432 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
433 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
434 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
435 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
436 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
437 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
438 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
439 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm256_add_pd(velecsum,velec);
446 /* Calculate temporary vectorial force */
447 tx = _mm256_mul_pd(fscal,dx10);
448 ty = _mm256_mul_pd(fscal,dy10);
449 tz = _mm256_mul_pd(fscal,dz10);
451 /* Update vectorial force */
452 fix1 = _mm256_add_pd(fix1,tx);
453 fiy1 = _mm256_add_pd(fiy1,ty);
454 fiz1 = _mm256_add_pd(fiz1,tz);
456 fjx0 = _mm256_add_pd(fjx0,tx);
457 fjy0 = _mm256_add_pd(fjy0,ty);
458 fjz0 = _mm256_add_pd(fjz0,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r11 = _mm256_mul_pd(rsq11,rinv11);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm256_mul_pd(r11,ewtabscale);
470 ewitab = _mm256_cvttpd_epi32(ewrt);
471 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
477 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
479 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
480 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
481 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm256_add_pd(velecsum,velec);
488 /* Calculate temporary vectorial force */
489 tx = _mm256_mul_pd(fscal,dx11);
490 ty = _mm256_mul_pd(fscal,dy11);
491 tz = _mm256_mul_pd(fscal,dz11);
493 /* Update vectorial force */
494 fix1 = _mm256_add_pd(fix1,tx);
495 fiy1 = _mm256_add_pd(fiy1,ty);
496 fiz1 = _mm256_add_pd(fiz1,tz);
498 fjx1 = _mm256_add_pd(fjx1,tx);
499 fjy1 = _mm256_add_pd(fjy1,ty);
500 fjz1 = _mm256_add_pd(fjz1,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r12 = _mm256_mul_pd(rsq12,rinv12);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm256_mul_pd(r12,ewtabscale);
512 ewitab = _mm256_cvttpd_epi32(ewrt);
513 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
519 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
521 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
522 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
523 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm256_add_pd(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm256_mul_pd(fscal,dx12);
532 ty = _mm256_mul_pd(fscal,dy12);
533 tz = _mm256_mul_pd(fscal,dz12);
535 /* Update vectorial force */
536 fix1 = _mm256_add_pd(fix1,tx);
537 fiy1 = _mm256_add_pd(fiy1,ty);
538 fiz1 = _mm256_add_pd(fiz1,tz);
540 fjx2 = _mm256_add_pd(fjx2,tx);
541 fjy2 = _mm256_add_pd(fjy2,ty);
542 fjz2 = _mm256_add_pd(fjz2,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r20 = _mm256_mul_pd(rsq20,rinv20);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt = _mm256_mul_pd(r20,ewtabscale);
554 ewitab = _mm256_cvttpd_epi32(ewrt);
555 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
556 ewitab = _mm_slli_epi32(ewitab,2);
557 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
558 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
559 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
560 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
561 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
562 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
563 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
564 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
565 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm256_add_pd(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm256_mul_pd(fscal,dx20);
574 ty = _mm256_mul_pd(fscal,dy20);
575 tz = _mm256_mul_pd(fscal,dz20);
577 /* Update vectorial force */
578 fix2 = _mm256_add_pd(fix2,tx);
579 fiy2 = _mm256_add_pd(fiy2,ty);
580 fiz2 = _mm256_add_pd(fiz2,tz);
582 fjx0 = _mm256_add_pd(fjx0,tx);
583 fjy0 = _mm256_add_pd(fjy0,ty);
584 fjz0 = _mm256_add_pd(fjz0,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r21 = _mm256_mul_pd(rsq21,rinv21);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm256_mul_pd(r21,ewtabscale);
596 ewitab = _mm256_cvttpd_epi32(ewrt);
597 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
603 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
605 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
606 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
607 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm256_add_pd(velecsum,velec);
614 /* Calculate temporary vectorial force */
615 tx = _mm256_mul_pd(fscal,dx21);
616 ty = _mm256_mul_pd(fscal,dy21);
617 tz = _mm256_mul_pd(fscal,dz21);
619 /* Update vectorial force */
620 fix2 = _mm256_add_pd(fix2,tx);
621 fiy2 = _mm256_add_pd(fiy2,ty);
622 fiz2 = _mm256_add_pd(fiz2,tz);
624 fjx1 = _mm256_add_pd(fjx1,tx);
625 fjy1 = _mm256_add_pd(fjy1,ty);
626 fjz1 = _mm256_add_pd(fjz1,tz);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r22 = _mm256_mul_pd(rsq22,rinv22);
634 /* EWALD ELECTROSTATICS */
636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
637 ewrt = _mm256_mul_pd(r22,ewtabscale);
638 ewitab = _mm256_cvttpd_epi32(ewrt);
639 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
640 ewitab = _mm_slli_epi32(ewitab,2);
641 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
642 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
643 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
644 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
645 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
646 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
647 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
648 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
649 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm256_add_pd(velecsum,velec);
656 /* Calculate temporary vectorial force */
657 tx = _mm256_mul_pd(fscal,dx22);
658 ty = _mm256_mul_pd(fscal,dy22);
659 tz = _mm256_mul_pd(fscal,dz22);
661 /* Update vectorial force */
662 fix2 = _mm256_add_pd(fix2,tx);
663 fiy2 = _mm256_add_pd(fiy2,ty);
664 fiz2 = _mm256_add_pd(fiz2,tz);
666 fjx2 = _mm256_add_pd(fjx2,tx);
667 fjy2 = _mm256_add_pd(fjy2,ty);
668 fjz2 = _mm256_add_pd(fjz2,tz);
670 fjptrA = f+j_coord_offsetA;
671 fjptrB = f+j_coord_offsetB;
672 fjptrC = f+j_coord_offsetC;
673 fjptrD = f+j_coord_offsetD;
675 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
676 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
678 /* Inner loop uses 369 flops */
684 /* Get j neighbor index, and coordinate index */
685 jnrlistA = jjnr[jidx];
686 jnrlistB = jjnr[jidx+1];
687 jnrlistC = jjnr[jidx+2];
688 jnrlistD = jjnr[jidx+3];
689 /* Sign of each element will be negative for non-real atoms.
690 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
691 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
693 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
695 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
696 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
697 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
699 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
700 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
701 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
702 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
703 j_coord_offsetA = DIM*jnrA;
704 j_coord_offsetB = DIM*jnrB;
705 j_coord_offsetC = DIM*jnrC;
706 j_coord_offsetD = DIM*jnrD;
708 /* load j atom coordinates */
709 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
710 x+j_coord_offsetC,x+j_coord_offsetD,
711 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
713 /* Calculate displacement vector */
714 dx00 = _mm256_sub_pd(ix0,jx0);
715 dy00 = _mm256_sub_pd(iy0,jy0);
716 dz00 = _mm256_sub_pd(iz0,jz0);
717 dx01 = _mm256_sub_pd(ix0,jx1);
718 dy01 = _mm256_sub_pd(iy0,jy1);
719 dz01 = _mm256_sub_pd(iz0,jz1);
720 dx02 = _mm256_sub_pd(ix0,jx2);
721 dy02 = _mm256_sub_pd(iy0,jy2);
722 dz02 = _mm256_sub_pd(iz0,jz2);
723 dx10 = _mm256_sub_pd(ix1,jx0);
724 dy10 = _mm256_sub_pd(iy1,jy0);
725 dz10 = _mm256_sub_pd(iz1,jz0);
726 dx11 = _mm256_sub_pd(ix1,jx1);
727 dy11 = _mm256_sub_pd(iy1,jy1);
728 dz11 = _mm256_sub_pd(iz1,jz1);
729 dx12 = _mm256_sub_pd(ix1,jx2);
730 dy12 = _mm256_sub_pd(iy1,jy2);
731 dz12 = _mm256_sub_pd(iz1,jz2);
732 dx20 = _mm256_sub_pd(ix2,jx0);
733 dy20 = _mm256_sub_pd(iy2,jy0);
734 dz20 = _mm256_sub_pd(iz2,jz0);
735 dx21 = _mm256_sub_pd(ix2,jx1);
736 dy21 = _mm256_sub_pd(iy2,jy1);
737 dz21 = _mm256_sub_pd(iz2,jz1);
738 dx22 = _mm256_sub_pd(ix2,jx2);
739 dy22 = _mm256_sub_pd(iy2,jy2);
740 dz22 = _mm256_sub_pd(iz2,jz2);
742 /* Calculate squared distance and things based on it */
743 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
744 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
745 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
746 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
747 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
748 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
749 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
750 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
751 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
753 rinv00 = avx256_invsqrt_d(rsq00);
754 rinv01 = avx256_invsqrt_d(rsq01);
755 rinv02 = avx256_invsqrt_d(rsq02);
756 rinv10 = avx256_invsqrt_d(rsq10);
757 rinv11 = avx256_invsqrt_d(rsq11);
758 rinv12 = avx256_invsqrt_d(rsq12);
759 rinv20 = avx256_invsqrt_d(rsq20);
760 rinv21 = avx256_invsqrt_d(rsq21);
761 rinv22 = avx256_invsqrt_d(rsq22);
763 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
764 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
765 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
766 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
767 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
768 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
769 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
770 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
771 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
773 fjx0 = _mm256_setzero_pd();
774 fjy0 = _mm256_setzero_pd();
775 fjz0 = _mm256_setzero_pd();
776 fjx1 = _mm256_setzero_pd();
777 fjy1 = _mm256_setzero_pd();
778 fjz1 = _mm256_setzero_pd();
779 fjx2 = _mm256_setzero_pd();
780 fjy2 = _mm256_setzero_pd();
781 fjz2 = _mm256_setzero_pd();
783 /**************************
784 * CALCULATE INTERACTIONS *
785 **************************/
787 r00 = _mm256_mul_pd(rsq00,rinv00);
788 r00 = _mm256_andnot_pd(dummy_mask,r00);
790 /* EWALD ELECTROSTATICS */
792 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
793 ewrt = _mm256_mul_pd(r00,ewtabscale);
794 ewitab = _mm256_cvttpd_epi32(ewrt);
795 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
796 ewitab = _mm_slli_epi32(ewitab,2);
797 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
798 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
799 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
800 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
801 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
802 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
803 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
804 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
805 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm256_andnot_pd(dummy_mask,velec);
809 velecsum = _mm256_add_pd(velecsum,velec);
813 fscal = _mm256_andnot_pd(dummy_mask,fscal);
815 /* Calculate temporary vectorial force */
816 tx = _mm256_mul_pd(fscal,dx00);
817 ty = _mm256_mul_pd(fscal,dy00);
818 tz = _mm256_mul_pd(fscal,dz00);
820 /* Update vectorial force */
821 fix0 = _mm256_add_pd(fix0,tx);
822 fiy0 = _mm256_add_pd(fiy0,ty);
823 fiz0 = _mm256_add_pd(fiz0,tz);
825 fjx0 = _mm256_add_pd(fjx0,tx);
826 fjy0 = _mm256_add_pd(fjy0,ty);
827 fjz0 = _mm256_add_pd(fjz0,tz);
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 r01 = _mm256_mul_pd(rsq01,rinv01);
834 r01 = _mm256_andnot_pd(dummy_mask,r01);
836 /* EWALD ELECTROSTATICS */
838 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
839 ewrt = _mm256_mul_pd(r01,ewtabscale);
840 ewitab = _mm256_cvttpd_epi32(ewrt);
841 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
842 ewitab = _mm_slli_epi32(ewitab,2);
843 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
844 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
845 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
846 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
847 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
848 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
849 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
850 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
851 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
853 /* Update potential sum for this i atom from the interaction with this j atom. */
854 velec = _mm256_andnot_pd(dummy_mask,velec);
855 velecsum = _mm256_add_pd(velecsum,velec);
859 fscal = _mm256_andnot_pd(dummy_mask,fscal);
861 /* Calculate temporary vectorial force */
862 tx = _mm256_mul_pd(fscal,dx01);
863 ty = _mm256_mul_pd(fscal,dy01);
864 tz = _mm256_mul_pd(fscal,dz01);
866 /* Update vectorial force */
867 fix0 = _mm256_add_pd(fix0,tx);
868 fiy0 = _mm256_add_pd(fiy0,ty);
869 fiz0 = _mm256_add_pd(fiz0,tz);
871 fjx1 = _mm256_add_pd(fjx1,tx);
872 fjy1 = _mm256_add_pd(fjy1,ty);
873 fjz1 = _mm256_add_pd(fjz1,tz);
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
879 r02 = _mm256_mul_pd(rsq02,rinv02);
880 r02 = _mm256_andnot_pd(dummy_mask,r02);
882 /* EWALD ELECTROSTATICS */
884 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
885 ewrt = _mm256_mul_pd(r02,ewtabscale);
886 ewitab = _mm256_cvttpd_epi32(ewrt);
887 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
888 ewitab = _mm_slli_epi32(ewitab,2);
889 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
890 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
891 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
892 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
893 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
894 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
895 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
896 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
897 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec = _mm256_andnot_pd(dummy_mask,velec);
901 velecsum = _mm256_add_pd(velecsum,velec);
905 fscal = _mm256_andnot_pd(dummy_mask,fscal);
907 /* Calculate temporary vectorial force */
908 tx = _mm256_mul_pd(fscal,dx02);
909 ty = _mm256_mul_pd(fscal,dy02);
910 tz = _mm256_mul_pd(fscal,dz02);
912 /* Update vectorial force */
913 fix0 = _mm256_add_pd(fix0,tx);
914 fiy0 = _mm256_add_pd(fiy0,ty);
915 fiz0 = _mm256_add_pd(fiz0,tz);
917 fjx2 = _mm256_add_pd(fjx2,tx);
918 fjy2 = _mm256_add_pd(fjy2,ty);
919 fjz2 = _mm256_add_pd(fjz2,tz);
921 /**************************
922 * CALCULATE INTERACTIONS *
923 **************************/
925 r10 = _mm256_mul_pd(rsq10,rinv10);
926 r10 = _mm256_andnot_pd(dummy_mask,r10);
928 /* EWALD ELECTROSTATICS */
930 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
931 ewrt = _mm256_mul_pd(r10,ewtabscale);
932 ewitab = _mm256_cvttpd_epi32(ewrt);
933 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
934 ewitab = _mm_slli_epi32(ewitab,2);
935 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
936 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
937 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
938 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
939 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
940 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
941 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
942 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
943 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
945 /* Update potential sum for this i atom from the interaction with this j atom. */
946 velec = _mm256_andnot_pd(dummy_mask,velec);
947 velecsum = _mm256_add_pd(velecsum,velec);
951 fscal = _mm256_andnot_pd(dummy_mask,fscal);
953 /* Calculate temporary vectorial force */
954 tx = _mm256_mul_pd(fscal,dx10);
955 ty = _mm256_mul_pd(fscal,dy10);
956 tz = _mm256_mul_pd(fscal,dz10);
958 /* Update vectorial force */
959 fix1 = _mm256_add_pd(fix1,tx);
960 fiy1 = _mm256_add_pd(fiy1,ty);
961 fiz1 = _mm256_add_pd(fiz1,tz);
963 fjx0 = _mm256_add_pd(fjx0,tx);
964 fjy0 = _mm256_add_pd(fjy0,ty);
965 fjz0 = _mm256_add_pd(fjz0,tz);
967 /**************************
968 * CALCULATE INTERACTIONS *
969 **************************/
971 r11 = _mm256_mul_pd(rsq11,rinv11);
972 r11 = _mm256_andnot_pd(dummy_mask,r11);
974 /* EWALD ELECTROSTATICS */
976 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
977 ewrt = _mm256_mul_pd(r11,ewtabscale);
978 ewitab = _mm256_cvttpd_epi32(ewrt);
979 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
980 ewitab = _mm_slli_epi32(ewitab,2);
981 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
982 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
983 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
984 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
985 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
986 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
987 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
988 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
989 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec = _mm256_andnot_pd(dummy_mask,velec);
993 velecsum = _mm256_add_pd(velecsum,velec);
997 fscal = _mm256_andnot_pd(dummy_mask,fscal);
999 /* Calculate temporary vectorial force */
1000 tx = _mm256_mul_pd(fscal,dx11);
1001 ty = _mm256_mul_pd(fscal,dy11);
1002 tz = _mm256_mul_pd(fscal,dz11);
1004 /* Update vectorial force */
1005 fix1 = _mm256_add_pd(fix1,tx);
1006 fiy1 = _mm256_add_pd(fiy1,ty);
1007 fiz1 = _mm256_add_pd(fiz1,tz);
1009 fjx1 = _mm256_add_pd(fjx1,tx);
1010 fjy1 = _mm256_add_pd(fjy1,ty);
1011 fjz1 = _mm256_add_pd(fjz1,tz);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 r12 = _mm256_mul_pd(rsq12,rinv12);
1018 r12 = _mm256_andnot_pd(dummy_mask,r12);
1020 /* EWALD ELECTROSTATICS */
1022 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1023 ewrt = _mm256_mul_pd(r12,ewtabscale);
1024 ewitab = _mm256_cvttpd_epi32(ewrt);
1025 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1026 ewitab = _mm_slli_epi32(ewitab,2);
1027 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1028 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1029 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1030 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1031 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1032 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1033 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1034 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1035 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1037 /* Update potential sum for this i atom from the interaction with this j atom. */
1038 velec = _mm256_andnot_pd(dummy_mask,velec);
1039 velecsum = _mm256_add_pd(velecsum,velec);
1043 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1045 /* Calculate temporary vectorial force */
1046 tx = _mm256_mul_pd(fscal,dx12);
1047 ty = _mm256_mul_pd(fscal,dy12);
1048 tz = _mm256_mul_pd(fscal,dz12);
1050 /* Update vectorial force */
1051 fix1 = _mm256_add_pd(fix1,tx);
1052 fiy1 = _mm256_add_pd(fiy1,ty);
1053 fiz1 = _mm256_add_pd(fiz1,tz);
1055 fjx2 = _mm256_add_pd(fjx2,tx);
1056 fjy2 = _mm256_add_pd(fjy2,ty);
1057 fjz2 = _mm256_add_pd(fjz2,tz);
1059 /**************************
1060 * CALCULATE INTERACTIONS *
1061 **************************/
1063 r20 = _mm256_mul_pd(rsq20,rinv20);
1064 r20 = _mm256_andnot_pd(dummy_mask,r20);
1066 /* EWALD ELECTROSTATICS */
1068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1069 ewrt = _mm256_mul_pd(r20,ewtabscale);
1070 ewitab = _mm256_cvttpd_epi32(ewrt);
1071 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1072 ewitab = _mm_slli_epi32(ewitab,2);
1073 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1074 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1075 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1076 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1077 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1078 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1079 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1080 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1081 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1083 /* Update potential sum for this i atom from the interaction with this j atom. */
1084 velec = _mm256_andnot_pd(dummy_mask,velec);
1085 velecsum = _mm256_add_pd(velecsum,velec);
1089 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1091 /* Calculate temporary vectorial force */
1092 tx = _mm256_mul_pd(fscal,dx20);
1093 ty = _mm256_mul_pd(fscal,dy20);
1094 tz = _mm256_mul_pd(fscal,dz20);
1096 /* Update vectorial force */
1097 fix2 = _mm256_add_pd(fix2,tx);
1098 fiy2 = _mm256_add_pd(fiy2,ty);
1099 fiz2 = _mm256_add_pd(fiz2,tz);
1101 fjx0 = _mm256_add_pd(fjx0,tx);
1102 fjy0 = _mm256_add_pd(fjy0,ty);
1103 fjz0 = _mm256_add_pd(fjz0,tz);
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 r21 = _mm256_mul_pd(rsq21,rinv21);
1110 r21 = _mm256_andnot_pd(dummy_mask,r21);
1112 /* EWALD ELECTROSTATICS */
1114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1115 ewrt = _mm256_mul_pd(r21,ewtabscale);
1116 ewitab = _mm256_cvttpd_epi32(ewrt);
1117 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1118 ewitab = _mm_slli_epi32(ewitab,2);
1119 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1120 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1121 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1122 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1123 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1124 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1125 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1126 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1127 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1129 /* Update potential sum for this i atom from the interaction with this j atom. */
1130 velec = _mm256_andnot_pd(dummy_mask,velec);
1131 velecsum = _mm256_add_pd(velecsum,velec);
1135 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1137 /* Calculate temporary vectorial force */
1138 tx = _mm256_mul_pd(fscal,dx21);
1139 ty = _mm256_mul_pd(fscal,dy21);
1140 tz = _mm256_mul_pd(fscal,dz21);
1142 /* Update vectorial force */
1143 fix2 = _mm256_add_pd(fix2,tx);
1144 fiy2 = _mm256_add_pd(fiy2,ty);
1145 fiz2 = _mm256_add_pd(fiz2,tz);
1147 fjx1 = _mm256_add_pd(fjx1,tx);
1148 fjy1 = _mm256_add_pd(fjy1,ty);
1149 fjz1 = _mm256_add_pd(fjz1,tz);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 r22 = _mm256_mul_pd(rsq22,rinv22);
1156 r22 = _mm256_andnot_pd(dummy_mask,r22);
1158 /* EWALD ELECTROSTATICS */
1160 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1161 ewrt = _mm256_mul_pd(r22,ewtabscale);
1162 ewitab = _mm256_cvttpd_epi32(ewrt);
1163 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1164 ewitab = _mm_slli_epi32(ewitab,2);
1165 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1166 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1167 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1168 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1169 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1170 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1171 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1172 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1173 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1175 /* Update potential sum for this i atom from the interaction with this j atom. */
1176 velec = _mm256_andnot_pd(dummy_mask,velec);
1177 velecsum = _mm256_add_pd(velecsum,velec);
1181 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1183 /* Calculate temporary vectorial force */
1184 tx = _mm256_mul_pd(fscal,dx22);
1185 ty = _mm256_mul_pd(fscal,dy22);
1186 tz = _mm256_mul_pd(fscal,dz22);
1188 /* Update vectorial force */
1189 fix2 = _mm256_add_pd(fix2,tx);
1190 fiy2 = _mm256_add_pd(fiy2,ty);
1191 fiz2 = _mm256_add_pd(fiz2,tz);
1193 fjx2 = _mm256_add_pd(fjx2,tx);
1194 fjy2 = _mm256_add_pd(fjy2,ty);
1195 fjz2 = _mm256_add_pd(fjz2,tz);
1197 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1198 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1199 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1200 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1202 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1203 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1205 /* Inner loop uses 378 flops */
1208 /* End of innermost loop */
1210 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1211 f+i_coord_offset,fshift+i_shift_offset);
1214 /* Update potential energies */
1215 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1217 /* Increment number of inner iterations */
1218 inneriter += j_index_end - j_index_start;
1220 /* Outer loop uses 19 flops */
1223 /* Increment number of outer iterations */
1226 /* Update outer/inner flops */
1228 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*378);
1231 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
1232 * Electrostatics interaction: Ewald
1233 * VdW interaction: None
1234 * Geometry: Water3-Water3
1235 * Calculate force/pot: Force
1238 nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
1239 (t_nblist * gmx_restrict nlist,
1240 rvec * gmx_restrict xx,
1241 rvec * gmx_restrict ff,
1242 struct t_forcerec * gmx_restrict fr,
1243 t_mdatoms * gmx_restrict mdatoms,
1244 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1245 t_nrnb * gmx_restrict nrnb)
1247 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1248 * just 0 for non-waters.
1249 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1250 * jnr indices corresponding to data put in the four positions in the SIMD register.
1252 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1253 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1254 int jnrA,jnrB,jnrC,jnrD;
1255 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1256 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1257 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1258 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1259 real rcutoff_scalar;
1260 real *shiftvec,*fshift,*x,*f;
1261 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1262 real scratch[4*DIM];
1263 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1264 real * vdwioffsetptr0;
1265 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1266 real * vdwioffsetptr1;
1267 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1268 real * vdwioffsetptr2;
1269 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1270 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1271 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1272 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1273 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1274 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1275 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1276 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1277 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1278 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1279 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1280 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1281 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1282 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1283 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1284 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1285 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1288 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1289 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1291 __m256d dummy_mask,cutoff_mask;
1292 __m128 tmpmask0,tmpmask1;
1293 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1294 __m256d one = _mm256_set1_pd(1.0);
1295 __m256d two = _mm256_set1_pd(2.0);
1301 jindex = nlist->jindex;
1303 shiftidx = nlist->shift;
1305 shiftvec = fr->shift_vec[0];
1306 fshift = fr->fshift[0];
1307 facel = _mm256_set1_pd(fr->ic->epsfac);
1308 charge = mdatoms->chargeA;
1310 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1311 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1312 beta2 = _mm256_mul_pd(beta,beta);
1313 beta3 = _mm256_mul_pd(beta,beta2);
1315 ewtab = fr->ic->tabq_coul_F;
1316 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1317 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1319 /* Setup water-specific parameters */
1320 inr = nlist->iinr[0];
1321 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1322 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1323 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1325 jq0 = _mm256_set1_pd(charge[inr+0]);
1326 jq1 = _mm256_set1_pd(charge[inr+1]);
1327 jq2 = _mm256_set1_pd(charge[inr+2]);
1328 qq00 = _mm256_mul_pd(iq0,jq0);
1329 qq01 = _mm256_mul_pd(iq0,jq1);
1330 qq02 = _mm256_mul_pd(iq0,jq2);
1331 qq10 = _mm256_mul_pd(iq1,jq0);
1332 qq11 = _mm256_mul_pd(iq1,jq1);
1333 qq12 = _mm256_mul_pd(iq1,jq2);
1334 qq20 = _mm256_mul_pd(iq2,jq0);
1335 qq21 = _mm256_mul_pd(iq2,jq1);
1336 qq22 = _mm256_mul_pd(iq2,jq2);
1338 /* Avoid stupid compiler warnings */
1339 jnrA = jnrB = jnrC = jnrD = 0;
1340 j_coord_offsetA = 0;
1341 j_coord_offsetB = 0;
1342 j_coord_offsetC = 0;
1343 j_coord_offsetD = 0;
1348 for(iidx=0;iidx<4*DIM;iidx++)
1350 scratch[iidx] = 0.0;
1353 /* Start outer loop over neighborlists */
1354 for(iidx=0; iidx<nri; iidx++)
1356 /* Load shift vector for this list */
1357 i_shift_offset = DIM*shiftidx[iidx];
1359 /* Load limits for loop over neighbors */
1360 j_index_start = jindex[iidx];
1361 j_index_end = jindex[iidx+1];
1363 /* Get outer coordinate index */
1365 i_coord_offset = DIM*inr;
1367 /* Load i particle coords and add shift vector */
1368 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1369 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1371 fix0 = _mm256_setzero_pd();
1372 fiy0 = _mm256_setzero_pd();
1373 fiz0 = _mm256_setzero_pd();
1374 fix1 = _mm256_setzero_pd();
1375 fiy1 = _mm256_setzero_pd();
1376 fiz1 = _mm256_setzero_pd();
1377 fix2 = _mm256_setzero_pd();
1378 fiy2 = _mm256_setzero_pd();
1379 fiz2 = _mm256_setzero_pd();
1381 /* Start inner kernel loop */
1382 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1385 /* Get j neighbor index, and coordinate index */
1387 jnrB = jjnr[jidx+1];
1388 jnrC = jjnr[jidx+2];
1389 jnrD = jjnr[jidx+3];
1390 j_coord_offsetA = DIM*jnrA;
1391 j_coord_offsetB = DIM*jnrB;
1392 j_coord_offsetC = DIM*jnrC;
1393 j_coord_offsetD = DIM*jnrD;
1395 /* load j atom coordinates */
1396 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1397 x+j_coord_offsetC,x+j_coord_offsetD,
1398 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1400 /* Calculate displacement vector */
1401 dx00 = _mm256_sub_pd(ix0,jx0);
1402 dy00 = _mm256_sub_pd(iy0,jy0);
1403 dz00 = _mm256_sub_pd(iz0,jz0);
1404 dx01 = _mm256_sub_pd(ix0,jx1);
1405 dy01 = _mm256_sub_pd(iy0,jy1);
1406 dz01 = _mm256_sub_pd(iz0,jz1);
1407 dx02 = _mm256_sub_pd(ix0,jx2);
1408 dy02 = _mm256_sub_pd(iy0,jy2);
1409 dz02 = _mm256_sub_pd(iz0,jz2);
1410 dx10 = _mm256_sub_pd(ix1,jx0);
1411 dy10 = _mm256_sub_pd(iy1,jy0);
1412 dz10 = _mm256_sub_pd(iz1,jz0);
1413 dx11 = _mm256_sub_pd(ix1,jx1);
1414 dy11 = _mm256_sub_pd(iy1,jy1);
1415 dz11 = _mm256_sub_pd(iz1,jz1);
1416 dx12 = _mm256_sub_pd(ix1,jx2);
1417 dy12 = _mm256_sub_pd(iy1,jy2);
1418 dz12 = _mm256_sub_pd(iz1,jz2);
1419 dx20 = _mm256_sub_pd(ix2,jx0);
1420 dy20 = _mm256_sub_pd(iy2,jy0);
1421 dz20 = _mm256_sub_pd(iz2,jz0);
1422 dx21 = _mm256_sub_pd(ix2,jx1);
1423 dy21 = _mm256_sub_pd(iy2,jy1);
1424 dz21 = _mm256_sub_pd(iz2,jz1);
1425 dx22 = _mm256_sub_pd(ix2,jx2);
1426 dy22 = _mm256_sub_pd(iy2,jy2);
1427 dz22 = _mm256_sub_pd(iz2,jz2);
1429 /* Calculate squared distance and things based on it */
1430 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1431 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1432 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1433 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1434 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1435 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1436 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1437 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1438 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1440 rinv00 = avx256_invsqrt_d(rsq00);
1441 rinv01 = avx256_invsqrt_d(rsq01);
1442 rinv02 = avx256_invsqrt_d(rsq02);
1443 rinv10 = avx256_invsqrt_d(rsq10);
1444 rinv11 = avx256_invsqrt_d(rsq11);
1445 rinv12 = avx256_invsqrt_d(rsq12);
1446 rinv20 = avx256_invsqrt_d(rsq20);
1447 rinv21 = avx256_invsqrt_d(rsq21);
1448 rinv22 = avx256_invsqrt_d(rsq22);
1450 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1451 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1452 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1453 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1454 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1455 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1456 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1457 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1458 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1460 fjx0 = _mm256_setzero_pd();
1461 fjy0 = _mm256_setzero_pd();
1462 fjz0 = _mm256_setzero_pd();
1463 fjx1 = _mm256_setzero_pd();
1464 fjy1 = _mm256_setzero_pd();
1465 fjz1 = _mm256_setzero_pd();
1466 fjx2 = _mm256_setzero_pd();
1467 fjy2 = _mm256_setzero_pd();
1468 fjz2 = _mm256_setzero_pd();
1470 /**************************
1471 * CALCULATE INTERACTIONS *
1472 **************************/
1474 r00 = _mm256_mul_pd(rsq00,rinv00);
1476 /* EWALD ELECTROSTATICS */
1478 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1479 ewrt = _mm256_mul_pd(r00,ewtabscale);
1480 ewitab = _mm256_cvttpd_epi32(ewrt);
1481 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1482 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1483 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1485 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1486 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1490 /* Calculate temporary vectorial force */
1491 tx = _mm256_mul_pd(fscal,dx00);
1492 ty = _mm256_mul_pd(fscal,dy00);
1493 tz = _mm256_mul_pd(fscal,dz00);
1495 /* Update vectorial force */
1496 fix0 = _mm256_add_pd(fix0,tx);
1497 fiy0 = _mm256_add_pd(fiy0,ty);
1498 fiz0 = _mm256_add_pd(fiz0,tz);
1500 fjx0 = _mm256_add_pd(fjx0,tx);
1501 fjy0 = _mm256_add_pd(fjy0,ty);
1502 fjz0 = _mm256_add_pd(fjz0,tz);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 r01 = _mm256_mul_pd(rsq01,rinv01);
1510 /* EWALD ELECTROSTATICS */
1512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1513 ewrt = _mm256_mul_pd(r01,ewtabscale);
1514 ewitab = _mm256_cvttpd_epi32(ewrt);
1515 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1516 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1517 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1519 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1520 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1524 /* Calculate temporary vectorial force */
1525 tx = _mm256_mul_pd(fscal,dx01);
1526 ty = _mm256_mul_pd(fscal,dy01);
1527 tz = _mm256_mul_pd(fscal,dz01);
1529 /* Update vectorial force */
1530 fix0 = _mm256_add_pd(fix0,tx);
1531 fiy0 = _mm256_add_pd(fiy0,ty);
1532 fiz0 = _mm256_add_pd(fiz0,tz);
1534 fjx1 = _mm256_add_pd(fjx1,tx);
1535 fjy1 = _mm256_add_pd(fjy1,ty);
1536 fjz1 = _mm256_add_pd(fjz1,tz);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 r02 = _mm256_mul_pd(rsq02,rinv02);
1544 /* EWALD ELECTROSTATICS */
1546 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1547 ewrt = _mm256_mul_pd(r02,ewtabscale);
1548 ewitab = _mm256_cvttpd_epi32(ewrt);
1549 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1550 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1551 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1553 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1554 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1558 /* Calculate temporary vectorial force */
1559 tx = _mm256_mul_pd(fscal,dx02);
1560 ty = _mm256_mul_pd(fscal,dy02);
1561 tz = _mm256_mul_pd(fscal,dz02);
1563 /* Update vectorial force */
1564 fix0 = _mm256_add_pd(fix0,tx);
1565 fiy0 = _mm256_add_pd(fiy0,ty);
1566 fiz0 = _mm256_add_pd(fiz0,tz);
1568 fjx2 = _mm256_add_pd(fjx2,tx);
1569 fjy2 = _mm256_add_pd(fjy2,ty);
1570 fjz2 = _mm256_add_pd(fjz2,tz);
1572 /**************************
1573 * CALCULATE INTERACTIONS *
1574 **************************/
1576 r10 = _mm256_mul_pd(rsq10,rinv10);
1578 /* EWALD ELECTROSTATICS */
1580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1581 ewrt = _mm256_mul_pd(r10,ewtabscale);
1582 ewitab = _mm256_cvttpd_epi32(ewrt);
1583 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1584 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1585 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1587 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1588 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1592 /* Calculate temporary vectorial force */
1593 tx = _mm256_mul_pd(fscal,dx10);
1594 ty = _mm256_mul_pd(fscal,dy10);
1595 tz = _mm256_mul_pd(fscal,dz10);
1597 /* Update vectorial force */
1598 fix1 = _mm256_add_pd(fix1,tx);
1599 fiy1 = _mm256_add_pd(fiy1,ty);
1600 fiz1 = _mm256_add_pd(fiz1,tz);
1602 fjx0 = _mm256_add_pd(fjx0,tx);
1603 fjy0 = _mm256_add_pd(fjy0,ty);
1604 fjz0 = _mm256_add_pd(fjz0,tz);
1606 /**************************
1607 * CALCULATE INTERACTIONS *
1608 **************************/
1610 r11 = _mm256_mul_pd(rsq11,rinv11);
1612 /* EWALD ELECTROSTATICS */
1614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1615 ewrt = _mm256_mul_pd(r11,ewtabscale);
1616 ewitab = _mm256_cvttpd_epi32(ewrt);
1617 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1618 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1619 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1621 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1622 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1626 /* Calculate temporary vectorial force */
1627 tx = _mm256_mul_pd(fscal,dx11);
1628 ty = _mm256_mul_pd(fscal,dy11);
1629 tz = _mm256_mul_pd(fscal,dz11);
1631 /* Update vectorial force */
1632 fix1 = _mm256_add_pd(fix1,tx);
1633 fiy1 = _mm256_add_pd(fiy1,ty);
1634 fiz1 = _mm256_add_pd(fiz1,tz);
1636 fjx1 = _mm256_add_pd(fjx1,tx);
1637 fjy1 = _mm256_add_pd(fjy1,ty);
1638 fjz1 = _mm256_add_pd(fjz1,tz);
1640 /**************************
1641 * CALCULATE INTERACTIONS *
1642 **************************/
1644 r12 = _mm256_mul_pd(rsq12,rinv12);
1646 /* EWALD ELECTROSTATICS */
1648 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1649 ewrt = _mm256_mul_pd(r12,ewtabscale);
1650 ewitab = _mm256_cvttpd_epi32(ewrt);
1651 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1652 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1653 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1655 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1656 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1660 /* Calculate temporary vectorial force */
1661 tx = _mm256_mul_pd(fscal,dx12);
1662 ty = _mm256_mul_pd(fscal,dy12);
1663 tz = _mm256_mul_pd(fscal,dz12);
1665 /* Update vectorial force */
1666 fix1 = _mm256_add_pd(fix1,tx);
1667 fiy1 = _mm256_add_pd(fiy1,ty);
1668 fiz1 = _mm256_add_pd(fiz1,tz);
1670 fjx2 = _mm256_add_pd(fjx2,tx);
1671 fjy2 = _mm256_add_pd(fjy2,ty);
1672 fjz2 = _mm256_add_pd(fjz2,tz);
1674 /**************************
1675 * CALCULATE INTERACTIONS *
1676 **************************/
1678 r20 = _mm256_mul_pd(rsq20,rinv20);
1680 /* EWALD ELECTROSTATICS */
1682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683 ewrt = _mm256_mul_pd(r20,ewtabscale);
1684 ewitab = _mm256_cvttpd_epi32(ewrt);
1685 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1686 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1687 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1689 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1690 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1694 /* Calculate temporary vectorial force */
1695 tx = _mm256_mul_pd(fscal,dx20);
1696 ty = _mm256_mul_pd(fscal,dy20);
1697 tz = _mm256_mul_pd(fscal,dz20);
1699 /* Update vectorial force */
1700 fix2 = _mm256_add_pd(fix2,tx);
1701 fiy2 = _mm256_add_pd(fiy2,ty);
1702 fiz2 = _mm256_add_pd(fiz2,tz);
1704 fjx0 = _mm256_add_pd(fjx0,tx);
1705 fjy0 = _mm256_add_pd(fjy0,ty);
1706 fjz0 = _mm256_add_pd(fjz0,tz);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 r21 = _mm256_mul_pd(rsq21,rinv21);
1714 /* EWALD ELECTROSTATICS */
1716 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1717 ewrt = _mm256_mul_pd(r21,ewtabscale);
1718 ewitab = _mm256_cvttpd_epi32(ewrt);
1719 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1720 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1721 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1723 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1724 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1728 /* Calculate temporary vectorial force */
1729 tx = _mm256_mul_pd(fscal,dx21);
1730 ty = _mm256_mul_pd(fscal,dy21);
1731 tz = _mm256_mul_pd(fscal,dz21);
1733 /* Update vectorial force */
1734 fix2 = _mm256_add_pd(fix2,tx);
1735 fiy2 = _mm256_add_pd(fiy2,ty);
1736 fiz2 = _mm256_add_pd(fiz2,tz);
1738 fjx1 = _mm256_add_pd(fjx1,tx);
1739 fjy1 = _mm256_add_pd(fjy1,ty);
1740 fjz1 = _mm256_add_pd(fjz1,tz);
1742 /**************************
1743 * CALCULATE INTERACTIONS *
1744 **************************/
1746 r22 = _mm256_mul_pd(rsq22,rinv22);
1748 /* EWALD ELECTROSTATICS */
1750 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1751 ewrt = _mm256_mul_pd(r22,ewtabscale);
1752 ewitab = _mm256_cvttpd_epi32(ewrt);
1753 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1754 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1755 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1757 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1758 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1762 /* Calculate temporary vectorial force */
1763 tx = _mm256_mul_pd(fscal,dx22);
1764 ty = _mm256_mul_pd(fscal,dy22);
1765 tz = _mm256_mul_pd(fscal,dz22);
1767 /* Update vectorial force */
1768 fix2 = _mm256_add_pd(fix2,tx);
1769 fiy2 = _mm256_add_pd(fiy2,ty);
1770 fiz2 = _mm256_add_pd(fiz2,tz);
1772 fjx2 = _mm256_add_pd(fjx2,tx);
1773 fjy2 = _mm256_add_pd(fjy2,ty);
1774 fjz2 = _mm256_add_pd(fjz2,tz);
1776 fjptrA = f+j_coord_offsetA;
1777 fjptrB = f+j_coord_offsetB;
1778 fjptrC = f+j_coord_offsetC;
1779 fjptrD = f+j_coord_offsetD;
1781 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1782 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1784 /* Inner loop uses 324 flops */
1787 if(jidx<j_index_end)
1790 /* Get j neighbor index, and coordinate index */
1791 jnrlistA = jjnr[jidx];
1792 jnrlistB = jjnr[jidx+1];
1793 jnrlistC = jjnr[jidx+2];
1794 jnrlistD = jjnr[jidx+3];
1795 /* Sign of each element will be negative for non-real atoms.
1796 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1797 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1799 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1801 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1802 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1803 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1805 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1806 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1807 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1808 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1809 j_coord_offsetA = DIM*jnrA;
1810 j_coord_offsetB = DIM*jnrB;
1811 j_coord_offsetC = DIM*jnrC;
1812 j_coord_offsetD = DIM*jnrD;
1814 /* load j atom coordinates */
1815 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1816 x+j_coord_offsetC,x+j_coord_offsetD,
1817 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1819 /* Calculate displacement vector */
1820 dx00 = _mm256_sub_pd(ix0,jx0);
1821 dy00 = _mm256_sub_pd(iy0,jy0);
1822 dz00 = _mm256_sub_pd(iz0,jz0);
1823 dx01 = _mm256_sub_pd(ix0,jx1);
1824 dy01 = _mm256_sub_pd(iy0,jy1);
1825 dz01 = _mm256_sub_pd(iz0,jz1);
1826 dx02 = _mm256_sub_pd(ix0,jx2);
1827 dy02 = _mm256_sub_pd(iy0,jy2);
1828 dz02 = _mm256_sub_pd(iz0,jz2);
1829 dx10 = _mm256_sub_pd(ix1,jx0);
1830 dy10 = _mm256_sub_pd(iy1,jy0);
1831 dz10 = _mm256_sub_pd(iz1,jz0);
1832 dx11 = _mm256_sub_pd(ix1,jx1);
1833 dy11 = _mm256_sub_pd(iy1,jy1);
1834 dz11 = _mm256_sub_pd(iz1,jz1);
1835 dx12 = _mm256_sub_pd(ix1,jx2);
1836 dy12 = _mm256_sub_pd(iy1,jy2);
1837 dz12 = _mm256_sub_pd(iz1,jz2);
1838 dx20 = _mm256_sub_pd(ix2,jx0);
1839 dy20 = _mm256_sub_pd(iy2,jy0);
1840 dz20 = _mm256_sub_pd(iz2,jz0);
1841 dx21 = _mm256_sub_pd(ix2,jx1);
1842 dy21 = _mm256_sub_pd(iy2,jy1);
1843 dz21 = _mm256_sub_pd(iz2,jz1);
1844 dx22 = _mm256_sub_pd(ix2,jx2);
1845 dy22 = _mm256_sub_pd(iy2,jy2);
1846 dz22 = _mm256_sub_pd(iz2,jz2);
1848 /* Calculate squared distance and things based on it */
1849 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1850 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1851 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1852 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1853 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1854 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1855 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1856 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1857 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1859 rinv00 = avx256_invsqrt_d(rsq00);
1860 rinv01 = avx256_invsqrt_d(rsq01);
1861 rinv02 = avx256_invsqrt_d(rsq02);
1862 rinv10 = avx256_invsqrt_d(rsq10);
1863 rinv11 = avx256_invsqrt_d(rsq11);
1864 rinv12 = avx256_invsqrt_d(rsq12);
1865 rinv20 = avx256_invsqrt_d(rsq20);
1866 rinv21 = avx256_invsqrt_d(rsq21);
1867 rinv22 = avx256_invsqrt_d(rsq22);
1869 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1870 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1871 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1872 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1873 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1874 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1875 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1876 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1877 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1879 fjx0 = _mm256_setzero_pd();
1880 fjy0 = _mm256_setzero_pd();
1881 fjz0 = _mm256_setzero_pd();
1882 fjx1 = _mm256_setzero_pd();
1883 fjy1 = _mm256_setzero_pd();
1884 fjz1 = _mm256_setzero_pd();
1885 fjx2 = _mm256_setzero_pd();
1886 fjy2 = _mm256_setzero_pd();
1887 fjz2 = _mm256_setzero_pd();
1889 /**************************
1890 * CALCULATE INTERACTIONS *
1891 **************************/
1893 r00 = _mm256_mul_pd(rsq00,rinv00);
1894 r00 = _mm256_andnot_pd(dummy_mask,r00);
1896 /* EWALD ELECTROSTATICS */
1898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1899 ewrt = _mm256_mul_pd(r00,ewtabscale);
1900 ewitab = _mm256_cvttpd_epi32(ewrt);
1901 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1902 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1903 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1905 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1906 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1910 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1912 /* Calculate temporary vectorial force */
1913 tx = _mm256_mul_pd(fscal,dx00);
1914 ty = _mm256_mul_pd(fscal,dy00);
1915 tz = _mm256_mul_pd(fscal,dz00);
1917 /* Update vectorial force */
1918 fix0 = _mm256_add_pd(fix0,tx);
1919 fiy0 = _mm256_add_pd(fiy0,ty);
1920 fiz0 = _mm256_add_pd(fiz0,tz);
1922 fjx0 = _mm256_add_pd(fjx0,tx);
1923 fjy0 = _mm256_add_pd(fjy0,ty);
1924 fjz0 = _mm256_add_pd(fjz0,tz);
1926 /**************************
1927 * CALCULATE INTERACTIONS *
1928 **************************/
1930 r01 = _mm256_mul_pd(rsq01,rinv01);
1931 r01 = _mm256_andnot_pd(dummy_mask,r01);
1933 /* EWALD ELECTROSTATICS */
1935 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1936 ewrt = _mm256_mul_pd(r01,ewtabscale);
1937 ewitab = _mm256_cvttpd_epi32(ewrt);
1938 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1939 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1940 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1942 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1943 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1947 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1949 /* Calculate temporary vectorial force */
1950 tx = _mm256_mul_pd(fscal,dx01);
1951 ty = _mm256_mul_pd(fscal,dy01);
1952 tz = _mm256_mul_pd(fscal,dz01);
1954 /* Update vectorial force */
1955 fix0 = _mm256_add_pd(fix0,tx);
1956 fiy0 = _mm256_add_pd(fiy0,ty);
1957 fiz0 = _mm256_add_pd(fiz0,tz);
1959 fjx1 = _mm256_add_pd(fjx1,tx);
1960 fjy1 = _mm256_add_pd(fjy1,ty);
1961 fjz1 = _mm256_add_pd(fjz1,tz);
1963 /**************************
1964 * CALCULATE INTERACTIONS *
1965 **************************/
1967 r02 = _mm256_mul_pd(rsq02,rinv02);
1968 r02 = _mm256_andnot_pd(dummy_mask,r02);
1970 /* EWALD ELECTROSTATICS */
1972 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1973 ewrt = _mm256_mul_pd(r02,ewtabscale);
1974 ewitab = _mm256_cvttpd_epi32(ewrt);
1975 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1976 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1977 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1979 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1980 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1984 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1986 /* Calculate temporary vectorial force */
1987 tx = _mm256_mul_pd(fscal,dx02);
1988 ty = _mm256_mul_pd(fscal,dy02);
1989 tz = _mm256_mul_pd(fscal,dz02);
1991 /* Update vectorial force */
1992 fix0 = _mm256_add_pd(fix0,tx);
1993 fiy0 = _mm256_add_pd(fiy0,ty);
1994 fiz0 = _mm256_add_pd(fiz0,tz);
1996 fjx2 = _mm256_add_pd(fjx2,tx);
1997 fjy2 = _mm256_add_pd(fjy2,ty);
1998 fjz2 = _mm256_add_pd(fjz2,tz);
2000 /**************************
2001 * CALCULATE INTERACTIONS *
2002 **************************/
2004 r10 = _mm256_mul_pd(rsq10,rinv10);
2005 r10 = _mm256_andnot_pd(dummy_mask,r10);
2007 /* EWALD ELECTROSTATICS */
2009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2010 ewrt = _mm256_mul_pd(r10,ewtabscale);
2011 ewitab = _mm256_cvttpd_epi32(ewrt);
2012 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2013 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2014 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2016 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2017 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2021 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2023 /* Calculate temporary vectorial force */
2024 tx = _mm256_mul_pd(fscal,dx10);
2025 ty = _mm256_mul_pd(fscal,dy10);
2026 tz = _mm256_mul_pd(fscal,dz10);
2028 /* Update vectorial force */
2029 fix1 = _mm256_add_pd(fix1,tx);
2030 fiy1 = _mm256_add_pd(fiy1,ty);
2031 fiz1 = _mm256_add_pd(fiz1,tz);
2033 fjx0 = _mm256_add_pd(fjx0,tx);
2034 fjy0 = _mm256_add_pd(fjy0,ty);
2035 fjz0 = _mm256_add_pd(fjz0,tz);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 r11 = _mm256_mul_pd(rsq11,rinv11);
2042 r11 = _mm256_andnot_pd(dummy_mask,r11);
2044 /* EWALD ELECTROSTATICS */
2046 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2047 ewrt = _mm256_mul_pd(r11,ewtabscale);
2048 ewitab = _mm256_cvttpd_epi32(ewrt);
2049 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2050 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2051 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2053 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2054 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2058 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2060 /* Calculate temporary vectorial force */
2061 tx = _mm256_mul_pd(fscal,dx11);
2062 ty = _mm256_mul_pd(fscal,dy11);
2063 tz = _mm256_mul_pd(fscal,dz11);
2065 /* Update vectorial force */
2066 fix1 = _mm256_add_pd(fix1,tx);
2067 fiy1 = _mm256_add_pd(fiy1,ty);
2068 fiz1 = _mm256_add_pd(fiz1,tz);
2070 fjx1 = _mm256_add_pd(fjx1,tx);
2071 fjy1 = _mm256_add_pd(fjy1,ty);
2072 fjz1 = _mm256_add_pd(fjz1,tz);
2074 /**************************
2075 * CALCULATE INTERACTIONS *
2076 **************************/
2078 r12 = _mm256_mul_pd(rsq12,rinv12);
2079 r12 = _mm256_andnot_pd(dummy_mask,r12);
2081 /* EWALD ELECTROSTATICS */
2083 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2084 ewrt = _mm256_mul_pd(r12,ewtabscale);
2085 ewitab = _mm256_cvttpd_epi32(ewrt);
2086 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2087 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2088 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2090 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2091 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2095 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2097 /* Calculate temporary vectorial force */
2098 tx = _mm256_mul_pd(fscal,dx12);
2099 ty = _mm256_mul_pd(fscal,dy12);
2100 tz = _mm256_mul_pd(fscal,dz12);
2102 /* Update vectorial force */
2103 fix1 = _mm256_add_pd(fix1,tx);
2104 fiy1 = _mm256_add_pd(fiy1,ty);
2105 fiz1 = _mm256_add_pd(fiz1,tz);
2107 fjx2 = _mm256_add_pd(fjx2,tx);
2108 fjy2 = _mm256_add_pd(fjy2,ty);
2109 fjz2 = _mm256_add_pd(fjz2,tz);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 r20 = _mm256_mul_pd(rsq20,rinv20);
2116 r20 = _mm256_andnot_pd(dummy_mask,r20);
2118 /* EWALD ELECTROSTATICS */
2120 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2121 ewrt = _mm256_mul_pd(r20,ewtabscale);
2122 ewitab = _mm256_cvttpd_epi32(ewrt);
2123 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2124 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2125 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2127 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2128 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2132 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2134 /* Calculate temporary vectorial force */
2135 tx = _mm256_mul_pd(fscal,dx20);
2136 ty = _mm256_mul_pd(fscal,dy20);
2137 tz = _mm256_mul_pd(fscal,dz20);
2139 /* Update vectorial force */
2140 fix2 = _mm256_add_pd(fix2,tx);
2141 fiy2 = _mm256_add_pd(fiy2,ty);
2142 fiz2 = _mm256_add_pd(fiz2,tz);
2144 fjx0 = _mm256_add_pd(fjx0,tx);
2145 fjy0 = _mm256_add_pd(fjy0,ty);
2146 fjz0 = _mm256_add_pd(fjz0,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 r21 = _mm256_mul_pd(rsq21,rinv21);
2153 r21 = _mm256_andnot_pd(dummy_mask,r21);
2155 /* EWALD ELECTROSTATICS */
2157 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2158 ewrt = _mm256_mul_pd(r21,ewtabscale);
2159 ewitab = _mm256_cvttpd_epi32(ewrt);
2160 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2161 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2162 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2164 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2165 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2169 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2171 /* Calculate temporary vectorial force */
2172 tx = _mm256_mul_pd(fscal,dx21);
2173 ty = _mm256_mul_pd(fscal,dy21);
2174 tz = _mm256_mul_pd(fscal,dz21);
2176 /* Update vectorial force */
2177 fix2 = _mm256_add_pd(fix2,tx);
2178 fiy2 = _mm256_add_pd(fiy2,ty);
2179 fiz2 = _mm256_add_pd(fiz2,tz);
2181 fjx1 = _mm256_add_pd(fjx1,tx);
2182 fjy1 = _mm256_add_pd(fjy1,ty);
2183 fjz1 = _mm256_add_pd(fjz1,tz);
2185 /**************************
2186 * CALCULATE INTERACTIONS *
2187 **************************/
2189 r22 = _mm256_mul_pd(rsq22,rinv22);
2190 r22 = _mm256_andnot_pd(dummy_mask,r22);
2192 /* EWALD ELECTROSTATICS */
2194 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2195 ewrt = _mm256_mul_pd(r22,ewtabscale);
2196 ewitab = _mm256_cvttpd_epi32(ewrt);
2197 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2198 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2199 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2201 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2202 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2206 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2208 /* Calculate temporary vectorial force */
2209 tx = _mm256_mul_pd(fscal,dx22);
2210 ty = _mm256_mul_pd(fscal,dy22);
2211 tz = _mm256_mul_pd(fscal,dz22);
2213 /* Update vectorial force */
2214 fix2 = _mm256_add_pd(fix2,tx);
2215 fiy2 = _mm256_add_pd(fiy2,ty);
2216 fiz2 = _mm256_add_pd(fiz2,tz);
2218 fjx2 = _mm256_add_pd(fjx2,tx);
2219 fjy2 = _mm256_add_pd(fjy2,ty);
2220 fjz2 = _mm256_add_pd(fjz2,tz);
2222 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2223 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2224 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2225 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2227 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2228 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2230 /* Inner loop uses 333 flops */
2233 /* End of innermost loop */
2235 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2236 f+i_coord_offset,fshift+i_shift_offset);
2238 /* Increment number of inner iterations */
2239 inneriter += j_index_end - j_index_start;
2241 /* Outer loop uses 18 flops */
2244 /* Increment number of outer iterations */
2247 /* Update outer/inner flops */
2249 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*333);