2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr1;
71 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
72 real * vdwioffsetptr2;
73 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
74 real * vdwioffsetptr3;
75 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
85 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
88 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
89 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
90 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
95 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
97 __m256d dummy_mask,cutoff_mask;
98 __m128 tmpmask0,tmpmask1;
99 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
100 __m256d one = _mm256_set1_pd(1.0);
101 __m256d two = _mm256_set1_pd(2.0);
107 jindex = nlist->jindex;
109 shiftidx = nlist->shift;
111 shiftvec = fr->shift_vec[0];
112 fshift = fr->fshift[0];
113 facel = _mm256_set1_pd(fr->epsfac);
114 charge = mdatoms->chargeA;
116 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
117 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
118 beta2 = _mm256_mul_pd(beta,beta);
119 beta3 = _mm256_mul_pd(beta,beta2);
121 ewtab = fr->ic->tabq_coul_FDV0;
122 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
123 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
125 /* Setup water-specific parameters */
126 inr = nlist->iinr[0];
127 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
128 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
129 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
131 jq1 = _mm256_set1_pd(charge[inr+1]);
132 jq2 = _mm256_set1_pd(charge[inr+2]);
133 jq3 = _mm256_set1_pd(charge[inr+3]);
134 qq11 = _mm256_mul_pd(iq1,jq1);
135 qq12 = _mm256_mul_pd(iq1,jq2);
136 qq13 = _mm256_mul_pd(iq1,jq3);
137 qq21 = _mm256_mul_pd(iq2,jq1);
138 qq22 = _mm256_mul_pd(iq2,jq2);
139 qq23 = _mm256_mul_pd(iq2,jq3);
140 qq31 = _mm256_mul_pd(iq3,jq1);
141 qq32 = _mm256_mul_pd(iq3,jq2);
142 qq33 = _mm256_mul_pd(iq3,jq3);
144 /* Avoid stupid compiler warnings */
145 jnrA = jnrB = jnrC = jnrD = 0;
154 for(iidx=0;iidx<4*DIM;iidx++)
159 /* Start outer loop over neighborlists */
160 for(iidx=0; iidx<nri; iidx++)
162 /* Load shift vector for this list */
163 i_shift_offset = DIM*shiftidx[iidx];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
175 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
177 fix1 = _mm256_setzero_pd();
178 fiy1 = _mm256_setzero_pd();
179 fiz1 = _mm256_setzero_pd();
180 fix2 = _mm256_setzero_pd();
181 fiy2 = _mm256_setzero_pd();
182 fiz2 = _mm256_setzero_pd();
183 fix3 = _mm256_setzero_pd();
184 fiy3 = _mm256_setzero_pd();
185 fiz3 = _mm256_setzero_pd();
187 /* Reset potential sums */
188 velecsum = _mm256_setzero_pd();
190 /* Start inner kernel loop */
191 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
194 /* Get j neighbor index, and coordinate index */
199 j_coord_offsetA = DIM*jnrA;
200 j_coord_offsetB = DIM*jnrB;
201 j_coord_offsetC = DIM*jnrC;
202 j_coord_offsetD = DIM*jnrD;
204 /* load j atom coordinates */
205 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
206 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
207 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
209 /* Calculate displacement vector */
210 dx11 = _mm256_sub_pd(ix1,jx1);
211 dy11 = _mm256_sub_pd(iy1,jy1);
212 dz11 = _mm256_sub_pd(iz1,jz1);
213 dx12 = _mm256_sub_pd(ix1,jx2);
214 dy12 = _mm256_sub_pd(iy1,jy2);
215 dz12 = _mm256_sub_pd(iz1,jz2);
216 dx13 = _mm256_sub_pd(ix1,jx3);
217 dy13 = _mm256_sub_pd(iy1,jy3);
218 dz13 = _mm256_sub_pd(iz1,jz3);
219 dx21 = _mm256_sub_pd(ix2,jx1);
220 dy21 = _mm256_sub_pd(iy2,jy1);
221 dz21 = _mm256_sub_pd(iz2,jz1);
222 dx22 = _mm256_sub_pd(ix2,jx2);
223 dy22 = _mm256_sub_pd(iy2,jy2);
224 dz22 = _mm256_sub_pd(iz2,jz2);
225 dx23 = _mm256_sub_pd(ix2,jx3);
226 dy23 = _mm256_sub_pd(iy2,jy3);
227 dz23 = _mm256_sub_pd(iz2,jz3);
228 dx31 = _mm256_sub_pd(ix3,jx1);
229 dy31 = _mm256_sub_pd(iy3,jy1);
230 dz31 = _mm256_sub_pd(iz3,jz1);
231 dx32 = _mm256_sub_pd(ix3,jx2);
232 dy32 = _mm256_sub_pd(iy3,jy2);
233 dz32 = _mm256_sub_pd(iz3,jz2);
234 dx33 = _mm256_sub_pd(ix3,jx3);
235 dy33 = _mm256_sub_pd(iy3,jy3);
236 dz33 = _mm256_sub_pd(iz3,jz3);
238 /* Calculate squared distance and things based on it */
239 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
240 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
241 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
242 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
243 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
244 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
245 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
246 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
247 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
249 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
250 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
251 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
252 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
253 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
254 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
255 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
256 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
257 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
259 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
260 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
261 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
262 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
263 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
264 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
265 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
266 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
267 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
269 fjx1 = _mm256_setzero_pd();
270 fjy1 = _mm256_setzero_pd();
271 fjz1 = _mm256_setzero_pd();
272 fjx2 = _mm256_setzero_pd();
273 fjy2 = _mm256_setzero_pd();
274 fjz2 = _mm256_setzero_pd();
275 fjx3 = _mm256_setzero_pd();
276 fjy3 = _mm256_setzero_pd();
277 fjz3 = _mm256_setzero_pd();
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 r11 = _mm256_mul_pd(rsq11,rinv11);
285 /* EWALD ELECTROSTATICS */
287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
288 ewrt = _mm256_mul_pd(r11,ewtabscale);
289 ewitab = _mm256_cvttpd_epi32(ewrt);
290 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
291 ewitab = _mm_slli_epi32(ewitab,2);
292 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
293 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
294 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
295 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
296 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
297 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
298 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
299 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
300 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
302 /* Update potential sum for this i atom from the interaction with this j atom. */
303 velecsum = _mm256_add_pd(velecsum,velec);
307 /* Calculate temporary vectorial force */
308 tx = _mm256_mul_pd(fscal,dx11);
309 ty = _mm256_mul_pd(fscal,dy11);
310 tz = _mm256_mul_pd(fscal,dz11);
312 /* Update vectorial force */
313 fix1 = _mm256_add_pd(fix1,tx);
314 fiy1 = _mm256_add_pd(fiy1,ty);
315 fiz1 = _mm256_add_pd(fiz1,tz);
317 fjx1 = _mm256_add_pd(fjx1,tx);
318 fjy1 = _mm256_add_pd(fjy1,ty);
319 fjz1 = _mm256_add_pd(fjz1,tz);
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 r12 = _mm256_mul_pd(rsq12,rinv12);
327 /* EWALD ELECTROSTATICS */
329 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
330 ewrt = _mm256_mul_pd(r12,ewtabscale);
331 ewitab = _mm256_cvttpd_epi32(ewrt);
332 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
333 ewitab = _mm_slli_epi32(ewitab,2);
334 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
335 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
336 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
337 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
338 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
339 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
340 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
341 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
342 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velecsum = _mm256_add_pd(velecsum,velec);
349 /* Calculate temporary vectorial force */
350 tx = _mm256_mul_pd(fscal,dx12);
351 ty = _mm256_mul_pd(fscal,dy12);
352 tz = _mm256_mul_pd(fscal,dz12);
354 /* Update vectorial force */
355 fix1 = _mm256_add_pd(fix1,tx);
356 fiy1 = _mm256_add_pd(fiy1,ty);
357 fiz1 = _mm256_add_pd(fiz1,tz);
359 fjx2 = _mm256_add_pd(fjx2,tx);
360 fjy2 = _mm256_add_pd(fjy2,ty);
361 fjz2 = _mm256_add_pd(fjz2,tz);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 r13 = _mm256_mul_pd(rsq13,rinv13);
369 /* EWALD ELECTROSTATICS */
371 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
372 ewrt = _mm256_mul_pd(r13,ewtabscale);
373 ewitab = _mm256_cvttpd_epi32(ewrt);
374 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
375 ewitab = _mm_slli_epi32(ewitab,2);
376 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
377 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
378 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
379 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
380 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
381 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
382 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
383 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
384 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
386 /* Update potential sum for this i atom from the interaction with this j atom. */
387 velecsum = _mm256_add_pd(velecsum,velec);
391 /* Calculate temporary vectorial force */
392 tx = _mm256_mul_pd(fscal,dx13);
393 ty = _mm256_mul_pd(fscal,dy13);
394 tz = _mm256_mul_pd(fscal,dz13);
396 /* Update vectorial force */
397 fix1 = _mm256_add_pd(fix1,tx);
398 fiy1 = _mm256_add_pd(fiy1,ty);
399 fiz1 = _mm256_add_pd(fiz1,tz);
401 fjx3 = _mm256_add_pd(fjx3,tx);
402 fjy3 = _mm256_add_pd(fjy3,ty);
403 fjz3 = _mm256_add_pd(fjz3,tz);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 r21 = _mm256_mul_pd(rsq21,rinv21);
411 /* EWALD ELECTROSTATICS */
413 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
414 ewrt = _mm256_mul_pd(r21,ewtabscale);
415 ewitab = _mm256_cvttpd_epi32(ewrt);
416 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
417 ewitab = _mm_slli_epi32(ewitab,2);
418 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
419 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
420 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
421 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
422 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
423 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
424 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
425 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
426 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velecsum = _mm256_add_pd(velecsum,velec);
433 /* Calculate temporary vectorial force */
434 tx = _mm256_mul_pd(fscal,dx21);
435 ty = _mm256_mul_pd(fscal,dy21);
436 tz = _mm256_mul_pd(fscal,dz21);
438 /* Update vectorial force */
439 fix2 = _mm256_add_pd(fix2,tx);
440 fiy2 = _mm256_add_pd(fiy2,ty);
441 fiz2 = _mm256_add_pd(fiz2,tz);
443 fjx1 = _mm256_add_pd(fjx1,tx);
444 fjy1 = _mm256_add_pd(fjy1,ty);
445 fjz1 = _mm256_add_pd(fjz1,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 r22 = _mm256_mul_pd(rsq22,rinv22);
453 /* EWALD ELECTROSTATICS */
455 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
456 ewrt = _mm256_mul_pd(r22,ewtabscale);
457 ewitab = _mm256_cvttpd_epi32(ewrt);
458 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
459 ewitab = _mm_slli_epi32(ewitab,2);
460 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
461 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
462 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
463 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
464 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
465 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
466 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
467 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
468 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm256_add_pd(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm256_mul_pd(fscal,dx22);
477 ty = _mm256_mul_pd(fscal,dy22);
478 tz = _mm256_mul_pd(fscal,dz22);
480 /* Update vectorial force */
481 fix2 = _mm256_add_pd(fix2,tx);
482 fiy2 = _mm256_add_pd(fiy2,ty);
483 fiz2 = _mm256_add_pd(fiz2,tz);
485 fjx2 = _mm256_add_pd(fjx2,tx);
486 fjy2 = _mm256_add_pd(fjy2,ty);
487 fjz2 = _mm256_add_pd(fjz2,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 r23 = _mm256_mul_pd(rsq23,rinv23);
495 /* EWALD ELECTROSTATICS */
497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
498 ewrt = _mm256_mul_pd(r23,ewtabscale);
499 ewitab = _mm256_cvttpd_epi32(ewrt);
500 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
501 ewitab = _mm_slli_epi32(ewitab,2);
502 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
503 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
504 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
505 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
506 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
507 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
508 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
509 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
510 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velecsum = _mm256_add_pd(velecsum,velec);
517 /* Calculate temporary vectorial force */
518 tx = _mm256_mul_pd(fscal,dx23);
519 ty = _mm256_mul_pd(fscal,dy23);
520 tz = _mm256_mul_pd(fscal,dz23);
522 /* Update vectorial force */
523 fix2 = _mm256_add_pd(fix2,tx);
524 fiy2 = _mm256_add_pd(fiy2,ty);
525 fiz2 = _mm256_add_pd(fiz2,tz);
527 fjx3 = _mm256_add_pd(fjx3,tx);
528 fjy3 = _mm256_add_pd(fjy3,ty);
529 fjz3 = _mm256_add_pd(fjz3,tz);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 r31 = _mm256_mul_pd(rsq31,rinv31);
537 /* EWALD ELECTROSTATICS */
539 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
540 ewrt = _mm256_mul_pd(r31,ewtabscale);
541 ewitab = _mm256_cvttpd_epi32(ewrt);
542 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
543 ewitab = _mm_slli_epi32(ewitab,2);
544 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
545 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
546 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
547 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
548 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
549 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
550 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
551 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
552 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velecsum = _mm256_add_pd(velecsum,velec);
559 /* Calculate temporary vectorial force */
560 tx = _mm256_mul_pd(fscal,dx31);
561 ty = _mm256_mul_pd(fscal,dy31);
562 tz = _mm256_mul_pd(fscal,dz31);
564 /* Update vectorial force */
565 fix3 = _mm256_add_pd(fix3,tx);
566 fiy3 = _mm256_add_pd(fiy3,ty);
567 fiz3 = _mm256_add_pd(fiz3,tz);
569 fjx1 = _mm256_add_pd(fjx1,tx);
570 fjy1 = _mm256_add_pd(fjy1,ty);
571 fjz1 = _mm256_add_pd(fjz1,tz);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 r32 = _mm256_mul_pd(rsq32,rinv32);
579 /* EWALD ELECTROSTATICS */
581 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
582 ewrt = _mm256_mul_pd(r32,ewtabscale);
583 ewitab = _mm256_cvttpd_epi32(ewrt);
584 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
585 ewitab = _mm_slli_epi32(ewitab,2);
586 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
587 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
588 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
589 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
590 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
591 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
592 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
593 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
594 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
596 /* Update potential sum for this i atom from the interaction with this j atom. */
597 velecsum = _mm256_add_pd(velecsum,velec);
601 /* Calculate temporary vectorial force */
602 tx = _mm256_mul_pd(fscal,dx32);
603 ty = _mm256_mul_pd(fscal,dy32);
604 tz = _mm256_mul_pd(fscal,dz32);
606 /* Update vectorial force */
607 fix3 = _mm256_add_pd(fix3,tx);
608 fiy3 = _mm256_add_pd(fiy3,ty);
609 fiz3 = _mm256_add_pd(fiz3,tz);
611 fjx2 = _mm256_add_pd(fjx2,tx);
612 fjy2 = _mm256_add_pd(fjy2,ty);
613 fjz2 = _mm256_add_pd(fjz2,tz);
615 /**************************
616 * CALCULATE INTERACTIONS *
617 **************************/
619 r33 = _mm256_mul_pd(rsq33,rinv33);
621 /* EWALD ELECTROSTATICS */
623 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
624 ewrt = _mm256_mul_pd(r33,ewtabscale);
625 ewitab = _mm256_cvttpd_epi32(ewrt);
626 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
627 ewitab = _mm_slli_epi32(ewitab,2);
628 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
629 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
630 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
631 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
632 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
633 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
634 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
635 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
636 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velecsum = _mm256_add_pd(velecsum,velec);
643 /* Calculate temporary vectorial force */
644 tx = _mm256_mul_pd(fscal,dx33);
645 ty = _mm256_mul_pd(fscal,dy33);
646 tz = _mm256_mul_pd(fscal,dz33);
648 /* Update vectorial force */
649 fix3 = _mm256_add_pd(fix3,tx);
650 fiy3 = _mm256_add_pd(fiy3,ty);
651 fiz3 = _mm256_add_pd(fiz3,tz);
653 fjx3 = _mm256_add_pd(fjx3,tx);
654 fjy3 = _mm256_add_pd(fjy3,ty);
655 fjz3 = _mm256_add_pd(fjz3,tz);
657 fjptrA = f+j_coord_offsetA;
658 fjptrB = f+j_coord_offsetB;
659 fjptrC = f+j_coord_offsetC;
660 fjptrD = f+j_coord_offsetD;
662 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
663 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
665 /* Inner loop uses 369 flops */
671 /* Get j neighbor index, and coordinate index */
672 jnrlistA = jjnr[jidx];
673 jnrlistB = jjnr[jidx+1];
674 jnrlistC = jjnr[jidx+2];
675 jnrlistD = jjnr[jidx+3];
676 /* Sign of each element will be negative for non-real atoms.
677 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
678 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
680 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
682 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
683 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
684 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
686 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
687 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
688 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
689 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
690 j_coord_offsetA = DIM*jnrA;
691 j_coord_offsetB = DIM*jnrB;
692 j_coord_offsetC = DIM*jnrC;
693 j_coord_offsetD = DIM*jnrD;
695 /* load j atom coordinates */
696 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
697 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
698 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
700 /* Calculate displacement vector */
701 dx11 = _mm256_sub_pd(ix1,jx1);
702 dy11 = _mm256_sub_pd(iy1,jy1);
703 dz11 = _mm256_sub_pd(iz1,jz1);
704 dx12 = _mm256_sub_pd(ix1,jx2);
705 dy12 = _mm256_sub_pd(iy1,jy2);
706 dz12 = _mm256_sub_pd(iz1,jz2);
707 dx13 = _mm256_sub_pd(ix1,jx3);
708 dy13 = _mm256_sub_pd(iy1,jy3);
709 dz13 = _mm256_sub_pd(iz1,jz3);
710 dx21 = _mm256_sub_pd(ix2,jx1);
711 dy21 = _mm256_sub_pd(iy2,jy1);
712 dz21 = _mm256_sub_pd(iz2,jz1);
713 dx22 = _mm256_sub_pd(ix2,jx2);
714 dy22 = _mm256_sub_pd(iy2,jy2);
715 dz22 = _mm256_sub_pd(iz2,jz2);
716 dx23 = _mm256_sub_pd(ix2,jx3);
717 dy23 = _mm256_sub_pd(iy2,jy3);
718 dz23 = _mm256_sub_pd(iz2,jz3);
719 dx31 = _mm256_sub_pd(ix3,jx1);
720 dy31 = _mm256_sub_pd(iy3,jy1);
721 dz31 = _mm256_sub_pd(iz3,jz1);
722 dx32 = _mm256_sub_pd(ix3,jx2);
723 dy32 = _mm256_sub_pd(iy3,jy2);
724 dz32 = _mm256_sub_pd(iz3,jz2);
725 dx33 = _mm256_sub_pd(ix3,jx3);
726 dy33 = _mm256_sub_pd(iy3,jy3);
727 dz33 = _mm256_sub_pd(iz3,jz3);
729 /* Calculate squared distance and things based on it */
730 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
731 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
732 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
733 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
734 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
735 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
736 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
737 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
738 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
740 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
741 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
742 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
743 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
744 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
745 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
746 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
747 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
748 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
750 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
751 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
752 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
753 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
754 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
755 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
756 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
757 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
758 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
760 fjx1 = _mm256_setzero_pd();
761 fjy1 = _mm256_setzero_pd();
762 fjz1 = _mm256_setzero_pd();
763 fjx2 = _mm256_setzero_pd();
764 fjy2 = _mm256_setzero_pd();
765 fjz2 = _mm256_setzero_pd();
766 fjx3 = _mm256_setzero_pd();
767 fjy3 = _mm256_setzero_pd();
768 fjz3 = _mm256_setzero_pd();
770 /**************************
771 * CALCULATE INTERACTIONS *
772 **************************/
774 r11 = _mm256_mul_pd(rsq11,rinv11);
775 r11 = _mm256_andnot_pd(dummy_mask,r11);
777 /* EWALD ELECTROSTATICS */
779 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
780 ewrt = _mm256_mul_pd(r11,ewtabscale);
781 ewitab = _mm256_cvttpd_epi32(ewrt);
782 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
783 ewitab = _mm_slli_epi32(ewitab,2);
784 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
785 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
786 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
787 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
788 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
789 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
790 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
791 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
792 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
794 /* Update potential sum for this i atom from the interaction with this j atom. */
795 velec = _mm256_andnot_pd(dummy_mask,velec);
796 velecsum = _mm256_add_pd(velecsum,velec);
800 fscal = _mm256_andnot_pd(dummy_mask,fscal);
802 /* Calculate temporary vectorial force */
803 tx = _mm256_mul_pd(fscal,dx11);
804 ty = _mm256_mul_pd(fscal,dy11);
805 tz = _mm256_mul_pd(fscal,dz11);
807 /* Update vectorial force */
808 fix1 = _mm256_add_pd(fix1,tx);
809 fiy1 = _mm256_add_pd(fiy1,ty);
810 fiz1 = _mm256_add_pd(fiz1,tz);
812 fjx1 = _mm256_add_pd(fjx1,tx);
813 fjy1 = _mm256_add_pd(fjy1,ty);
814 fjz1 = _mm256_add_pd(fjz1,tz);
816 /**************************
817 * CALCULATE INTERACTIONS *
818 **************************/
820 r12 = _mm256_mul_pd(rsq12,rinv12);
821 r12 = _mm256_andnot_pd(dummy_mask,r12);
823 /* EWALD ELECTROSTATICS */
825 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
826 ewrt = _mm256_mul_pd(r12,ewtabscale);
827 ewitab = _mm256_cvttpd_epi32(ewrt);
828 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
829 ewitab = _mm_slli_epi32(ewitab,2);
830 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
831 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
832 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
833 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
834 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
835 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
836 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
837 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
838 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec = _mm256_andnot_pd(dummy_mask,velec);
842 velecsum = _mm256_add_pd(velecsum,velec);
846 fscal = _mm256_andnot_pd(dummy_mask,fscal);
848 /* Calculate temporary vectorial force */
849 tx = _mm256_mul_pd(fscal,dx12);
850 ty = _mm256_mul_pd(fscal,dy12);
851 tz = _mm256_mul_pd(fscal,dz12);
853 /* Update vectorial force */
854 fix1 = _mm256_add_pd(fix1,tx);
855 fiy1 = _mm256_add_pd(fiy1,ty);
856 fiz1 = _mm256_add_pd(fiz1,tz);
858 fjx2 = _mm256_add_pd(fjx2,tx);
859 fjy2 = _mm256_add_pd(fjy2,ty);
860 fjz2 = _mm256_add_pd(fjz2,tz);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 r13 = _mm256_mul_pd(rsq13,rinv13);
867 r13 = _mm256_andnot_pd(dummy_mask,r13);
869 /* EWALD ELECTROSTATICS */
871 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
872 ewrt = _mm256_mul_pd(r13,ewtabscale);
873 ewitab = _mm256_cvttpd_epi32(ewrt);
874 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
875 ewitab = _mm_slli_epi32(ewitab,2);
876 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
877 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
878 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
879 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
880 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
881 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
882 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
883 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
884 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 velec = _mm256_andnot_pd(dummy_mask,velec);
888 velecsum = _mm256_add_pd(velecsum,velec);
892 fscal = _mm256_andnot_pd(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm256_mul_pd(fscal,dx13);
896 ty = _mm256_mul_pd(fscal,dy13);
897 tz = _mm256_mul_pd(fscal,dz13);
899 /* Update vectorial force */
900 fix1 = _mm256_add_pd(fix1,tx);
901 fiy1 = _mm256_add_pd(fiy1,ty);
902 fiz1 = _mm256_add_pd(fiz1,tz);
904 fjx3 = _mm256_add_pd(fjx3,tx);
905 fjy3 = _mm256_add_pd(fjy3,ty);
906 fjz3 = _mm256_add_pd(fjz3,tz);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 r21 = _mm256_mul_pd(rsq21,rinv21);
913 r21 = _mm256_andnot_pd(dummy_mask,r21);
915 /* EWALD ELECTROSTATICS */
917 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
918 ewrt = _mm256_mul_pd(r21,ewtabscale);
919 ewitab = _mm256_cvttpd_epi32(ewrt);
920 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
921 ewitab = _mm_slli_epi32(ewitab,2);
922 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
923 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
924 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
925 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
926 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
927 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
928 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
929 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
930 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
932 /* Update potential sum for this i atom from the interaction with this j atom. */
933 velec = _mm256_andnot_pd(dummy_mask,velec);
934 velecsum = _mm256_add_pd(velecsum,velec);
938 fscal = _mm256_andnot_pd(dummy_mask,fscal);
940 /* Calculate temporary vectorial force */
941 tx = _mm256_mul_pd(fscal,dx21);
942 ty = _mm256_mul_pd(fscal,dy21);
943 tz = _mm256_mul_pd(fscal,dz21);
945 /* Update vectorial force */
946 fix2 = _mm256_add_pd(fix2,tx);
947 fiy2 = _mm256_add_pd(fiy2,ty);
948 fiz2 = _mm256_add_pd(fiz2,tz);
950 fjx1 = _mm256_add_pd(fjx1,tx);
951 fjy1 = _mm256_add_pd(fjy1,ty);
952 fjz1 = _mm256_add_pd(fjz1,tz);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 r22 = _mm256_mul_pd(rsq22,rinv22);
959 r22 = _mm256_andnot_pd(dummy_mask,r22);
961 /* EWALD ELECTROSTATICS */
963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
964 ewrt = _mm256_mul_pd(r22,ewtabscale);
965 ewitab = _mm256_cvttpd_epi32(ewrt);
966 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
967 ewitab = _mm_slli_epi32(ewitab,2);
968 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
969 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
970 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
971 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
972 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
973 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
974 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
975 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
976 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm256_andnot_pd(dummy_mask,velec);
980 velecsum = _mm256_add_pd(velecsum,velec);
984 fscal = _mm256_andnot_pd(dummy_mask,fscal);
986 /* Calculate temporary vectorial force */
987 tx = _mm256_mul_pd(fscal,dx22);
988 ty = _mm256_mul_pd(fscal,dy22);
989 tz = _mm256_mul_pd(fscal,dz22);
991 /* Update vectorial force */
992 fix2 = _mm256_add_pd(fix2,tx);
993 fiy2 = _mm256_add_pd(fiy2,ty);
994 fiz2 = _mm256_add_pd(fiz2,tz);
996 fjx2 = _mm256_add_pd(fjx2,tx);
997 fjy2 = _mm256_add_pd(fjy2,ty);
998 fjz2 = _mm256_add_pd(fjz2,tz);
1000 /**************************
1001 * CALCULATE INTERACTIONS *
1002 **************************/
1004 r23 = _mm256_mul_pd(rsq23,rinv23);
1005 r23 = _mm256_andnot_pd(dummy_mask,r23);
1007 /* EWALD ELECTROSTATICS */
1009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1010 ewrt = _mm256_mul_pd(r23,ewtabscale);
1011 ewitab = _mm256_cvttpd_epi32(ewrt);
1012 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1013 ewitab = _mm_slli_epi32(ewitab,2);
1014 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1015 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1016 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1017 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1018 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1019 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1020 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1021 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1022 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1024 /* Update potential sum for this i atom from the interaction with this j atom. */
1025 velec = _mm256_andnot_pd(dummy_mask,velec);
1026 velecsum = _mm256_add_pd(velecsum,velec);
1030 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1032 /* Calculate temporary vectorial force */
1033 tx = _mm256_mul_pd(fscal,dx23);
1034 ty = _mm256_mul_pd(fscal,dy23);
1035 tz = _mm256_mul_pd(fscal,dz23);
1037 /* Update vectorial force */
1038 fix2 = _mm256_add_pd(fix2,tx);
1039 fiy2 = _mm256_add_pd(fiy2,ty);
1040 fiz2 = _mm256_add_pd(fiz2,tz);
1042 fjx3 = _mm256_add_pd(fjx3,tx);
1043 fjy3 = _mm256_add_pd(fjy3,ty);
1044 fjz3 = _mm256_add_pd(fjz3,tz);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 r31 = _mm256_mul_pd(rsq31,rinv31);
1051 r31 = _mm256_andnot_pd(dummy_mask,r31);
1053 /* EWALD ELECTROSTATICS */
1055 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1056 ewrt = _mm256_mul_pd(r31,ewtabscale);
1057 ewitab = _mm256_cvttpd_epi32(ewrt);
1058 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1059 ewitab = _mm_slli_epi32(ewitab,2);
1060 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1061 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1062 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1063 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1064 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1065 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1066 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1067 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1068 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1070 /* Update potential sum for this i atom from the interaction with this j atom. */
1071 velec = _mm256_andnot_pd(dummy_mask,velec);
1072 velecsum = _mm256_add_pd(velecsum,velec);
1076 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1078 /* Calculate temporary vectorial force */
1079 tx = _mm256_mul_pd(fscal,dx31);
1080 ty = _mm256_mul_pd(fscal,dy31);
1081 tz = _mm256_mul_pd(fscal,dz31);
1083 /* Update vectorial force */
1084 fix3 = _mm256_add_pd(fix3,tx);
1085 fiy3 = _mm256_add_pd(fiy3,ty);
1086 fiz3 = _mm256_add_pd(fiz3,tz);
1088 fjx1 = _mm256_add_pd(fjx1,tx);
1089 fjy1 = _mm256_add_pd(fjy1,ty);
1090 fjz1 = _mm256_add_pd(fjz1,tz);
1092 /**************************
1093 * CALCULATE INTERACTIONS *
1094 **************************/
1096 r32 = _mm256_mul_pd(rsq32,rinv32);
1097 r32 = _mm256_andnot_pd(dummy_mask,r32);
1099 /* EWALD ELECTROSTATICS */
1101 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1102 ewrt = _mm256_mul_pd(r32,ewtabscale);
1103 ewitab = _mm256_cvttpd_epi32(ewrt);
1104 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1105 ewitab = _mm_slli_epi32(ewitab,2);
1106 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1107 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1108 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1109 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1110 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1111 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1112 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1113 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1114 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec = _mm256_andnot_pd(dummy_mask,velec);
1118 velecsum = _mm256_add_pd(velecsum,velec);
1122 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1124 /* Calculate temporary vectorial force */
1125 tx = _mm256_mul_pd(fscal,dx32);
1126 ty = _mm256_mul_pd(fscal,dy32);
1127 tz = _mm256_mul_pd(fscal,dz32);
1129 /* Update vectorial force */
1130 fix3 = _mm256_add_pd(fix3,tx);
1131 fiy3 = _mm256_add_pd(fiy3,ty);
1132 fiz3 = _mm256_add_pd(fiz3,tz);
1134 fjx2 = _mm256_add_pd(fjx2,tx);
1135 fjy2 = _mm256_add_pd(fjy2,ty);
1136 fjz2 = _mm256_add_pd(fjz2,tz);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 r33 = _mm256_mul_pd(rsq33,rinv33);
1143 r33 = _mm256_andnot_pd(dummy_mask,r33);
1145 /* EWALD ELECTROSTATICS */
1147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1148 ewrt = _mm256_mul_pd(r33,ewtabscale);
1149 ewitab = _mm256_cvttpd_epi32(ewrt);
1150 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1151 ewitab = _mm_slli_epi32(ewitab,2);
1152 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1153 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1154 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1155 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1156 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1157 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1158 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1159 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1160 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1162 /* Update potential sum for this i atom from the interaction with this j atom. */
1163 velec = _mm256_andnot_pd(dummy_mask,velec);
1164 velecsum = _mm256_add_pd(velecsum,velec);
1168 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1170 /* Calculate temporary vectorial force */
1171 tx = _mm256_mul_pd(fscal,dx33);
1172 ty = _mm256_mul_pd(fscal,dy33);
1173 tz = _mm256_mul_pd(fscal,dz33);
1175 /* Update vectorial force */
1176 fix3 = _mm256_add_pd(fix3,tx);
1177 fiy3 = _mm256_add_pd(fiy3,ty);
1178 fiz3 = _mm256_add_pd(fiz3,tz);
1180 fjx3 = _mm256_add_pd(fjx3,tx);
1181 fjy3 = _mm256_add_pd(fjy3,ty);
1182 fjz3 = _mm256_add_pd(fjz3,tz);
1184 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1185 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1186 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1187 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1189 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1190 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1192 /* Inner loop uses 378 flops */
1195 /* End of innermost loop */
1197 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1198 f+i_coord_offset+DIM,fshift+i_shift_offset);
1201 /* Update potential energies */
1202 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1204 /* Increment number of inner iterations */
1205 inneriter += j_index_end - j_index_start;
1207 /* Outer loop uses 19 flops */
1210 /* Increment number of outer iterations */
1213 /* Update outer/inner flops */
1215 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*378);
1218 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double
1219 * Electrostatics interaction: Ewald
1220 * VdW interaction: None
1221 * Geometry: Water4-Water4
1222 * Calculate force/pot: Force
1225 nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double
1226 (t_nblist * gmx_restrict nlist,
1227 rvec * gmx_restrict xx,
1228 rvec * gmx_restrict ff,
1229 t_forcerec * gmx_restrict fr,
1230 t_mdatoms * gmx_restrict mdatoms,
1231 nb_kernel_data_t * gmx_restrict kernel_data,
1232 t_nrnb * gmx_restrict nrnb)
1234 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1235 * just 0 for non-waters.
1236 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1237 * jnr indices corresponding to data put in the four positions in the SIMD register.
1239 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1240 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1241 int jnrA,jnrB,jnrC,jnrD;
1242 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1243 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1244 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1245 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1246 real rcutoff_scalar;
1247 real *shiftvec,*fshift,*x,*f;
1248 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1249 real scratch[4*DIM];
1250 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1251 real * vdwioffsetptr1;
1252 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1253 real * vdwioffsetptr2;
1254 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1255 real * vdwioffsetptr3;
1256 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1257 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1258 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1259 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1260 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1261 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1262 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1263 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1264 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1265 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1266 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1267 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1268 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1269 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1270 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1271 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1272 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1275 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1276 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1278 __m256d dummy_mask,cutoff_mask;
1279 __m128 tmpmask0,tmpmask1;
1280 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1281 __m256d one = _mm256_set1_pd(1.0);
1282 __m256d two = _mm256_set1_pd(2.0);
1288 jindex = nlist->jindex;
1290 shiftidx = nlist->shift;
1292 shiftvec = fr->shift_vec[0];
1293 fshift = fr->fshift[0];
1294 facel = _mm256_set1_pd(fr->epsfac);
1295 charge = mdatoms->chargeA;
1297 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1298 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1299 beta2 = _mm256_mul_pd(beta,beta);
1300 beta3 = _mm256_mul_pd(beta,beta2);
1302 ewtab = fr->ic->tabq_coul_F;
1303 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1304 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1306 /* Setup water-specific parameters */
1307 inr = nlist->iinr[0];
1308 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1309 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1310 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1312 jq1 = _mm256_set1_pd(charge[inr+1]);
1313 jq2 = _mm256_set1_pd(charge[inr+2]);
1314 jq3 = _mm256_set1_pd(charge[inr+3]);
1315 qq11 = _mm256_mul_pd(iq1,jq1);
1316 qq12 = _mm256_mul_pd(iq1,jq2);
1317 qq13 = _mm256_mul_pd(iq1,jq3);
1318 qq21 = _mm256_mul_pd(iq2,jq1);
1319 qq22 = _mm256_mul_pd(iq2,jq2);
1320 qq23 = _mm256_mul_pd(iq2,jq3);
1321 qq31 = _mm256_mul_pd(iq3,jq1);
1322 qq32 = _mm256_mul_pd(iq3,jq2);
1323 qq33 = _mm256_mul_pd(iq3,jq3);
1325 /* Avoid stupid compiler warnings */
1326 jnrA = jnrB = jnrC = jnrD = 0;
1327 j_coord_offsetA = 0;
1328 j_coord_offsetB = 0;
1329 j_coord_offsetC = 0;
1330 j_coord_offsetD = 0;
1335 for(iidx=0;iidx<4*DIM;iidx++)
1337 scratch[iidx] = 0.0;
1340 /* Start outer loop over neighborlists */
1341 for(iidx=0; iidx<nri; iidx++)
1343 /* Load shift vector for this list */
1344 i_shift_offset = DIM*shiftidx[iidx];
1346 /* Load limits for loop over neighbors */
1347 j_index_start = jindex[iidx];
1348 j_index_end = jindex[iidx+1];
1350 /* Get outer coordinate index */
1352 i_coord_offset = DIM*inr;
1354 /* Load i particle coords and add shift vector */
1355 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1356 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1358 fix1 = _mm256_setzero_pd();
1359 fiy1 = _mm256_setzero_pd();
1360 fiz1 = _mm256_setzero_pd();
1361 fix2 = _mm256_setzero_pd();
1362 fiy2 = _mm256_setzero_pd();
1363 fiz2 = _mm256_setzero_pd();
1364 fix3 = _mm256_setzero_pd();
1365 fiy3 = _mm256_setzero_pd();
1366 fiz3 = _mm256_setzero_pd();
1368 /* Start inner kernel loop */
1369 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1372 /* Get j neighbor index, and coordinate index */
1374 jnrB = jjnr[jidx+1];
1375 jnrC = jjnr[jidx+2];
1376 jnrD = jjnr[jidx+3];
1377 j_coord_offsetA = DIM*jnrA;
1378 j_coord_offsetB = DIM*jnrB;
1379 j_coord_offsetC = DIM*jnrC;
1380 j_coord_offsetD = DIM*jnrD;
1382 /* load j atom coordinates */
1383 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1384 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1385 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1387 /* Calculate displacement vector */
1388 dx11 = _mm256_sub_pd(ix1,jx1);
1389 dy11 = _mm256_sub_pd(iy1,jy1);
1390 dz11 = _mm256_sub_pd(iz1,jz1);
1391 dx12 = _mm256_sub_pd(ix1,jx2);
1392 dy12 = _mm256_sub_pd(iy1,jy2);
1393 dz12 = _mm256_sub_pd(iz1,jz2);
1394 dx13 = _mm256_sub_pd(ix1,jx3);
1395 dy13 = _mm256_sub_pd(iy1,jy3);
1396 dz13 = _mm256_sub_pd(iz1,jz3);
1397 dx21 = _mm256_sub_pd(ix2,jx1);
1398 dy21 = _mm256_sub_pd(iy2,jy1);
1399 dz21 = _mm256_sub_pd(iz2,jz1);
1400 dx22 = _mm256_sub_pd(ix2,jx2);
1401 dy22 = _mm256_sub_pd(iy2,jy2);
1402 dz22 = _mm256_sub_pd(iz2,jz2);
1403 dx23 = _mm256_sub_pd(ix2,jx3);
1404 dy23 = _mm256_sub_pd(iy2,jy3);
1405 dz23 = _mm256_sub_pd(iz2,jz3);
1406 dx31 = _mm256_sub_pd(ix3,jx1);
1407 dy31 = _mm256_sub_pd(iy3,jy1);
1408 dz31 = _mm256_sub_pd(iz3,jz1);
1409 dx32 = _mm256_sub_pd(ix3,jx2);
1410 dy32 = _mm256_sub_pd(iy3,jy2);
1411 dz32 = _mm256_sub_pd(iz3,jz2);
1412 dx33 = _mm256_sub_pd(ix3,jx3);
1413 dy33 = _mm256_sub_pd(iy3,jy3);
1414 dz33 = _mm256_sub_pd(iz3,jz3);
1416 /* Calculate squared distance and things based on it */
1417 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1418 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1419 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1420 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1421 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1422 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1423 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1424 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1425 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1427 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1428 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1429 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1430 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1431 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1432 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1433 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1434 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1435 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1437 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1438 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1439 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1440 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1441 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1442 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1443 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1444 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1445 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1447 fjx1 = _mm256_setzero_pd();
1448 fjy1 = _mm256_setzero_pd();
1449 fjz1 = _mm256_setzero_pd();
1450 fjx2 = _mm256_setzero_pd();
1451 fjy2 = _mm256_setzero_pd();
1452 fjz2 = _mm256_setzero_pd();
1453 fjx3 = _mm256_setzero_pd();
1454 fjy3 = _mm256_setzero_pd();
1455 fjz3 = _mm256_setzero_pd();
1457 /**************************
1458 * CALCULATE INTERACTIONS *
1459 **************************/
1461 r11 = _mm256_mul_pd(rsq11,rinv11);
1463 /* EWALD ELECTROSTATICS */
1465 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1466 ewrt = _mm256_mul_pd(r11,ewtabscale);
1467 ewitab = _mm256_cvttpd_epi32(ewrt);
1468 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1469 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1470 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1472 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1473 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1477 /* Calculate temporary vectorial force */
1478 tx = _mm256_mul_pd(fscal,dx11);
1479 ty = _mm256_mul_pd(fscal,dy11);
1480 tz = _mm256_mul_pd(fscal,dz11);
1482 /* Update vectorial force */
1483 fix1 = _mm256_add_pd(fix1,tx);
1484 fiy1 = _mm256_add_pd(fiy1,ty);
1485 fiz1 = _mm256_add_pd(fiz1,tz);
1487 fjx1 = _mm256_add_pd(fjx1,tx);
1488 fjy1 = _mm256_add_pd(fjy1,ty);
1489 fjz1 = _mm256_add_pd(fjz1,tz);
1491 /**************************
1492 * CALCULATE INTERACTIONS *
1493 **************************/
1495 r12 = _mm256_mul_pd(rsq12,rinv12);
1497 /* EWALD ELECTROSTATICS */
1499 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1500 ewrt = _mm256_mul_pd(r12,ewtabscale);
1501 ewitab = _mm256_cvttpd_epi32(ewrt);
1502 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1503 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1504 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1506 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1507 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1511 /* Calculate temporary vectorial force */
1512 tx = _mm256_mul_pd(fscal,dx12);
1513 ty = _mm256_mul_pd(fscal,dy12);
1514 tz = _mm256_mul_pd(fscal,dz12);
1516 /* Update vectorial force */
1517 fix1 = _mm256_add_pd(fix1,tx);
1518 fiy1 = _mm256_add_pd(fiy1,ty);
1519 fiz1 = _mm256_add_pd(fiz1,tz);
1521 fjx2 = _mm256_add_pd(fjx2,tx);
1522 fjy2 = _mm256_add_pd(fjy2,ty);
1523 fjz2 = _mm256_add_pd(fjz2,tz);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 r13 = _mm256_mul_pd(rsq13,rinv13);
1531 /* EWALD ELECTROSTATICS */
1533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1534 ewrt = _mm256_mul_pd(r13,ewtabscale);
1535 ewitab = _mm256_cvttpd_epi32(ewrt);
1536 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1537 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1538 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1540 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1541 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1545 /* Calculate temporary vectorial force */
1546 tx = _mm256_mul_pd(fscal,dx13);
1547 ty = _mm256_mul_pd(fscal,dy13);
1548 tz = _mm256_mul_pd(fscal,dz13);
1550 /* Update vectorial force */
1551 fix1 = _mm256_add_pd(fix1,tx);
1552 fiy1 = _mm256_add_pd(fiy1,ty);
1553 fiz1 = _mm256_add_pd(fiz1,tz);
1555 fjx3 = _mm256_add_pd(fjx3,tx);
1556 fjy3 = _mm256_add_pd(fjy3,ty);
1557 fjz3 = _mm256_add_pd(fjz3,tz);
1559 /**************************
1560 * CALCULATE INTERACTIONS *
1561 **************************/
1563 r21 = _mm256_mul_pd(rsq21,rinv21);
1565 /* EWALD ELECTROSTATICS */
1567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1568 ewrt = _mm256_mul_pd(r21,ewtabscale);
1569 ewitab = _mm256_cvttpd_epi32(ewrt);
1570 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1571 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1572 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1574 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1575 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1579 /* Calculate temporary vectorial force */
1580 tx = _mm256_mul_pd(fscal,dx21);
1581 ty = _mm256_mul_pd(fscal,dy21);
1582 tz = _mm256_mul_pd(fscal,dz21);
1584 /* Update vectorial force */
1585 fix2 = _mm256_add_pd(fix2,tx);
1586 fiy2 = _mm256_add_pd(fiy2,ty);
1587 fiz2 = _mm256_add_pd(fiz2,tz);
1589 fjx1 = _mm256_add_pd(fjx1,tx);
1590 fjy1 = _mm256_add_pd(fjy1,ty);
1591 fjz1 = _mm256_add_pd(fjz1,tz);
1593 /**************************
1594 * CALCULATE INTERACTIONS *
1595 **************************/
1597 r22 = _mm256_mul_pd(rsq22,rinv22);
1599 /* EWALD ELECTROSTATICS */
1601 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1602 ewrt = _mm256_mul_pd(r22,ewtabscale);
1603 ewitab = _mm256_cvttpd_epi32(ewrt);
1604 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1605 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1606 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1608 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1609 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1613 /* Calculate temporary vectorial force */
1614 tx = _mm256_mul_pd(fscal,dx22);
1615 ty = _mm256_mul_pd(fscal,dy22);
1616 tz = _mm256_mul_pd(fscal,dz22);
1618 /* Update vectorial force */
1619 fix2 = _mm256_add_pd(fix2,tx);
1620 fiy2 = _mm256_add_pd(fiy2,ty);
1621 fiz2 = _mm256_add_pd(fiz2,tz);
1623 fjx2 = _mm256_add_pd(fjx2,tx);
1624 fjy2 = _mm256_add_pd(fjy2,ty);
1625 fjz2 = _mm256_add_pd(fjz2,tz);
1627 /**************************
1628 * CALCULATE INTERACTIONS *
1629 **************************/
1631 r23 = _mm256_mul_pd(rsq23,rinv23);
1633 /* EWALD ELECTROSTATICS */
1635 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1636 ewrt = _mm256_mul_pd(r23,ewtabscale);
1637 ewitab = _mm256_cvttpd_epi32(ewrt);
1638 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1639 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1640 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1642 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1643 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1647 /* Calculate temporary vectorial force */
1648 tx = _mm256_mul_pd(fscal,dx23);
1649 ty = _mm256_mul_pd(fscal,dy23);
1650 tz = _mm256_mul_pd(fscal,dz23);
1652 /* Update vectorial force */
1653 fix2 = _mm256_add_pd(fix2,tx);
1654 fiy2 = _mm256_add_pd(fiy2,ty);
1655 fiz2 = _mm256_add_pd(fiz2,tz);
1657 fjx3 = _mm256_add_pd(fjx3,tx);
1658 fjy3 = _mm256_add_pd(fjy3,ty);
1659 fjz3 = _mm256_add_pd(fjz3,tz);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 r31 = _mm256_mul_pd(rsq31,rinv31);
1667 /* EWALD ELECTROSTATICS */
1669 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1670 ewrt = _mm256_mul_pd(r31,ewtabscale);
1671 ewitab = _mm256_cvttpd_epi32(ewrt);
1672 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1673 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1674 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1676 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1677 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1681 /* Calculate temporary vectorial force */
1682 tx = _mm256_mul_pd(fscal,dx31);
1683 ty = _mm256_mul_pd(fscal,dy31);
1684 tz = _mm256_mul_pd(fscal,dz31);
1686 /* Update vectorial force */
1687 fix3 = _mm256_add_pd(fix3,tx);
1688 fiy3 = _mm256_add_pd(fiy3,ty);
1689 fiz3 = _mm256_add_pd(fiz3,tz);
1691 fjx1 = _mm256_add_pd(fjx1,tx);
1692 fjy1 = _mm256_add_pd(fjy1,ty);
1693 fjz1 = _mm256_add_pd(fjz1,tz);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 r32 = _mm256_mul_pd(rsq32,rinv32);
1701 /* EWALD ELECTROSTATICS */
1703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1704 ewrt = _mm256_mul_pd(r32,ewtabscale);
1705 ewitab = _mm256_cvttpd_epi32(ewrt);
1706 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1707 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1708 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1710 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1711 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1715 /* Calculate temporary vectorial force */
1716 tx = _mm256_mul_pd(fscal,dx32);
1717 ty = _mm256_mul_pd(fscal,dy32);
1718 tz = _mm256_mul_pd(fscal,dz32);
1720 /* Update vectorial force */
1721 fix3 = _mm256_add_pd(fix3,tx);
1722 fiy3 = _mm256_add_pd(fiy3,ty);
1723 fiz3 = _mm256_add_pd(fiz3,tz);
1725 fjx2 = _mm256_add_pd(fjx2,tx);
1726 fjy2 = _mm256_add_pd(fjy2,ty);
1727 fjz2 = _mm256_add_pd(fjz2,tz);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r33 = _mm256_mul_pd(rsq33,rinv33);
1735 /* EWALD ELECTROSTATICS */
1737 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1738 ewrt = _mm256_mul_pd(r33,ewtabscale);
1739 ewitab = _mm256_cvttpd_epi32(ewrt);
1740 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1741 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1742 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1744 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1745 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1749 /* Calculate temporary vectorial force */
1750 tx = _mm256_mul_pd(fscal,dx33);
1751 ty = _mm256_mul_pd(fscal,dy33);
1752 tz = _mm256_mul_pd(fscal,dz33);
1754 /* Update vectorial force */
1755 fix3 = _mm256_add_pd(fix3,tx);
1756 fiy3 = _mm256_add_pd(fiy3,ty);
1757 fiz3 = _mm256_add_pd(fiz3,tz);
1759 fjx3 = _mm256_add_pd(fjx3,tx);
1760 fjy3 = _mm256_add_pd(fjy3,ty);
1761 fjz3 = _mm256_add_pd(fjz3,tz);
1763 fjptrA = f+j_coord_offsetA;
1764 fjptrB = f+j_coord_offsetB;
1765 fjptrC = f+j_coord_offsetC;
1766 fjptrD = f+j_coord_offsetD;
1768 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1769 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1771 /* Inner loop uses 324 flops */
1774 if(jidx<j_index_end)
1777 /* Get j neighbor index, and coordinate index */
1778 jnrlistA = jjnr[jidx];
1779 jnrlistB = jjnr[jidx+1];
1780 jnrlistC = jjnr[jidx+2];
1781 jnrlistD = jjnr[jidx+3];
1782 /* Sign of each element will be negative for non-real atoms.
1783 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1784 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1786 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1788 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1789 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1790 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1792 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1793 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1794 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1795 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1796 j_coord_offsetA = DIM*jnrA;
1797 j_coord_offsetB = DIM*jnrB;
1798 j_coord_offsetC = DIM*jnrC;
1799 j_coord_offsetD = DIM*jnrD;
1801 /* load j atom coordinates */
1802 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1803 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1804 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1806 /* Calculate displacement vector */
1807 dx11 = _mm256_sub_pd(ix1,jx1);
1808 dy11 = _mm256_sub_pd(iy1,jy1);
1809 dz11 = _mm256_sub_pd(iz1,jz1);
1810 dx12 = _mm256_sub_pd(ix1,jx2);
1811 dy12 = _mm256_sub_pd(iy1,jy2);
1812 dz12 = _mm256_sub_pd(iz1,jz2);
1813 dx13 = _mm256_sub_pd(ix1,jx3);
1814 dy13 = _mm256_sub_pd(iy1,jy3);
1815 dz13 = _mm256_sub_pd(iz1,jz3);
1816 dx21 = _mm256_sub_pd(ix2,jx1);
1817 dy21 = _mm256_sub_pd(iy2,jy1);
1818 dz21 = _mm256_sub_pd(iz2,jz1);
1819 dx22 = _mm256_sub_pd(ix2,jx2);
1820 dy22 = _mm256_sub_pd(iy2,jy2);
1821 dz22 = _mm256_sub_pd(iz2,jz2);
1822 dx23 = _mm256_sub_pd(ix2,jx3);
1823 dy23 = _mm256_sub_pd(iy2,jy3);
1824 dz23 = _mm256_sub_pd(iz2,jz3);
1825 dx31 = _mm256_sub_pd(ix3,jx1);
1826 dy31 = _mm256_sub_pd(iy3,jy1);
1827 dz31 = _mm256_sub_pd(iz3,jz1);
1828 dx32 = _mm256_sub_pd(ix3,jx2);
1829 dy32 = _mm256_sub_pd(iy3,jy2);
1830 dz32 = _mm256_sub_pd(iz3,jz2);
1831 dx33 = _mm256_sub_pd(ix3,jx3);
1832 dy33 = _mm256_sub_pd(iy3,jy3);
1833 dz33 = _mm256_sub_pd(iz3,jz3);
1835 /* Calculate squared distance and things based on it */
1836 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1837 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1838 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1839 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1840 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1841 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1842 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1843 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1844 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1846 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1847 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1848 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1849 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1850 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1851 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1852 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1853 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1854 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1856 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1857 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1858 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1859 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1860 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1861 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1862 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1863 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1864 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1866 fjx1 = _mm256_setzero_pd();
1867 fjy1 = _mm256_setzero_pd();
1868 fjz1 = _mm256_setzero_pd();
1869 fjx2 = _mm256_setzero_pd();
1870 fjy2 = _mm256_setzero_pd();
1871 fjz2 = _mm256_setzero_pd();
1872 fjx3 = _mm256_setzero_pd();
1873 fjy3 = _mm256_setzero_pd();
1874 fjz3 = _mm256_setzero_pd();
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 r11 = _mm256_mul_pd(rsq11,rinv11);
1881 r11 = _mm256_andnot_pd(dummy_mask,r11);
1883 /* EWALD ELECTROSTATICS */
1885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1886 ewrt = _mm256_mul_pd(r11,ewtabscale);
1887 ewitab = _mm256_cvttpd_epi32(ewrt);
1888 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1889 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1890 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1892 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1893 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1897 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1899 /* Calculate temporary vectorial force */
1900 tx = _mm256_mul_pd(fscal,dx11);
1901 ty = _mm256_mul_pd(fscal,dy11);
1902 tz = _mm256_mul_pd(fscal,dz11);
1904 /* Update vectorial force */
1905 fix1 = _mm256_add_pd(fix1,tx);
1906 fiy1 = _mm256_add_pd(fiy1,ty);
1907 fiz1 = _mm256_add_pd(fiz1,tz);
1909 fjx1 = _mm256_add_pd(fjx1,tx);
1910 fjy1 = _mm256_add_pd(fjy1,ty);
1911 fjz1 = _mm256_add_pd(fjz1,tz);
1913 /**************************
1914 * CALCULATE INTERACTIONS *
1915 **************************/
1917 r12 = _mm256_mul_pd(rsq12,rinv12);
1918 r12 = _mm256_andnot_pd(dummy_mask,r12);
1920 /* EWALD ELECTROSTATICS */
1922 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1923 ewrt = _mm256_mul_pd(r12,ewtabscale);
1924 ewitab = _mm256_cvttpd_epi32(ewrt);
1925 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1926 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1927 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1929 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1930 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1934 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1936 /* Calculate temporary vectorial force */
1937 tx = _mm256_mul_pd(fscal,dx12);
1938 ty = _mm256_mul_pd(fscal,dy12);
1939 tz = _mm256_mul_pd(fscal,dz12);
1941 /* Update vectorial force */
1942 fix1 = _mm256_add_pd(fix1,tx);
1943 fiy1 = _mm256_add_pd(fiy1,ty);
1944 fiz1 = _mm256_add_pd(fiz1,tz);
1946 fjx2 = _mm256_add_pd(fjx2,tx);
1947 fjy2 = _mm256_add_pd(fjy2,ty);
1948 fjz2 = _mm256_add_pd(fjz2,tz);
1950 /**************************
1951 * CALCULATE INTERACTIONS *
1952 **************************/
1954 r13 = _mm256_mul_pd(rsq13,rinv13);
1955 r13 = _mm256_andnot_pd(dummy_mask,r13);
1957 /* EWALD ELECTROSTATICS */
1959 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1960 ewrt = _mm256_mul_pd(r13,ewtabscale);
1961 ewitab = _mm256_cvttpd_epi32(ewrt);
1962 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1963 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1964 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1966 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1967 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1971 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1973 /* Calculate temporary vectorial force */
1974 tx = _mm256_mul_pd(fscal,dx13);
1975 ty = _mm256_mul_pd(fscal,dy13);
1976 tz = _mm256_mul_pd(fscal,dz13);
1978 /* Update vectorial force */
1979 fix1 = _mm256_add_pd(fix1,tx);
1980 fiy1 = _mm256_add_pd(fiy1,ty);
1981 fiz1 = _mm256_add_pd(fiz1,tz);
1983 fjx3 = _mm256_add_pd(fjx3,tx);
1984 fjy3 = _mm256_add_pd(fjy3,ty);
1985 fjz3 = _mm256_add_pd(fjz3,tz);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 r21 = _mm256_mul_pd(rsq21,rinv21);
1992 r21 = _mm256_andnot_pd(dummy_mask,r21);
1994 /* EWALD ELECTROSTATICS */
1996 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1997 ewrt = _mm256_mul_pd(r21,ewtabscale);
1998 ewitab = _mm256_cvttpd_epi32(ewrt);
1999 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2000 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2001 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2003 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2004 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2008 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2010 /* Calculate temporary vectorial force */
2011 tx = _mm256_mul_pd(fscal,dx21);
2012 ty = _mm256_mul_pd(fscal,dy21);
2013 tz = _mm256_mul_pd(fscal,dz21);
2015 /* Update vectorial force */
2016 fix2 = _mm256_add_pd(fix2,tx);
2017 fiy2 = _mm256_add_pd(fiy2,ty);
2018 fiz2 = _mm256_add_pd(fiz2,tz);
2020 fjx1 = _mm256_add_pd(fjx1,tx);
2021 fjy1 = _mm256_add_pd(fjy1,ty);
2022 fjz1 = _mm256_add_pd(fjz1,tz);
2024 /**************************
2025 * CALCULATE INTERACTIONS *
2026 **************************/
2028 r22 = _mm256_mul_pd(rsq22,rinv22);
2029 r22 = _mm256_andnot_pd(dummy_mask,r22);
2031 /* EWALD ELECTROSTATICS */
2033 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2034 ewrt = _mm256_mul_pd(r22,ewtabscale);
2035 ewitab = _mm256_cvttpd_epi32(ewrt);
2036 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2037 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2038 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2040 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2041 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2045 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2047 /* Calculate temporary vectorial force */
2048 tx = _mm256_mul_pd(fscal,dx22);
2049 ty = _mm256_mul_pd(fscal,dy22);
2050 tz = _mm256_mul_pd(fscal,dz22);
2052 /* Update vectorial force */
2053 fix2 = _mm256_add_pd(fix2,tx);
2054 fiy2 = _mm256_add_pd(fiy2,ty);
2055 fiz2 = _mm256_add_pd(fiz2,tz);
2057 fjx2 = _mm256_add_pd(fjx2,tx);
2058 fjy2 = _mm256_add_pd(fjy2,ty);
2059 fjz2 = _mm256_add_pd(fjz2,tz);
2061 /**************************
2062 * CALCULATE INTERACTIONS *
2063 **************************/
2065 r23 = _mm256_mul_pd(rsq23,rinv23);
2066 r23 = _mm256_andnot_pd(dummy_mask,r23);
2068 /* EWALD ELECTROSTATICS */
2070 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2071 ewrt = _mm256_mul_pd(r23,ewtabscale);
2072 ewitab = _mm256_cvttpd_epi32(ewrt);
2073 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2074 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2075 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2077 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2078 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2082 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2084 /* Calculate temporary vectorial force */
2085 tx = _mm256_mul_pd(fscal,dx23);
2086 ty = _mm256_mul_pd(fscal,dy23);
2087 tz = _mm256_mul_pd(fscal,dz23);
2089 /* Update vectorial force */
2090 fix2 = _mm256_add_pd(fix2,tx);
2091 fiy2 = _mm256_add_pd(fiy2,ty);
2092 fiz2 = _mm256_add_pd(fiz2,tz);
2094 fjx3 = _mm256_add_pd(fjx3,tx);
2095 fjy3 = _mm256_add_pd(fjy3,ty);
2096 fjz3 = _mm256_add_pd(fjz3,tz);
2098 /**************************
2099 * CALCULATE INTERACTIONS *
2100 **************************/
2102 r31 = _mm256_mul_pd(rsq31,rinv31);
2103 r31 = _mm256_andnot_pd(dummy_mask,r31);
2105 /* EWALD ELECTROSTATICS */
2107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2108 ewrt = _mm256_mul_pd(r31,ewtabscale);
2109 ewitab = _mm256_cvttpd_epi32(ewrt);
2110 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2111 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2112 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2114 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2115 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2119 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2121 /* Calculate temporary vectorial force */
2122 tx = _mm256_mul_pd(fscal,dx31);
2123 ty = _mm256_mul_pd(fscal,dy31);
2124 tz = _mm256_mul_pd(fscal,dz31);
2126 /* Update vectorial force */
2127 fix3 = _mm256_add_pd(fix3,tx);
2128 fiy3 = _mm256_add_pd(fiy3,ty);
2129 fiz3 = _mm256_add_pd(fiz3,tz);
2131 fjx1 = _mm256_add_pd(fjx1,tx);
2132 fjy1 = _mm256_add_pd(fjy1,ty);
2133 fjz1 = _mm256_add_pd(fjz1,tz);
2135 /**************************
2136 * CALCULATE INTERACTIONS *
2137 **************************/
2139 r32 = _mm256_mul_pd(rsq32,rinv32);
2140 r32 = _mm256_andnot_pd(dummy_mask,r32);
2142 /* EWALD ELECTROSTATICS */
2144 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2145 ewrt = _mm256_mul_pd(r32,ewtabscale);
2146 ewitab = _mm256_cvttpd_epi32(ewrt);
2147 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2148 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2149 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2151 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2152 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2156 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2158 /* Calculate temporary vectorial force */
2159 tx = _mm256_mul_pd(fscal,dx32);
2160 ty = _mm256_mul_pd(fscal,dy32);
2161 tz = _mm256_mul_pd(fscal,dz32);
2163 /* Update vectorial force */
2164 fix3 = _mm256_add_pd(fix3,tx);
2165 fiy3 = _mm256_add_pd(fiy3,ty);
2166 fiz3 = _mm256_add_pd(fiz3,tz);
2168 fjx2 = _mm256_add_pd(fjx2,tx);
2169 fjy2 = _mm256_add_pd(fjy2,ty);
2170 fjz2 = _mm256_add_pd(fjz2,tz);
2172 /**************************
2173 * CALCULATE INTERACTIONS *
2174 **************************/
2176 r33 = _mm256_mul_pd(rsq33,rinv33);
2177 r33 = _mm256_andnot_pd(dummy_mask,r33);
2179 /* EWALD ELECTROSTATICS */
2181 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2182 ewrt = _mm256_mul_pd(r33,ewtabscale);
2183 ewitab = _mm256_cvttpd_epi32(ewrt);
2184 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2185 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2186 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2188 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2189 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2193 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2195 /* Calculate temporary vectorial force */
2196 tx = _mm256_mul_pd(fscal,dx33);
2197 ty = _mm256_mul_pd(fscal,dy33);
2198 tz = _mm256_mul_pd(fscal,dz33);
2200 /* Update vectorial force */
2201 fix3 = _mm256_add_pd(fix3,tx);
2202 fiy3 = _mm256_add_pd(fiy3,ty);
2203 fiz3 = _mm256_add_pd(fiz3,tz);
2205 fjx3 = _mm256_add_pd(fjx3,tx);
2206 fjy3 = _mm256_add_pd(fjy3,ty);
2207 fjz3 = _mm256_add_pd(fjz3,tz);
2209 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2210 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2211 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2212 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2214 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2215 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2217 /* Inner loop uses 333 flops */
2220 /* End of innermost loop */
2222 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2223 f+i_coord_offset+DIM,fshift+i_shift_offset);
2225 /* Increment number of inner iterations */
2226 inneriter += j_index_end - j_index_start;
2228 /* Outer loop uses 18 flops */
2231 /* Increment number of outer iterations */
2234 /* Update outer/inner flops */
2236 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*333);