2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr1;
84 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 real * vdwioffsetptr2;
86 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 real * vdwioffsetptr3;
88 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
90 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
92 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
94 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
98 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
101 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
102 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
103 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
107 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
110 __m256d dummy_mask,cutoff_mask;
111 __m128 tmpmask0,tmpmask1;
112 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
113 __m256d one = _mm256_set1_pd(1.0);
114 __m256d two = _mm256_set1_pd(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_pd(fr->ic->epsfac);
127 charge = mdatoms->chargeA;
129 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
130 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
131 beta2 = _mm256_mul_pd(beta,beta);
132 beta3 = _mm256_mul_pd(beta,beta2);
134 ewtab = fr->ic->tabq_coul_FDV0;
135 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
136 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
138 /* Setup water-specific parameters */
139 inr = nlist->iinr[0];
140 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
141 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
142 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
144 jq1 = _mm256_set1_pd(charge[inr+1]);
145 jq2 = _mm256_set1_pd(charge[inr+2]);
146 jq3 = _mm256_set1_pd(charge[inr+3]);
147 qq11 = _mm256_mul_pd(iq1,jq1);
148 qq12 = _mm256_mul_pd(iq1,jq2);
149 qq13 = _mm256_mul_pd(iq1,jq3);
150 qq21 = _mm256_mul_pd(iq2,jq1);
151 qq22 = _mm256_mul_pd(iq2,jq2);
152 qq23 = _mm256_mul_pd(iq2,jq3);
153 qq31 = _mm256_mul_pd(iq3,jq1);
154 qq32 = _mm256_mul_pd(iq3,jq2);
155 qq33 = _mm256_mul_pd(iq3,jq3);
157 /* Avoid stupid compiler warnings */
158 jnrA = jnrB = jnrC = jnrD = 0;
167 for(iidx=0;iidx<4*DIM;iidx++)
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
188 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
190 fix1 = _mm256_setzero_pd();
191 fiy1 = _mm256_setzero_pd();
192 fiz1 = _mm256_setzero_pd();
193 fix2 = _mm256_setzero_pd();
194 fiy2 = _mm256_setzero_pd();
195 fiz2 = _mm256_setzero_pd();
196 fix3 = _mm256_setzero_pd();
197 fiy3 = _mm256_setzero_pd();
198 fiz3 = _mm256_setzero_pd();
200 /* Reset potential sums */
201 velecsum = _mm256_setzero_pd();
203 /* Start inner kernel loop */
204 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
207 /* Get j neighbor index, and coordinate index */
212 j_coord_offsetA = DIM*jnrA;
213 j_coord_offsetB = DIM*jnrB;
214 j_coord_offsetC = DIM*jnrC;
215 j_coord_offsetD = DIM*jnrD;
217 /* load j atom coordinates */
218 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
219 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
220 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
222 /* Calculate displacement vector */
223 dx11 = _mm256_sub_pd(ix1,jx1);
224 dy11 = _mm256_sub_pd(iy1,jy1);
225 dz11 = _mm256_sub_pd(iz1,jz1);
226 dx12 = _mm256_sub_pd(ix1,jx2);
227 dy12 = _mm256_sub_pd(iy1,jy2);
228 dz12 = _mm256_sub_pd(iz1,jz2);
229 dx13 = _mm256_sub_pd(ix1,jx3);
230 dy13 = _mm256_sub_pd(iy1,jy3);
231 dz13 = _mm256_sub_pd(iz1,jz3);
232 dx21 = _mm256_sub_pd(ix2,jx1);
233 dy21 = _mm256_sub_pd(iy2,jy1);
234 dz21 = _mm256_sub_pd(iz2,jz1);
235 dx22 = _mm256_sub_pd(ix2,jx2);
236 dy22 = _mm256_sub_pd(iy2,jy2);
237 dz22 = _mm256_sub_pd(iz2,jz2);
238 dx23 = _mm256_sub_pd(ix2,jx3);
239 dy23 = _mm256_sub_pd(iy2,jy3);
240 dz23 = _mm256_sub_pd(iz2,jz3);
241 dx31 = _mm256_sub_pd(ix3,jx1);
242 dy31 = _mm256_sub_pd(iy3,jy1);
243 dz31 = _mm256_sub_pd(iz3,jz1);
244 dx32 = _mm256_sub_pd(ix3,jx2);
245 dy32 = _mm256_sub_pd(iy3,jy2);
246 dz32 = _mm256_sub_pd(iz3,jz2);
247 dx33 = _mm256_sub_pd(ix3,jx3);
248 dy33 = _mm256_sub_pd(iy3,jy3);
249 dz33 = _mm256_sub_pd(iz3,jz3);
251 /* Calculate squared distance and things based on it */
252 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
253 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
254 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
255 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
256 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
257 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
258 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
259 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
260 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
262 rinv11 = avx256_invsqrt_d(rsq11);
263 rinv12 = avx256_invsqrt_d(rsq12);
264 rinv13 = avx256_invsqrt_d(rsq13);
265 rinv21 = avx256_invsqrt_d(rsq21);
266 rinv22 = avx256_invsqrt_d(rsq22);
267 rinv23 = avx256_invsqrt_d(rsq23);
268 rinv31 = avx256_invsqrt_d(rsq31);
269 rinv32 = avx256_invsqrt_d(rsq32);
270 rinv33 = avx256_invsqrt_d(rsq33);
272 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
273 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
274 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
275 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
276 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
277 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
278 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
279 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
280 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
282 fjx1 = _mm256_setzero_pd();
283 fjy1 = _mm256_setzero_pd();
284 fjz1 = _mm256_setzero_pd();
285 fjx2 = _mm256_setzero_pd();
286 fjy2 = _mm256_setzero_pd();
287 fjz2 = _mm256_setzero_pd();
288 fjx3 = _mm256_setzero_pd();
289 fjy3 = _mm256_setzero_pd();
290 fjz3 = _mm256_setzero_pd();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 r11 = _mm256_mul_pd(rsq11,rinv11);
298 /* EWALD ELECTROSTATICS */
300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
301 ewrt = _mm256_mul_pd(r11,ewtabscale);
302 ewitab = _mm256_cvttpd_epi32(ewrt);
303 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
304 ewitab = _mm_slli_epi32(ewitab,2);
305 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
306 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
307 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
308 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
309 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
310 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
311 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
312 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
313 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
315 /* Update potential sum for this i atom from the interaction with this j atom. */
316 velecsum = _mm256_add_pd(velecsum,velec);
320 /* Calculate temporary vectorial force */
321 tx = _mm256_mul_pd(fscal,dx11);
322 ty = _mm256_mul_pd(fscal,dy11);
323 tz = _mm256_mul_pd(fscal,dz11);
325 /* Update vectorial force */
326 fix1 = _mm256_add_pd(fix1,tx);
327 fiy1 = _mm256_add_pd(fiy1,ty);
328 fiz1 = _mm256_add_pd(fiz1,tz);
330 fjx1 = _mm256_add_pd(fjx1,tx);
331 fjy1 = _mm256_add_pd(fjy1,ty);
332 fjz1 = _mm256_add_pd(fjz1,tz);
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 r12 = _mm256_mul_pd(rsq12,rinv12);
340 /* EWALD ELECTROSTATICS */
342 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
343 ewrt = _mm256_mul_pd(r12,ewtabscale);
344 ewitab = _mm256_cvttpd_epi32(ewrt);
345 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
346 ewitab = _mm_slli_epi32(ewitab,2);
347 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
348 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
349 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
350 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
351 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
352 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
353 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
354 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
355 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm256_add_pd(velecsum,velec);
362 /* Calculate temporary vectorial force */
363 tx = _mm256_mul_pd(fscal,dx12);
364 ty = _mm256_mul_pd(fscal,dy12);
365 tz = _mm256_mul_pd(fscal,dz12);
367 /* Update vectorial force */
368 fix1 = _mm256_add_pd(fix1,tx);
369 fiy1 = _mm256_add_pd(fiy1,ty);
370 fiz1 = _mm256_add_pd(fiz1,tz);
372 fjx2 = _mm256_add_pd(fjx2,tx);
373 fjy2 = _mm256_add_pd(fjy2,ty);
374 fjz2 = _mm256_add_pd(fjz2,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r13 = _mm256_mul_pd(rsq13,rinv13);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm256_mul_pd(r13,ewtabscale);
386 ewitab = _mm256_cvttpd_epi32(ewrt);
387 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
390 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
391 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
392 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
393 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
394 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
395 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
396 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
397 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm256_add_pd(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm256_mul_pd(fscal,dx13);
406 ty = _mm256_mul_pd(fscal,dy13);
407 tz = _mm256_mul_pd(fscal,dz13);
409 /* Update vectorial force */
410 fix1 = _mm256_add_pd(fix1,tx);
411 fiy1 = _mm256_add_pd(fiy1,ty);
412 fiz1 = _mm256_add_pd(fiz1,tz);
414 fjx3 = _mm256_add_pd(fjx3,tx);
415 fjy3 = _mm256_add_pd(fjy3,ty);
416 fjz3 = _mm256_add_pd(fjz3,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r21 = _mm256_mul_pd(rsq21,rinv21);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt = _mm256_mul_pd(r21,ewtabscale);
428 ewitab = _mm256_cvttpd_epi32(ewrt);
429 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
430 ewitab = _mm_slli_epi32(ewitab,2);
431 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
432 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
433 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
434 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
435 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
436 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
437 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
438 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
439 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm256_add_pd(velecsum,velec);
446 /* Calculate temporary vectorial force */
447 tx = _mm256_mul_pd(fscal,dx21);
448 ty = _mm256_mul_pd(fscal,dy21);
449 tz = _mm256_mul_pd(fscal,dz21);
451 /* Update vectorial force */
452 fix2 = _mm256_add_pd(fix2,tx);
453 fiy2 = _mm256_add_pd(fiy2,ty);
454 fiz2 = _mm256_add_pd(fiz2,tz);
456 fjx1 = _mm256_add_pd(fjx1,tx);
457 fjy1 = _mm256_add_pd(fjy1,ty);
458 fjz1 = _mm256_add_pd(fjz1,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r22 = _mm256_mul_pd(rsq22,rinv22);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm256_mul_pd(r22,ewtabscale);
470 ewitab = _mm256_cvttpd_epi32(ewrt);
471 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
477 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
479 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
480 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
481 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm256_add_pd(velecsum,velec);
488 /* Calculate temporary vectorial force */
489 tx = _mm256_mul_pd(fscal,dx22);
490 ty = _mm256_mul_pd(fscal,dy22);
491 tz = _mm256_mul_pd(fscal,dz22);
493 /* Update vectorial force */
494 fix2 = _mm256_add_pd(fix2,tx);
495 fiy2 = _mm256_add_pd(fiy2,ty);
496 fiz2 = _mm256_add_pd(fiz2,tz);
498 fjx2 = _mm256_add_pd(fjx2,tx);
499 fjy2 = _mm256_add_pd(fjy2,ty);
500 fjz2 = _mm256_add_pd(fjz2,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r23 = _mm256_mul_pd(rsq23,rinv23);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm256_mul_pd(r23,ewtabscale);
512 ewitab = _mm256_cvttpd_epi32(ewrt);
513 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
519 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
521 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
522 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
523 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm256_add_pd(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm256_mul_pd(fscal,dx23);
532 ty = _mm256_mul_pd(fscal,dy23);
533 tz = _mm256_mul_pd(fscal,dz23);
535 /* Update vectorial force */
536 fix2 = _mm256_add_pd(fix2,tx);
537 fiy2 = _mm256_add_pd(fiy2,ty);
538 fiz2 = _mm256_add_pd(fiz2,tz);
540 fjx3 = _mm256_add_pd(fjx3,tx);
541 fjy3 = _mm256_add_pd(fjy3,ty);
542 fjz3 = _mm256_add_pd(fjz3,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r31 = _mm256_mul_pd(rsq31,rinv31);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt = _mm256_mul_pd(r31,ewtabscale);
554 ewitab = _mm256_cvttpd_epi32(ewrt);
555 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
556 ewitab = _mm_slli_epi32(ewitab,2);
557 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
558 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
559 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
560 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
561 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
562 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
563 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
564 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
565 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm256_add_pd(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm256_mul_pd(fscal,dx31);
574 ty = _mm256_mul_pd(fscal,dy31);
575 tz = _mm256_mul_pd(fscal,dz31);
577 /* Update vectorial force */
578 fix3 = _mm256_add_pd(fix3,tx);
579 fiy3 = _mm256_add_pd(fiy3,ty);
580 fiz3 = _mm256_add_pd(fiz3,tz);
582 fjx1 = _mm256_add_pd(fjx1,tx);
583 fjy1 = _mm256_add_pd(fjy1,ty);
584 fjz1 = _mm256_add_pd(fjz1,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r32 = _mm256_mul_pd(rsq32,rinv32);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm256_mul_pd(r32,ewtabscale);
596 ewitab = _mm256_cvttpd_epi32(ewrt);
597 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
603 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
605 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
606 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
607 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm256_add_pd(velecsum,velec);
614 /* Calculate temporary vectorial force */
615 tx = _mm256_mul_pd(fscal,dx32);
616 ty = _mm256_mul_pd(fscal,dy32);
617 tz = _mm256_mul_pd(fscal,dz32);
619 /* Update vectorial force */
620 fix3 = _mm256_add_pd(fix3,tx);
621 fiy3 = _mm256_add_pd(fiy3,ty);
622 fiz3 = _mm256_add_pd(fiz3,tz);
624 fjx2 = _mm256_add_pd(fjx2,tx);
625 fjy2 = _mm256_add_pd(fjy2,ty);
626 fjz2 = _mm256_add_pd(fjz2,tz);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r33 = _mm256_mul_pd(rsq33,rinv33);
634 /* EWALD ELECTROSTATICS */
636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
637 ewrt = _mm256_mul_pd(r33,ewtabscale);
638 ewitab = _mm256_cvttpd_epi32(ewrt);
639 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
640 ewitab = _mm_slli_epi32(ewitab,2);
641 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
642 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
643 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
644 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
645 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
646 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
647 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
648 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
649 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm256_add_pd(velecsum,velec);
656 /* Calculate temporary vectorial force */
657 tx = _mm256_mul_pd(fscal,dx33);
658 ty = _mm256_mul_pd(fscal,dy33);
659 tz = _mm256_mul_pd(fscal,dz33);
661 /* Update vectorial force */
662 fix3 = _mm256_add_pd(fix3,tx);
663 fiy3 = _mm256_add_pd(fiy3,ty);
664 fiz3 = _mm256_add_pd(fiz3,tz);
666 fjx3 = _mm256_add_pd(fjx3,tx);
667 fjy3 = _mm256_add_pd(fjy3,ty);
668 fjz3 = _mm256_add_pd(fjz3,tz);
670 fjptrA = f+j_coord_offsetA;
671 fjptrB = f+j_coord_offsetB;
672 fjptrC = f+j_coord_offsetC;
673 fjptrD = f+j_coord_offsetD;
675 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
676 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
678 /* Inner loop uses 369 flops */
684 /* Get j neighbor index, and coordinate index */
685 jnrlistA = jjnr[jidx];
686 jnrlistB = jjnr[jidx+1];
687 jnrlistC = jjnr[jidx+2];
688 jnrlistD = jjnr[jidx+3];
689 /* Sign of each element will be negative for non-real atoms.
690 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
691 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
693 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
695 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
696 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
697 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
699 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
700 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
701 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
702 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
703 j_coord_offsetA = DIM*jnrA;
704 j_coord_offsetB = DIM*jnrB;
705 j_coord_offsetC = DIM*jnrC;
706 j_coord_offsetD = DIM*jnrD;
708 /* load j atom coordinates */
709 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
710 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
711 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
713 /* Calculate displacement vector */
714 dx11 = _mm256_sub_pd(ix1,jx1);
715 dy11 = _mm256_sub_pd(iy1,jy1);
716 dz11 = _mm256_sub_pd(iz1,jz1);
717 dx12 = _mm256_sub_pd(ix1,jx2);
718 dy12 = _mm256_sub_pd(iy1,jy2);
719 dz12 = _mm256_sub_pd(iz1,jz2);
720 dx13 = _mm256_sub_pd(ix1,jx3);
721 dy13 = _mm256_sub_pd(iy1,jy3);
722 dz13 = _mm256_sub_pd(iz1,jz3);
723 dx21 = _mm256_sub_pd(ix2,jx1);
724 dy21 = _mm256_sub_pd(iy2,jy1);
725 dz21 = _mm256_sub_pd(iz2,jz1);
726 dx22 = _mm256_sub_pd(ix2,jx2);
727 dy22 = _mm256_sub_pd(iy2,jy2);
728 dz22 = _mm256_sub_pd(iz2,jz2);
729 dx23 = _mm256_sub_pd(ix2,jx3);
730 dy23 = _mm256_sub_pd(iy2,jy3);
731 dz23 = _mm256_sub_pd(iz2,jz3);
732 dx31 = _mm256_sub_pd(ix3,jx1);
733 dy31 = _mm256_sub_pd(iy3,jy1);
734 dz31 = _mm256_sub_pd(iz3,jz1);
735 dx32 = _mm256_sub_pd(ix3,jx2);
736 dy32 = _mm256_sub_pd(iy3,jy2);
737 dz32 = _mm256_sub_pd(iz3,jz2);
738 dx33 = _mm256_sub_pd(ix3,jx3);
739 dy33 = _mm256_sub_pd(iy3,jy3);
740 dz33 = _mm256_sub_pd(iz3,jz3);
742 /* Calculate squared distance and things based on it */
743 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
744 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
745 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
746 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
747 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
748 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
749 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
750 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
751 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
753 rinv11 = avx256_invsqrt_d(rsq11);
754 rinv12 = avx256_invsqrt_d(rsq12);
755 rinv13 = avx256_invsqrt_d(rsq13);
756 rinv21 = avx256_invsqrt_d(rsq21);
757 rinv22 = avx256_invsqrt_d(rsq22);
758 rinv23 = avx256_invsqrt_d(rsq23);
759 rinv31 = avx256_invsqrt_d(rsq31);
760 rinv32 = avx256_invsqrt_d(rsq32);
761 rinv33 = avx256_invsqrt_d(rsq33);
763 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
764 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
765 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
766 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
767 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
768 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
769 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
770 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
771 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
773 fjx1 = _mm256_setzero_pd();
774 fjy1 = _mm256_setzero_pd();
775 fjz1 = _mm256_setzero_pd();
776 fjx2 = _mm256_setzero_pd();
777 fjy2 = _mm256_setzero_pd();
778 fjz2 = _mm256_setzero_pd();
779 fjx3 = _mm256_setzero_pd();
780 fjy3 = _mm256_setzero_pd();
781 fjz3 = _mm256_setzero_pd();
783 /**************************
784 * CALCULATE INTERACTIONS *
785 **************************/
787 r11 = _mm256_mul_pd(rsq11,rinv11);
788 r11 = _mm256_andnot_pd(dummy_mask,r11);
790 /* EWALD ELECTROSTATICS */
792 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
793 ewrt = _mm256_mul_pd(r11,ewtabscale);
794 ewitab = _mm256_cvttpd_epi32(ewrt);
795 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
796 ewitab = _mm_slli_epi32(ewitab,2);
797 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
798 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
799 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
800 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
801 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
802 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
803 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
804 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
805 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm256_andnot_pd(dummy_mask,velec);
809 velecsum = _mm256_add_pd(velecsum,velec);
813 fscal = _mm256_andnot_pd(dummy_mask,fscal);
815 /* Calculate temporary vectorial force */
816 tx = _mm256_mul_pd(fscal,dx11);
817 ty = _mm256_mul_pd(fscal,dy11);
818 tz = _mm256_mul_pd(fscal,dz11);
820 /* Update vectorial force */
821 fix1 = _mm256_add_pd(fix1,tx);
822 fiy1 = _mm256_add_pd(fiy1,ty);
823 fiz1 = _mm256_add_pd(fiz1,tz);
825 fjx1 = _mm256_add_pd(fjx1,tx);
826 fjy1 = _mm256_add_pd(fjy1,ty);
827 fjz1 = _mm256_add_pd(fjz1,tz);
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 r12 = _mm256_mul_pd(rsq12,rinv12);
834 r12 = _mm256_andnot_pd(dummy_mask,r12);
836 /* EWALD ELECTROSTATICS */
838 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
839 ewrt = _mm256_mul_pd(r12,ewtabscale);
840 ewitab = _mm256_cvttpd_epi32(ewrt);
841 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
842 ewitab = _mm_slli_epi32(ewitab,2);
843 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
844 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
845 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
846 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
847 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
848 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
849 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
850 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
851 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
853 /* Update potential sum for this i atom from the interaction with this j atom. */
854 velec = _mm256_andnot_pd(dummy_mask,velec);
855 velecsum = _mm256_add_pd(velecsum,velec);
859 fscal = _mm256_andnot_pd(dummy_mask,fscal);
861 /* Calculate temporary vectorial force */
862 tx = _mm256_mul_pd(fscal,dx12);
863 ty = _mm256_mul_pd(fscal,dy12);
864 tz = _mm256_mul_pd(fscal,dz12);
866 /* Update vectorial force */
867 fix1 = _mm256_add_pd(fix1,tx);
868 fiy1 = _mm256_add_pd(fiy1,ty);
869 fiz1 = _mm256_add_pd(fiz1,tz);
871 fjx2 = _mm256_add_pd(fjx2,tx);
872 fjy2 = _mm256_add_pd(fjy2,ty);
873 fjz2 = _mm256_add_pd(fjz2,tz);
875 /**************************
876 * CALCULATE INTERACTIONS *
877 **************************/
879 r13 = _mm256_mul_pd(rsq13,rinv13);
880 r13 = _mm256_andnot_pd(dummy_mask,r13);
882 /* EWALD ELECTROSTATICS */
884 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
885 ewrt = _mm256_mul_pd(r13,ewtabscale);
886 ewitab = _mm256_cvttpd_epi32(ewrt);
887 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
888 ewitab = _mm_slli_epi32(ewitab,2);
889 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
890 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
891 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
892 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
893 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
894 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
895 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
896 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
897 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec = _mm256_andnot_pd(dummy_mask,velec);
901 velecsum = _mm256_add_pd(velecsum,velec);
905 fscal = _mm256_andnot_pd(dummy_mask,fscal);
907 /* Calculate temporary vectorial force */
908 tx = _mm256_mul_pd(fscal,dx13);
909 ty = _mm256_mul_pd(fscal,dy13);
910 tz = _mm256_mul_pd(fscal,dz13);
912 /* Update vectorial force */
913 fix1 = _mm256_add_pd(fix1,tx);
914 fiy1 = _mm256_add_pd(fiy1,ty);
915 fiz1 = _mm256_add_pd(fiz1,tz);
917 fjx3 = _mm256_add_pd(fjx3,tx);
918 fjy3 = _mm256_add_pd(fjy3,ty);
919 fjz3 = _mm256_add_pd(fjz3,tz);
921 /**************************
922 * CALCULATE INTERACTIONS *
923 **************************/
925 r21 = _mm256_mul_pd(rsq21,rinv21);
926 r21 = _mm256_andnot_pd(dummy_mask,r21);
928 /* EWALD ELECTROSTATICS */
930 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
931 ewrt = _mm256_mul_pd(r21,ewtabscale);
932 ewitab = _mm256_cvttpd_epi32(ewrt);
933 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
934 ewitab = _mm_slli_epi32(ewitab,2);
935 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
936 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
937 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
938 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
939 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
940 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
941 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
942 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
943 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
945 /* Update potential sum for this i atom from the interaction with this j atom. */
946 velec = _mm256_andnot_pd(dummy_mask,velec);
947 velecsum = _mm256_add_pd(velecsum,velec);
951 fscal = _mm256_andnot_pd(dummy_mask,fscal);
953 /* Calculate temporary vectorial force */
954 tx = _mm256_mul_pd(fscal,dx21);
955 ty = _mm256_mul_pd(fscal,dy21);
956 tz = _mm256_mul_pd(fscal,dz21);
958 /* Update vectorial force */
959 fix2 = _mm256_add_pd(fix2,tx);
960 fiy2 = _mm256_add_pd(fiy2,ty);
961 fiz2 = _mm256_add_pd(fiz2,tz);
963 fjx1 = _mm256_add_pd(fjx1,tx);
964 fjy1 = _mm256_add_pd(fjy1,ty);
965 fjz1 = _mm256_add_pd(fjz1,tz);
967 /**************************
968 * CALCULATE INTERACTIONS *
969 **************************/
971 r22 = _mm256_mul_pd(rsq22,rinv22);
972 r22 = _mm256_andnot_pd(dummy_mask,r22);
974 /* EWALD ELECTROSTATICS */
976 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
977 ewrt = _mm256_mul_pd(r22,ewtabscale);
978 ewitab = _mm256_cvttpd_epi32(ewrt);
979 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
980 ewitab = _mm_slli_epi32(ewitab,2);
981 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
982 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
983 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
984 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
985 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
986 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
987 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
988 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
989 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec = _mm256_andnot_pd(dummy_mask,velec);
993 velecsum = _mm256_add_pd(velecsum,velec);
997 fscal = _mm256_andnot_pd(dummy_mask,fscal);
999 /* Calculate temporary vectorial force */
1000 tx = _mm256_mul_pd(fscal,dx22);
1001 ty = _mm256_mul_pd(fscal,dy22);
1002 tz = _mm256_mul_pd(fscal,dz22);
1004 /* Update vectorial force */
1005 fix2 = _mm256_add_pd(fix2,tx);
1006 fiy2 = _mm256_add_pd(fiy2,ty);
1007 fiz2 = _mm256_add_pd(fiz2,tz);
1009 fjx2 = _mm256_add_pd(fjx2,tx);
1010 fjy2 = _mm256_add_pd(fjy2,ty);
1011 fjz2 = _mm256_add_pd(fjz2,tz);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 r23 = _mm256_mul_pd(rsq23,rinv23);
1018 r23 = _mm256_andnot_pd(dummy_mask,r23);
1020 /* EWALD ELECTROSTATICS */
1022 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1023 ewrt = _mm256_mul_pd(r23,ewtabscale);
1024 ewitab = _mm256_cvttpd_epi32(ewrt);
1025 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1026 ewitab = _mm_slli_epi32(ewitab,2);
1027 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1028 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1029 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1030 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1031 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1032 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1033 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1034 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1035 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1037 /* Update potential sum for this i atom from the interaction with this j atom. */
1038 velec = _mm256_andnot_pd(dummy_mask,velec);
1039 velecsum = _mm256_add_pd(velecsum,velec);
1043 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1045 /* Calculate temporary vectorial force */
1046 tx = _mm256_mul_pd(fscal,dx23);
1047 ty = _mm256_mul_pd(fscal,dy23);
1048 tz = _mm256_mul_pd(fscal,dz23);
1050 /* Update vectorial force */
1051 fix2 = _mm256_add_pd(fix2,tx);
1052 fiy2 = _mm256_add_pd(fiy2,ty);
1053 fiz2 = _mm256_add_pd(fiz2,tz);
1055 fjx3 = _mm256_add_pd(fjx3,tx);
1056 fjy3 = _mm256_add_pd(fjy3,ty);
1057 fjz3 = _mm256_add_pd(fjz3,tz);
1059 /**************************
1060 * CALCULATE INTERACTIONS *
1061 **************************/
1063 r31 = _mm256_mul_pd(rsq31,rinv31);
1064 r31 = _mm256_andnot_pd(dummy_mask,r31);
1066 /* EWALD ELECTROSTATICS */
1068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1069 ewrt = _mm256_mul_pd(r31,ewtabscale);
1070 ewitab = _mm256_cvttpd_epi32(ewrt);
1071 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1072 ewitab = _mm_slli_epi32(ewitab,2);
1073 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1074 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1075 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1076 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1077 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1078 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1079 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1080 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1081 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1083 /* Update potential sum for this i atom from the interaction with this j atom. */
1084 velec = _mm256_andnot_pd(dummy_mask,velec);
1085 velecsum = _mm256_add_pd(velecsum,velec);
1089 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1091 /* Calculate temporary vectorial force */
1092 tx = _mm256_mul_pd(fscal,dx31);
1093 ty = _mm256_mul_pd(fscal,dy31);
1094 tz = _mm256_mul_pd(fscal,dz31);
1096 /* Update vectorial force */
1097 fix3 = _mm256_add_pd(fix3,tx);
1098 fiy3 = _mm256_add_pd(fiy3,ty);
1099 fiz3 = _mm256_add_pd(fiz3,tz);
1101 fjx1 = _mm256_add_pd(fjx1,tx);
1102 fjy1 = _mm256_add_pd(fjy1,ty);
1103 fjz1 = _mm256_add_pd(fjz1,tz);
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 r32 = _mm256_mul_pd(rsq32,rinv32);
1110 r32 = _mm256_andnot_pd(dummy_mask,r32);
1112 /* EWALD ELECTROSTATICS */
1114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1115 ewrt = _mm256_mul_pd(r32,ewtabscale);
1116 ewitab = _mm256_cvttpd_epi32(ewrt);
1117 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1118 ewitab = _mm_slli_epi32(ewitab,2);
1119 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1120 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1121 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1122 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1123 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1124 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1125 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1126 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1127 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1129 /* Update potential sum for this i atom from the interaction with this j atom. */
1130 velec = _mm256_andnot_pd(dummy_mask,velec);
1131 velecsum = _mm256_add_pd(velecsum,velec);
1135 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1137 /* Calculate temporary vectorial force */
1138 tx = _mm256_mul_pd(fscal,dx32);
1139 ty = _mm256_mul_pd(fscal,dy32);
1140 tz = _mm256_mul_pd(fscal,dz32);
1142 /* Update vectorial force */
1143 fix3 = _mm256_add_pd(fix3,tx);
1144 fiy3 = _mm256_add_pd(fiy3,ty);
1145 fiz3 = _mm256_add_pd(fiz3,tz);
1147 fjx2 = _mm256_add_pd(fjx2,tx);
1148 fjy2 = _mm256_add_pd(fjy2,ty);
1149 fjz2 = _mm256_add_pd(fjz2,tz);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 r33 = _mm256_mul_pd(rsq33,rinv33);
1156 r33 = _mm256_andnot_pd(dummy_mask,r33);
1158 /* EWALD ELECTROSTATICS */
1160 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1161 ewrt = _mm256_mul_pd(r33,ewtabscale);
1162 ewitab = _mm256_cvttpd_epi32(ewrt);
1163 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1164 ewitab = _mm_slli_epi32(ewitab,2);
1165 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1166 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1167 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1168 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1169 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1170 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1171 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1172 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1173 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1175 /* Update potential sum for this i atom from the interaction with this j atom. */
1176 velec = _mm256_andnot_pd(dummy_mask,velec);
1177 velecsum = _mm256_add_pd(velecsum,velec);
1181 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1183 /* Calculate temporary vectorial force */
1184 tx = _mm256_mul_pd(fscal,dx33);
1185 ty = _mm256_mul_pd(fscal,dy33);
1186 tz = _mm256_mul_pd(fscal,dz33);
1188 /* Update vectorial force */
1189 fix3 = _mm256_add_pd(fix3,tx);
1190 fiy3 = _mm256_add_pd(fiy3,ty);
1191 fiz3 = _mm256_add_pd(fiz3,tz);
1193 fjx3 = _mm256_add_pd(fjx3,tx);
1194 fjy3 = _mm256_add_pd(fjy3,ty);
1195 fjz3 = _mm256_add_pd(fjz3,tz);
1197 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1198 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1199 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1200 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1202 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1203 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1205 /* Inner loop uses 378 flops */
1208 /* End of innermost loop */
1210 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1211 f+i_coord_offset+DIM,fshift+i_shift_offset);
1214 /* Update potential energies */
1215 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1217 /* Increment number of inner iterations */
1218 inneriter += j_index_end - j_index_start;
1220 /* Outer loop uses 19 flops */
1223 /* Increment number of outer iterations */
1226 /* Update outer/inner flops */
1228 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*378);
1231 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double
1232 * Electrostatics interaction: Ewald
1233 * VdW interaction: None
1234 * Geometry: Water4-Water4
1235 * Calculate force/pot: Force
1238 nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_double
1239 (t_nblist * gmx_restrict nlist,
1240 rvec * gmx_restrict xx,
1241 rvec * gmx_restrict ff,
1242 struct t_forcerec * gmx_restrict fr,
1243 t_mdatoms * gmx_restrict mdatoms,
1244 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1245 t_nrnb * gmx_restrict nrnb)
1247 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1248 * just 0 for non-waters.
1249 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1250 * jnr indices corresponding to data put in the four positions in the SIMD register.
1252 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1253 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1254 int jnrA,jnrB,jnrC,jnrD;
1255 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1256 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1257 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1258 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1259 real rcutoff_scalar;
1260 real *shiftvec,*fshift,*x,*f;
1261 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1262 real scratch[4*DIM];
1263 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1264 real * vdwioffsetptr1;
1265 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1266 real * vdwioffsetptr2;
1267 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1268 real * vdwioffsetptr3;
1269 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1270 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1271 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1272 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1273 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1274 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1275 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1276 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1277 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1278 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1279 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1280 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1281 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1282 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1283 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1284 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1285 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1288 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1289 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1291 __m256d dummy_mask,cutoff_mask;
1292 __m128 tmpmask0,tmpmask1;
1293 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1294 __m256d one = _mm256_set1_pd(1.0);
1295 __m256d two = _mm256_set1_pd(2.0);
1301 jindex = nlist->jindex;
1303 shiftidx = nlist->shift;
1305 shiftvec = fr->shift_vec[0];
1306 fshift = fr->fshift[0];
1307 facel = _mm256_set1_pd(fr->ic->epsfac);
1308 charge = mdatoms->chargeA;
1310 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1311 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1312 beta2 = _mm256_mul_pd(beta,beta);
1313 beta3 = _mm256_mul_pd(beta,beta2);
1315 ewtab = fr->ic->tabq_coul_F;
1316 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1317 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1319 /* Setup water-specific parameters */
1320 inr = nlist->iinr[0];
1321 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1322 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1323 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1325 jq1 = _mm256_set1_pd(charge[inr+1]);
1326 jq2 = _mm256_set1_pd(charge[inr+2]);
1327 jq3 = _mm256_set1_pd(charge[inr+3]);
1328 qq11 = _mm256_mul_pd(iq1,jq1);
1329 qq12 = _mm256_mul_pd(iq1,jq2);
1330 qq13 = _mm256_mul_pd(iq1,jq3);
1331 qq21 = _mm256_mul_pd(iq2,jq1);
1332 qq22 = _mm256_mul_pd(iq2,jq2);
1333 qq23 = _mm256_mul_pd(iq2,jq3);
1334 qq31 = _mm256_mul_pd(iq3,jq1);
1335 qq32 = _mm256_mul_pd(iq3,jq2);
1336 qq33 = _mm256_mul_pd(iq3,jq3);
1338 /* Avoid stupid compiler warnings */
1339 jnrA = jnrB = jnrC = jnrD = 0;
1340 j_coord_offsetA = 0;
1341 j_coord_offsetB = 0;
1342 j_coord_offsetC = 0;
1343 j_coord_offsetD = 0;
1348 for(iidx=0;iidx<4*DIM;iidx++)
1350 scratch[iidx] = 0.0;
1353 /* Start outer loop over neighborlists */
1354 for(iidx=0; iidx<nri; iidx++)
1356 /* Load shift vector for this list */
1357 i_shift_offset = DIM*shiftidx[iidx];
1359 /* Load limits for loop over neighbors */
1360 j_index_start = jindex[iidx];
1361 j_index_end = jindex[iidx+1];
1363 /* Get outer coordinate index */
1365 i_coord_offset = DIM*inr;
1367 /* Load i particle coords and add shift vector */
1368 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1369 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1371 fix1 = _mm256_setzero_pd();
1372 fiy1 = _mm256_setzero_pd();
1373 fiz1 = _mm256_setzero_pd();
1374 fix2 = _mm256_setzero_pd();
1375 fiy2 = _mm256_setzero_pd();
1376 fiz2 = _mm256_setzero_pd();
1377 fix3 = _mm256_setzero_pd();
1378 fiy3 = _mm256_setzero_pd();
1379 fiz3 = _mm256_setzero_pd();
1381 /* Start inner kernel loop */
1382 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1385 /* Get j neighbor index, and coordinate index */
1387 jnrB = jjnr[jidx+1];
1388 jnrC = jjnr[jidx+2];
1389 jnrD = jjnr[jidx+3];
1390 j_coord_offsetA = DIM*jnrA;
1391 j_coord_offsetB = DIM*jnrB;
1392 j_coord_offsetC = DIM*jnrC;
1393 j_coord_offsetD = DIM*jnrD;
1395 /* load j atom coordinates */
1396 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1397 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1398 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1400 /* Calculate displacement vector */
1401 dx11 = _mm256_sub_pd(ix1,jx1);
1402 dy11 = _mm256_sub_pd(iy1,jy1);
1403 dz11 = _mm256_sub_pd(iz1,jz1);
1404 dx12 = _mm256_sub_pd(ix1,jx2);
1405 dy12 = _mm256_sub_pd(iy1,jy2);
1406 dz12 = _mm256_sub_pd(iz1,jz2);
1407 dx13 = _mm256_sub_pd(ix1,jx3);
1408 dy13 = _mm256_sub_pd(iy1,jy3);
1409 dz13 = _mm256_sub_pd(iz1,jz3);
1410 dx21 = _mm256_sub_pd(ix2,jx1);
1411 dy21 = _mm256_sub_pd(iy2,jy1);
1412 dz21 = _mm256_sub_pd(iz2,jz1);
1413 dx22 = _mm256_sub_pd(ix2,jx2);
1414 dy22 = _mm256_sub_pd(iy2,jy2);
1415 dz22 = _mm256_sub_pd(iz2,jz2);
1416 dx23 = _mm256_sub_pd(ix2,jx3);
1417 dy23 = _mm256_sub_pd(iy2,jy3);
1418 dz23 = _mm256_sub_pd(iz2,jz3);
1419 dx31 = _mm256_sub_pd(ix3,jx1);
1420 dy31 = _mm256_sub_pd(iy3,jy1);
1421 dz31 = _mm256_sub_pd(iz3,jz1);
1422 dx32 = _mm256_sub_pd(ix3,jx2);
1423 dy32 = _mm256_sub_pd(iy3,jy2);
1424 dz32 = _mm256_sub_pd(iz3,jz2);
1425 dx33 = _mm256_sub_pd(ix3,jx3);
1426 dy33 = _mm256_sub_pd(iy3,jy3);
1427 dz33 = _mm256_sub_pd(iz3,jz3);
1429 /* Calculate squared distance and things based on it */
1430 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1431 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1432 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1433 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1434 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1435 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1436 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1437 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1438 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1440 rinv11 = avx256_invsqrt_d(rsq11);
1441 rinv12 = avx256_invsqrt_d(rsq12);
1442 rinv13 = avx256_invsqrt_d(rsq13);
1443 rinv21 = avx256_invsqrt_d(rsq21);
1444 rinv22 = avx256_invsqrt_d(rsq22);
1445 rinv23 = avx256_invsqrt_d(rsq23);
1446 rinv31 = avx256_invsqrt_d(rsq31);
1447 rinv32 = avx256_invsqrt_d(rsq32);
1448 rinv33 = avx256_invsqrt_d(rsq33);
1450 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1451 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1452 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1453 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1454 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1455 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1456 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1457 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1458 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1460 fjx1 = _mm256_setzero_pd();
1461 fjy1 = _mm256_setzero_pd();
1462 fjz1 = _mm256_setzero_pd();
1463 fjx2 = _mm256_setzero_pd();
1464 fjy2 = _mm256_setzero_pd();
1465 fjz2 = _mm256_setzero_pd();
1466 fjx3 = _mm256_setzero_pd();
1467 fjy3 = _mm256_setzero_pd();
1468 fjz3 = _mm256_setzero_pd();
1470 /**************************
1471 * CALCULATE INTERACTIONS *
1472 **************************/
1474 r11 = _mm256_mul_pd(rsq11,rinv11);
1476 /* EWALD ELECTROSTATICS */
1478 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1479 ewrt = _mm256_mul_pd(r11,ewtabscale);
1480 ewitab = _mm256_cvttpd_epi32(ewrt);
1481 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1482 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1483 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1485 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1486 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1490 /* Calculate temporary vectorial force */
1491 tx = _mm256_mul_pd(fscal,dx11);
1492 ty = _mm256_mul_pd(fscal,dy11);
1493 tz = _mm256_mul_pd(fscal,dz11);
1495 /* Update vectorial force */
1496 fix1 = _mm256_add_pd(fix1,tx);
1497 fiy1 = _mm256_add_pd(fiy1,ty);
1498 fiz1 = _mm256_add_pd(fiz1,tz);
1500 fjx1 = _mm256_add_pd(fjx1,tx);
1501 fjy1 = _mm256_add_pd(fjy1,ty);
1502 fjz1 = _mm256_add_pd(fjz1,tz);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 r12 = _mm256_mul_pd(rsq12,rinv12);
1510 /* EWALD ELECTROSTATICS */
1512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1513 ewrt = _mm256_mul_pd(r12,ewtabscale);
1514 ewitab = _mm256_cvttpd_epi32(ewrt);
1515 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1516 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1517 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1519 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1520 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1524 /* Calculate temporary vectorial force */
1525 tx = _mm256_mul_pd(fscal,dx12);
1526 ty = _mm256_mul_pd(fscal,dy12);
1527 tz = _mm256_mul_pd(fscal,dz12);
1529 /* Update vectorial force */
1530 fix1 = _mm256_add_pd(fix1,tx);
1531 fiy1 = _mm256_add_pd(fiy1,ty);
1532 fiz1 = _mm256_add_pd(fiz1,tz);
1534 fjx2 = _mm256_add_pd(fjx2,tx);
1535 fjy2 = _mm256_add_pd(fjy2,ty);
1536 fjz2 = _mm256_add_pd(fjz2,tz);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 r13 = _mm256_mul_pd(rsq13,rinv13);
1544 /* EWALD ELECTROSTATICS */
1546 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1547 ewrt = _mm256_mul_pd(r13,ewtabscale);
1548 ewitab = _mm256_cvttpd_epi32(ewrt);
1549 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1550 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1551 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1553 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1554 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1558 /* Calculate temporary vectorial force */
1559 tx = _mm256_mul_pd(fscal,dx13);
1560 ty = _mm256_mul_pd(fscal,dy13);
1561 tz = _mm256_mul_pd(fscal,dz13);
1563 /* Update vectorial force */
1564 fix1 = _mm256_add_pd(fix1,tx);
1565 fiy1 = _mm256_add_pd(fiy1,ty);
1566 fiz1 = _mm256_add_pd(fiz1,tz);
1568 fjx3 = _mm256_add_pd(fjx3,tx);
1569 fjy3 = _mm256_add_pd(fjy3,ty);
1570 fjz3 = _mm256_add_pd(fjz3,tz);
1572 /**************************
1573 * CALCULATE INTERACTIONS *
1574 **************************/
1576 r21 = _mm256_mul_pd(rsq21,rinv21);
1578 /* EWALD ELECTROSTATICS */
1580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1581 ewrt = _mm256_mul_pd(r21,ewtabscale);
1582 ewitab = _mm256_cvttpd_epi32(ewrt);
1583 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1584 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1585 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1587 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1588 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1592 /* Calculate temporary vectorial force */
1593 tx = _mm256_mul_pd(fscal,dx21);
1594 ty = _mm256_mul_pd(fscal,dy21);
1595 tz = _mm256_mul_pd(fscal,dz21);
1597 /* Update vectorial force */
1598 fix2 = _mm256_add_pd(fix2,tx);
1599 fiy2 = _mm256_add_pd(fiy2,ty);
1600 fiz2 = _mm256_add_pd(fiz2,tz);
1602 fjx1 = _mm256_add_pd(fjx1,tx);
1603 fjy1 = _mm256_add_pd(fjy1,ty);
1604 fjz1 = _mm256_add_pd(fjz1,tz);
1606 /**************************
1607 * CALCULATE INTERACTIONS *
1608 **************************/
1610 r22 = _mm256_mul_pd(rsq22,rinv22);
1612 /* EWALD ELECTROSTATICS */
1614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1615 ewrt = _mm256_mul_pd(r22,ewtabscale);
1616 ewitab = _mm256_cvttpd_epi32(ewrt);
1617 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1618 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1619 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1621 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1622 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1626 /* Calculate temporary vectorial force */
1627 tx = _mm256_mul_pd(fscal,dx22);
1628 ty = _mm256_mul_pd(fscal,dy22);
1629 tz = _mm256_mul_pd(fscal,dz22);
1631 /* Update vectorial force */
1632 fix2 = _mm256_add_pd(fix2,tx);
1633 fiy2 = _mm256_add_pd(fiy2,ty);
1634 fiz2 = _mm256_add_pd(fiz2,tz);
1636 fjx2 = _mm256_add_pd(fjx2,tx);
1637 fjy2 = _mm256_add_pd(fjy2,ty);
1638 fjz2 = _mm256_add_pd(fjz2,tz);
1640 /**************************
1641 * CALCULATE INTERACTIONS *
1642 **************************/
1644 r23 = _mm256_mul_pd(rsq23,rinv23);
1646 /* EWALD ELECTROSTATICS */
1648 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1649 ewrt = _mm256_mul_pd(r23,ewtabscale);
1650 ewitab = _mm256_cvttpd_epi32(ewrt);
1651 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1652 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1653 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1655 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1656 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1660 /* Calculate temporary vectorial force */
1661 tx = _mm256_mul_pd(fscal,dx23);
1662 ty = _mm256_mul_pd(fscal,dy23);
1663 tz = _mm256_mul_pd(fscal,dz23);
1665 /* Update vectorial force */
1666 fix2 = _mm256_add_pd(fix2,tx);
1667 fiy2 = _mm256_add_pd(fiy2,ty);
1668 fiz2 = _mm256_add_pd(fiz2,tz);
1670 fjx3 = _mm256_add_pd(fjx3,tx);
1671 fjy3 = _mm256_add_pd(fjy3,ty);
1672 fjz3 = _mm256_add_pd(fjz3,tz);
1674 /**************************
1675 * CALCULATE INTERACTIONS *
1676 **************************/
1678 r31 = _mm256_mul_pd(rsq31,rinv31);
1680 /* EWALD ELECTROSTATICS */
1682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683 ewrt = _mm256_mul_pd(r31,ewtabscale);
1684 ewitab = _mm256_cvttpd_epi32(ewrt);
1685 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1686 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1687 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1689 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1690 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1694 /* Calculate temporary vectorial force */
1695 tx = _mm256_mul_pd(fscal,dx31);
1696 ty = _mm256_mul_pd(fscal,dy31);
1697 tz = _mm256_mul_pd(fscal,dz31);
1699 /* Update vectorial force */
1700 fix3 = _mm256_add_pd(fix3,tx);
1701 fiy3 = _mm256_add_pd(fiy3,ty);
1702 fiz3 = _mm256_add_pd(fiz3,tz);
1704 fjx1 = _mm256_add_pd(fjx1,tx);
1705 fjy1 = _mm256_add_pd(fjy1,ty);
1706 fjz1 = _mm256_add_pd(fjz1,tz);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 r32 = _mm256_mul_pd(rsq32,rinv32);
1714 /* EWALD ELECTROSTATICS */
1716 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1717 ewrt = _mm256_mul_pd(r32,ewtabscale);
1718 ewitab = _mm256_cvttpd_epi32(ewrt);
1719 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1720 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1721 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1723 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1724 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1728 /* Calculate temporary vectorial force */
1729 tx = _mm256_mul_pd(fscal,dx32);
1730 ty = _mm256_mul_pd(fscal,dy32);
1731 tz = _mm256_mul_pd(fscal,dz32);
1733 /* Update vectorial force */
1734 fix3 = _mm256_add_pd(fix3,tx);
1735 fiy3 = _mm256_add_pd(fiy3,ty);
1736 fiz3 = _mm256_add_pd(fiz3,tz);
1738 fjx2 = _mm256_add_pd(fjx2,tx);
1739 fjy2 = _mm256_add_pd(fjy2,ty);
1740 fjz2 = _mm256_add_pd(fjz2,tz);
1742 /**************************
1743 * CALCULATE INTERACTIONS *
1744 **************************/
1746 r33 = _mm256_mul_pd(rsq33,rinv33);
1748 /* EWALD ELECTROSTATICS */
1750 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1751 ewrt = _mm256_mul_pd(r33,ewtabscale);
1752 ewitab = _mm256_cvttpd_epi32(ewrt);
1753 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1754 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1755 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1757 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1758 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1762 /* Calculate temporary vectorial force */
1763 tx = _mm256_mul_pd(fscal,dx33);
1764 ty = _mm256_mul_pd(fscal,dy33);
1765 tz = _mm256_mul_pd(fscal,dz33);
1767 /* Update vectorial force */
1768 fix3 = _mm256_add_pd(fix3,tx);
1769 fiy3 = _mm256_add_pd(fiy3,ty);
1770 fiz3 = _mm256_add_pd(fiz3,tz);
1772 fjx3 = _mm256_add_pd(fjx3,tx);
1773 fjy3 = _mm256_add_pd(fjy3,ty);
1774 fjz3 = _mm256_add_pd(fjz3,tz);
1776 fjptrA = f+j_coord_offsetA;
1777 fjptrB = f+j_coord_offsetB;
1778 fjptrC = f+j_coord_offsetC;
1779 fjptrD = f+j_coord_offsetD;
1781 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1782 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1784 /* Inner loop uses 324 flops */
1787 if(jidx<j_index_end)
1790 /* Get j neighbor index, and coordinate index */
1791 jnrlistA = jjnr[jidx];
1792 jnrlistB = jjnr[jidx+1];
1793 jnrlistC = jjnr[jidx+2];
1794 jnrlistD = jjnr[jidx+3];
1795 /* Sign of each element will be negative for non-real atoms.
1796 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1797 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1799 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1801 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1802 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1803 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1805 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1806 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1807 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1808 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1809 j_coord_offsetA = DIM*jnrA;
1810 j_coord_offsetB = DIM*jnrB;
1811 j_coord_offsetC = DIM*jnrC;
1812 j_coord_offsetD = DIM*jnrD;
1814 /* load j atom coordinates */
1815 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1816 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1817 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1819 /* Calculate displacement vector */
1820 dx11 = _mm256_sub_pd(ix1,jx1);
1821 dy11 = _mm256_sub_pd(iy1,jy1);
1822 dz11 = _mm256_sub_pd(iz1,jz1);
1823 dx12 = _mm256_sub_pd(ix1,jx2);
1824 dy12 = _mm256_sub_pd(iy1,jy2);
1825 dz12 = _mm256_sub_pd(iz1,jz2);
1826 dx13 = _mm256_sub_pd(ix1,jx3);
1827 dy13 = _mm256_sub_pd(iy1,jy3);
1828 dz13 = _mm256_sub_pd(iz1,jz3);
1829 dx21 = _mm256_sub_pd(ix2,jx1);
1830 dy21 = _mm256_sub_pd(iy2,jy1);
1831 dz21 = _mm256_sub_pd(iz2,jz1);
1832 dx22 = _mm256_sub_pd(ix2,jx2);
1833 dy22 = _mm256_sub_pd(iy2,jy2);
1834 dz22 = _mm256_sub_pd(iz2,jz2);
1835 dx23 = _mm256_sub_pd(ix2,jx3);
1836 dy23 = _mm256_sub_pd(iy2,jy3);
1837 dz23 = _mm256_sub_pd(iz2,jz3);
1838 dx31 = _mm256_sub_pd(ix3,jx1);
1839 dy31 = _mm256_sub_pd(iy3,jy1);
1840 dz31 = _mm256_sub_pd(iz3,jz1);
1841 dx32 = _mm256_sub_pd(ix3,jx2);
1842 dy32 = _mm256_sub_pd(iy3,jy2);
1843 dz32 = _mm256_sub_pd(iz3,jz2);
1844 dx33 = _mm256_sub_pd(ix3,jx3);
1845 dy33 = _mm256_sub_pd(iy3,jy3);
1846 dz33 = _mm256_sub_pd(iz3,jz3);
1848 /* Calculate squared distance and things based on it */
1849 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1850 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1851 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1852 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1853 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1854 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1855 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1856 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1857 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1859 rinv11 = avx256_invsqrt_d(rsq11);
1860 rinv12 = avx256_invsqrt_d(rsq12);
1861 rinv13 = avx256_invsqrt_d(rsq13);
1862 rinv21 = avx256_invsqrt_d(rsq21);
1863 rinv22 = avx256_invsqrt_d(rsq22);
1864 rinv23 = avx256_invsqrt_d(rsq23);
1865 rinv31 = avx256_invsqrt_d(rsq31);
1866 rinv32 = avx256_invsqrt_d(rsq32);
1867 rinv33 = avx256_invsqrt_d(rsq33);
1869 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1870 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1871 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1872 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1873 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1874 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1875 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1876 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1877 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1879 fjx1 = _mm256_setzero_pd();
1880 fjy1 = _mm256_setzero_pd();
1881 fjz1 = _mm256_setzero_pd();
1882 fjx2 = _mm256_setzero_pd();
1883 fjy2 = _mm256_setzero_pd();
1884 fjz2 = _mm256_setzero_pd();
1885 fjx3 = _mm256_setzero_pd();
1886 fjy3 = _mm256_setzero_pd();
1887 fjz3 = _mm256_setzero_pd();
1889 /**************************
1890 * CALCULATE INTERACTIONS *
1891 **************************/
1893 r11 = _mm256_mul_pd(rsq11,rinv11);
1894 r11 = _mm256_andnot_pd(dummy_mask,r11);
1896 /* EWALD ELECTROSTATICS */
1898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1899 ewrt = _mm256_mul_pd(r11,ewtabscale);
1900 ewitab = _mm256_cvttpd_epi32(ewrt);
1901 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1902 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1903 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1905 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1906 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1910 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1912 /* Calculate temporary vectorial force */
1913 tx = _mm256_mul_pd(fscal,dx11);
1914 ty = _mm256_mul_pd(fscal,dy11);
1915 tz = _mm256_mul_pd(fscal,dz11);
1917 /* Update vectorial force */
1918 fix1 = _mm256_add_pd(fix1,tx);
1919 fiy1 = _mm256_add_pd(fiy1,ty);
1920 fiz1 = _mm256_add_pd(fiz1,tz);
1922 fjx1 = _mm256_add_pd(fjx1,tx);
1923 fjy1 = _mm256_add_pd(fjy1,ty);
1924 fjz1 = _mm256_add_pd(fjz1,tz);
1926 /**************************
1927 * CALCULATE INTERACTIONS *
1928 **************************/
1930 r12 = _mm256_mul_pd(rsq12,rinv12);
1931 r12 = _mm256_andnot_pd(dummy_mask,r12);
1933 /* EWALD ELECTROSTATICS */
1935 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1936 ewrt = _mm256_mul_pd(r12,ewtabscale);
1937 ewitab = _mm256_cvttpd_epi32(ewrt);
1938 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1939 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1940 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1942 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1943 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1947 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1949 /* Calculate temporary vectorial force */
1950 tx = _mm256_mul_pd(fscal,dx12);
1951 ty = _mm256_mul_pd(fscal,dy12);
1952 tz = _mm256_mul_pd(fscal,dz12);
1954 /* Update vectorial force */
1955 fix1 = _mm256_add_pd(fix1,tx);
1956 fiy1 = _mm256_add_pd(fiy1,ty);
1957 fiz1 = _mm256_add_pd(fiz1,tz);
1959 fjx2 = _mm256_add_pd(fjx2,tx);
1960 fjy2 = _mm256_add_pd(fjy2,ty);
1961 fjz2 = _mm256_add_pd(fjz2,tz);
1963 /**************************
1964 * CALCULATE INTERACTIONS *
1965 **************************/
1967 r13 = _mm256_mul_pd(rsq13,rinv13);
1968 r13 = _mm256_andnot_pd(dummy_mask,r13);
1970 /* EWALD ELECTROSTATICS */
1972 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1973 ewrt = _mm256_mul_pd(r13,ewtabscale);
1974 ewitab = _mm256_cvttpd_epi32(ewrt);
1975 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1976 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1977 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1979 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1980 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1984 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1986 /* Calculate temporary vectorial force */
1987 tx = _mm256_mul_pd(fscal,dx13);
1988 ty = _mm256_mul_pd(fscal,dy13);
1989 tz = _mm256_mul_pd(fscal,dz13);
1991 /* Update vectorial force */
1992 fix1 = _mm256_add_pd(fix1,tx);
1993 fiy1 = _mm256_add_pd(fiy1,ty);
1994 fiz1 = _mm256_add_pd(fiz1,tz);
1996 fjx3 = _mm256_add_pd(fjx3,tx);
1997 fjy3 = _mm256_add_pd(fjy3,ty);
1998 fjz3 = _mm256_add_pd(fjz3,tz);
2000 /**************************
2001 * CALCULATE INTERACTIONS *
2002 **************************/
2004 r21 = _mm256_mul_pd(rsq21,rinv21);
2005 r21 = _mm256_andnot_pd(dummy_mask,r21);
2007 /* EWALD ELECTROSTATICS */
2009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2010 ewrt = _mm256_mul_pd(r21,ewtabscale);
2011 ewitab = _mm256_cvttpd_epi32(ewrt);
2012 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2013 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2014 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2016 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2017 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2021 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2023 /* Calculate temporary vectorial force */
2024 tx = _mm256_mul_pd(fscal,dx21);
2025 ty = _mm256_mul_pd(fscal,dy21);
2026 tz = _mm256_mul_pd(fscal,dz21);
2028 /* Update vectorial force */
2029 fix2 = _mm256_add_pd(fix2,tx);
2030 fiy2 = _mm256_add_pd(fiy2,ty);
2031 fiz2 = _mm256_add_pd(fiz2,tz);
2033 fjx1 = _mm256_add_pd(fjx1,tx);
2034 fjy1 = _mm256_add_pd(fjy1,ty);
2035 fjz1 = _mm256_add_pd(fjz1,tz);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 r22 = _mm256_mul_pd(rsq22,rinv22);
2042 r22 = _mm256_andnot_pd(dummy_mask,r22);
2044 /* EWALD ELECTROSTATICS */
2046 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2047 ewrt = _mm256_mul_pd(r22,ewtabscale);
2048 ewitab = _mm256_cvttpd_epi32(ewrt);
2049 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2050 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2051 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2053 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2054 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2058 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2060 /* Calculate temporary vectorial force */
2061 tx = _mm256_mul_pd(fscal,dx22);
2062 ty = _mm256_mul_pd(fscal,dy22);
2063 tz = _mm256_mul_pd(fscal,dz22);
2065 /* Update vectorial force */
2066 fix2 = _mm256_add_pd(fix2,tx);
2067 fiy2 = _mm256_add_pd(fiy2,ty);
2068 fiz2 = _mm256_add_pd(fiz2,tz);
2070 fjx2 = _mm256_add_pd(fjx2,tx);
2071 fjy2 = _mm256_add_pd(fjy2,ty);
2072 fjz2 = _mm256_add_pd(fjz2,tz);
2074 /**************************
2075 * CALCULATE INTERACTIONS *
2076 **************************/
2078 r23 = _mm256_mul_pd(rsq23,rinv23);
2079 r23 = _mm256_andnot_pd(dummy_mask,r23);
2081 /* EWALD ELECTROSTATICS */
2083 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2084 ewrt = _mm256_mul_pd(r23,ewtabscale);
2085 ewitab = _mm256_cvttpd_epi32(ewrt);
2086 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2087 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2088 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2090 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2091 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2095 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2097 /* Calculate temporary vectorial force */
2098 tx = _mm256_mul_pd(fscal,dx23);
2099 ty = _mm256_mul_pd(fscal,dy23);
2100 tz = _mm256_mul_pd(fscal,dz23);
2102 /* Update vectorial force */
2103 fix2 = _mm256_add_pd(fix2,tx);
2104 fiy2 = _mm256_add_pd(fiy2,ty);
2105 fiz2 = _mm256_add_pd(fiz2,tz);
2107 fjx3 = _mm256_add_pd(fjx3,tx);
2108 fjy3 = _mm256_add_pd(fjy3,ty);
2109 fjz3 = _mm256_add_pd(fjz3,tz);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 r31 = _mm256_mul_pd(rsq31,rinv31);
2116 r31 = _mm256_andnot_pd(dummy_mask,r31);
2118 /* EWALD ELECTROSTATICS */
2120 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2121 ewrt = _mm256_mul_pd(r31,ewtabscale);
2122 ewitab = _mm256_cvttpd_epi32(ewrt);
2123 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2124 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2125 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2127 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2128 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2132 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2134 /* Calculate temporary vectorial force */
2135 tx = _mm256_mul_pd(fscal,dx31);
2136 ty = _mm256_mul_pd(fscal,dy31);
2137 tz = _mm256_mul_pd(fscal,dz31);
2139 /* Update vectorial force */
2140 fix3 = _mm256_add_pd(fix3,tx);
2141 fiy3 = _mm256_add_pd(fiy3,ty);
2142 fiz3 = _mm256_add_pd(fiz3,tz);
2144 fjx1 = _mm256_add_pd(fjx1,tx);
2145 fjy1 = _mm256_add_pd(fjy1,ty);
2146 fjz1 = _mm256_add_pd(fjz1,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 r32 = _mm256_mul_pd(rsq32,rinv32);
2153 r32 = _mm256_andnot_pd(dummy_mask,r32);
2155 /* EWALD ELECTROSTATICS */
2157 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2158 ewrt = _mm256_mul_pd(r32,ewtabscale);
2159 ewitab = _mm256_cvttpd_epi32(ewrt);
2160 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2161 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2162 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2164 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2165 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2169 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2171 /* Calculate temporary vectorial force */
2172 tx = _mm256_mul_pd(fscal,dx32);
2173 ty = _mm256_mul_pd(fscal,dy32);
2174 tz = _mm256_mul_pd(fscal,dz32);
2176 /* Update vectorial force */
2177 fix3 = _mm256_add_pd(fix3,tx);
2178 fiy3 = _mm256_add_pd(fiy3,ty);
2179 fiz3 = _mm256_add_pd(fiz3,tz);
2181 fjx2 = _mm256_add_pd(fjx2,tx);
2182 fjy2 = _mm256_add_pd(fjy2,ty);
2183 fjz2 = _mm256_add_pd(fjz2,tz);
2185 /**************************
2186 * CALCULATE INTERACTIONS *
2187 **************************/
2189 r33 = _mm256_mul_pd(rsq33,rinv33);
2190 r33 = _mm256_andnot_pd(dummy_mask,r33);
2192 /* EWALD ELECTROSTATICS */
2194 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2195 ewrt = _mm256_mul_pd(r33,ewtabscale);
2196 ewitab = _mm256_cvttpd_epi32(ewrt);
2197 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2198 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2199 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2201 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2202 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2206 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2208 /* Calculate temporary vectorial force */
2209 tx = _mm256_mul_pd(fscal,dx33);
2210 ty = _mm256_mul_pd(fscal,dy33);
2211 tz = _mm256_mul_pd(fscal,dz33);
2213 /* Update vectorial force */
2214 fix3 = _mm256_add_pd(fix3,tx);
2215 fiy3 = _mm256_add_pd(fiy3,ty);
2216 fiz3 = _mm256_add_pd(fiz3,tz);
2218 fjx3 = _mm256_add_pd(fjx3,tx);
2219 fjy3 = _mm256_add_pd(fjy3,ty);
2220 fjz3 = _mm256_add_pd(fjz3,tz);
2222 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2223 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2224 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2225 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2227 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2228 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2230 /* Inner loop uses 333 flops */
2233 /* End of innermost loop */
2235 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2236 f+i_coord_offset+DIM,fshift+i_shift_offset);
2238 /* Increment number of inner iterations */
2239 inneriter += j_index_end - j_index_start;
2241 /* Outer loop uses 18 flops */
2244 /* Increment number of outer iterations */
2247 /* Update outer/inner flops */
2249 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*333);