2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
105 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
106 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
108 __m256d dummy_mask,cutoff_mask;
109 __m128 tmpmask0,tmpmask1;
110 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
111 __m256d one = _mm256_set1_pd(1.0);
112 __m256d two = _mm256_set1_pd(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm256_set1_pd(fr->epsfac);
125 charge = mdatoms->chargeA;
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
131 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
132 beta2 = _mm256_mul_pd(beta,beta);
133 beta3 = _mm256_mul_pd(beta,beta2);
135 ewtab = fr->ic->tabq_coul_FDV0;
136 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
137 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
142 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
143 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
144 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
146 jq1 = _mm256_set1_pd(charge[inr+1]);
147 jq2 = _mm256_set1_pd(charge[inr+2]);
148 jq3 = _mm256_set1_pd(charge[inr+3]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
151 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
152 qq11 = _mm256_mul_pd(iq1,jq1);
153 qq12 = _mm256_mul_pd(iq1,jq2);
154 qq13 = _mm256_mul_pd(iq1,jq3);
155 qq21 = _mm256_mul_pd(iq2,jq1);
156 qq22 = _mm256_mul_pd(iq2,jq2);
157 qq23 = _mm256_mul_pd(iq2,jq3);
158 qq31 = _mm256_mul_pd(iq3,jq1);
159 qq32 = _mm256_mul_pd(iq3,jq2);
160 qq33 = _mm256_mul_pd(iq3,jq3);
162 /* Avoid stupid compiler warnings */
163 jnrA = jnrB = jnrC = jnrD = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195 fix0 = _mm256_setzero_pd();
196 fiy0 = _mm256_setzero_pd();
197 fiz0 = _mm256_setzero_pd();
198 fix1 = _mm256_setzero_pd();
199 fiy1 = _mm256_setzero_pd();
200 fiz1 = _mm256_setzero_pd();
201 fix2 = _mm256_setzero_pd();
202 fiy2 = _mm256_setzero_pd();
203 fiz2 = _mm256_setzero_pd();
204 fix3 = _mm256_setzero_pd();
205 fiy3 = _mm256_setzero_pd();
206 fiz3 = _mm256_setzero_pd();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_pd();
210 vvdwsum = _mm256_setzero_pd();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
216 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA = DIM*jnrA;
222 j_coord_offsetB = DIM*jnrB;
223 j_coord_offsetC = DIM*jnrC;
224 j_coord_offsetD = DIM*jnrD;
226 /* load j atom coordinates */
227 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
230 &jy2,&jz2,&jx3,&jy3,&jz3);
232 /* Calculate displacement vector */
233 dx00 = _mm256_sub_pd(ix0,jx0);
234 dy00 = _mm256_sub_pd(iy0,jy0);
235 dz00 = _mm256_sub_pd(iz0,jz0);
236 dx11 = _mm256_sub_pd(ix1,jx1);
237 dy11 = _mm256_sub_pd(iy1,jy1);
238 dz11 = _mm256_sub_pd(iz1,jz1);
239 dx12 = _mm256_sub_pd(ix1,jx2);
240 dy12 = _mm256_sub_pd(iy1,jy2);
241 dz12 = _mm256_sub_pd(iz1,jz2);
242 dx13 = _mm256_sub_pd(ix1,jx3);
243 dy13 = _mm256_sub_pd(iy1,jy3);
244 dz13 = _mm256_sub_pd(iz1,jz3);
245 dx21 = _mm256_sub_pd(ix2,jx1);
246 dy21 = _mm256_sub_pd(iy2,jy1);
247 dz21 = _mm256_sub_pd(iz2,jz1);
248 dx22 = _mm256_sub_pd(ix2,jx2);
249 dy22 = _mm256_sub_pd(iy2,jy2);
250 dz22 = _mm256_sub_pd(iz2,jz2);
251 dx23 = _mm256_sub_pd(ix2,jx3);
252 dy23 = _mm256_sub_pd(iy2,jy3);
253 dz23 = _mm256_sub_pd(iz2,jz3);
254 dx31 = _mm256_sub_pd(ix3,jx1);
255 dy31 = _mm256_sub_pd(iy3,jy1);
256 dz31 = _mm256_sub_pd(iz3,jz1);
257 dx32 = _mm256_sub_pd(ix3,jx2);
258 dy32 = _mm256_sub_pd(iy3,jy2);
259 dz32 = _mm256_sub_pd(iz3,jz2);
260 dx33 = _mm256_sub_pd(ix3,jx3);
261 dy33 = _mm256_sub_pd(iy3,jy3);
262 dz33 = _mm256_sub_pd(iz3,jz3);
264 /* Calculate squared distance and things based on it */
265 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
266 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
267 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
268 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
269 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
270 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
271 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
272 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
273 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
274 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
276 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
277 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
278 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
279 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
280 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
281 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
282 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
283 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
284 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
286 rinvsq00 = gmx_mm256_inv_pd(rsq00);
287 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
288 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
289 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
290 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
291 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
292 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
293 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
294 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
295 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
297 fjx0 = _mm256_setzero_pd();
298 fjy0 = _mm256_setzero_pd();
299 fjz0 = _mm256_setzero_pd();
300 fjx1 = _mm256_setzero_pd();
301 fjy1 = _mm256_setzero_pd();
302 fjz1 = _mm256_setzero_pd();
303 fjx2 = _mm256_setzero_pd();
304 fjy2 = _mm256_setzero_pd();
305 fjz2 = _mm256_setzero_pd();
306 fjx3 = _mm256_setzero_pd();
307 fjy3 = _mm256_setzero_pd();
308 fjz3 = _mm256_setzero_pd();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 /* LENNARD-JONES DISPERSION/REPULSION */
316 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
317 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
318 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
319 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
320 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
322 /* Update potential sum for this i atom from the interaction with this j atom. */
323 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
327 /* Calculate temporary vectorial force */
328 tx = _mm256_mul_pd(fscal,dx00);
329 ty = _mm256_mul_pd(fscal,dy00);
330 tz = _mm256_mul_pd(fscal,dz00);
332 /* Update vectorial force */
333 fix0 = _mm256_add_pd(fix0,tx);
334 fiy0 = _mm256_add_pd(fiy0,ty);
335 fiz0 = _mm256_add_pd(fiz0,tz);
337 fjx0 = _mm256_add_pd(fjx0,tx);
338 fjy0 = _mm256_add_pd(fjy0,ty);
339 fjz0 = _mm256_add_pd(fjz0,tz);
341 /**************************
342 * CALCULATE INTERACTIONS *
343 **************************/
345 r11 = _mm256_mul_pd(rsq11,rinv11);
347 /* EWALD ELECTROSTATICS */
349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
350 ewrt = _mm256_mul_pd(r11,ewtabscale);
351 ewitab = _mm256_cvttpd_epi32(ewrt);
352 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
353 ewitab = _mm_slli_epi32(ewitab,2);
354 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
355 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
356 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
357 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
358 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
359 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
360 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
361 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
362 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 velecsum = _mm256_add_pd(velecsum,velec);
369 /* Calculate temporary vectorial force */
370 tx = _mm256_mul_pd(fscal,dx11);
371 ty = _mm256_mul_pd(fscal,dy11);
372 tz = _mm256_mul_pd(fscal,dz11);
374 /* Update vectorial force */
375 fix1 = _mm256_add_pd(fix1,tx);
376 fiy1 = _mm256_add_pd(fiy1,ty);
377 fiz1 = _mm256_add_pd(fiz1,tz);
379 fjx1 = _mm256_add_pd(fjx1,tx);
380 fjy1 = _mm256_add_pd(fjy1,ty);
381 fjz1 = _mm256_add_pd(fjz1,tz);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 r12 = _mm256_mul_pd(rsq12,rinv12);
389 /* EWALD ELECTROSTATICS */
391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
392 ewrt = _mm256_mul_pd(r12,ewtabscale);
393 ewitab = _mm256_cvttpd_epi32(ewrt);
394 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
395 ewitab = _mm_slli_epi32(ewitab,2);
396 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
397 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
398 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
399 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
400 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
401 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
402 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
403 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
404 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velecsum = _mm256_add_pd(velecsum,velec);
411 /* Calculate temporary vectorial force */
412 tx = _mm256_mul_pd(fscal,dx12);
413 ty = _mm256_mul_pd(fscal,dy12);
414 tz = _mm256_mul_pd(fscal,dz12);
416 /* Update vectorial force */
417 fix1 = _mm256_add_pd(fix1,tx);
418 fiy1 = _mm256_add_pd(fiy1,ty);
419 fiz1 = _mm256_add_pd(fiz1,tz);
421 fjx2 = _mm256_add_pd(fjx2,tx);
422 fjy2 = _mm256_add_pd(fjy2,ty);
423 fjz2 = _mm256_add_pd(fjz2,tz);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 r13 = _mm256_mul_pd(rsq13,rinv13);
431 /* EWALD ELECTROSTATICS */
433 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
434 ewrt = _mm256_mul_pd(r13,ewtabscale);
435 ewitab = _mm256_cvttpd_epi32(ewrt);
436 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
437 ewitab = _mm_slli_epi32(ewitab,2);
438 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
439 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
440 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
441 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
442 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
443 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
444 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
445 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
446 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velecsum = _mm256_add_pd(velecsum,velec);
453 /* Calculate temporary vectorial force */
454 tx = _mm256_mul_pd(fscal,dx13);
455 ty = _mm256_mul_pd(fscal,dy13);
456 tz = _mm256_mul_pd(fscal,dz13);
458 /* Update vectorial force */
459 fix1 = _mm256_add_pd(fix1,tx);
460 fiy1 = _mm256_add_pd(fiy1,ty);
461 fiz1 = _mm256_add_pd(fiz1,tz);
463 fjx3 = _mm256_add_pd(fjx3,tx);
464 fjy3 = _mm256_add_pd(fjy3,ty);
465 fjz3 = _mm256_add_pd(fjz3,tz);
467 /**************************
468 * CALCULATE INTERACTIONS *
469 **************************/
471 r21 = _mm256_mul_pd(rsq21,rinv21);
473 /* EWALD ELECTROSTATICS */
475 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476 ewrt = _mm256_mul_pd(r21,ewtabscale);
477 ewitab = _mm256_cvttpd_epi32(ewrt);
478 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
479 ewitab = _mm_slli_epi32(ewitab,2);
480 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
481 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
482 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
483 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
484 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
485 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
486 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
487 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
488 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
490 /* Update potential sum for this i atom from the interaction with this j atom. */
491 velecsum = _mm256_add_pd(velecsum,velec);
495 /* Calculate temporary vectorial force */
496 tx = _mm256_mul_pd(fscal,dx21);
497 ty = _mm256_mul_pd(fscal,dy21);
498 tz = _mm256_mul_pd(fscal,dz21);
500 /* Update vectorial force */
501 fix2 = _mm256_add_pd(fix2,tx);
502 fiy2 = _mm256_add_pd(fiy2,ty);
503 fiz2 = _mm256_add_pd(fiz2,tz);
505 fjx1 = _mm256_add_pd(fjx1,tx);
506 fjy1 = _mm256_add_pd(fjy1,ty);
507 fjz1 = _mm256_add_pd(fjz1,tz);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 r22 = _mm256_mul_pd(rsq22,rinv22);
515 /* EWALD ELECTROSTATICS */
517 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
518 ewrt = _mm256_mul_pd(r22,ewtabscale);
519 ewitab = _mm256_cvttpd_epi32(ewrt);
520 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
521 ewitab = _mm_slli_epi32(ewitab,2);
522 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
523 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
524 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
525 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
526 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
527 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
528 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
529 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
530 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velecsum = _mm256_add_pd(velecsum,velec);
537 /* Calculate temporary vectorial force */
538 tx = _mm256_mul_pd(fscal,dx22);
539 ty = _mm256_mul_pd(fscal,dy22);
540 tz = _mm256_mul_pd(fscal,dz22);
542 /* Update vectorial force */
543 fix2 = _mm256_add_pd(fix2,tx);
544 fiy2 = _mm256_add_pd(fiy2,ty);
545 fiz2 = _mm256_add_pd(fiz2,tz);
547 fjx2 = _mm256_add_pd(fjx2,tx);
548 fjy2 = _mm256_add_pd(fjy2,ty);
549 fjz2 = _mm256_add_pd(fjz2,tz);
551 /**************************
552 * CALCULATE INTERACTIONS *
553 **************************/
555 r23 = _mm256_mul_pd(rsq23,rinv23);
557 /* EWALD ELECTROSTATICS */
559 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
560 ewrt = _mm256_mul_pd(r23,ewtabscale);
561 ewitab = _mm256_cvttpd_epi32(ewrt);
562 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
563 ewitab = _mm_slli_epi32(ewitab,2);
564 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
565 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
566 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
567 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
568 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
569 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
570 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
571 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
572 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 velecsum = _mm256_add_pd(velecsum,velec);
579 /* Calculate temporary vectorial force */
580 tx = _mm256_mul_pd(fscal,dx23);
581 ty = _mm256_mul_pd(fscal,dy23);
582 tz = _mm256_mul_pd(fscal,dz23);
584 /* Update vectorial force */
585 fix2 = _mm256_add_pd(fix2,tx);
586 fiy2 = _mm256_add_pd(fiy2,ty);
587 fiz2 = _mm256_add_pd(fiz2,tz);
589 fjx3 = _mm256_add_pd(fjx3,tx);
590 fjy3 = _mm256_add_pd(fjy3,ty);
591 fjz3 = _mm256_add_pd(fjz3,tz);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 r31 = _mm256_mul_pd(rsq31,rinv31);
599 /* EWALD ELECTROSTATICS */
601 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
602 ewrt = _mm256_mul_pd(r31,ewtabscale);
603 ewitab = _mm256_cvttpd_epi32(ewrt);
604 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
605 ewitab = _mm_slli_epi32(ewitab,2);
606 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
607 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
608 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
609 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
610 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
611 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
612 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
613 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
614 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
616 /* Update potential sum for this i atom from the interaction with this j atom. */
617 velecsum = _mm256_add_pd(velecsum,velec);
621 /* Calculate temporary vectorial force */
622 tx = _mm256_mul_pd(fscal,dx31);
623 ty = _mm256_mul_pd(fscal,dy31);
624 tz = _mm256_mul_pd(fscal,dz31);
626 /* Update vectorial force */
627 fix3 = _mm256_add_pd(fix3,tx);
628 fiy3 = _mm256_add_pd(fiy3,ty);
629 fiz3 = _mm256_add_pd(fiz3,tz);
631 fjx1 = _mm256_add_pd(fjx1,tx);
632 fjy1 = _mm256_add_pd(fjy1,ty);
633 fjz1 = _mm256_add_pd(fjz1,tz);
635 /**************************
636 * CALCULATE INTERACTIONS *
637 **************************/
639 r32 = _mm256_mul_pd(rsq32,rinv32);
641 /* EWALD ELECTROSTATICS */
643 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
644 ewrt = _mm256_mul_pd(r32,ewtabscale);
645 ewitab = _mm256_cvttpd_epi32(ewrt);
646 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
647 ewitab = _mm_slli_epi32(ewitab,2);
648 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
649 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
650 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
651 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
652 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
653 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
654 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
655 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
656 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velecsum = _mm256_add_pd(velecsum,velec);
663 /* Calculate temporary vectorial force */
664 tx = _mm256_mul_pd(fscal,dx32);
665 ty = _mm256_mul_pd(fscal,dy32);
666 tz = _mm256_mul_pd(fscal,dz32);
668 /* Update vectorial force */
669 fix3 = _mm256_add_pd(fix3,tx);
670 fiy3 = _mm256_add_pd(fiy3,ty);
671 fiz3 = _mm256_add_pd(fiz3,tz);
673 fjx2 = _mm256_add_pd(fjx2,tx);
674 fjy2 = _mm256_add_pd(fjy2,ty);
675 fjz2 = _mm256_add_pd(fjz2,tz);
677 /**************************
678 * CALCULATE INTERACTIONS *
679 **************************/
681 r33 = _mm256_mul_pd(rsq33,rinv33);
683 /* EWALD ELECTROSTATICS */
685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
686 ewrt = _mm256_mul_pd(r33,ewtabscale);
687 ewitab = _mm256_cvttpd_epi32(ewrt);
688 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
689 ewitab = _mm_slli_epi32(ewitab,2);
690 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
691 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
692 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
693 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
694 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
695 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
696 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
697 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
698 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
700 /* Update potential sum for this i atom from the interaction with this j atom. */
701 velecsum = _mm256_add_pd(velecsum,velec);
705 /* Calculate temporary vectorial force */
706 tx = _mm256_mul_pd(fscal,dx33);
707 ty = _mm256_mul_pd(fscal,dy33);
708 tz = _mm256_mul_pd(fscal,dz33);
710 /* Update vectorial force */
711 fix3 = _mm256_add_pd(fix3,tx);
712 fiy3 = _mm256_add_pd(fiy3,ty);
713 fiz3 = _mm256_add_pd(fiz3,tz);
715 fjx3 = _mm256_add_pd(fjx3,tx);
716 fjy3 = _mm256_add_pd(fjy3,ty);
717 fjz3 = _mm256_add_pd(fjz3,tz);
719 fjptrA = f+j_coord_offsetA;
720 fjptrB = f+j_coord_offsetB;
721 fjptrC = f+j_coord_offsetC;
722 fjptrD = f+j_coord_offsetD;
724 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
725 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
726 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
728 /* Inner loop uses 404 flops */
734 /* Get j neighbor index, and coordinate index */
735 jnrlistA = jjnr[jidx];
736 jnrlistB = jjnr[jidx+1];
737 jnrlistC = jjnr[jidx+2];
738 jnrlistD = jjnr[jidx+3];
739 /* Sign of each element will be negative for non-real atoms.
740 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
741 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
743 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
745 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
746 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
747 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
749 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
750 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
751 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
752 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
753 j_coord_offsetA = DIM*jnrA;
754 j_coord_offsetB = DIM*jnrB;
755 j_coord_offsetC = DIM*jnrC;
756 j_coord_offsetD = DIM*jnrD;
758 /* load j atom coordinates */
759 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
760 x+j_coord_offsetC,x+j_coord_offsetD,
761 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
762 &jy2,&jz2,&jx3,&jy3,&jz3);
764 /* Calculate displacement vector */
765 dx00 = _mm256_sub_pd(ix0,jx0);
766 dy00 = _mm256_sub_pd(iy0,jy0);
767 dz00 = _mm256_sub_pd(iz0,jz0);
768 dx11 = _mm256_sub_pd(ix1,jx1);
769 dy11 = _mm256_sub_pd(iy1,jy1);
770 dz11 = _mm256_sub_pd(iz1,jz1);
771 dx12 = _mm256_sub_pd(ix1,jx2);
772 dy12 = _mm256_sub_pd(iy1,jy2);
773 dz12 = _mm256_sub_pd(iz1,jz2);
774 dx13 = _mm256_sub_pd(ix1,jx3);
775 dy13 = _mm256_sub_pd(iy1,jy3);
776 dz13 = _mm256_sub_pd(iz1,jz3);
777 dx21 = _mm256_sub_pd(ix2,jx1);
778 dy21 = _mm256_sub_pd(iy2,jy1);
779 dz21 = _mm256_sub_pd(iz2,jz1);
780 dx22 = _mm256_sub_pd(ix2,jx2);
781 dy22 = _mm256_sub_pd(iy2,jy2);
782 dz22 = _mm256_sub_pd(iz2,jz2);
783 dx23 = _mm256_sub_pd(ix2,jx3);
784 dy23 = _mm256_sub_pd(iy2,jy3);
785 dz23 = _mm256_sub_pd(iz2,jz3);
786 dx31 = _mm256_sub_pd(ix3,jx1);
787 dy31 = _mm256_sub_pd(iy3,jy1);
788 dz31 = _mm256_sub_pd(iz3,jz1);
789 dx32 = _mm256_sub_pd(ix3,jx2);
790 dy32 = _mm256_sub_pd(iy3,jy2);
791 dz32 = _mm256_sub_pd(iz3,jz2);
792 dx33 = _mm256_sub_pd(ix3,jx3);
793 dy33 = _mm256_sub_pd(iy3,jy3);
794 dz33 = _mm256_sub_pd(iz3,jz3);
796 /* Calculate squared distance and things based on it */
797 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
798 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
799 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
800 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
801 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
802 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
803 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
804 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
805 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
806 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
808 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
809 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
810 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
811 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
812 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
813 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
814 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
815 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
816 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
818 rinvsq00 = gmx_mm256_inv_pd(rsq00);
819 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
820 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
821 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
822 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
823 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
824 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
825 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
826 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
827 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
829 fjx0 = _mm256_setzero_pd();
830 fjy0 = _mm256_setzero_pd();
831 fjz0 = _mm256_setzero_pd();
832 fjx1 = _mm256_setzero_pd();
833 fjy1 = _mm256_setzero_pd();
834 fjz1 = _mm256_setzero_pd();
835 fjx2 = _mm256_setzero_pd();
836 fjy2 = _mm256_setzero_pd();
837 fjz2 = _mm256_setzero_pd();
838 fjx3 = _mm256_setzero_pd();
839 fjy3 = _mm256_setzero_pd();
840 fjz3 = _mm256_setzero_pd();
842 /**************************
843 * CALCULATE INTERACTIONS *
844 **************************/
846 /* LENNARD-JONES DISPERSION/REPULSION */
848 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
849 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
850 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
851 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
852 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
854 /* Update potential sum for this i atom from the interaction with this j atom. */
855 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
856 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
860 fscal = _mm256_andnot_pd(dummy_mask,fscal);
862 /* Calculate temporary vectorial force */
863 tx = _mm256_mul_pd(fscal,dx00);
864 ty = _mm256_mul_pd(fscal,dy00);
865 tz = _mm256_mul_pd(fscal,dz00);
867 /* Update vectorial force */
868 fix0 = _mm256_add_pd(fix0,tx);
869 fiy0 = _mm256_add_pd(fiy0,ty);
870 fiz0 = _mm256_add_pd(fiz0,tz);
872 fjx0 = _mm256_add_pd(fjx0,tx);
873 fjy0 = _mm256_add_pd(fjy0,ty);
874 fjz0 = _mm256_add_pd(fjz0,tz);
876 /**************************
877 * CALCULATE INTERACTIONS *
878 **************************/
880 r11 = _mm256_mul_pd(rsq11,rinv11);
881 r11 = _mm256_andnot_pd(dummy_mask,r11);
883 /* EWALD ELECTROSTATICS */
885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
886 ewrt = _mm256_mul_pd(r11,ewtabscale);
887 ewitab = _mm256_cvttpd_epi32(ewrt);
888 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
889 ewitab = _mm_slli_epi32(ewitab,2);
890 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
891 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
892 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
893 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
894 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
895 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
896 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
897 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
898 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec = _mm256_andnot_pd(dummy_mask,velec);
902 velecsum = _mm256_add_pd(velecsum,velec);
906 fscal = _mm256_andnot_pd(dummy_mask,fscal);
908 /* Calculate temporary vectorial force */
909 tx = _mm256_mul_pd(fscal,dx11);
910 ty = _mm256_mul_pd(fscal,dy11);
911 tz = _mm256_mul_pd(fscal,dz11);
913 /* Update vectorial force */
914 fix1 = _mm256_add_pd(fix1,tx);
915 fiy1 = _mm256_add_pd(fiy1,ty);
916 fiz1 = _mm256_add_pd(fiz1,tz);
918 fjx1 = _mm256_add_pd(fjx1,tx);
919 fjy1 = _mm256_add_pd(fjy1,ty);
920 fjz1 = _mm256_add_pd(fjz1,tz);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 r12 = _mm256_mul_pd(rsq12,rinv12);
927 r12 = _mm256_andnot_pd(dummy_mask,r12);
929 /* EWALD ELECTROSTATICS */
931 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
932 ewrt = _mm256_mul_pd(r12,ewtabscale);
933 ewitab = _mm256_cvttpd_epi32(ewrt);
934 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
935 ewitab = _mm_slli_epi32(ewitab,2);
936 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
937 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
938 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
939 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
940 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
941 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
942 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
943 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
944 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
946 /* Update potential sum for this i atom from the interaction with this j atom. */
947 velec = _mm256_andnot_pd(dummy_mask,velec);
948 velecsum = _mm256_add_pd(velecsum,velec);
952 fscal = _mm256_andnot_pd(dummy_mask,fscal);
954 /* Calculate temporary vectorial force */
955 tx = _mm256_mul_pd(fscal,dx12);
956 ty = _mm256_mul_pd(fscal,dy12);
957 tz = _mm256_mul_pd(fscal,dz12);
959 /* Update vectorial force */
960 fix1 = _mm256_add_pd(fix1,tx);
961 fiy1 = _mm256_add_pd(fiy1,ty);
962 fiz1 = _mm256_add_pd(fiz1,tz);
964 fjx2 = _mm256_add_pd(fjx2,tx);
965 fjy2 = _mm256_add_pd(fjy2,ty);
966 fjz2 = _mm256_add_pd(fjz2,tz);
968 /**************************
969 * CALCULATE INTERACTIONS *
970 **************************/
972 r13 = _mm256_mul_pd(rsq13,rinv13);
973 r13 = _mm256_andnot_pd(dummy_mask,r13);
975 /* EWALD ELECTROSTATICS */
977 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
978 ewrt = _mm256_mul_pd(r13,ewtabscale);
979 ewitab = _mm256_cvttpd_epi32(ewrt);
980 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
981 ewitab = _mm_slli_epi32(ewitab,2);
982 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
983 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
984 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
985 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
986 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
987 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
988 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
989 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
990 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _mm256_andnot_pd(dummy_mask,velec);
994 velecsum = _mm256_add_pd(velecsum,velec);
998 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1000 /* Calculate temporary vectorial force */
1001 tx = _mm256_mul_pd(fscal,dx13);
1002 ty = _mm256_mul_pd(fscal,dy13);
1003 tz = _mm256_mul_pd(fscal,dz13);
1005 /* Update vectorial force */
1006 fix1 = _mm256_add_pd(fix1,tx);
1007 fiy1 = _mm256_add_pd(fiy1,ty);
1008 fiz1 = _mm256_add_pd(fiz1,tz);
1010 fjx3 = _mm256_add_pd(fjx3,tx);
1011 fjy3 = _mm256_add_pd(fjy3,ty);
1012 fjz3 = _mm256_add_pd(fjz3,tz);
1014 /**************************
1015 * CALCULATE INTERACTIONS *
1016 **************************/
1018 r21 = _mm256_mul_pd(rsq21,rinv21);
1019 r21 = _mm256_andnot_pd(dummy_mask,r21);
1021 /* EWALD ELECTROSTATICS */
1023 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1024 ewrt = _mm256_mul_pd(r21,ewtabscale);
1025 ewitab = _mm256_cvttpd_epi32(ewrt);
1026 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1027 ewitab = _mm_slli_epi32(ewitab,2);
1028 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1029 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1030 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1031 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1032 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1033 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1034 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1035 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1036 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1038 /* Update potential sum for this i atom from the interaction with this j atom. */
1039 velec = _mm256_andnot_pd(dummy_mask,velec);
1040 velecsum = _mm256_add_pd(velecsum,velec);
1044 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1046 /* Calculate temporary vectorial force */
1047 tx = _mm256_mul_pd(fscal,dx21);
1048 ty = _mm256_mul_pd(fscal,dy21);
1049 tz = _mm256_mul_pd(fscal,dz21);
1051 /* Update vectorial force */
1052 fix2 = _mm256_add_pd(fix2,tx);
1053 fiy2 = _mm256_add_pd(fiy2,ty);
1054 fiz2 = _mm256_add_pd(fiz2,tz);
1056 fjx1 = _mm256_add_pd(fjx1,tx);
1057 fjy1 = _mm256_add_pd(fjy1,ty);
1058 fjz1 = _mm256_add_pd(fjz1,tz);
1060 /**************************
1061 * CALCULATE INTERACTIONS *
1062 **************************/
1064 r22 = _mm256_mul_pd(rsq22,rinv22);
1065 r22 = _mm256_andnot_pd(dummy_mask,r22);
1067 /* EWALD ELECTROSTATICS */
1069 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1070 ewrt = _mm256_mul_pd(r22,ewtabscale);
1071 ewitab = _mm256_cvttpd_epi32(ewrt);
1072 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1073 ewitab = _mm_slli_epi32(ewitab,2);
1074 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1075 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1076 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1077 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1078 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1079 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1080 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1081 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1082 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1084 /* Update potential sum for this i atom from the interaction with this j atom. */
1085 velec = _mm256_andnot_pd(dummy_mask,velec);
1086 velecsum = _mm256_add_pd(velecsum,velec);
1090 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1092 /* Calculate temporary vectorial force */
1093 tx = _mm256_mul_pd(fscal,dx22);
1094 ty = _mm256_mul_pd(fscal,dy22);
1095 tz = _mm256_mul_pd(fscal,dz22);
1097 /* Update vectorial force */
1098 fix2 = _mm256_add_pd(fix2,tx);
1099 fiy2 = _mm256_add_pd(fiy2,ty);
1100 fiz2 = _mm256_add_pd(fiz2,tz);
1102 fjx2 = _mm256_add_pd(fjx2,tx);
1103 fjy2 = _mm256_add_pd(fjy2,ty);
1104 fjz2 = _mm256_add_pd(fjz2,tz);
1106 /**************************
1107 * CALCULATE INTERACTIONS *
1108 **************************/
1110 r23 = _mm256_mul_pd(rsq23,rinv23);
1111 r23 = _mm256_andnot_pd(dummy_mask,r23);
1113 /* EWALD ELECTROSTATICS */
1115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1116 ewrt = _mm256_mul_pd(r23,ewtabscale);
1117 ewitab = _mm256_cvttpd_epi32(ewrt);
1118 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1119 ewitab = _mm_slli_epi32(ewitab,2);
1120 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1121 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1122 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1123 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1124 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1125 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1126 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1127 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1128 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1130 /* Update potential sum for this i atom from the interaction with this j atom. */
1131 velec = _mm256_andnot_pd(dummy_mask,velec);
1132 velecsum = _mm256_add_pd(velecsum,velec);
1136 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1138 /* Calculate temporary vectorial force */
1139 tx = _mm256_mul_pd(fscal,dx23);
1140 ty = _mm256_mul_pd(fscal,dy23);
1141 tz = _mm256_mul_pd(fscal,dz23);
1143 /* Update vectorial force */
1144 fix2 = _mm256_add_pd(fix2,tx);
1145 fiy2 = _mm256_add_pd(fiy2,ty);
1146 fiz2 = _mm256_add_pd(fiz2,tz);
1148 fjx3 = _mm256_add_pd(fjx3,tx);
1149 fjy3 = _mm256_add_pd(fjy3,ty);
1150 fjz3 = _mm256_add_pd(fjz3,tz);
1152 /**************************
1153 * CALCULATE INTERACTIONS *
1154 **************************/
1156 r31 = _mm256_mul_pd(rsq31,rinv31);
1157 r31 = _mm256_andnot_pd(dummy_mask,r31);
1159 /* EWALD ELECTROSTATICS */
1161 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1162 ewrt = _mm256_mul_pd(r31,ewtabscale);
1163 ewitab = _mm256_cvttpd_epi32(ewrt);
1164 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1165 ewitab = _mm_slli_epi32(ewitab,2);
1166 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1167 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1168 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1169 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1170 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1171 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1172 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1173 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1174 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1176 /* Update potential sum for this i atom from the interaction with this j atom. */
1177 velec = _mm256_andnot_pd(dummy_mask,velec);
1178 velecsum = _mm256_add_pd(velecsum,velec);
1182 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1184 /* Calculate temporary vectorial force */
1185 tx = _mm256_mul_pd(fscal,dx31);
1186 ty = _mm256_mul_pd(fscal,dy31);
1187 tz = _mm256_mul_pd(fscal,dz31);
1189 /* Update vectorial force */
1190 fix3 = _mm256_add_pd(fix3,tx);
1191 fiy3 = _mm256_add_pd(fiy3,ty);
1192 fiz3 = _mm256_add_pd(fiz3,tz);
1194 fjx1 = _mm256_add_pd(fjx1,tx);
1195 fjy1 = _mm256_add_pd(fjy1,ty);
1196 fjz1 = _mm256_add_pd(fjz1,tz);
1198 /**************************
1199 * CALCULATE INTERACTIONS *
1200 **************************/
1202 r32 = _mm256_mul_pd(rsq32,rinv32);
1203 r32 = _mm256_andnot_pd(dummy_mask,r32);
1205 /* EWALD ELECTROSTATICS */
1207 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1208 ewrt = _mm256_mul_pd(r32,ewtabscale);
1209 ewitab = _mm256_cvttpd_epi32(ewrt);
1210 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1211 ewitab = _mm_slli_epi32(ewitab,2);
1212 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1213 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1214 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1215 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1216 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1217 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1218 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1219 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1220 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1222 /* Update potential sum for this i atom from the interaction with this j atom. */
1223 velec = _mm256_andnot_pd(dummy_mask,velec);
1224 velecsum = _mm256_add_pd(velecsum,velec);
1228 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm256_mul_pd(fscal,dx32);
1232 ty = _mm256_mul_pd(fscal,dy32);
1233 tz = _mm256_mul_pd(fscal,dz32);
1235 /* Update vectorial force */
1236 fix3 = _mm256_add_pd(fix3,tx);
1237 fiy3 = _mm256_add_pd(fiy3,ty);
1238 fiz3 = _mm256_add_pd(fiz3,tz);
1240 fjx2 = _mm256_add_pd(fjx2,tx);
1241 fjy2 = _mm256_add_pd(fjy2,ty);
1242 fjz2 = _mm256_add_pd(fjz2,tz);
1244 /**************************
1245 * CALCULATE INTERACTIONS *
1246 **************************/
1248 r33 = _mm256_mul_pd(rsq33,rinv33);
1249 r33 = _mm256_andnot_pd(dummy_mask,r33);
1251 /* EWALD ELECTROSTATICS */
1253 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1254 ewrt = _mm256_mul_pd(r33,ewtabscale);
1255 ewitab = _mm256_cvttpd_epi32(ewrt);
1256 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1257 ewitab = _mm_slli_epi32(ewitab,2);
1258 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1259 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1260 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1261 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1262 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1263 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1264 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1265 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1266 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1268 /* Update potential sum for this i atom from the interaction with this j atom. */
1269 velec = _mm256_andnot_pd(dummy_mask,velec);
1270 velecsum = _mm256_add_pd(velecsum,velec);
1274 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1276 /* Calculate temporary vectorial force */
1277 tx = _mm256_mul_pd(fscal,dx33);
1278 ty = _mm256_mul_pd(fscal,dy33);
1279 tz = _mm256_mul_pd(fscal,dz33);
1281 /* Update vectorial force */
1282 fix3 = _mm256_add_pd(fix3,tx);
1283 fiy3 = _mm256_add_pd(fiy3,ty);
1284 fiz3 = _mm256_add_pd(fiz3,tz);
1286 fjx3 = _mm256_add_pd(fjx3,tx);
1287 fjy3 = _mm256_add_pd(fjy3,ty);
1288 fjz3 = _mm256_add_pd(fjz3,tz);
1290 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1291 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1292 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1293 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1295 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1296 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1297 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1299 /* Inner loop uses 413 flops */
1302 /* End of innermost loop */
1304 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1305 f+i_coord_offset,fshift+i_shift_offset);
1308 /* Update potential energies */
1309 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1310 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1312 /* Increment number of inner iterations */
1313 inneriter += j_index_end - j_index_start;
1315 /* Outer loop uses 26 flops */
1318 /* Increment number of outer iterations */
1321 /* Update outer/inner flops */
1323 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
1326 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
1327 * Electrostatics interaction: Ewald
1328 * VdW interaction: LennardJones
1329 * Geometry: Water4-Water4
1330 * Calculate force/pot: Force
1333 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
1334 (t_nblist * gmx_restrict nlist,
1335 rvec * gmx_restrict xx,
1336 rvec * gmx_restrict ff,
1337 t_forcerec * gmx_restrict fr,
1338 t_mdatoms * gmx_restrict mdatoms,
1339 nb_kernel_data_t * gmx_restrict kernel_data,
1340 t_nrnb * gmx_restrict nrnb)
1342 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1343 * just 0 for non-waters.
1344 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1345 * jnr indices corresponding to data put in the four positions in the SIMD register.
1347 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1348 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1349 int jnrA,jnrB,jnrC,jnrD;
1350 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1351 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1352 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1353 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1354 real rcutoff_scalar;
1355 real *shiftvec,*fshift,*x,*f;
1356 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1357 real scratch[4*DIM];
1358 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1359 real * vdwioffsetptr0;
1360 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1361 real * vdwioffsetptr1;
1362 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1363 real * vdwioffsetptr2;
1364 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1365 real * vdwioffsetptr3;
1366 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1367 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1368 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1369 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1370 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1371 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1372 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1373 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1374 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1375 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1376 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1377 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1378 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1379 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1380 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1381 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1382 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1383 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1384 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1385 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1388 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1391 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1392 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1394 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1395 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1397 __m256d dummy_mask,cutoff_mask;
1398 __m128 tmpmask0,tmpmask1;
1399 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1400 __m256d one = _mm256_set1_pd(1.0);
1401 __m256d two = _mm256_set1_pd(2.0);
1407 jindex = nlist->jindex;
1409 shiftidx = nlist->shift;
1411 shiftvec = fr->shift_vec[0];
1412 fshift = fr->fshift[0];
1413 facel = _mm256_set1_pd(fr->epsfac);
1414 charge = mdatoms->chargeA;
1415 nvdwtype = fr->ntype;
1416 vdwparam = fr->nbfp;
1417 vdwtype = mdatoms->typeA;
1419 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1420 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1421 beta2 = _mm256_mul_pd(beta,beta);
1422 beta3 = _mm256_mul_pd(beta,beta2);
1424 ewtab = fr->ic->tabq_coul_F;
1425 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1426 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1428 /* Setup water-specific parameters */
1429 inr = nlist->iinr[0];
1430 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1431 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1432 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1433 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1435 jq1 = _mm256_set1_pd(charge[inr+1]);
1436 jq2 = _mm256_set1_pd(charge[inr+2]);
1437 jq3 = _mm256_set1_pd(charge[inr+3]);
1438 vdwjidx0A = 2*vdwtype[inr+0];
1439 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1440 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1441 qq11 = _mm256_mul_pd(iq1,jq1);
1442 qq12 = _mm256_mul_pd(iq1,jq2);
1443 qq13 = _mm256_mul_pd(iq1,jq3);
1444 qq21 = _mm256_mul_pd(iq2,jq1);
1445 qq22 = _mm256_mul_pd(iq2,jq2);
1446 qq23 = _mm256_mul_pd(iq2,jq3);
1447 qq31 = _mm256_mul_pd(iq3,jq1);
1448 qq32 = _mm256_mul_pd(iq3,jq2);
1449 qq33 = _mm256_mul_pd(iq3,jq3);
1451 /* Avoid stupid compiler warnings */
1452 jnrA = jnrB = jnrC = jnrD = 0;
1453 j_coord_offsetA = 0;
1454 j_coord_offsetB = 0;
1455 j_coord_offsetC = 0;
1456 j_coord_offsetD = 0;
1461 for(iidx=0;iidx<4*DIM;iidx++)
1463 scratch[iidx] = 0.0;
1466 /* Start outer loop over neighborlists */
1467 for(iidx=0; iidx<nri; iidx++)
1469 /* Load shift vector for this list */
1470 i_shift_offset = DIM*shiftidx[iidx];
1472 /* Load limits for loop over neighbors */
1473 j_index_start = jindex[iidx];
1474 j_index_end = jindex[iidx+1];
1476 /* Get outer coordinate index */
1478 i_coord_offset = DIM*inr;
1480 /* Load i particle coords and add shift vector */
1481 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1482 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1484 fix0 = _mm256_setzero_pd();
1485 fiy0 = _mm256_setzero_pd();
1486 fiz0 = _mm256_setzero_pd();
1487 fix1 = _mm256_setzero_pd();
1488 fiy1 = _mm256_setzero_pd();
1489 fiz1 = _mm256_setzero_pd();
1490 fix2 = _mm256_setzero_pd();
1491 fiy2 = _mm256_setzero_pd();
1492 fiz2 = _mm256_setzero_pd();
1493 fix3 = _mm256_setzero_pd();
1494 fiy3 = _mm256_setzero_pd();
1495 fiz3 = _mm256_setzero_pd();
1497 /* Start inner kernel loop */
1498 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1501 /* Get j neighbor index, and coordinate index */
1503 jnrB = jjnr[jidx+1];
1504 jnrC = jjnr[jidx+2];
1505 jnrD = jjnr[jidx+3];
1506 j_coord_offsetA = DIM*jnrA;
1507 j_coord_offsetB = DIM*jnrB;
1508 j_coord_offsetC = DIM*jnrC;
1509 j_coord_offsetD = DIM*jnrD;
1511 /* load j atom coordinates */
1512 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1513 x+j_coord_offsetC,x+j_coord_offsetD,
1514 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1515 &jy2,&jz2,&jx3,&jy3,&jz3);
1517 /* Calculate displacement vector */
1518 dx00 = _mm256_sub_pd(ix0,jx0);
1519 dy00 = _mm256_sub_pd(iy0,jy0);
1520 dz00 = _mm256_sub_pd(iz0,jz0);
1521 dx11 = _mm256_sub_pd(ix1,jx1);
1522 dy11 = _mm256_sub_pd(iy1,jy1);
1523 dz11 = _mm256_sub_pd(iz1,jz1);
1524 dx12 = _mm256_sub_pd(ix1,jx2);
1525 dy12 = _mm256_sub_pd(iy1,jy2);
1526 dz12 = _mm256_sub_pd(iz1,jz2);
1527 dx13 = _mm256_sub_pd(ix1,jx3);
1528 dy13 = _mm256_sub_pd(iy1,jy3);
1529 dz13 = _mm256_sub_pd(iz1,jz3);
1530 dx21 = _mm256_sub_pd(ix2,jx1);
1531 dy21 = _mm256_sub_pd(iy2,jy1);
1532 dz21 = _mm256_sub_pd(iz2,jz1);
1533 dx22 = _mm256_sub_pd(ix2,jx2);
1534 dy22 = _mm256_sub_pd(iy2,jy2);
1535 dz22 = _mm256_sub_pd(iz2,jz2);
1536 dx23 = _mm256_sub_pd(ix2,jx3);
1537 dy23 = _mm256_sub_pd(iy2,jy3);
1538 dz23 = _mm256_sub_pd(iz2,jz3);
1539 dx31 = _mm256_sub_pd(ix3,jx1);
1540 dy31 = _mm256_sub_pd(iy3,jy1);
1541 dz31 = _mm256_sub_pd(iz3,jz1);
1542 dx32 = _mm256_sub_pd(ix3,jx2);
1543 dy32 = _mm256_sub_pd(iy3,jy2);
1544 dz32 = _mm256_sub_pd(iz3,jz2);
1545 dx33 = _mm256_sub_pd(ix3,jx3);
1546 dy33 = _mm256_sub_pd(iy3,jy3);
1547 dz33 = _mm256_sub_pd(iz3,jz3);
1549 /* Calculate squared distance and things based on it */
1550 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1551 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1552 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1553 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1554 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1555 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1556 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1557 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1558 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1559 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1561 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1562 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1563 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1564 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1565 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1566 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1567 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1568 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1569 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1571 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1572 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1573 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1574 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1575 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1576 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1577 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1578 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1579 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1580 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1582 fjx0 = _mm256_setzero_pd();
1583 fjy0 = _mm256_setzero_pd();
1584 fjz0 = _mm256_setzero_pd();
1585 fjx1 = _mm256_setzero_pd();
1586 fjy1 = _mm256_setzero_pd();
1587 fjz1 = _mm256_setzero_pd();
1588 fjx2 = _mm256_setzero_pd();
1589 fjy2 = _mm256_setzero_pd();
1590 fjz2 = _mm256_setzero_pd();
1591 fjx3 = _mm256_setzero_pd();
1592 fjy3 = _mm256_setzero_pd();
1593 fjz3 = _mm256_setzero_pd();
1595 /**************************
1596 * CALCULATE INTERACTIONS *
1597 **************************/
1599 /* LENNARD-JONES DISPERSION/REPULSION */
1601 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1602 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1606 /* Calculate temporary vectorial force */
1607 tx = _mm256_mul_pd(fscal,dx00);
1608 ty = _mm256_mul_pd(fscal,dy00);
1609 tz = _mm256_mul_pd(fscal,dz00);
1611 /* Update vectorial force */
1612 fix0 = _mm256_add_pd(fix0,tx);
1613 fiy0 = _mm256_add_pd(fiy0,ty);
1614 fiz0 = _mm256_add_pd(fiz0,tz);
1616 fjx0 = _mm256_add_pd(fjx0,tx);
1617 fjy0 = _mm256_add_pd(fjy0,ty);
1618 fjz0 = _mm256_add_pd(fjz0,tz);
1620 /**************************
1621 * CALCULATE INTERACTIONS *
1622 **************************/
1624 r11 = _mm256_mul_pd(rsq11,rinv11);
1626 /* EWALD ELECTROSTATICS */
1628 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1629 ewrt = _mm256_mul_pd(r11,ewtabscale);
1630 ewitab = _mm256_cvttpd_epi32(ewrt);
1631 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1632 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1633 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1635 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1636 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1640 /* Calculate temporary vectorial force */
1641 tx = _mm256_mul_pd(fscal,dx11);
1642 ty = _mm256_mul_pd(fscal,dy11);
1643 tz = _mm256_mul_pd(fscal,dz11);
1645 /* Update vectorial force */
1646 fix1 = _mm256_add_pd(fix1,tx);
1647 fiy1 = _mm256_add_pd(fiy1,ty);
1648 fiz1 = _mm256_add_pd(fiz1,tz);
1650 fjx1 = _mm256_add_pd(fjx1,tx);
1651 fjy1 = _mm256_add_pd(fjy1,ty);
1652 fjz1 = _mm256_add_pd(fjz1,tz);
1654 /**************************
1655 * CALCULATE INTERACTIONS *
1656 **************************/
1658 r12 = _mm256_mul_pd(rsq12,rinv12);
1660 /* EWALD ELECTROSTATICS */
1662 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1663 ewrt = _mm256_mul_pd(r12,ewtabscale);
1664 ewitab = _mm256_cvttpd_epi32(ewrt);
1665 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1666 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1667 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1669 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1670 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1674 /* Calculate temporary vectorial force */
1675 tx = _mm256_mul_pd(fscal,dx12);
1676 ty = _mm256_mul_pd(fscal,dy12);
1677 tz = _mm256_mul_pd(fscal,dz12);
1679 /* Update vectorial force */
1680 fix1 = _mm256_add_pd(fix1,tx);
1681 fiy1 = _mm256_add_pd(fiy1,ty);
1682 fiz1 = _mm256_add_pd(fiz1,tz);
1684 fjx2 = _mm256_add_pd(fjx2,tx);
1685 fjy2 = _mm256_add_pd(fjy2,ty);
1686 fjz2 = _mm256_add_pd(fjz2,tz);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 r13 = _mm256_mul_pd(rsq13,rinv13);
1694 /* EWALD ELECTROSTATICS */
1696 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1697 ewrt = _mm256_mul_pd(r13,ewtabscale);
1698 ewitab = _mm256_cvttpd_epi32(ewrt);
1699 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1700 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1701 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1703 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1704 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1708 /* Calculate temporary vectorial force */
1709 tx = _mm256_mul_pd(fscal,dx13);
1710 ty = _mm256_mul_pd(fscal,dy13);
1711 tz = _mm256_mul_pd(fscal,dz13);
1713 /* Update vectorial force */
1714 fix1 = _mm256_add_pd(fix1,tx);
1715 fiy1 = _mm256_add_pd(fiy1,ty);
1716 fiz1 = _mm256_add_pd(fiz1,tz);
1718 fjx3 = _mm256_add_pd(fjx3,tx);
1719 fjy3 = _mm256_add_pd(fjy3,ty);
1720 fjz3 = _mm256_add_pd(fjz3,tz);
1722 /**************************
1723 * CALCULATE INTERACTIONS *
1724 **************************/
1726 r21 = _mm256_mul_pd(rsq21,rinv21);
1728 /* EWALD ELECTROSTATICS */
1730 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1731 ewrt = _mm256_mul_pd(r21,ewtabscale);
1732 ewitab = _mm256_cvttpd_epi32(ewrt);
1733 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1734 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1735 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1737 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1738 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1742 /* Calculate temporary vectorial force */
1743 tx = _mm256_mul_pd(fscal,dx21);
1744 ty = _mm256_mul_pd(fscal,dy21);
1745 tz = _mm256_mul_pd(fscal,dz21);
1747 /* Update vectorial force */
1748 fix2 = _mm256_add_pd(fix2,tx);
1749 fiy2 = _mm256_add_pd(fiy2,ty);
1750 fiz2 = _mm256_add_pd(fiz2,tz);
1752 fjx1 = _mm256_add_pd(fjx1,tx);
1753 fjy1 = _mm256_add_pd(fjy1,ty);
1754 fjz1 = _mm256_add_pd(fjz1,tz);
1756 /**************************
1757 * CALCULATE INTERACTIONS *
1758 **************************/
1760 r22 = _mm256_mul_pd(rsq22,rinv22);
1762 /* EWALD ELECTROSTATICS */
1764 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1765 ewrt = _mm256_mul_pd(r22,ewtabscale);
1766 ewitab = _mm256_cvttpd_epi32(ewrt);
1767 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1768 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1769 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1771 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1772 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1776 /* Calculate temporary vectorial force */
1777 tx = _mm256_mul_pd(fscal,dx22);
1778 ty = _mm256_mul_pd(fscal,dy22);
1779 tz = _mm256_mul_pd(fscal,dz22);
1781 /* Update vectorial force */
1782 fix2 = _mm256_add_pd(fix2,tx);
1783 fiy2 = _mm256_add_pd(fiy2,ty);
1784 fiz2 = _mm256_add_pd(fiz2,tz);
1786 fjx2 = _mm256_add_pd(fjx2,tx);
1787 fjy2 = _mm256_add_pd(fjy2,ty);
1788 fjz2 = _mm256_add_pd(fjz2,tz);
1790 /**************************
1791 * CALCULATE INTERACTIONS *
1792 **************************/
1794 r23 = _mm256_mul_pd(rsq23,rinv23);
1796 /* EWALD ELECTROSTATICS */
1798 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1799 ewrt = _mm256_mul_pd(r23,ewtabscale);
1800 ewitab = _mm256_cvttpd_epi32(ewrt);
1801 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1802 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1803 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1805 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1806 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1810 /* Calculate temporary vectorial force */
1811 tx = _mm256_mul_pd(fscal,dx23);
1812 ty = _mm256_mul_pd(fscal,dy23);
1813 tz = _mm256_mul_pd(fscal,dz23);
1815 /* Update vectorial force */
1816 fix2 = _mm256_add_pd(fix2,tx);
1817 fiy2 = _mm256_add_pd(fiy2,ty);
1818 fiz2 = _mm256_add_pd(fiz2,tz);
1820 fjx3 = _mm256_add_pd(fjx3,tx);
1821 fjy3 = _mm256_add_pd(fjy3,ty);
1822 fjz3 = _mm256_add_pd(fjz3,tz);
1824 /**************************
1825 * CALCULATE INTERACTIONS *
1826 **************************/
1828 r31 = _mm256_mul_pd(rsq31,rinv31);
1830 /* EWALD ELECTROSTATICS */
1832 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1833 ewrt = _mm256_mul_pd(r31,ewtabscale);
1834 ewitab = _mm256_cvttpd_epi32(ewrt);
1835 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1836 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1837 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1839 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1840 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1844 /* Calculate temporary vectorial force */
1845 tx = _mm256_mul_pd(fscal,dx31);
1846 ty = _mm256_mul_pd(fscal,dy31);
1847 tz = _mm256_mul_pd(fscal,dz31);
1849 /* Update vectorial force */
1850 fix3 = _mm256_add_pd(fix3,tx);
1851 fiy3 = _mm256_add_pd(fiy3,ty);
1852 fiz3 = _mm256_add_pd(fiz3,tz);
1854 fjx1 = _mm256_add_pd(fjx1,tx);
1855 fjy1 = _mm256_add_pd(fjy1,ty);
1856 fjz1 = _mm256_add_pd(fjz1,tz);
1858 /**************************
1859 * CALCULATE INTERACTIONS *
1860 **************************/
1862 r32 = _mm256_mul_pd(rsq32,rinv32);
1864 /* EWALD ELECTROSTATICS */
1866 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1867 ewrt = _mm256_mul_pd(r32,ewtabscale);
1868 ewitab = _mm256_cvttpd_epi32(ewrt);
1869 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1870 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1871 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1873 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1874 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1878 /* Calculate temporary vectorial force */
1879 tx = _mm256_mul_pd(fscal,dx32);
1880 ty = _mm256_mul_pd(fscal,dy32);
1881 tz = _mm256_mul_pd(fscal,dz32);
1883 /* Update vectorial force */
1884 fix3 = _mm256_add_pd(fix3,tx);
1885 fiy3 = _mm256_add_pd(fiy3,ty);
1886 fiz3 = _mm256_add_pd(fiz3,tz);
1888 fjx2 = _mm256_add_pd(fjx2,tx);
1889 fjy2 = _mm256_add_pd(fjy2,ty);
1890 fjz2 = _mm256_add_pd(fjz2,tz);
1892 /**************************
1893 * CALCULATE INTERACTIONS *
1894 **************************/
1896 r33 = _mm256_mul_pd(rsq33,rinv33);
1898 /* EWALD ELECTROSTATICS */
1900 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1901 ewrt = _mm256_mul_pd(r33,ewtabscale);
1902 ewitab = _mm256_cvttpd_epi32(ewrt);
1903 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1904 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1905 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1907 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1908 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1912 /* Calculate temporary vectorial force */
1913 tx = _mm256_mul_pd(fscal,dx33);
1914 ty = _mm256_mul_pd(fscal,dy33);
1915 tz = _mm256_mul_pd(fscal,dz33);
1917 /* Update vectorial force */
1918 fix3 = _mm256_add_pd(fix3,tx);
1919 fiy3 = _mm256_add_pd(fiy3,ty);
1920 fiz3 = _mm256_add_pd(fiz3,tz);
1922 fjx3 = _mm256_add_pd(fjx3,tx);
1923 fjy3 = _mm256_add_pd(fjy3,ty);
1924 fjz3 = _mm256_add_pd(fjz3,tz);
1926 fjptrA = f+j_coord_offsetA;
1927 fjptrB = f+j_coord_offsetB;
1928 fjptrC = f+j_coord_offsetC;
1929 fjptrD = f+j_coord_offsetD;
1931 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1932 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1933 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1935 /* Inner loop uses 354 flops */
1938 if(jidx<j_index_end)
1941 /* Get j neighbor index, and coordinate index */
1942 jnrlistA = jjnr[jidx];
1943 jnrlistB = jjnr[jidx+1];
1944 jnrlistC = jjnr[jidx+2];
1945 jnrlistD = jjnr[jidx+3];
1946 /* Sign of each element will be negative for non-real atoms.
1947 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1948 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1950 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1952 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1953 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1954 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1956 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1957 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1958 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1959 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1960 j_coord_offsetA = DIM*jnrA;
1961 j_coord_offsetB = DIM*jnrB;
1962 j_coord_offsetC = DIM*jnrC;
1963 j_coord_offsetD = DIM*jnrD;
1965 /* load j atom coordinates */
1966 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1967 x+j_coord_offsetC,x+j_coord_offsetD,
1968 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1969 &jy2,&jz2,&jx3,&jy3,&jz3);
1971 /* Calculate displacement vector */
1972 dx00 = _mm256_sub_pd(ix0,jx0);
1973 dy00 = _mm256_sub_pd(iy0,jy0);
1974 dz00 = _mm256_sub_pd(iz0,jz0);
1975 dx11 = _mm256_sub_pd(ix1,jx1);
1976 dy11 = _mm256_sub_pd(iy1,jy1);
1977 dz11 = _mm256_sub_pd(iz1,jz1);
1978 dx12 = _mm256_sub_pd(ix1,jx2);
1979 dy12 = _mm256_sub_pd(iy1,jy2);
1980 dz12 = _mm256_sub_pd(iz1,jz2);
1981 dx13 = _mm256_sub_pd(ix1,jx3);
1982 dy13 = _mm256_sub_pd(iy1,jy3);
1983 dz13 = _mm256_sub_pd(iz1,jz3);
1984 dx21 = _mm256_sub_pd(ix2,jx1);
1985 dy21 = _mm256_sub_pd(iy2,jy1);
1986 dz21 = _mm256_sub_pd(iz2,jz1);
1987 dx22 = _mm256_sub_pd(ix2,jx2);
1988 dy22 = _mm256_sub_pd(iy2,jy2);
1989 dz22 = _mm256_sub_pd(iz2,jz2);
1990 dx23 = _mm256_sub_pd(ix2,jx3);
1991 dy23 = _mm256_sub_pd(iy2,jy3);
1992 dz23 = _mm256_sub_pd(iz2,jz3);
1993 dx31 = _mm256_sub_pd(ix3,jx1);
1994 dy31 = _mm256_sub_pd(iy3,jy1);
1995 dz31 = _mm256_sub_pd(iz3,jz1);
1996 dx32 = _mm256_sub_pd(ix3,jx2);
1997 dy32 = _mm256_sub_pd(iy3,jy2);
1998 dz32 = _mm256_sub_pd(iz3,jz2);
1999 dx33 = _mm256_sub_pd(ix3,jx3);
2000 dy33 = _mm256_sub_pd(iy3,jy3);
2001 dz33 = _mm256_sub_pd(iz3,jz3);
2003 /* Calculate squared distance and things based on it */
2004 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2005 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2006 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2007 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2008 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2009 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2010 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2011 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2012 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2013 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2015 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2016 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2017 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2018 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2019 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2020 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2021 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2022 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2023 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2025 rinvsq00 = gmx_mm256_inv_pd(rsq00);
2026 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2027 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2028 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2029 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2030 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2031 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2032 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2033 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2034 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2036 fjx0 = _mm256_setzero_pd();
2037 fjy0 = _mm256_setzero_pd();
2038 fjz0 = _mm256_setzero_pd();
2039 fjx1 = _mm256_setzero_pd();
2040 fjy1 = _mm256_setzero_pd();
2041 fjz1 = _mm256_setzero_pd();
2042 fjx2 = _mm256_setzero_pd();
2043 fjy2 = _mm256_setzero_pd();
2044 fjz2 = _mm256_setzero_pd();
2045 fjx3 = _mm256_setzero_pd();
2046 fjy3 = _mm256_setzero_pd();
2047 fjz3 = _mm256_setzero_pd();
2049 /**************************
2050 * CALCULATE INTERACTIONS *
2051 **************************/
2053 /* LENNARD-JONES DISPERSION/REPULSION */
2055 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2056 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2060 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2062 /* Calculate temporary vectorial force */
2063 tx = _mm256_mul_pd(fscal,dx00);
2064 ty = _mm256_mul_pd(fscal,dy00);
2065 tz = _mm256_mul_pd(fscal,dz00);
2067 /* Update vectorial force */
2068 fix0 = _mm256_add_pd(fix0,tx);
2069 fiy0 = _mm256_add_pd(fiy0,ty);
2070 fiz0 = _mm256_add_pd(fiz0,tz);
2072 fjx0 = _mm256_add_pd(fjx0,tx);
2073 fjy0 = _mm256_add_pd(fjy0,ty);
2074 fjz0 = _mm256_add_pd(fjz0,tz);
2076 /**************************
2077 * CALCULATE INTERACTIONS *
2078 **************************/
2080 r11 = _mm256_mul_pd(rsq11,rinv11);
2081 r11 = _mm256_andnot_pd(dummy_mask,r11);
2083 /* EWALD ELECTROSTATICS */
2085 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2086 ewrt = _mm256_mul_pd(r11,ewtabscale);
2087 ewitab = _mm256_cvttpd_epi32(ewrt);
2088 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2089 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2090 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2092 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2093 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2097 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2099 /* Calculate temporary vectorial force */
2100 tx = _mm256_mul_pd(fscal,dx11);
2101 ty = _mm256_mul_pd(fscal,dy11);
2102 tz = _mm256_mul_pd(fscal,dz11);
2104 /* Update vectorial force */
2105 fix1 = _mm256_add_pd(fix1,tx);
2106 fiy1 = _mm256_add_pd(fiy1,ty);
2107 fiz1 = _mm256_add_pd(fiz1,tz);
2109 fjx1 = _mm256_add_pd(fjx1,tx);
2110 fjy1 = _mm256_add_pd(fjy1,ty);
2111 fjz1 = _mm256_add_pd(fjz1,tz);
2113 /**************************
2114 * CALCULATE INTERACTIONS *
2115 **************************/
2117 r12 = _mm256_mul_pd(rsq12,rinv12);
2118 r12 = _mm256_andnot_pd(dummy_mask,r12);
2120 /* EWALD ELECTROSTATICS */
2122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2123 ewrt = _mm256_mul_pd(r12,ewtabscale);
2124 ewitab = _mm256_cvttpd_epi32(ewrt);
2125 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2126 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2127 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2129 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2130 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2134 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2136 /* Calculate temporary vectorial force */
2137 tx = _mm256_mul_pd(fscal,dx12);
2138 ty = _mm256_mul_pd(fscal,dy12);
2139 tz = _mm256_mul_pd(fscal,dz12);
2141 /* Update vectorial force */
2142 fix1 = _mm256_add_pd(fix1,tx);
2143 fiy1 = _mm256_add_pd(fiy1,ty);
2144 fiz1 = _mm256_add_pd(fiz1,tz);
2146 fjx2 = _mm256_add_pd(fjx2,tx);
2147 fjy2 = _mm256_add_pd(fjy2,ty);
2148 fjz2 = _mm256_add_pd(fjz2,tz);
2150 /**************************
2151 * CALCULATE INTERACTIONS *
2152 **************************/
2154 r13 = _mm256_mul_pd(rsq13,rinv13);
2155 r13 = _mm256_andnot_pd(dummy_mask,r13);
2157 /* EWALD ELECTROSTATICS */
2159 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160 ewrt = _mm256_mul_pd(r13,ewtabscale);
2161 ewitab = _mm256_cvttpd_epi32(ewrt);
2162 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2163 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2164 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2166 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2167 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2171 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2173 /* Calculate temporary vectorial force */
2174 tx = _mm256_mul_pd(fscal,dx13);
2175 ty = _mm256_mul_pd(fscal,dy13);
2176 tz = _mm256_mul_pd(fscal,dz13);
2178 /* Update vectorial force */
2179 fix1 = _mm256_add_pd(fix1,tx);
2180 fiy1 = _mm256_add_pd(fiy1,ty);
2181 fiz1 = _mm256_add_pd(fiz1,tz);
2183 fjx3 = _mm256_add_pd(fjx3,tx);
2184 fjy3 = _mm256_add_pd(fjy3,ty);
2185 fjz3 = _mm256_add_pd(fjz3,tz);
2187 /**************************
2188 * CALCULATE INTERACTIONS *
2189 **************************/
2191 r21 = _mm256_mul_pd(rsq21,rinv21);
2192 r21 = _mm256_andnot_pd(dummy_mask,r21);
2194 /* EWALD ELECTROSTATICS */
2196 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2197 ewrt = _mm256_mul_pd(r21,ewtabscale);
2198 ewitab = _mm256_cvttpd_epi32(ewrt);
2199 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2200 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2201 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2203 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2204 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2208 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2210 /* Calculate temporary vectorial force */
2211 tx = _mm256_mul_pd(fscal,dx21);
2212 ty = _mm256_mul_pd(fscal,dy21);
2213 tz = _mm256_mul_pd(fscal,dz21);
2215 /* Update vectorial force */
2216 fix2 = _mm256_add_pd(fix2,tx);
2217 fiy2 = _mm256_add_pd(fiy2,ty);
2218 fiz2 = _mm256_add_pd(fiz2,tz);
2220 fjx1 = _mm256_add_pd(fjx1,tx);
2221 fjy1 = _mm256_add_pd(fjy1,ty);
2222 fjz1 = _mm256_add_pd(fjz1,tz);
2224 /**************************
2225 * CALCULATE INTERACTIONS *
2226 **************************/
2228 r22 = _mm256_mul_pd(rsq22,rinv22);
2229 r22 = _mm256_andnot_pd(dummy_mask,r22);
2231 /* EWALD ELECTROSTATICS */
2233 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2234 ewrt = _mm256_mul_pd(r22,ewtabscale);
2235 ewitab = _mm256_cvttpd_epi32(ewrt);
2236 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2237 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2238 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2240 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2241 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2245 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2247 /* Calculate temporary vectorial force */
2248 tx = _mm256_mul_pd(fscal,dx22);
2249 ty = _mm256_mul_pd(fscal,dy22);
2250 tz = _mm256_mul_pd(fscal,dz22);
2252 /* Update vectorial force */
2253 fix2 = _mm256_add_pd(fix2,tx);
2254 fiy2 = _mm256_add_pd(fiy2,ty);
2255 fiz2 = _mm256_add_pd(fiz2,tz);
2257 fjx2 = _mm256_add_pd(fjx2,tx);
2258 fjy2 = _mm256_add_pd(fjy2,ty);
2259 fjz2 = _mm256_add_pd(fjz2,tz);
2261 /**************************
2262 * CALCULATE INTERACTIONS *
2263 **************************/
2265 r23 = _mm256_mul_pd(rsq23,rinv23);
2266 r23 = _mm256_andnot_pd(dummy_mask,r23);
2268 /* EWALD ELECTROSTATICS */
2270 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2271 ewrt = _mm256_mul_pd(r23,ewtabscale);
2272 ewitab = _mm256_cvttpd_epi32(ewrt);
2273 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2274 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2275 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2277 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2278 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2282 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2284 /* Calculate temporary vectorial force */
2285 tx = _mm256_mul_pd(fscal,dx23);
2286 ty = _mm256_mul_pd(fscal,dy23);
2287 tz = _mm256_mul_pd(fscal,dz23);
2289 /* Update vectorial force */
2290 fix2 = _mm256_add_pd(fix2,tx);
2291 fiy2 = _mm256_add_pd(fiy2,ty);
2292 fiz2 = _mm256_add_pd(fiz2,tz);
2294 fjx3 = _mm256_add_pd(fjx3,tx);
2295 fjy3 = _mm256_add_pd(fjy3,ty);
2296 fjz3 = _mm256_add_pd(fjz3,tz);
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2302 r31 = _mm256_mul_pd(rsq31,rinv31);
2303 r31 = _mm256_andnot_pd(dummy_mask,r31);
2305 /* EWALD ELECTROSTATICS */
2307 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2308 ewrt = _mm256_mul_pd(r31,ewtabscale);
2309 ewitab = _mm256_cvttpd_epi32(ewrt);
2310 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2311 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2312 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2314 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2315 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2319 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2321 /* Calculate temporary vectorial force */
2322 tx = _mm256_mul_pd(fscal,dx31);
2323 ty = _mm256_mul_pd(fscal,dy31);
2324 tz = _mm256_mul_pd(fscal,dz31);
2326 /* Update vectorial force */
2327 fix3 = _mm256_add_pd(fix3,tx);
2328 fiy3 = _mm256_add_pd(fiy3,ty);
2329 fiz3 = _mm256_add_pd(fiz3,tz);
2331 fjx1 = _mm256_add_pd(fjx1,tx);
2332 fjy1 = _mm256_add_pd(fjy1,ty);
2333 fjz1 = _mm256_add_pd(fjz1,tz);
2335 /**************************
2336 * CALCULATE INTERACTIONS *
2337 **************************/
2339 r32 = _mm256_mul_pd(rsq32,rinv32);
2340 r32 = _mm256_andnot_pd(dummy_mask,r32);
2342 /* EWALD ELECTROSTATICS */
2344 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2345 ewrt = _mm256_mul_pd(r32,ewtabscale);
2346 ewitab = _mm256_cvttpd_epi32(ewrt);
2347 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2348 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2349 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2351 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2352 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2356 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2358 /* Calculate temporary vectorial force */
2359 tx = _mm256_mul_pd(fscal,dx32);
2360 ty = _mm256_mul_pd(fscal,dy32);
2361 tz = _mm256_mul_pd(fscal,dz32);
2363 /* Update vectorial force */
2364 fix3 = _mm256_add_pd(fix3,tx);
2365 fiy3 = _mm256_add_pd(fiy3,ty);
2366 fiz3 = _mm256_add_pd(fiz3,tz);
2368 fjx2 = _mm256_add_pd(fjx2,tx);
2369 fjy2 = _mm256_add_pd(fjy2,ty);
2370 fjz2 = _mm256_add_pd(fjz2,tz);
2372 /**************************
2373 * CALCULATE INTERACTIONS *
2374 **************************/
2376 r33 = _mm256_mul_pd(rsq33,rinv33);
2377 r33 = _mm256_andnot_pd(dummy_mask,r33);
2379 /* EWALD ELECTROSTATICS */
2381 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2382 ewrt = _mm256_mul_pd(r33,ewtabscale);
2383 ewitab = _mm256_cvttpd_epi32(ewrt);
2384 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2385 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2386 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2388 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2389 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2393 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2395 /* Calculate temporary vectorial force */
2396 tx = _mm256_mul_pd(fscal,dx33);
2397 ty = _mm256_mul_pd(fscal,dy33);
2398 tz = _mm256_mul_pd(fscal,dz33);
2400 /* Update vectorial force */
2401 fix3 = _mm256_add_pd(fix3,tx);
2402 fiy3 = _mm256_add_pd(fiy3,ty);
2403 fiz3 = _mm256_add_pd(fiz3,tz);
2405 fjx3 = _mm256_add_pd(fjx3,tx);
2406 fjy3 = _mm256_add_pd(fjy3,ty);
2407 fjz3 = _mm256_add_pd(fjz3,tz);
2409 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2410 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2411 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2412 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2414 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2415 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2416 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2418 /* Inner loop uses 363 flops */
2421 /* End of innermost loop */
2423 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2424 f+i_coord_offset,fshift+i_shift_offset);
2426 /* Increment number of inner iterations */
2427 inneriter += j_index_end - j_index_start;
2429 /* Outer loop uses 24 flops */
2432 /* Increment number of outer iterations */
2435 /* Update outer/inner flops */
2437 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*363);