2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
100 __m128i ifour = _mm_set1_epi32(4);
101 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
107 __m256d dummy_mask,cutoff_mask;
108 __m128 tmpmask0,tmpmask1;
109 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
110 __m256d one = _mm256_set1_pd(1.0);
111 __m256d two = _mm256_set1_pd(2.0);
117 jindex = nlist->jindex;
119 shiftidx = nlist->shift;
121 shiftvec = fr->shift_vec[0];
122 fshift = fr->fshift[0];
123 facel = _mm256_set1_pd(fr->epsfac);
124 charge = mdatoms->chargeA;
125 nvdwtype = fr->ntype;
127 vdwtype = mdatoms->typeA;
129 vftab = kernel_data->table_vdw->data;
130 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
132 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
133 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
134 beta2 = _mm256_mul_pd(beta,beta);
135 beta3 = _mm256_mul_pd(beta,beta2);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
144 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
145 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
146 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
148 jq0 = _mm256_set1_pd(charge[inr+0]);
149 jq1 = _mm256_set1_pd(charge[inr+1]);
150 jq2 = _mm256_set1_pd(charge[inr+2]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 qq00 = _mm256_mul_pd(iq0,jq0);
153 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
154 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
155 qq01 = _mm256_mul_pd(iq0,jq1);
156 qq02 = _mm256_mul_pd(iq0,jq2);
157 qq10 = _mm256_mul_pd(iq1,jq0);
158 qq11 = _mm256_mul_pd(iq1,jq1);
159 qq12 = _mm256_mul_pd(iq1,jq2);
160 qq20 = _mm256_mul_pd(iq2,jq0);
161 qq21 = _mm256_mul_pd(iq2,jq1);
162 qq22 = _mm256_mul_pd(iq2,jq2);
164 /* Avoid stupid compiler warnings */
165 jnrA = jnrB = jnrC = jnrD = 0;
174 for(iidx=0;iidx<4*DIM;iidx++)
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197 fix0 = _mm256_setzero_pd();
198 fiy0 = _mm256_setzero_pd();
199 fiz0 = _mm256_setzero_pd();
200 fix1 = _mm256_setzero_pd();
201 fiy1 = _mm256_setzero_pd();
202 fiz1 = _mm256_setzero_pd();
203 fix2 = _mm256_setzero_pd();
204 fiy2 = _mm256_setzero_pd();
205 fiz2 = _mm256_setzero_pd();
207 /* Reset potential sums */
208 velecsum = _mm256_setzero_pd();
209 vvdwsum = _mm256_setzero_pd();
211 /* Start inner kernel loop */
212 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
225 /* load j atom coordinates */
226 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
227 x+j_coord_offsetC,x+j_coord_offsetD,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
230 /* Calculate displacement vector */
231 dx00 = _mm256_sub_pd(ix0,jx0);
232 dy00 = _mm256_sub_pd(iy0,jy0);
233 dz00 = _mm256_sub_pd(iz0,jz0);
234 dx01 = _mm256_sub_pd(ix0,jx1);
235 dy01 = _mm256_sub_pd(iy0,jy1);
236 dz01 = _mm256_sub_pd(iz0,jz1);
237 dx02 = _mm256_sub_pd(ix0,jx2);
238 dy02 = _mm256_sub_pd(iy0,jy2);
239 dz02 = _mm256_sub_pd(iz0,jz2);
240 dx10 = _mm256_sub_pd(ix1,jx0);
241 dy10 = _mm256_sub_pd(iy1,jy0);
242 dz10 = _mm256_sub_pd(iz1,jz0);
243 dx11 = _mm256_sub_pd(ix1,jx1);
244 dy11 = _mm256_sub_pd(iy1,jy1);
245 dz11 = _mm256_sub_pd(iz1,jz1);
246 dx12 = _mm256_sub_pd(ix1,jx2);
247 dy12 = _mm256_sub_pd(iy1,jy2);
248 dz12 = _mm256_sub_pd(iz1,jz2);
249 dx20 = _mm256_sub_pd(ix2,jx0);
250 dy20 = _mm256_sub_pd(iy2,jy0);
251 dz20 = _mm256_sub_pd(iz2,jz0);
252 dx21 = _mm256_sub_pd(ix2,jx1);
253 dy21 = _mm256_sub_pd(iy2,jy1);
254 dz21 = _mm256_sub_pd(iz2,jz1);
255 dx22 = _mm256_sub_pd(ix2,jx2);
256 dy22 = _mm256_sub_pd(iy2,jy2);
257 dz22 = _mm256_sub_pd(iz2,jz2);
259 /* Calculate squared distance and things based on it */
260 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
261 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
262 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
263 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
264 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
265 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
266 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
267 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
268 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
270 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
271 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
272 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
273 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
274 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
275 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
276 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
277 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
278 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
280 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
281 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
282 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
283 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
284 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
285 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
286 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
287 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
288 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
290 fjx0 = _mm256_setzero_pd();
291 fjy0 = _mm256_setzero_pd();
292 fjz0 = _mm256_setzero_pd();
293 fjx1 = _mm256_setzero_pd();
294 fjy1 = _mm256_setzero_pd();
295 fjz1 = _mm256_setzero_pd();
296 fjx2 = _mm256_setzero_pd();
297 fjy2 = _mm256_setzero_pd();
298 fjz2 = _mm256_setzero_pd();
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 r00 = _mm256_mul_pd(rsq00,rinv00);
306 /* Calculate table index by multiplying r with table scale and truncate to integer */
307 rt = _mm256_mul_pd(r00,vftabscale);
308 vfitab = _mm256_cvttpd_epi32(rt);
309 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
310 vfitab = _mm_slli_epi32(vfitab,3);
312 /* EWALD ELECTROSTATICS */
314 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
315 ewrt = _mm256_mul_pd(r00,ewtabscale);
316 ewitab = _mm256_cvttpd_epi32(ewrt);
317 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
318 ewitab = _mm_slli_epi32(ewitab,2);
319 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
320 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
321 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
322 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
323 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
324 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
325 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
326 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
327 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
329 /* CUBIC SPLINE TABLE DISPERSION */
330 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
331 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
332 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
333 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
334 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
335 Heps = _mm256_mul_pd(vfeps,H);
336 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
337 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
338 vvdw6 = _mm256_mul_pd(c6_00,VV);
339 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
340 fvdw6 = _mm256_mul_pd(c6_00,FF);
342 /* CUBIC SPLINE TABLE REPULSION */
343 vfitab = _mm_add_epi32(vfitab,ifour);
344 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
345 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
346 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
347 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
348 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
349 Heps = _mm256_mul_pd(vfeps,H);
350 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
351 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
352 vvdw12 = _mm256_mul_pd(c12_00,VV);
353 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
354 fvdw12 = _mm256_mul_pd(c12_00,FF);
355 vvdw = _mm256_add_pd(vvdw12,vvdw6);
356 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
358 /* Update potential sum for this i atom from the interaction with this j atom. */
359 velecsum = _mm256_add_pd(velecsum,velec);
360 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
362 fscal = _mm256_add_pd(felec,fvdw);
364 /* Calculate temporary vectorial force */
365 tx = _mm256_mul_pd(fscal,dx00);
366 ty = _mm256_mul_pd(fscal,dy00);
367 tz = _mm256_mul_pd(fscal,dz00);
369 /* Update vectorial force */
370 fix0 = _mm256_add_pd(fix0,tx);
371 fiy0 = _mm256_add_pd(fiy0,ty);
372 fiz0 = _mm256_add_pd(fiz0,tz);
374 fjx0 = _mm256_add_pd(fjx0,tx);
375 fjy0 = _mm256_add_pd(fjy0,ty);
376 fjz0 = _mm256_add_pd(fjz0,tz);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 r01 = _mm256_mul_pd(rsq01,rinv01);
384 /* EWALD ELECTROSTATICS */
386 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
387 ewrt = _mm256_mul_pd(r01,ewtabscale);
388 ewitab = _mm256_cvttpd_epi32(ewrt);
389 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
390 ewitab = _mm_slli_epi32(ewitab,2);
391 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
392 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
393 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
394 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
395 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
396 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
397 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
398 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
399 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velecsum = _mm256_add_pd(velecsum,velec);
406 /* Calculate temporary vectorial force */
407 tx = _mm256_mul_pd(fscal,dx01);
408 ty = _mm256_mul_pd(fscal,dy01);
409 tz = _mm256_mul_pd(fscal,dz01);
411 /* Update vectorial force */
412 fix0 = _mm256_add_pd(fix0,tx);
413 fiy0 = _mm256_add_pd(fiy0,ty);
414 fiz0 = _mm256_add_pd(fiz0,tz);
416 fjx1 = _mm256_add_pd(fjx1,tx);
417 fjy1 = _mm256_add_pd(fjy1,ty);
418 fjz1 = _mm256_add_pd(fjz1,tz);
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 r02 = _mm256_mul_pd(rsq02,rinv02);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm256_mul_pd(r02,ewtabscale);
430 ewitab = _mm256_cvttpd_epi32(ewrt);
431 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
434 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
435 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
436 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
437 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
438 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
439 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
440 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
441 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm256_add_pd(velecsum,velec);
448 /* Calculate temporary vectorial force */
449 tx = _mm256_mul_pd(fscal,dx02);
450 ty = _mm256_mul_pd(fscal,dy02);
451 tz = _mm256_mul_pd(fscal,dz02);
453 /* Update vectorial force */
454 fix0 = _mm256_add_pd(fix0,tx);
455 fiy0 = _mm256_add_pd(fiy0,ty);
456 fiz0 = _mm256_add_pd(fiz0,tz);
458 fjx2 = _mm256_add_pd(fjx2,tx);
459 fjy2 = _mm256_add_pd(fjy2,ty);
460 fjz2 = _mm256_add_pd(fjz2,tz);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 r10 = _mm256_mul_pd(rsq10,rinv10);
468 /* EWALD ELECTROSTATICS */
470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
471 ewrt = _mm256_mul_pd(r10,ewtabscale);
472 ewitab = _mm256_cvttpd_epi32(ewrt);
473 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
474 ewitab = _mm_slli_epi32(ewitab,2);
475 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
476 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
477 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
478 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
479 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
480 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
481 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
482 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
483 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velecsum = _mm256_add_pd(velecsum,velec);
490 /* Calculate temporary vectorial force */
491 tx = _mm256_mul_pd(fscal,dx10);
492 ty = _mm256_mul_pd(fscal,dy10);
493 tz = _mm256_mul_pd(fscal,dz10);
495 /* Update vectorial force */
496 fix1 = _mm256_add_pd(fix1,tx);
497 fiy1 = _mm256_add_pd(fiy1,ty);
498 fiz1 = _mm256_add_pd(fiz1,tz);
500 fjx0 = _mm256_add_pd(fjx0,tx);
501 fjy0 = _mm256_add_pd(fjy0,ty);
502 fjz0 = _mm256_add_pd(fjz0,tz);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 r11 = _mm256_mul_pd(rsq11,rinv11);
510 /* EWALD ELECTROSTATICS */
512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
513 ewrt = _mm256_mul_pd(r11,ewtabscale);
514 ewitab = _mm256_cvttpd_epi32(ewrt);
515 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
516 ewitab = _mm_slli_epi32(ewitab,2);
517 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
518 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
519 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
520 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
521 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
522 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
523 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
524 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
525 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velecsum = _mm256_add_pd(velecsum,velec);
532 /* Calculate temporary vectorial force */
533 tx = _mm256_mul_pd(fscal,dx11);
534 ty = _mm256_mul_pd(fscal,dy11);
535 tz = _mm256_mul_pd(fscal,dz11);
537 /* Update vectorial force */
538 fix1 = _mm256_add_pd(fix1,tx);
539 fiy1 = _mm256_add_pd(fiy1,ty);
540 fiz1 = _mm256_add_pd(fiz1,tz);
542 fjx1 = _mm256_add_pd(fjx1,tx);
543 fjy1 = _mm256_add_pd(fjy1,ty);
544 fjz1 = _mm256_add_pd(fjz1,tz);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 r12 = _mm256_mul_pd(rsq12,rinv12);
552 /* EWALD ELECTROSTATICS */
554 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
555 ewrt = _mm256_mul_pd(r12,ewtabscale);
556 ewitab = _mm256_cvttpd_epi32(ewrt);
557 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
558 ewitab = _mm_slli_epi32(ewitab,2);
559 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
560 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
561 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
562 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
563 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
564 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
565 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
566 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
567 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velecsum = _mm256_add_pd(velecsum,velec);
574 /* Calculate temporary vectorial force */
575 tx = _mm256_mul_pd(fscal,dx12);
576 ty = _mm256_mul_pd(fscal,dy12);
577 tz = _mm256_mul_pd(fscal,dz12);
579 /* Update vectorial force */
580 fix1 = _mm256_add_pd(fix1,tx);
581 fiy1 = _mm256_add_pd(fiy1,ty);
582 fiz1 = _mm256_add_pd(fiz1,tz);
584 fjx2 = _mm256_add_pd(fjx2,tx);
585 fjy2 = _mm256_add_pd(fjy2,ty);
586 fjz2 = _mm256_add_pd(fjz2,tz);
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 r20 = _mm256_mul_pd(rsq20,rinv20);
594 /* EWALD ELECTROSTATICS */
596 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
597 ewrt = _mm256_mul_pd(r20,ewtabscale);
598 ewitab = _mm256_cvttpd_epi32(ewrt);
599 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
600 ewitab = _mm_slli_epi32(ewitab,2);
601 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
602 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
603 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
604 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
605 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
606 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
607 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
608 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
609 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velecsum = _mm256_add_pd(velecsum,velec);
616 /* Calculate temporary vectorial force */
617 tx = _mm256_mul_pd(fscal,dx20);
618 ty = _mm256_mul_pd(fscal,dy20);
619 tz = _mm256_mul_pd(fscal,dz20);
621 /* Update vectorial force */
622 fix2 = _mm256_add_pd(fix2,tx);
623 fiy2 = _mm256_add_pd(fiy2,ty);
624 fiz2 = _mm256_add_pd(fiz2,tz);
626 fjx0 = _mm256_add_pd(fjx0,tx);
627 fjy0 = _mm256_add_pd(fjy0,ty);
628 fjz0 = _mm256_add_pd(fjz0,tz);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 r21 = _mm256_mul_pd(rsq21,rinv21);
636 /* EWALD ELECTROSTATICS */
638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639 ewrt = _mm256_mul_pd(r21,ewtabscale);
640 ewitab = _mm256_cvttpd_epi32(ewrt);
641 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
642 ewitab = _mm_slli_epi32(ewitab,2);
643 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
644 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
645 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
646 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
647 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
648 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
649 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
650 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
651 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
653 /* Update potential sum for this i atom from the interaction with this j atom. */
654 velecsum = _mm256_add_pd(velecsum,velec);
658 /* Calculate temporary vectorial force */
659 tx = _mm256_mul_pd(fscal,dx21);
660 ty = _mm256_mul_pd(fscal,dy21);
661 tz = _mm256_mul_pd(fscal,dz21);
663 /* Update vectorial force */
664 fix2 = _mm256_add_pd(fix2,tx);
665 fiy2 = _mm256_add_pd(fiy2,ty);
666 fiz2 = _mm256_add_pd(fiz2,tz);
668 fjx1 = _mm256_add_pd(fjx1,tx);
669 fjy1 = _mm256_add_pd(fjy1,ty);
670 fjz1 = _mm256_add_pd(fjz1,tz);
672 /**************************
673 * CALCULATE INTERACTIONS *
674 **************************/
676 r22 = _mm256_mul_pd(rsq22,rinv22);
678 /* EWALD ELECTROSTATICS */
680 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681 ewrt = _mm256_mul_pd(r22,ewtabscale);
682 ewitab = _mm256_cvttpd_epi32(ewrt);
683 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
684 ewitab = _mm_slli_epi32(ewitab,2);
685 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
686 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
687 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
688 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
689 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
690 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
691 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
692 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
693 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
695 /* Update potential sum for this i atom from the interaction with this j atom. */
696 velecsum = _mm256_add_pd(velecsum,velec);
700 /* Calculate temporary vectorial force */
701 tx = _mm256_mul_pd(fscal,dx22);
702 ty = _mm256_mul_pd(fscal,dy22);
703 tz = _mm256_mul_pd(fscal,dz22);
705 /* Update vectorial force */
706 fix2 = _mm256_add_pd(fix2,tx);
707 fiy2 = _mm256_add_pd(fiy2,ty);
708 fiz2 = _mm256_add_pd(fiz2,tz);
710 fjx2 = _mm256_add_pd(fjx2,tx);
711 fjy2 = _mm256_add_pd(fjy2,ty);
712 fjz2 = _mm256_add_pd(fjz2,tz);
714 fjptrA = f+j_coord_offsetA;
715 fjptrB = f+j_coord_offsetB;
716 fjptrC = f+j_coord_offsetC;
717 fjptrD = f+j_coord_offsetD;
719 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
720 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
722 /* Inner loop uses 403 flops */
728 /* Get j neighbor index, and coordinate index */
729 jnrlistA = jjnr[jidx];
730 jnrlistB = jjnr[jidx+1];
731 jnrlistC = jjnr[jidx+2];
732 jnrlistD = jjnr[jidx+3];
733 /* Sign of each element will be negative for non-real atoms.
734 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
735 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
737 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
739 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
740 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
741 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
743 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
744 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
745 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
746 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
747 j_coord_offsetA = DIM*jnrA;
748 j_coord_offsetB = DIM*jnrB;
749 j_coord_offsetC = DIM*jnrC;
750 j_coord_offsetD = DIM*jnrD;
752 /* load j atom coordinates */
753 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
754 x+j_coord_offsetC,x+j_coord_offsetD,
755 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
757 /* Calculate displacement vector */
758 dx00 = _mm256_sub_pd(ix0,jx0);
759 dy00 = _mm256_sub_pd(iy0,jy0);
760 dz00 = _mm256_sub_pd(iz0,jz0);
761 dx01 = _mm256_sub_pd(ix0,jx1);
762 dy01 = _mm256_sub_pd(iy0,jy1);
763 dz01 = _mm256_sub_pd(iz0,jz1);
764 dx02 = _mm256_sub_pd(ix0,jx2);
765 dy02 = _mm256_sub_pd(iy0,jy2);
766 dz02 = _mm256_sub_pd(iz0,jz2);
767 dx10 = _mm256_sub_pd(ix1,jx0);
768 dy10 = _mm256_sub_pd(iy1,jy0);
769 dz10 = _mm256_sub_pd(iz1,jz0);
770 dx11 = _mm256_sub_pd(ix1,jx1);
771 dy11 = _mm256_sub_pd(iy1,jy1);
772 dz11 = _mm256_sub_pd(iz1,jz1);
773 dx12 = _mm256_sub_pd(ix1,jx2);
774 dy12 = _mm256_sub_pd(iy1,jy2);
775 dz12 = _mm256_sub_pd(iz1,jz2);
776 dx20 = _mm256_sub_pd(ix2,jx0);
777 dy20 = _mm256_sub_pd(iy2,jy0);
778 dz20 = _mm256_sub_pd(iz2,jz0);
779 dx21 = _mm256_sub_pd(ix2,jx1);
780 dy21 = _mm256_sub_pd(iy2,jy1);
781 dz21 = _mm256_sub_pd(iz2,jz1);
782 dx22 = _mm256_sub_pd(ix2,jx2);
783 dy22 = _mm256_sub_pd(iy2,jy2);
784 dz22 = _mm256_sub_pd(iz2,jz2);
786 /* Calculate squared distance and things based on it */
787 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
788 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
789 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
790 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
791 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
792 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
793 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
794 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
795 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
797 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
798 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
799 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
800 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
801 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
802 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
803 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
804 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
805 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
807 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
808 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
809 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
810 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
811 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
812 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
813 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
814 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
815 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
817 fjx0 = _mm256_setzero_pd();
818 fjy0 = _mm256_setzero_pd();
819 fjz0 = _mm256_setzero_pd();
820 fjx1 = _mm256_setzero_pd();
821 fjy1 = _mm256_setzero_pd();
822 fjz1 = _mm256_setzero_pd();
823 fjx2 = _mm256_setzero_pd();
824 fjy2 = _mm256_setzero_pd();
825 fjz2 = _mm256_setzero_pd();
827 /**************************
828 * CALCULATE INTERACTIONS *
829 **************************/
831 r00 = _mm256_mul_pd(rsq00,rinv00);
832 r00 = _mm256_andnot_pd(dummy_mask,r00);
834 /* Calculate table index by multiplying r with table scale and truncate to integer */
835 rt = _mm256_mul_pd(r00,vftabscale);
836 vfitab = _mm256_cvttpd_epi32(rt);
837 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
838 vfitab = _mm_slli_epi32(vfitab,3);
840 /* EWALD ELECTROSTATICS */
842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
843 ewrt = _mm256_mul_pd(r00,ewtabscale);
844 ewitab = _mm256_cvttpd_epi32(ewrt);
845 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
846 ewitab = _mm_slli_epi32(ewitab,2);
847 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
848 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
849 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
850 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
851 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
852 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
853 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
854 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
855 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
857 /* CUBIC SPLINE TABLE DISPERSION */
858 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
859 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
860 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
861 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
862 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
863 Heps = _mm256_mul_pd(vfeps,H);
864 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
865 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
866 vvdw6 = _mm256_mul_pd(c6_00,VV);
867 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
868 fvdw6 = _mm256_mul_pd(c6_00,FF);
870 /* CUBIC SPLINE TABLE REPULSION */
871 vfitab = _mm_add_epi32(vfitab,ifour);
872 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
873 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
874 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
875 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
876 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
877 Heps = _mm256_mul_pd(vfeps,H);
878 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
879 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
880 vvdw12 = _mm256_mul_pd(c12_00,VV);
881 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
882 fvdw12 = _mm256_mul_pd(c12_00,FF);
883 vvdw = _mm256_add_pd(vvdw12,vvdw6);
884 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 velec = _mm256_andnot_pd(dummy_mask,velec);
888 velecsum = _mm256_add_pd(velecsum,velec);
889 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
890 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
892 fscal = _mm256_add_pd(felec,fvdw);
894 fscal = _mm256_andnot_pd(dummy_mask,fscal);
896 /* Calculate temporary vectorial force */
897 tx = _mm256_mul_pd(fscal,dx00);
898 ty = _mm256_mul_pd(fscal,dy00);
899 tz = _mm256_mul_pd(fscal,dz00);
901 /* Update vectorial force */
902 fix0 = _mm256_add_pd(fix0,tx);
903 fiy0 = _mm256_add_pd(fiy0,ty);
904 fiz0 = _mm256_add_pd(fiz0,tz);
906 fjx0 = _mm256_add_pd(fjx0,tx);
907 fjy0 = _mm256_add_pd(fjy0,ty);
908 fjz0 = _mm256_add_pd(fjz0,tz);
910 /**************************
911 * CALCULATE INTERACTIONS *
912 **************************/
914 r01 = _mm256_mul_pd(rsq01,rinv01);
915 r01 = _mm256_andnot_pd(dummy_mask,r01);
917 /* EWALD ELECTROSTATICS */
919 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
920 ewrt = _mm256_mul_pd(r01,ewtabscale);
921 ewitab = _mm256_cvttpd_epi32(ewrt);
922 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
923 ewitab = _mm_slli_epi32(ewitab,2);
924 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
925 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
926 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
927 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
928 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
929 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
930 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
931 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
932 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
934 /* Update potential sum for this i atom from the interaction with this j atom. */
935 velec = _mm256_andnot_pd(dummy_mask,velec);
936 velecsum = _mm256_add_pd(velecsum,velec);
940 fscal = _mm256_andnot_pd(dummy_mask,fscal);
942 /* Calculate temporary vectorial force */
943 tx = _mm256_mul_pd(fscal,dx01);
944 ty = _mm256_mul_pd(fscal,dy01);
945 tz = _mm256_mul_pd(fscal,dz01);
947 /* Update vectorial force */
948 fix0 = _mm256_add_pd(fix0,tx);
949 fiy0 = _mm256_add_pd(fiy0,ty);
950 fiz0 = _mm256_add_pd(fiz0,tz);
952 fjx1 = _mm256_add_pd(fjx1,tx);
953 fjy1 = _mm256_add_pd(fjy1,ty);
954 fjz1 = _mm256_add_pd(fjz1,tz);
956 /**************************
957 * CALCULATE INTERACTIONS *
958 **************************/
960 r02 = _mm256_mul_pd(rsq02,rinv02);
961 r02 = _mm256_andnot_pd(dummy_mask,r02);
963 /* EWALD ELECTROSTATICS */
965 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
966 ewrt = _mm256_mul_pd(r02,ewtabscale);
967 ewitab = _mm256_cvttpd_epi32(ewrt);
968 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
969 ewitab = _mm_slli_epi32(ewitab,2);
970 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
971 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
972 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
973 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
974 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
975 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
976 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
977 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
978 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
980 /* Update potential sum for this i atom from the interaction with this j atom. */
981 velec = _mm256_andnot_pd(dummy_mask,velec);
982 velecsum = _mm256_add_pd(velecsum,velec);
986 fscal = _mm256_andnot_pd(dummy_mask,fscal);
988 /* Calculate temporary vectorial force */
989 tx = _mm256_mul_pd(fscal,dx02);
990 ty = _mm256_mul_pd(fscal,dy02);
991 tz = _mm256_mul_pd(fscal,dz02);
993 /* Update vectorial force */
994 fix0 = _mm256_add_pd(fix0,tx);
995 fiy0 = _mm256_add_pd(fiy0,ty);
996 fiz0 = _mm256_add_pd(fiz0,tz);
998 fjx2 = _mm256_add_pd(fjx2,tx);
999 fjy2 = _mm256_add_pd(fjy2,ty);
1000 fjz2 = _mm256_add_pd(fjz2,tz);
1002 /**************************
1003 * CALCULATE INTERACTIONS *
1004 **************************/
1006 r10 = _mm256_mul_pd(rsq10,rinv10);
1007 r10 = _mm256_andnot_pd(dummy_mask,r10);
1009 /* EWALD ELECTROSTATICS */
1011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012 ewrt = _mm256_mul_pd(r10,ewtabscale);
1013 ewitab = _mm256_cvttpd_epi32(ewrt);
1014 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1015 ewitab = _mm_slli_epi32(ewitab,2);
1016 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1017 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1018 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1019 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1020 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1021 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1022 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1023 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1024 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm256_andnot_pd(dummy_mask,velec);
1028 velecsum = _mm256_add_pd(velecsum,velec);
1032 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1034 /* Calculate temporary vectorial force */
1035 tx = _mm256_mul_pd(fscal,dx10);
1036 ty = _mm256_mul_pd(fscal,dy10);
1037 tz = _mm256_mul_pd(fscal,dz10);
1039 /* Update vectorial force */
1040 fix1 = _mm256_add_pd(fix1,tx);
1041 fiy1 = _mm256_add_pd(fiy1,ty);
1042 fiz1 = _mm256_add_pd(fiz1,tz);
1044 fjx0 = _mm256_add_pd(fjx0,tx);
1045 fjy0 = _mm256_add_pd(fjy0,ty);
1046 fjz0 = _mm256_add_pd(fjz0,tz);
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 r11 = _mm256_mul_pd(rsq11,rinv11);
1053 r11 = _mm256_andnot_pd(dummy_mask,r11);
1055 /* EWALD ELECTROSTATICS */
1057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058 ewrt = _mm256_mul_pd(r11,ewtabscale);
1059 ewitab = _mm256_cvttpd_epi32(ewrt);
1060 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1061 ewitab = _mm_slli_epi32(ewitab,2);
1062 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1063 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1064 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1065 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1066 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1067 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1068 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1069 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1070 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 velec = _mm256_andnot_pd(dummy_mask,velec);
1074 velecsum = _mm256_add_pd(velecsum,velec);
1078 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1080 /* Calculate temporary vectorial force */
1081 tx = _mm256_mul_pd(fscal,dx11);
1082 ty = _mm256_mul_pd(fscal,dy11);
1083 tz = _mm256_mul_pd(fscal,dz11);
1085 /* Update vectorial force */
1086 fix1 = _mm256_add_pd(fix1,tx);
1087 fiy1 = _mm256_add_pd(fiy1,ty);
1088 fiz1 = _mm256_add_pd(fiz1,tz);
1090 fjx1 = _mm256_add_pd(fjx1,tx);
1091 fjy1 = _mm256_add_pd(fjy1,ty);
1092 fjz1 = _mm256_add_pd(fjz1,tz);
1094 /**************************
1095 * CALCULATE INTERACTIONS *
1096 **************************/
1098 r12 = _mm256_mul_pd(rsq12,rinv12);
1099 r12 = _mm256_andnot_pd(dummy_mask,r12);
1101 /* EWALD ELECTROSTATICS */
1103 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104 ewrt = _mm256_mul_pd(r12,ewtabscale);
1105 ewitab = _mm256_cvttpd_epi32(ewrt);
1106 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1107 ewitab = _mm_slli_epi32(ewitab,2);
1108 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1109 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1110 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1111 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1112 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1113 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1114 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1115 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1116 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1118 /* Update potential sum for this i atom from the interaction with this j atom. */
1119 velec = _mm256_andnot_pd(dummy_mask,velec);
1120 velecsum = _mm256_add_pd(velecsum,velec);
1124 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1126 /* Calculate temporary vectorial force */
1127 tx = _mm256_mul_pd(fscal,dx12);
1128 ty = _mm256_mul_pd(fscal,dy12);
1129 tz = _mm256_mul_pd(fscal,dz12);
1131 /* Update vectorial force */
1132 fix1 = _mm256_add_pd(fix1,tx);
1133 fiy1 = _mm256_add_pd(fiy1,ty);
1134 fiz1 = _mm256_add_pd(fiz1,tz);
1136 fjx2 = _mm256_add_pd(fjx2,tx);
1137 fjy2 = _mm256_add_pd(fjy2,ty);
1138 fjz2 = _mm256_add_pd(fjz2,tz);
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1144 r20 = _mm256_mul_pd(rsq20,rinv20);
1145 r20 = _mm256_andnot_pd(dummy_mask,r20);
1147 /* EWALD ELECTROSTATICS */
1149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150 ewrt = _mm256_mul_pd(r20,ewtabscale);
1151 ewitab = _mm256_cvttpd_epi32(ewrt);
1152 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1153 ewitab = _mm_slli_epi32(ewitab,2);
1154 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1155 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1156 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1157 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1158 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1159 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1160 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1161 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1162 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1164 /* Update potential sum for this i atom from the interaction with this j atom. */
1165 velec = _mm256_andnot_pd(dummy_mask,velec);
1166 velecsum = _mm256_add_pd(velecsum,velec);
1170 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1172 /* Calculate temporary vectorial force */
1173 tx = _mm256_mul_pd(fscal,dx20);
1174 ty = _mm256_mul_pd(fscal,dy20);
1175 tz = _mm256_mul_pd(fscal,dz20);
1177 /* Update vectorial force */
1178 fix2 = _mm256_add_pd(fix2,tx);
1179 fiy2 = _mm256_add_pd(fiy2,ty);
1180 fiz2 = _mm256_add_pd(fiz2,tz);
1182 fjx0 = _mm256_add_pd(fjx0,tx);
1183 fjy0 = _mm256_add_pd(fjy0,ty);
1184 fjz0 = _mm256_add_pd(fjz0,tz);
1186 /**************************
1187 * CALCULATE INTERACTIONS *
1188 **************************/
1190 r21 = _mm256_mul_pd(rsq21,rinv21);
1191 r21 = _mm256_andnot_pd(dummy_mask,r21);
1193 /* EWALD ELECTROSTATICS */
1195 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196 ewrt = _mm256_mul_pd(r21,ewtabscale);
1197 ewitab = _mm256_cvttpd_epi32(ewrt);
1198 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1199 ewitab = _mm_slli_epi32(ewitab,2);
1200 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1201 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1202 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1203 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1204 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1205 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1206 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1207 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1208 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1210 /* Update potential sum for this i atom from the interaction with this j atom. */
1211 velec = _mm256_andnot_pd(dummy_mask,velec);
1212 velecsum = _mm256_add_pd(velecsum,velec);
1216 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1218 /* Calculate temporary vectorial force */
1219 tx = _mm256_mul_pd(fscal,dx21);
1220 ty = _mm256_mul_pd(fscal,dy21);
1221 tz = _mm256_mul_pd(fscal,dz21);
1223 /* Update vectorial force */
1224 fix2 = _mm256_add_pd(fix2,tx);
1225 fiy2 = _mm256_add_pd(fiy2,ty);
1226 fiz2 = _mm256_add_pd(fiz2,tz);
1228 fjx1 = _mm256_add_pd(fjx1,tx);
1229 fjy1 = _mm256_add_pd(fjy1,ty);
1230 fjz1 = _mm256_add_pd(fjz1,tz);
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 r22 = _mm256_mul_pd(rsq22,rinv22);
1237 r22 = _mm256_andnot_pd(dummy_mask,r22);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt = _mm256_mul_pd(r22,ewtabscale);
1243 ewitab = _mm256_cvttpd_epi32(ewrt);
1244 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1245 ewitab = _mm_slli_epi32(ewitab,2);
1246 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1247 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1248 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1249 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1250 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1251 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1252 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1253 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1254 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1256 /* Update potential sum for this i atom from the interaction with this j atom. */
1257 velec = _mm256_andnot_pd(dummy_mask,velec);
1258 velecsum = _mm256_add_pd(velecsum,velec);
1262 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1264 /* Calculate temporary vectorial force */
1265 tx = _mm256_mul_pd(fscal,dx22);
1266 ty = _mm256_mul_pd(fscal,dy22);
1267 tz = _mm256_mul_pd(fscal,dz22);
1269 /* Update vectorial force */
1270 fix2 = _mm256_add_pd(fix2,tx);
1271 fiy2 = _mm256_add_pd(fiy2,ty);
1272 fiz2 = _mm256_add_pd(fiz2,tz);
1274 fjx2 = _mm256_add_pd(fjx2,tx);
1275 fjy2 = _mm256_add_pd(fjy2,ty);
1276 fjz2 = _mm256_add_pd(fjz2,tz);
1278 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1279 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1280 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1281 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1283 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1284 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1286 /* Inner loop uses 412 flops */
1289 /* End of innermost loop */
1291 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1292 f+i_coord_offset,fshift+i_shift_offset);
1295 /* Update potential energies */
1296 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1297 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1299 /* Increment number of inner iterations */
1300 inneriter += j_index_end - j_index_start;
1302 /* Outer loop uses 20 flops */
1305 /* Increment number of outer iterations */
1308 /* Update outer/inner flops */
1310 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*412);
1313 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1314 * Electrostatics interaction: Ewald
1315 * VdW interaction: CubicSplineTable
1316 * Geometry: Water3-Water3
1317 * Calculate force/pot: Force
1320 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1321 (t_nblist * gmx_restrict nlist,
1322 rvec * gmx_restrict xx,
1323 rvec * gmx_restrict ff,
1324 t_forcerec * gmx_restrict fr,
1325 t_mdatoms * gmx_restrict mdatoms,
1326 nb_kernel_data_t * gmx_restrict kernel_data,
1327 t_nrnb * gmx_restrict nrnb)
1329 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1330 * just 0 for non-waters.
1331 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1332 * jnr indices corresponding to data put in the four positions in the SIMD register.
1334 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1335 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1336 int jnrA,jnrB,jnrC,jnrD;
1337 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1338 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1339 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1340 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1341 real rcutoff_scalar;
1342 real *shiftvec,*fshift,*x,*f;
1343 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1344 real scratch[4*DIM];
1345 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1346 real * vdwioffsetptr0;
1347 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1348 real * vdwioffsetptr1;
1349 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1350 real * vdwioffsetptr2;
1351 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1352 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1353 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1354 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1355 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1356 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1357 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1358 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1359 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1360 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1361 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1362 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1363 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1364 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1365 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1366 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1367 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1370 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1373 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1374 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1376 __m128i ifour = _mm_set1_epi32(4);
1377 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1380 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1381 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1383 __m256d dummy_mask,cutoff_mask;
1384 __m128 tmpmask0,tmpmask1;
1385 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1386 __m256d one = _mm256_set1_pd(1.0);
1387 __m256d two = _mm256_set1_pd(2.0);
1393 jindex = nlist->jindex;
1395 shiftidx = nlist->shift;
1397 shiftvec = fr->shift_vec[0];
1398 fshift = fr->fshift[0];
1399 facel = _mm256_set1_pd(fr->epsfac);
1400 charge = mdatoms->chargeA;
1401 nvdwtype = fr->ntype;
1402 vdwparam = fr->nbfp;
1403 vdwtype = mdatoms->typeA;
1405 vftab = kernel_data->table_vdw->data;
1406 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1408 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1409 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1410 beta2 = _mm256_mul_pd(beta,beta);
1411 beta3 = _mm256_mul_pd(beta,beta2);
1413 ewtab = fr->ic->tabq_coul_F;
1414 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1415 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1417 /* Setup water-specific parameters */
1418 inr = nlist->iinr[0];
1419 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1420 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1421 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1422 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1424 jq0 = _mm256_set1_pd(charge[inr+0]);
1425 jq1 = _mm256_set1_pd(charge[inr+1]);
1426 jq2 = _mm256_set1_pd(charge[inr+2]);
1427 vdwjidx0A = 2*vdwtype[inr+0];
1428 qq00 = _mm256_mul_pd(iq0,jq0);
1429 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1430 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1431 qq01 = _mm256_mul_pd(iq0,jq1);
1432 qq02 = _mm256_mul_pd(iq0,jq2);
1433 qq10 = _mm256_mul_pd(iq1,jq0);
1434 qq11 = _mm256_mul_pd(iq1,jq1);
1435 qq12 = _mm256_mul_pd(iq1,jq2);
1436 qq20 = _mm256_mul_pd(iq2,jq0);
1437 qq21 = _mm256_mul_pd(iq2,jq1);
1438 qq22 = _mm256_mul_pd(iq2,jq2);
1440 /* Avoid stupid compiler warnings */
1441 jnrA = jnrB = jnrC = jnrD = 0;
1442 j_coord_offsetA = 0;
1443 j_coord_offsetB = 0;
1444 j_coord_offsetC = 0;
1445 j_coord_offsetD = 0;
1450 for(iidx=0;iidx<4*DIM;iidx++)
1452 scratch[iidx] = 0.0;
1455 /* Start outer loop over neighborlists */
1456 for(iidx=0; iidx<nri; iidx++)
1458 /* Load shift vector for this list */
1459 i_shift_offset = DIM*shiftidx[iidx];
1461 /* Load limits for loop over neighbors */
1462 j_index_start = jindex[iidx];
1463 j_index_end = jindex[iidx+1];
1465 /* Get outer coordinate index */
1467 i_coord_offset = DIM*inr;
1469 /* Load i particle coords and add shift vector */
1470 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1471 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1473 fix0 = _mm256_setzero_pd();
1474 fiy0 = _mm256_setzero_pd();
1475 fiz0 = _mm256_setzero_pd();
1476 fix1 = _mm256_setzero_pd();
1477 fiy1 = _mm256_setzero_pd();
1478 fiz1 = _mm256_setzero_pd();
1479 fix2 = _mm256_setzero_pd();
1480 fiy2 = _mm256_setzero_pd();
1481 fiz2 = _mm256_setzero_pd();
1483 /* Start inner kernel loop */
1484 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1487 /* Get j neighbor index, and coordinate index */
1489 jnrB = jjnr[jidx+1];
1490 jnrC = jjnr[jidx+2];
1491 jnrD = jjnr[jidx+3];
1492 j_coord_offsetA = DIM*jnrA;
1493 j_coord_offsetB = DIM*jnrB;
1494 j_coord_offsetC = DIM*jnrC;
1495 j_coord_offsetD = DIM*jnrD;
1497 /* load j atom coordinates */
1498 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1499 x+j_coord_offsetC,x+j_coord_offsetD,
1500 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1502 /* Calculate displacement vector */
1503 dx00 = _mm256_sub_pd(ix0,jx0);
1504 dy00 = _mm256_sub_pd(iy0,jy0);
1505 dz00 = _mm256_sub_pd(iz0,jz0);
1506 dx01 = _mm256_sub_pd(ix0,jx1);
1507 dy01 = _mm256_sub_pd(iy0,jy1);
1508 dz01 = _mm256_sub_pd(iz0,jz1);
1509 dx02 = _mm256_sub_pd(ix0,jx2);
1510 dy02 = _mm256_sub_pd(iy0,jy2);
1511 dz02 = _mm256_sub_pd(iz0,jz2);
1512 dx10 = _mm256_sub_pd(ix1,jx0);
1513 dy10 = _mm256_sub_pd(iy1,jy0);
1514 dz10 = _mm256_sub_pd(iz1,jz0);
1515 dx11 = _mm256_sub_pd(ix1,jx1);
1516 dy11 = _mm256_sub_pd(iy1,jy1);
1517 dz11 = _mm256_sub_pd(iz1,jz1);
1518 dx12 = _mm256_sub_pd(ix1,jx2);
1519 dy12 = _mm256_sub_pd(iy1,jy2);
1520 dz12 = _mm256_sub_pd(iz1,jz2);
1521 dx20 = _mm256_sub_pd(ix2,jx0);
1522 dy20 = _mm256_sub_pd(iy2,jy0);
1523 dz20 = _mm256_sub_pd(iz2,jz0);
1524 dx21 = _mm256_sub_pd(ix2,jx1);
1525 dy21 = _mm256_sub_pd(iy2,jy1);
1526 dz21 = _mm256_sub_pd(iz2,jz1);
1527 dx22 = _mm256_sub_pd(ix2,jx2);
1528 dy22 = _mm256_sub_pd(iy2,jy2);
1529 dz22 = _mm256_sub_pd(iz2,jz2);
1531 /* Calculate squared distance and things based on it */
1532 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1533 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1534 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1535 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1536 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1537 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1538 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1539 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1540 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1542 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1543 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1544 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1545 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1546 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1547 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1548 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1549 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1550 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1552 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1553 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1554 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1555 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1556 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1557 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1558 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1559 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1560 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1562 fjx0 = _mm256_setzero_pd();
1563 fjy0 = _mm256_setzero_pd();
1564 fjz0 = _mm256_setzero_pd();
1565 fjx1 = _mm256_setzero_pd();
1566 fjy1 = _mm256_setzero_pd();
1567 fjz1 = _mm256_setzero_pd();
1568 fjx2 = _mm256_setzero_pd();
1569 fjy2 = _mm256_setzero_pd();
1570 fjz2 = _mm256_setzero_pd();
1572 /**************************
1573 * CALCULATE INTERACTIONS *
1574 **************************/
1576 r00 = _mm256_mul_pd(rsq00,rinv00);
1578 /* Calculate table index by multiplying r with table scale and truncate to integer */
1579 rt = _mm256_mul_pd(r00,vftabscale);
1580 vfitab = _mm256_cvttpd_epi32(rt);
1581 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1582 vfitab = _mm_slli_epi32(vfitab,3);
1584 /* EWALD ELECTROSTATICS */
1586 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1587 ewrt = _mm256_mul_pd(r00,ewtabscale);
1588 ewitab = _mm256_cvttpd_epi32(ewrt);
1589 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1590 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1591 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1593 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1594 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1596 /* CUBIC SPLINE TABLE DISPERSION */
1597 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1598 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1599 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1600 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1601 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1602 Heps = _mm256_mul_pd(vfeps,H);
1603 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1604 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1605 fvdw6 = _mm256_mul_pd(c6_00,FF);
1607 /* CUBIC SPLINE TABLE REPULSION */
1608 vfitab = _mm_add_epi32(vfitab,ifour);
1609 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1610 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1611 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1612 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1613 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1614 Heps = _mm256_mul_pd(vfeps,H);
1615 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1616 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1617 fvdw12 = _mm256_mul_pd(c12_00,FF);
1618 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1620 fscal = _mm256_add_pd(felec,fvdw);
1622 /* Calculate temporary vectorial force */
1623 tx = _mm256_mul_pd(fscal,dx00);
1624 ty = _mm256_mul_pd(fscal,dy00);
1625 tz = _mm256_mul_pd(fscal,dz00);
1627 /* Update vectorial force */
1628 fix0 = _mm256_add_pd(fix0,tx);
1629 fiy0 = _mm256_add_pd(fiy0,ty);
1630 fiz0 = _mm256_add_pd(fiz0,tz);
1632 fjx0 = _mm256_add_pd(fjx0,tx);
1633 fjy0 = _mm256_add_pd(fjy0,ty);
1634 fjz0 = _mm256_add_pd(fjz0,tz);
1636 /**************************
1637 * CALCULATE INTERACTIONS *
1638 **************************/
1640 r01 = _mm256_mul_pd(rsq01,rinv01);
1642 /* EWALD ELECTROSTATICS */
1644 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1645 ewrt = _mm256_mul_pd(r01,ewtabscale);
1646 ewitab = _mm256_cvttpd_epi32(ewrt);
1647 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1648 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1649 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1651 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1652 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1656 /* Calculate temporary vectorial force */
1657 tx = _mm256_mul_pd(fscal,dx01);
1658 ty = _mm256_mul_pd(fscal,dy01);
1659 tz = _mm256_mul_pd(fscal,dz01);
1661 /* Update vectorial force */
1662 fix0 = _mm256_add_pd(fix0,tx);
1663 fiy0 = _mm256_add_pd(fiy0,ty);
1664 fiz0 = _mm256_add_pd(fiz0,tz);
1666 fjx1 = _mm256_add_pd(fjx1,tx);
1667 fjy1 = _mm256_add_pd(fjy1,ty);
1668 fjz1 = _mm256_add_pd(fjz1,tz);
1670 /**************************
1671 * CALCULATE INTERACTIONS *
1672 **************************/
1674 r02 = _mm256_mul_pd(rsq02,rinv02);
1676 /* EWALD ELECTROSTATICS */
1678 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1679 ewrt = _mm256_mul_pd(r02,ewtabscale);
1680 ewitab = _mm256_cvttpd_epi32(ewrt);
1681 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1682 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1683 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1685 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1686 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1690 /* Calculate temporary vectorial force */
1691 tx = _mm256_mul_pd(fscal,dx02);
1692 ty = _mm256_mul_pd(fscal,dy02);
1693 tz = _mm256_mul_pd(fscal,dz02);
1695 /* Update vectorial force */
1696 fix0 = _mm256_add_pd(fix0,tx);
1697 fiy0 = _mm256_add_pd(fiy0,ty);
1698 fiz0 = _mm256_add_pd(fiz0,tz);
1700 fjx2 = _mm256_add_pd(fjx2,tx);
1701 fjy2 = _mm256_add_pd(fjy2,ty);
1702 fjz2 = _mm256_add_pd(fjz2,tz);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 r10 = _mm256_mul_pd(rsq10,rinv10);
1710 /* EWALD ELECTROSTATICS */
1712 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1713 ewrt = _mm256_mul_pd(r10,ewtabscale);
1714 ewitab = _mm256_cvttpd_epi32(ewrt);
1715 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1716 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1717 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1719 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1720 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1724 /* Calculate temporary vectorial force */
1725 tx = _mm256_mul_pd(fscal,dx10);
1726 ty = _mm256_mul_pd(fscal,dy10);
1727 tz = _mm256_mul_pd(fscal,dz10);
1729 /* Update vectorial force */
1730 fix1 = _mm256_add_pd(fix1,tx);
1731 fiy1 = _mm256_add_pd(fiy1,ty);
1732 fiz1 = _mm256_add_pd(fiz1,tz);
1734 fjx0 = _mm256_add_pd(fjx0,tx);
1735 fjy0 = _mm256_add_pd(fjy0,ty);
1736 fjz0 = _mm256_add_pd(fjz0,tz);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 r11 = _mm256_mul_pd(rsq11,rinv11);
1744 /* EWALD ELECTROSTATICS */
1746 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1747 ewrt = _mm256_mul_pd(r11,ewtabscale);
1748 ewitab = _mm256_cvttpd_epi32(ewrt);
1749 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1750 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1751 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1753 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1754 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1758 /* Calculate temporary vectorial force */
1759 tx = _mm256_mul_pd(fscal,dx11);
1760 ty = _mm256_mul_pd(fscal,dy11);
1761 tz = _mm256_mul_pd(fscal,dz11);
1763 /* Update vectorial force */
1764 fix1 = _mm256_add_pd(fix1,tx);
1765 fiy1 = _mm256_add_pd(fiy1,ty);
1766 fiz1 = _mm256_add_pd(fiz1,tz);
1768 fjx1 = _mm256_add_pd(fjx1,tx);
1769 fjy1 = _mm256_add_pd(fjy1,ty);
1770 fjz1 = _mm256_add_pd(fjz1,tz);
1772 /**************************
1773 * CALCULATE INTERACTIONS *
1774 **************************/
1776 r12 = _mm256_mul_pd(rsq12,rinv12);
1778 /* EWALD ELECTROSTATICS */
1780 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1781 ewrt = _mm256_mul_pd(r12,ewtabscale);
1782 ewitab = _mm256_cvttpd_epi32(ewrt);
1783 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1784 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1785 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1787 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1788 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1792 /* Calculate temporary vectorial force */
1793 tx = _mm256_mul_pd(fscal,dx12);
1794 ty = _mm256_mul_pd(fscal,dy12);
1795 tz = _mm256_mul_pd(fscal,dz12);
1797 /* Update vectorial force */
1798 fix1 = _mm256_add_pd(fix1,tx);
1799 fiy1 = _mm256_add_pd(fiy1,ty);
1800 fiz1 = _mm256_add_pd(fiz1,tz);
1802 fjx2 = _mm256_add_pd(fjx2,tx);
1803 fjy2 = _mm256_add_pd(fjy2,ty);
1804 fjz2 = _mm256_add_pd(fjz2,tz);
1806 /**************************
1807 * CALCULATE INTERACTIONS *
1808 **************************/
1810 r20 = _mm256_mul_pd(rsq20,rinv20);
1812 /* EWALD ELECTROSTATICS */
1814 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1815 ewrt = _mm256_mul_pd(r20,ewtabscale);
1816 ewitab = _mm256_cvttpd_epi32(ewrt);
1817 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1818 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1819 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1821 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1822 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1826 /* Calculate temporary vectorial force */
1827 tx = _mm256_mul_pd(fscal,dx20);
1828 ty = _mm256_mul_pd(fscal,dy20);
1829 tz = _mm256_mul_pd(fscal,dz20);
1831 /* Update vectorial force */
1832 fix2 = _mm256_add_pd(fix2,tx);
1833 fiy2 = _mm256_add_pd(fiy2,ty);
1834 fiz2 = _mm256_add_pd(fiz2,tz);
1836 fjx0 = _mm256_add_pd(fjx0,tx);
1837 fjy0 = _mm256_add_pd(fjy0,ty);
1838 fjz0 = _mm256_add_pd(fjz0,tz);
1840 /**************************
1841 * CALCULATE INTERACTIONS *
1842 **************************/
1844 r21 = _mm256_mul_pd(rsq21,rinv21);
1846 /* EWALD ELECTROSTATICS */
1848 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1849 ewrt = _mm256_mul_pd(r21,ewtabscale);
1850 ewitab = _mm256_cvttpd_epi32(ewrt);
1851 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1852 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1853 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1855 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1856 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1860 /* Calculate temporary vectorial force */
1861 tx = _mm256_mul_pd(fscal,dx21);
1862 ty = _mm256_mul_pd(fscal,dy21);
1863 tz = _mm256_mul_pd(fscal,dz21);
1865 /* Update vectorial force */
1866 fix2 = _mm256_add_pd(fix2,tx);
1867 fiy2 = _mm256_add_pd(fiy2,ty);
1868 fiz2 = _mm256_add_pd(fiz2,tz);
1870 fjx1 = _mm256_add_pd(fjx1,tx);
1871 fjy1 = _mm256_add_pd(fjy1,ty);
1872 fjz1 = _mm256_add_pd(fjz1,tz);
1874 /**************************
1875 * CALCULATE INTERACTIONS *
1876 **************************/
1878 r22 = _mm256_mul_pd(rsq22,rinv22);
1880 /* EWALD ELECTROSTATICS */
1882 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1883 ewrt = _mm256_mul_pd(r22,ewtabscale);
1884 ewitab = _mm256_cvttpd_epi32(ewrt);
1885 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1886 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1887 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1889 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1890 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1894 /* Calculate temporary vectorial force */
1895 tx = _mm256_mul_pd(fscal,dx22);
1896 ty = _mm256_mul_pd(fscal,dy22);
1897 tz = _mm256_mul_pd(fscal,dz22);
1899 /* Update vectorial force */
1900 fix2 = _mm256_add_pd(fix2,tx);
1901 fiy2 = _mm256_add_pd(fiy2,ty);
1902 fiz2 = _mm256_add_pd(fiz2,tz);
1904 fjx2 = _mm256_add_pd(fjx2,tx);
1905 fjy2 = _mm256_add_pd(fjy2,ty);
1906 fjz2 = _mm256_add_pd(fjz2,tz);
1908 fjptrA = f+j_coord_offsetA;
1909 fjptrB = f+j_coord_offsetB;
1910 fjptrC = f+j_coord_offsetC;
1911 fjptrD = f+j_coord_offsetD;
1913 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1914 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1916 /* Inner loop uses 350 flops */
1919 if(jidx<j_index_end)
1922 /* Get j neighbor index, and coordinate index */
1923 jnrlistA = jjnr[jidx];
1924 jnrlistB = jjnr[jidx+1];
1925 jnrlistC = jjnr[jidx+2];
1926 jnrlistD = jjnr[jidx+3];
1927 /* Sign of each element will be negative for non-real atoms.
1928 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1929 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1931 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1933 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1934 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1935 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1937 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1938 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1939 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1940 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1941 j_coord_offsetA = DIM*jnrA;
1942 j_coord_offsetB = DIM*jnrB;
1943 j_coord_offsetC = DIM*jnrC;
1944 j_coord_offsetD = DIM*jnrD;
1946 /* load j atom coordinates */
1947 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1948 x+j_coord_offsetC,x+j_coord_offsetD,
1949 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1951 /* Calculate displacement vector */
1952 dx00 = _mm256_sub_pd(ix0,jx0);
1953 dy00 = _mm256_sub_pd(iy0,jy0);
1954 dz00 = _mm256_sub_pd(iz0,jz0);
1955 dx01 = _mm256_sub_pd(ix0,jx1);
1956 dy01 = _mm256_sub_pd(iy0,jy1);
1957 dz01 = _mm256_sub_pd(iz0,jz1);
1958 dx02 = _mm256_sub_pd(ix0,jx2);
1959 dy02 = _mm256_sub_pd(iy0,jy2);
1960 dz02 = _mm256_sub_pd(iz0,jz2);
1961 dx10 = _mm256_sub_pd(ix1,jx0);
1962 dy10 = _mm256_sub_pd(iy1,jy0);
1963 dz10 = _mm256_sub_pd(iz1,jz0);
1964 dx11 = _mm256_sub_pd(ix1,jx1);
1965 dy11 = _mm256_sub_pd(iy1,jy1);
1966 dz11 = _mm256_sub_pd(iz1,jz1);
1967 dx12 = _mm256_sub_pd(ix1,jx2);
1968 dy12 = _mm256_sub_pd(iy1,jy2);
1969 dz12 = _mm256_sub_pd(iz1,jz2);
1970 dx20 = _mm256_sub_pd(ix2,jx0);
1971 dy20 = _mm256_sub_pd(iy2,jy0);
1972 dz20 = _mm256_sub_pd(iz2,jz0);
1973 dx21 = _mm256_sub_pd(ix2,jx1);
1974 dy21 = _mm256_sub_pd(iy2,jy1);
1975 dz21 = _mm256_sub_pd(iz2,jz1);
1976 dx22 = _mm256_sub_pd(ix2,jx2);
1977 dy22 = _mm256_sub_pd(iy2,jy2);
1978 dz22 = _mm256_sub_pd(iz2,jz2);
1980 /* Calculate squared distance and things based on it */
1981 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1982 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1983 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1984 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1985 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1986 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1987 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1988 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1989 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1991 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1992 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1993 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1994 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1995 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1996 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1997 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1998 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1999 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2001 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2002 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2003 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2004 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2005 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2006 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2007 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2008 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2009 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2011 fjx0 = _mm256_setzero_pd();
2012 fjy0 = _mm256_setzero_pd();
2013 fjz0 = _mm256_setzero_pd();
2014 fjx1 = _mm256_setzero_pd();
2015 fjy1 = _mm256_setzero_pd();
2016 fjz1 = _mm256_setzero_pd();
2017 fjx2 = _mm256_setzero_pd();
2018 fjy2 = _mm256_setzero_pd();
2019 fjz2 = _mm256_setzero_pd();
2021 /**************************
2022 * CALCULATE INTERACTIONS *
2023 **************************/
2025 r00 = _mm256_mul_pd(rsq00,rinv00);
2026 r00 = _mm256_andnot_pd(dummy_mask,r00);
2028 /* Calculate table index by multiplying r with table scale and truncate to integer */
2029 rt = _mm256_mul_pd(r00,vftabscale);
2030 vfitab = _mm256_cvttpd_epi32(rt);
2031 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2032 vfitab = _mm_slli_epi32(vfitab,3);
2034 /* EWALD ELECTROSTATICS */
2036 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2037 ewrt = _mm256_mul_pd(r00,ewtabscale);
2038 ewitab = _mm256_cvttpd_epi32(ewrt);
2039 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2040 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2041 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2043 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2044 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2046 /* CUBIC SPLINE TABLE DISPERSION */
2047 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2048 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2049 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2050 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2051 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2052 Heps = _mm256_mul_pd(vfeps,H);
2053 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2054 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2055 fvdw6 = _mm256_mul_pd(c6_00,FF);
2057 /* CUBIC SPLINE TABLE REPULSION */
2058 vfitab = _mm_add_epi32(vfitab,ifour);
2059 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2060 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2061 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2062 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2063 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2064 Heps = _mm256_mul_pd(vfeps,H);
2065 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2066 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2067 fvdw12 = _mm256_mul_pd(c12_00,FF);
2068 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2070 fscal = _mm256_add_pd(felec,fvdw);
2072 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2074 /* Calculate temporary vectorial force */
2075 tx = _mm256_mul_pd(fscal,dx00);
2076 ty = _mm256_mul_pd(fscal,dy00);
2077 tz = _mm256_mul_pd(fscal,dz00);
2079 /* Update vectorial force */
2080 fix0 = _mm256_add_pd(fix0,tx);
2081 fiy0 = _mm256_add_pd(fiy0,ty);
2082 fiz0 = _mm256_add_pd(fiz0,tz);
2084 fjx0 = _mm256_add_pd(fjx0,tx);
2085 fjy0 = _mm256_add_pd(fjy0,ty);
2086 fjz0 = _mm256_add_pd(fjz0,tz);
2088 /**************************
2089 * CALCULATE INTERACTIONS *
2090 **************************/
2092 r01 = _mm256_mul_pd(rsq01,rinv01);
2093 r01 = _mm256_andnot_pd(dummy_mask,r01);
2095 /* EWALD ELECTROSTATICS */
2097 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098 ewrt = _mm256_mul_pd(r01,ewtabscale);
2099 ewitab = _mm256_cvttpd_epi32(ewrt);
2100 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2101 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2102 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2104 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2105 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2109 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2111 /* Calculate temporary vectorial force */
2112 tx = _mm256_mul_pd(fscal,dx01);
2113 ty = _mm256_mul_pd(fscal,dy01);
2114 tz = _mm256_mul_pd(fscal,dz01);
2116 /* Update vectorial force */
2117 fix0 = _mm256_add_pd(fix0,tx);
2118 fiy0 = _mm256_add_pd(fiy0,ty);
2119 fiz0 = _mm256_add_pd(fiz0,tz);
2121 fjx1 = _mm256_add_pd(fjx1,tx);
2122 fjy1 = _mm256_add_pd(fjy1,ty);
2123 fjz1 = _mm256_add_pd(fjz1,tz);
2125 /**************************
2126 * CALCULATE INTERACTIONS *
2127 **************************/
2129 r02 = _mm256_mul_pd(rsq02,rinv02);
2130 r02 = _mm256_andnot_pd(dummy_mask,r02);
2132 /* EWALD ELECTROSTATICS */
2134 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2135 ewrt = _mm256_mul_pd(r02,ewtabscale);
2136 ewitab = _mm256_cvttpd_epi32(ewrt);
2137 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2138 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2139 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2141 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2142 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2146 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2148 /* Calculate temporary vectorial force */
2149 tx = _mm256_mul_pd(fscal,dx02);
2150 ty = _mm256_mul_pd(fscal,dy02);
2151 tz = _mm256_mul_pd(fscal,dz02);
2153 /* Update vectorial force */
2154 fix0 = _mm256_add_pd(fix0,tx);
2155 fiy0 = _mm256_add_pd(fiy0,ty);
2156 fiz0 = _mm256_add_pd(fiz0,tz);
2158 fjx2 = _mm256_add_pd(fjx2,tx);
2159 fjy2 = _mm256_add_pd(fjy2,ty);
2160 fjz2 = _mm256_add_pd(fjz2,tz);
2162 /**************************
2163 * CALCULATE INTERACTIONS *
2164 **************************/
2166 r10 = _mm256_mul_pd(rsq10,rinv10);
2167 r10 = _mm256_andnot_pd(dummy_mask,r10);
2169 /* EWALD ELECTROSTATICS */
2171 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2172 ewrt = _mm256_mul_pd(r10,ewtabscale);
2173 ewitab = _mm256_cvttpd_epi32(ewrt);
2174 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2175 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2176 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2178 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2179 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2183 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2185 /* Calculate temporary vectorial force */
2186 tx = _mm256_mul_pd(fscal,dx10);
2187 ty = _mm256_mul_pd(fscal,dy10);
2188 tz = _mm256_mul_pd(fscal,dz10);
2190 /* Update vectorial force */
2191 fix1 = _mm256_add_pd(fix1,tx);
2192 fiy1 = _mm256_add_pd(fiy1,ty);
2193 fiz1 = _mm256_add_pd(fiz1,tz);
2195 fjx0 = _mm256_add_pd(fjx0,tx);
2196 fjy0 = _mm256_add_pd(fjy0,ty);
2197 fjz0 = _mm256_add_pd(fjz0,tz);
2199 /**************************
2200 * CALCULATE INTERACTIONS *
2201 **************************/
2203 r11 = _mm256_mul_pd(rsq11,rinv11);
2204 r11 = _mm256_andnot_pd(dummy_mask,r11);
2206 /* EWALD ELECTROSTATICS */
2208 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2209 ewrt = _mm256_mul_pd(r11,ewtabscale);
2210 ewitab = _mm256_cvttpd_epi32(ewrt);
2211 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2212 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2213 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2215 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2216 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2220 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2222 /* Calculate temporary vectorial force */
2223 tx = _mm256_mul_pd(fscal,dx11);
2224 ty = _mm256_mul_pd(fscal,dy11);
2225 tz = _mm256_mul_pd(fscal,dz11);
2227 /* Update vectorial force */
2228 fix1 = _mm256_add_pd(fix1,tx);
2229 fiy1 = _mm256_add_pd(fiy1,ty);
2230 fiz1 = _mm256_add_pd(fiz1,tz);
2232 fjx1 = _mm256_add_pd(fjx1,tx);
2233 fjy1 = _mm256_add_pd(fjy1,ty);
2234 fjz1 = _mm256_add_pd(fjz1,tz);
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 r12 = _mm256_mul_pd(rsq12,rinv12);
2241 r12 = _mm256_andnot_pd(dummy_mask,r12);
2243 /* EWALD ELECTROSTATICS */
2245 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2246 ewrt = _mm256_mul_pd(r12,ewtabscale);
2247 ewitab = _mm256_cvttpd_epi32(ewrt);
2248 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2249 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2250 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2252 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2253 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2257 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2259 /* Calculate temporary vectorial force */
2260 tx = _mm256_mul_pd(fscal,dx12);
2261 ty = _mm256_mul_pd(fscal,dy12);
2262 tz = _mm256_mul_pd(fscal,dz12);
2264 /* Update vectorial force */
2265 fix1 = _mm256_add_pd(fix1,tx);
2266 fiy1 = _mm256_add_pd(fiy1,ty);
2267 fiz1 = _mm256_add_pd(fiz1,tz);
2269 fjx2 = _mm256_add_pd(fjx2,tx);
2270 fjy2 = _mm256_add_pd(fjy2,ty);
2271 fjz2 = _mm256_add_pd(fjz2,tz);
2273 /**************************
2274 * CALCULATE INTERACTIONS *
2275 **************************/
2277 r20 = _mm256_mul_pd(rsq20,rinv20);
2278 r20 = _mm256_andnot_pd(dummy_mask,r20);
2280 /* EWALD ELECTROSTATICS */
2282 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2283 ewrt = _mm256_mul_pd(r20,ewtabscale);
2284 ewitab = _mm256_cvttpd_epi32(ewrt);
2285 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2286 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2287 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2289 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2290 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2294 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2296 /* Calculate temporary vectorial force */
2297 tx = _mm256_mul_pd(fscal,dx20);
2298 ty = _mm256_mul_pd(fscal,dy20);
2299 tz = _mm256_mul_pd(fscal,dz20);
2301 /* Update vectorial force */
2302 fix2 = _mm256_add_pd(fix2,tx);
2303 fiy2 = _mm256_add_pd(fiy2,ty);
2304 fiz2 = _mm256_add_pd(fiz2,tz);
2306 fjx0 = _mm256_add_pd(fjx0,tx);
2307 fjy0 = _mm256_add_pd(fjy0,ty);
2308 fjz0 = _mm256_add_pd(fjz0,tz);
2310 /**************************
2311 * CALCULATE INTERACTIONS *
2312 **************************/
2314 r21 = _mm256_mul_pd(rsq21,rinv21);
2315 r21 = _mm256_andnot_pd(dummy_mask,r21);
2317 /* EWALD ELECTROSTATICS */
2319 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2320 ewrt = _mm256_mul_pd(r21,ewtabscale);
2321 ewitab = _mm256_cvttpd_epi32(ewrt);
2322 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2323 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2324 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2326 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2327 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2331 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2333 /* Calculate temporary vectorial force */
2334 tx = _mm256_mul_pd(fscal,dx21);
2335 ty = _mm256_mul_pd(fscal,dy21);
2336 tz = _mm256_mul_pd(fscal,dz21);
2338 /* Update vectorial force */
2339 fix2 = _mm256_add_pd(fix2,tx);
2340 fiy2 = _mm256_add_pd(fiy2,ty);
2341 fiz2 = _mm256_add_pd(fiz2,tz);
2343 fjx1 = _mm256_add_pd(fjx1,tx);
2344 fjy1 = _mm256_add_pd(fjy1,ty);
2345 fjz1 = _mm256_add_pd(fjz1,tz);
2347 /**************************
2348 * CALCULATE INTERACTIONS *
2349 **************************/
2351 r22 = _mm256_mul_pd(rsq22,rinv22);
2352 r22 = _mm256_andnot_pd(dummy_mask,r22);
2354 /* EWALD ELECTROSTATICS */
2356 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2357 ewrt = _mm256_mul_pd(r22,ewtabscale);
2358 ewitab = _mm256_cvttpd_epi32(ewrt);
2359 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2360 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2361 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2363 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2364 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2368 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2370 /* Calculate temporary vectorial force */
2371 tx = _mm256_mul_pd(fscal,dx22);
2372 ty = _mm256_mul_pd(fscal,dy22);
2373 tz = _mm256_mul_pd(fscal,dz22);
2375 /* Update vectorial force */
2376 fix2 = _mm256_add_pd(fix2,tx);
2377 fiy2 = _mm256_add_pd(fiy2,ty);
2378 fiz2 = _mm256_add_pd(fiz2,tz);
2380 fjx2 = _mm256_add_pd(fjx2,tx);
2381 fjy2 = _mm256_add_pd(fjy2,ty);
2382 fjz2 = _mm256_add_pd(fjz2,tz);
2384 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2385 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2386 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2387 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2389 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2390 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2392 /* Inner loop uses 359 flops */
2395 /* End of innermost loop */
2397 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2398 f+i_coord_offset,fshift+i_shift_offset);
2400 /* Increment number of inner iterations */
2401 inneriter += j_index_end - j_index_start;
2403 /* Outer loop uses 18 flops */
2406 /* Increment number of outer iterations */
2409 /* Update outer/inner flops */
2411 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);