2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 real * vdwioffsetptr3;
91 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
99 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
104 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
107 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
108 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
109 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
110 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
113 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
117 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
119 __m128i ifour = _mm_set1_epi32(4);
120 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
123 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
124 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
126 __m256d dummy_mask,cutoff_mask;
127 __m128 tmpmask0,tmpmask1;
128 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
129 __m256d one = _mm256_set1_pd(1.0);
130 __m256d two = _mm256_set1_pd(2.0);
136 jindex = nlist->jindex;
138 shiftidx = nlist->shift;
140 shiftvec = fr->shift_vec[0];
141 fshift = fr->fshift[0];
142 facel = _mm256_set1_pd(fr->epsfac);
143 charge = mdatoms->chargeA;
144 nvdwtype = fr->ntype;
146 vdwtype = mdatoms->typeA;
148 vftab = kernel_data->table_vdw->data;
149 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
151 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
152 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
153 beta2 = _mm256_mul_pd(beta,beta);
154 beta3 = _mm256_mul_pd(beta,beta2);
156 ewtab = fr->ic->tabq_coul_FDV0;
157 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
158 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
160 /* Setup water-specific parameters */
161 inr = nlist->iinr[0];
162 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
163 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
164 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
165 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
167 jq1 = _mm256_set1_pd(charge[inr+1]);
168 jq2 = _mm256_set1_pd(charge[inr+2]);
169 jq3 = _mm256_set1_pd(charge[inr+3]);
170 vdwjidx0A = 2*vdwtype[inr+0];
171 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
172 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
173 qq11 = _mm256_mul_pd(iq1,jq1);
174 qq12 = _mm256_mul_pd(iq1,jq2);
175 qq13 = _mm256_mul_pd(iq1,jq3);
176 qq21 = _mm256_mul_pd(iq2,jq1);
177 qq22 = _mm256_mul_pd(iq2,jq2);
178 qq23 = _mm256_mul_pd(iq2,jq3);
179 qq31 = _mm256_mul_pd(iq3,jq1);
180 qq32 = _mm256_mul_pd(iq3,jq2);
181 qq33 = _mm256_mul_pd(iq3,jq3);
183 /* Avoid stupid compiler warnings */
184 jnrA = jnrB = jnrC = jnrD = 0;
193 for(iidx=0;iidx<4*DIM;iidx++)
198 /* Start outer loop over neighborlists */
199 for(iidx=0; iidx<nri; iidx++)
201 /* Load shift vector for this list */
202 i_shift_offset = DIM*shiftidx[iidx];
204 /* Load limits for loop over neighbors */
205 j_index_start = jindex[iidx];
206 j_index_end = jindex[iidx+1];
208 /* Get outer coordinate index */
210 i_coord_offset = DIM*inr;
212 /* Load i particle coords and add shift vector */
213 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
214 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
216 fix0 = _mm256_setzero_pd();
217 fiy0 = _mm256_setzero_pd();
218 fiz0 = _mm256_setzero_pd();
219 fix1 = _mm256_setzero_pd();
220 fiy1 = _mm256_setzero_pd();
221 fiz1 = _mm256_setzero_pd();
222 fix2 = _mm256_setzero_pd();
223 fiy2 = _mm256_setzero_pd();
224 fiz2 = _mm256_setzero_pd();
225 fix3 = _mm256_setzero_pd();
226 fiy3 = _mm256_setzero_pd();
227 fiz3 = _mm256_setzero_pd();
229 /* Reset potential sums */
230 velecsum = _mm256_setzero_pd();
231 vvdwsum = _mm256_setzero_pd();
233 /* Start inner kernel loop */
234 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
237 /* Get j neighbor index, and coordinate index */
242 j_coord_offsetA = DIM*jnrA;
243 j_coord_offsetB = DIM*jnrB;
244 j_coord_offsetC = DIM*jnrC;
245 j_coord_offsetD = DIM*jnrD;
247 /* load j atom coordinates */
248 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
249 x+j_coord_offsetC,x+j_coord_offsetD,
250 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
251 &jy2,&jz2,&jx3,&jy3,&jz3);
253 /* Calculate displacement vector */
254 dx00 = _mm256_sub_pd(ix0,jx0);
255 dy00 = _mm256_sub_pd(iy0,jy0);
256 dz00 = _mm256_sub_pd(iz0,jz0);
257 dx11 = _mm256_sub_pd(ix1,jx1);
258 dy11 = _mm256_sub_pd(iy1,jy1);
259 dz11 = _mm256_sub_pd(iz1,jz1);
260 dx12 = _mm256_sub_pd(ix1,jx2);
261 dy12 = _mm256_sub_pd(iy1,jy2);
262 dz12 = _mm256_sub_pd(iz1,jz2);
263 dx13 = _mm256_sub_pd(ix1,jx3);
264 dy13 = _mm256_sub_pd(iy1,jy3);
265 dz13 = _mm256_sub_pd(iz1,jz3);
266 dx21 = _mm256_sub_pd(ix2,jx1);
267 dy21 = _mm256_sub_pd(iy2,jy1);
268 dz21 = _mm256_sub_pd(iz2,jz1);
269 dx22 = _mm256_sub_pd(ix2,jx2);
270 dy22 = _mm256_sub_pd(iy2,jy2);
271 dz22 = _mm256_sub_pd(iz2,jz2);
272 dx23 = _mm256_sub_pd(ix2,jx3);
273 dy23 = _mm256_sub_pd(iy2,jy3);
274 dz23 = _mm256_sub_pd(iz2,jz3);
275 dx31 = _mm256_sub_pd(ix3,jx1);
276 dy31 = _mm256_sub_pd(iy3,jy1);
277 dz31 = _mm256_sub_pd(iz3,jz1);
278 dx32 = _mm256_sub_pd(ix3,jx2);
279 dy32 = _mm256_sub_pd(iy3,jy2);
280 dz32 = _mm256_sub_pd(iz3,jz2);
281 dx33 = _mm256_sub_pd(ix3,jx3);
282 dy33 = _mm256_sub_pd(iy3,jy3);
283 dz33 = _mm256_sub_pd(iz3,jz3);
285 /* Calculate squared distance and things based on it */
286 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
287 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
288 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
289 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
290 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
291 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
292 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
293 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
294 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
295 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
297 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
298 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
299 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
300 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
301 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
302 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
303 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
304 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
305 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
306 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
308 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
309 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
310 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
311 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
312 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
313 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
314 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
315 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
316 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
318 fjx0 = _mm256_setzero_pd();
319 fjy0 = _mm256_setzero_pd();
320 fjz0 = _mm256_setzero_pd();
321 fjx1 = _mm256_setzero_pd();
322 fjy1 = _mm256_setzero_pd();
323 fjz1 = _mm256_setzero_pd();
324 fjx2 = _mm256_setzero_pd();
325 fjy2 = _mm256_setzero_pd();
326 fjz2 = _mm256_setzero_pd();
327 fjx3 = _mm256_setzero_pd();
328 fjy3 = _mm256_setzero_pd();
329 fjz3 = _mm256_setzero_pd();
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 r00 = _mm256_mul_pd(rsq00,rinv00);
337 /* Calculate table index by multiplying r with table scale and truncate to integer */
338 rt = _mm256_mul_pd(r00,vftabscale);
339 vfitab = _mm256_cvttpd_epi32(rt);
340 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
341 vfitab = _mm_slli_epi32(vfitab,3);
343 /* CUBIC SPLINE TABLE DISPERSION */
344 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
345 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
346 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
347 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
348 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
349 Heps = _mm256_mul_pd(vfeps,H);
350 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
351 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
352 vvdw6 = _mm256_mul_pd(c6_00,VV);
353 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
354 fvdw6 = _mm256_mul_pd(c6_00,FF);
356 /* CUBIC SPLINE TABLE REPULSION */
357 vfitab = _mm_add_epi32(vfitab,ifour);
358 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
359 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
360 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
361 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
362 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
363 Heps = _mm256_mul_pd(vfeps,H);
364 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
365 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
366 vvdw12 = _mm256_mul_pd(c12_00,VV);
367 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
368 fvdw12 = _mm256_mul_pd(c12_00,FF);
369 vvdw = _mm256_add_pd(vvdw12,vvdw6);
370 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
377 /* Calculate temporary vectorial force */
378 tx = _mm256_mul_pd(fscal,dx00);
379 ty = _mm256_mul_pd(fscal,dy00);
380 tz = _mm256_mul_pd(fscal,dz00);
382 /* Update vectorial force */
383 fix0 = _mm256_add_pd(fix0,tx);
384 fiy0 = _mm256_add_pd(fiy0,ty);
385 fiz0 = _mm256_add_pd(fiz0,tz);
387 fjx0 = _mm256_add_pd(fjx0,tx);
388 fjy0 = _mm256_add_pd(fjy0,ty);
389 fjz0 = _mm256_add_pd(fjz0,tz);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 r11 = _mm256_mul_pd(rsq11,rinv11);
397 /* EWALD ELECTROSTATICS */
399 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
400 ewrt = _mm256_mul_pd(r11,ewtabscale);
401 ewitab = _mm256_cvttpd_epi32(ewrt);
402 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
403 ewitab = _mm_slli_epi32(ewitab,2);
404 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
405 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
406 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
407 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
408 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
409 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
410 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
411 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
412 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velecsum = _mm256_add_pd(velecsum,velec);
419 /* Calculate temporary vectorial force */
420 tx = _mm256_mul_pd(fscal,dx11);
421 ty = _mm256_mul_pd(fscal,dy11);
422 tz = _mm256_mul_pd(fscal,dz11);
424 /* Update vectorial force */
425 fix1 = _mm256_add_pd(fix1,tx);
426 fiy1 = _mm256_add_pd(fiy1,ty);
427 fiz1 = _mm256_add_pd(fiz1,tz);
429 fjx1 = _mm256_add_pd(fjx1,tx);
430 fjy1 = _mm256_add_pd(fjy1,ty);
431 fjz1 = _mm256_add_pd(fjz1,tz);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 r12 = _mm256_mul_pd(rsq12,rinv12);
439 /* EWALD ELECTROSTATICS */
441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442 ewrt = _mm256_mul_pd(r12,ewtabscale);
443 ewitab = _mm256_cvttpd_epi32(ewrt);
444 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
445 ewitab = _mm_slli_epi32(ewitab,2);
446 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
447 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
448 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
449 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
450 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
451 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
452 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
453 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
454 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velecsum = _mm256_add_pd(velecsum,velec);
461 /* Calculate temporary vectorial force */
462 tx = _mm256_mul_pd(fscal,dx12);
463 ty = _mm256_mul_pd(fscal,dy12);
464 tz = _mm256_mul_pd(fscal,dz12);
466 /* Update vectorial force */
467 fix1 = _mm256_add_pd(fix1,tx);
468 fiy1 = _mm256_add_pd(fiy1,ty);
469 fiz1 = _mm256_add_pd(fiz1,tz);
471 fjx2 = _mm256_add_pd(fjx2,tx);
472 fjy2 = _mm256_add_pd(fjy2,ty);
473 fjz2 = _mm256_add_pd(fjz2,tz);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 r13 = _mm256_mul_pd(rsq13,rinv13);
481 /* EWALD ELECTROSTATICS */
483 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
484 ewrt = _mm256_mul_pd(r13,ewtabscale);
485 ewitab = _mm256_cvttpd_epi32(ewrt);
486 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
487 ewitab = _mm_slli_epi32(ewitab,2);
488 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
489 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
490 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
491 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
492 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
493 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
494 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
495 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
496 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
498 /* Update potential sum for this i atom from the interaction with this j atom. */
499 velecsum = _mm256_add_pd(velecsum,velec);
503 /* Calculate temporary vectorial force */
504 tx = _mm256_mul_pd(fscal,dx13);
505 ty = _mm256_mul_pd(fscal,dy13);
506 tz = _mm256_mul_pd(fscal,dz13);
508 /* Update vectorial force */
509 fix1 = _mm256_add_pd(fix1,tx);
510 fiy1 = _mm256_add_pd(fiy1,ty);
511 fiz1 = _mm256_add_pd(fiz1,tz);
513 fjx3 = _mm256_add_pd(fjx3,tx);
514 fjy3 = _mm256_add_pd(fjy3,ty);
515 fjz3 = _mm256_add_pd(fjz3,tz);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 r21 = _mm256_mul_pd(rsq21,rinv21);
523 /* EWALD ELECTROSTATICS */
525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
526 ewrt = _mm256_mul_pd(r21,ewtabscale);
527 ewitab = _mm256_cvttpd_epi32(ewrt);
528 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
529 ewitab = _mm_slli_epi32(ewitab,2);
530 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
531 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
532 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
533 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
534 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
535 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
536 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
537 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
538 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
540 /* Update potential sum for this i atom from the interaction with this j atom. */
541 velecsum = _mm256_add_pd(velecsum,velec);
545 /* Calculate temporary vectorial force */
546 tx = _mm256_mul_pd(fscal,dx21);
547 ty = _mm256_mul_pd(fscal,dy21);
548 tz = _mm256_mul_pd(fscal,dz21);
550 /* Update vectorial force */
551 fix2 = _mm256_add_pd(fix2,tx);
552 fiy2 = _mm256_add_pd(fiy2,ty);
553 fiz2 = _mm256_add_pd(fiz2,tz);
555 fjx1 = _mm256_add_pd(fjx1,tx);
556 fjy1 = _mm256_add_pd(fjy1,ty);
557 fjz1 = _mm256_add_pd(fjz1,tz);
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
563 r22 = _mm256_mul_pd(rsq22,rinv22);
565 /* EWALD ELECTROSTATICS */
567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
568 ewrt = _mm256_mul_pd(r22,ewtabscale);
569 ewitab = _mm256_cvttpd_epi32(ewrt);
570 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
571 ewitab = _mm_slli_epi32(ewitab,2);
572 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
573 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
574 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
575 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
576 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
577 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
578 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
579 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
580 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velecsum = _mm256_add_pd(velecsum,velec);
587 /* Calculate temporary vectorial force */
588 tx = _mm256_mul_pd(fscal,dx22);
589 ty = _mm256_mul_pd(fscal,dy22);
590 tz = _mm256_mul_pd(fscal,dz22);
592 /* Update vectorial force */
593 fix2 = _mm256_add_pd(fix2,tx);
594 fiy2 = _mm256_add_pd(fiy2,ty);
595 fiz2 = _mm256_add_pd(fiz2,tz);
597 fjx2 = _mm256_add_pd(fjx2,tx);
598 fjy2 = _mm256_add_pd(fjy2,ty);
599 fjz2 = _mm256_add_pd(fjz2,tz);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r23 = _mm256_mul_pd(rsq23,rinv23);
607 /* EWALD ELECTROSTATICS */
609 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
610 ewrt = _mm256_mul_pd(r23,ewtabscale);
611 ewitab = _mm256_cvttpd_epi32(ewrt);
612 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
613 ewitab = _mm_slli_epi32(ewitab,2);
614 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
615 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
616 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
617 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
618 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
619 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
620 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
621 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
622 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
624 /* Update potential sum for this i atom from the interaction with this j atom. */
625 velecsum = _mm256_add_pd(velecsum,velec);
629 /* Calculate temporary vectorial force */
630 tx = _mm256_mul_pd(fscal,dx23);
631 ty = _mm256_mul_pd(fscal,dy23);
632 tz = _mm256_mul_pd(fscal,dz23);
634 /* Update vectorial force */
635 fix2 = _mm256_add_pd(fix2,tx);
636 fiy2 = _mm256_add_pd(fiy2,ty);
637 fiz2 = _mm256_add_pd(fiz2,tz);
639 fjx3 = _mm256_add_pd(fjx3,tx);
640 fjy3 = _mm256_add_pd(fjy3,ty);
641 fjz3 = _mm256_add_pd(fjz3,tz);
643 /**************************
644 * CALCULATE INTERACTIONS *
645 **************************/
647 r31 = _mm256_mul_pd(rsq31,rinv31);
649 /* EWALD ELECTROSTATICS */
651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
652 ewrt = _mm256_mul_pd(r31,ewtabscale);
653 ewitab = _mm256_cvttpd_epi32(ewrt);
654 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
655 ewitab = _mm_slli_epi32(ewitab,2);
656 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
657 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
658 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
659 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
660 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
661 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
662 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
663 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
664 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
666 /* Update potential sum for this i atom from the interaction with this j atom. */
667 velecsum = _mm256_add_pd(velecsum,velec);
671 /* Calculate temporary vectorial force */
672 tx = _mm256_mul_pd(fscal,dx31);
673 ty = _mm256_mul_pd(fscal,dy31);
674 tz = _mm256_mul_pd(fscal,dz31);
676 /* Update vectorial force */
677 fix3 = _mm256_add_pd(fix3,tx);
678 fiy3 = _mm256_add_pd(fiy3,ty);
679 fiz3 = _mm256_add_pd(fiz3,tz);
681 fjx1 = _mm256_add_pd(fjx1,tx);
682 fjy1 = _mm256_add_pd(fjy1,ty);
683 fjz1 = _mm256_add_pd(fjz1,tz);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 r32 = _mm256_mul_pd(rsq32,rinv32);
691 /* EWALD ELECTROSTATICS */
693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694 ewrt = _mm256_mul_pd(r32,ewtabscale);
695 ewitab = _mm256_cvttpd_epi32(ewrt);
696 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
697 ewitab = _mm_slli_epi32(ewitab,2);
698 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
699 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
700 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
701 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
702 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
703 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
704 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
705 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
706 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velecsum = _mm256_add_pd(velecsum,velec);
713 /* Calculate temporary vectorial force */
714 tx = _mm256_mul_pd(fscal,dx32);
715 ty = _mm256_mul_pd(fscal,dy32);
716 tz = _mm256_mul_pd(fscal,dz32);
718 /* Update vectorial force */
719 fix3 = _mm256_add_pd(fix3,tx);
720 fiy3 = _mm256_add_pd(fiy3,ty);
721 fiz3 = _mm256_add_pd(fiz3,tz);
723 fjx2 = _mm256_add_pd(fjx2,tx);
724 fjy2 = _mm256_add_pd(fjy2,ty);
725 fjz2 = _mm256_add_pd(fjz2,tz);
727 /**************************
728 * CALCULATE INTERACTIONS *
729 **************************/
731 r33 = _mm256_mul_pd(rsq33,rinv33);
733 /* EWALD ELECTROSTATICS */
735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
736 ewrt = _mm256_mul_pd(r33,ewtabscale);
737 ewitab = _mm256_cvttpd_epi32(ewrt);
738 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
739 ewitab = _mm_slli_epi32(ewitab,2);
740 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
741 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
742 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
743 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
744 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
745 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
746 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
747 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
748 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velecsum = _mm256_add_pd(velecsum,velec);
755 /* Calculate temporary vectorial force */
756 tx = _mm256_mul_pd(fscal,dx33);
757 ty = _mm256_mul_pd(fscal,dy33);
758 tz = _mm256_mul_pd(fscal,dz33);
760 /* Update vectorial force */
761 fix3 = _mm256_add_pd(fix3,tx);
762 fiy3 = _mm256_add_pd(fiy3,ty);
763 fiz3 = _mm256_add_pd(fiz3,tz);
765 fjx3 = _mm256_add_pd(fjx3,tx);
766 fjy3 = _mm256_add_pd(fjy3,ty);
767 fjz3 = _mm256_add_pd(fjz3,tz);
769 fjptrA = f+j_coord_offsetA;
770 fjptrB = f+j_coord_offsetB;
771 fjptrC = f+j_coord_offsetC;
772 fjptrD = f+j_coord_offsetD;
774 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
775 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
776 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
778 /* Inner loop uses 428 flops */
784 /* Get j neighbor index, and coordinate index */
785 jnrlistA = jjnr[jidx];
786 jnrlistB = jjnr[jidx+1];
787 jnrlistC = jjnr[jidx+2];
788 jnrlistD = jjnr[jidx+3];
789 /* Sign of each element will be negative for non-real atoms.
790 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
791 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
793 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
795 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
796 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
797 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
799 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
800 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
801 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
802 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
803 j_coord_offsetA = DIM*jnrA;
804 j_coord_offsetB = DIM*jnrB;
805 j_coord_offsetC = DIM*jnrC;
806 j_coord_offsetD = DIM*jnrD;
808 /* load j atom coordinates */
809 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
810 x+j_coord_offsetC,x+j_coord_offsetD,
811 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
812 &jy2,&jz2,&jx3,&jy3,&jz3);
814 /* Calculate displacement vector */
815 dx00 = _mm256_sub_pd(ix0,jx0);
816 dy00 = _mm256_sub_pd(iy0,jy0);
817 dz00 = _mm256_sub_pd(iz0,jz0);
818 dx11 = _mm256_sub_pd(ix1,jx1);
819 dy11 = _mm256_sub_pd(iy1,jy1);
820 dz11 = _mm256_sub_pd(iz1,jz1);
821 dx12 = _mm256_sub_pd(ix1,jx2);
822 dy12 = _mm256_sub_pd(iy1,jy2);
823 dz12 = _mm256_sub_pd(iz1,jz2);
824 dx13 = _mm256_sub_pd(ix1,jx3);
825 dy13 = _mm256_sub_pd(iy1,jy3);
826 dz13 = _mm256_sub_pd(iz1,jz3);
827 dx21 = _mm256_sub_pd(ix2,jx1);
828 dy21 = _mm256_sub_pd(iy2,jy1);
829 dz21 = _mm256_sub_pd(iz2,jz1);
830 dx22 = _mm256_sub_pd(ix2,jx2);
831 dy22 = _mm256_sub_pd(iy2,jy2);
832 dz22 = _mm256_sub_pd(iz2,jz2);
833 dx23 = _mm256_sub_pd(ix2,jx3);
834 dy23 = _mm256_sub_pd(iy2,jy3);
835 dz23 = _mm256_sub_pd(iz2,jz3);
836 dx31 = _mm256_sub_pd(ix3,jx1);
837 dy31 = _mm256_sub_pd(iy3,jy1);
838 dz31 = _mm256_sub_pd(iz3,jz1);
839 dx32 = _mm256_sub_pd(ix3,jx2);
840 dy32 = _mm256_sub_pd(iy3,jy2);
841 dz32 = _mm256_sub_pd(iz3,jz2);
842 dx33 = _mm256_sub_pd(ix3,jx3);
843 dy33 = _mm256_sub_pd(iy3,jy3);
844 dz33 = _mm256_sub_pd(iz3,jz3);
846 /* Calculate squared distance and things based on it */
847 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
848 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
849 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
850 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
851 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
852 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
853 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
854 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
855 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
856 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
858 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
859 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
860 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
861 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
862 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
863 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
864 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
865 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
866 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
867 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
869 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
870 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
871 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
872 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
873 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
874 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
875 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
876 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
877 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
879 fjx0 = _mm256_setzero_pd();
880 fjy0 = _mm256_setzero_pd();
881 fjz0 = _mm256_setzero_pd();
882 fjx1 = _mm256_setzero_pd();
883 fjy1 = _mm256_setzero_pd();
884 fjz1 = _mm256_setzero_pd();
885 fjx2 = _mm256_setzero_pd();
886 fjy2 = _mm256_setzero_pd();
887 fjz2 = _mm256_setzero_pd();
888 fjx3 = _mm256_setzero_pd();
889 fjy3 = _mm256_setzero_pd();
890 fjz3 = _mm256_setzero_pd();
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 r00 = _mm256_mul_pd(rsq00,rinv00);
897 r00 = _mm256_andnot_pd(dummy_mask,r00);
899 /* Calculate table index by multiplying r with table scale and truncate to integer */
900 rt = _mm256_mul_pd(r00,vftabscale);
901 vfitab = _mm256_cvttpd_epi32(rt);
902 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
903 vfitab = _mm_slli_epi32(vfitab,3);
905 /* CUBIC SPLINE TABLE DISPERSION */
906 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
907 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
908 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
909 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
910 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
911 Heps = _mm256_mul_pd(vfeps,H);
912 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
913 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
914 vvdw6 = _mm256_mul_pd(c6_00,VV);
915 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
916 fvdw6 = _mm256_mul_pd(c6_00,FF);
918 /* CUBIC SPLINE TABLE REPULSION */
919 vfitab = _mm_add_epi32(vfitab,ifour);
920 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
921 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
922 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
923 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
924 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
925 Heps = _mm256_mul_pd(vfeps,H);
926 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
927 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
928 vvdw12 = _mm256_mul_pd(c12_00,VV);
929 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
930 fvdw12 = _mm256_mul_pd(c12_00,FF);
931 vvdw = _mm256_add_pd(vvdw12,vvdw6);
932 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
934 /* Update potential sum for this i atom from the interaction with this j atom. */
935 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
936 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
940 fscal = _mm256_andnot_pd(dummy_mask,fscal);
942 /* Calculate temporary vectorial force */
943 tx = _mm256_mul_pd(fscal,dx00);
944 ty = _mm256_mul_pd(fscal,dy00);
945 tz = _mm256_mul_pd(fscal,dz00);
947 /* Update vectorial force */
948 fix0 = _mm256_add_pd(fix0,tx);
949 fiy0 = _mm256_add_pd(fiy0,ty);
950 fiz0 = _mm256_add_pd(fiz0,tz);
952 fjx0 = _mm256_add_pd(fjx0,tx);
953 fjy0 = _mm256_add_pd(fjy0,ty);
954 fjz0 = _mm256_add_pd(fjz0,tz);
956 /**************************
957 * CALCULATE INTERACTIONS *
958 **************************/
960 r11 = _mm256_mul_pd(rsq11,rinv11);
961 r11 = _mm256_andnot_pd(dummy_mask,r11);
963 /* EWALD ELECTROSTATICS */
965 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
966 ewrt = _mm256_mul_pd(r11,ewtabscale);
967 ewitab = _mm256_cvttpd_epi32(ewrt);
968 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
969 ewitab = _mm_slli_epi32(ewitab,2);
970 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
971 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
972 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
973 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
974 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
975 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
976 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
977 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
978 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
980 /* Update potential sum for this i atom from the interaction with this j atom. */
981 velec = _mm256_andnot_pd(dummy_mask,velec);
982 velecsum = _mm256_add_pd(velecsum,velec);
986 fscal = _mm256_andnot_pd(dummy_mask,fscal);
988 /* Calculate temporary vectorial force */
989 tx = _mm256_mul_pd(fscal,dx11);
990 ty = _mm256_mul_pd(fscal,dy11);
991 tz = _mm256_mul_pd(fscal,dz11);
993 /* Update vectorial force */
994 fix1 = _mm256_add_pd(fix1,tx);
995 fiy1 = _mm256_add_pd(fiy1,ty);
996 fiz1 = _mm256_add_pd(fiz1,tz);
998 fjx1 = _mm256_add_pd(fjx1,tx);
999 fjy1 = _mm256_add_pd(fjy1,ty);
1000 fjz1 = _mm256_add_pd(fjz1,tz);
1002 /**************************
1003 * CALCULATE INTERACTIONS *
1004 **************************/
1006 r12 = _mm256_mul_pd(rsq12,rinv12);
1007 r12 = _mm256_andnot_pd(dummy_mask,r12);
1009 /* EWALD ELECTROSTATICS */
1011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012 ewrt = _mm256_mul_pd(r12,ewtabscale);
1013 ewitab = _mm256_cvttpd_epi32(ewrt);
1014 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1015 ewitab = _mm_slli_epi32(ewitab,2);
1016 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1017 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1018 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1019 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1020 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1021 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1022 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1023 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1024 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm256_andnot_pd(dummy_mask,velec);
1028 velecsum = _mm256_add_pd(velecsum,velec);
1032 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1034 /* Calculate temporary vectorial force */
1035 tx = _mm256_mul_pd(fscal,dx12);
1036 ty = _mm256_mul_pd(fscal,dy12);
1037 tz = _mm256_mul_pd(fscal,dz12);
1039 /* Update vectorial force */
1040 fix1 = _mm256_add_pd(fix1,tx);
1041 fiy1 = _mm256_add_pd(fiy1,ty);
1042 fiz1 = _mm256_add_pd(fiz1,tz);
1044 fjx2 = _mm256_add_pd(fjx2,tx);
1045 fjy2 = _mm256_add_pd(fjy2,ty);
1046 fjz2 = _mm256_add_pd(fjz2,tz);
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 r13 = _mm256_mul_pd(rsq13,rinv13);
1053 r13 = _mm256_andnot_pd(dummy_mask,r13);
1055 /* EWALD ELECTROSTATICS */
1057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058 ewrt = _mm256_mul_pd(r13,ewtabscale);
1059 ewitab = _mm256_cvttpd_epi32(ewrt);
1060 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1061 ewitab = _mm_slli_epi32(ewitab,2);
1062 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1063 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1064 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1065 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1066 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1067 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1068 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1069 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1070 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 velec = _mm256_andnot_pd(dummy_mask,velec);
1074 velecsum = _mm256_add_pd(velecsum,velec);
1078 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1080 /* Calculate temporary vectorial force */
1081 tx = _mm256_mul_pd(fscal,dx13);
1082 ty = _mm256_mul_pd(fscal,dy13);
1083 tz = _mm256_mul_pd(fscal,dz13);
1085 /* Update vectorial force */
1086 fix1 = _mm256_add_pd(fix1,tx);
1087 fiy1 = _mm256_add_pd(fiy1,ty);
1088 fiz1 = _mm256_add_pd(fiz1,tz);
1090 fjx3 = _mm256_add_pd(fjx3,tx);
1091 fjy3 = _mm256_add_pd(fjy3,ty);
1092 fjz3 = _mm256_add_pd(fjz3,tz);
1094 /**************************
1095 * CALCULATE INTERACTIONS *
1096 **************************/
1098 r21 = _mm256_mul_pd(rsq21,rinv21);
1099 r21 = _mm256_andnot_pd(dummy_mask,r21);
1101 /* EWALD ELECTROSTATICS */
1103 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104 ewrt = _mm256_mul_pd(r21,ewtabscale);
1105 ewitab = _mm256_cvttpd_epi32(ewrt);
1106 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1107 ewitab = _mm_slli_epi32(ewitab,2);
1108 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1109 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1110 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1111 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1112 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1113 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1114 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1115 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1116 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1118 /* Update potential sum for this i atom from the interaction with this j atom. */
1119 velec = _mm256_andnot_pd(dummy_mask,velec);
1120 velecsum = _mm256_add_pd(velecsum,velec);
1124 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1126 /* Calculate temporary vectorial force */
1127 tx = _mm256_mul_pd(fscal,dx21);
1128 ty = _mm256_mul_pd(fscal,dy21);
1129 tz = _mm256_mul_pd(fscal,dz21);
1131 /* Update vectorial force */
1132 fix2 = _mm256_add_pd(fix2,tx);
1133 fiy2 = _mm256_add_pd(fiy2,ty);
1134 fiz2 = _mm256_add_pd(fiz2,tz);
1136 fjx1 = _mm256_add_pd(fjx1,tx);
1137 fjy1 = _mm256_add_pd(fjy1,ty);
1138 fjz1 = _mm256_add_pd(fjz1,tz);
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1144 r22 = _mm256_mul_pd(rsq22,rinv22);
1145 r22 = _mm256_andnot_pd(dummy_mask,r22);
1147 /* EWALD ELECTROSTATICS */
1149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150 ewrt = _mm256_mul_pd(r22,ewtabscale);
1151 ewitab = _mm256_cvttpd_epi32(ewrt);
1152 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1153 ewitab = _mm_slli_epi32(ewitab,2);
1154 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1155 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1156 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1157 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1158 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1159 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1160 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1161 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1162 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1164 /* Update potential sum for this i atom from the interaction with this j atom. */
1165 velec = _mm256_andnot_pd(dummy_mask,velec);
1166 velecsum = _mm256_add_pd(velecsum,velec);
1170 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1172 /* Calculate temporary vectorial force */
1173 tx = _mm256_mul_pd(fscal,dx22);
1174 ty = _mm256_mul_pd(fscal,dy22);
1175 tz = _mm256_mul_pd(fscal,dz22);
1177 /* Update vectorial force */
1178 fix2 = _mm256_add_pd(fix2,tx);
1179 fiy2 = _mm256_add_pd(fiy2,ty);
1180 fiz2 = _mm256_add_pd(fiz2,tz);
1182 fjx2 = _mm256_add_pd(fjx2,tx);
1183 fjy2 = _mm256_add_pd(fjy2,ty);
1184 fjz2 = _mm256_add_pd(fjz2,tz);
1186 /**************************
1187 * CALCULATE INTERACTIONS *
1188 **************************/
1190 r23 = _mm256_mul_pd(rsq23,rinv23);
1191 r23 = _mm256_andnot_pd(dummy_mask,r23);
1193 /* EWALD ELECTROSTATICS */
1195 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196 ewrt = _mm256_mul_pd(r23,ewtabscale);
1197 ewitab = _mm256_cvttpd_epi32(ewrt);
1198 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1199 ewitab = _mm_slli_epi32(ewitab,2);
1200 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1201 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1202 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1203 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1204 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1205 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1206 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1207 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1208 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1210 /* Update potential sum for this i atom from the interaction with this j atom. */
1211 velec = _mm256_andnot_pd(dummy_mask,velec);
1212 velecsum = _mm256_add_pd(velecsum,velec);
1216 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1218 /* Calculate temporary vectorial force */
1219 tx = _mm256_mul_pd(fscal,dx23);
1220 ty = _mm256_mul_pd(fscal,dy23);
1221 tz = _mm256_mul_pd(fscal,dz23);
1223 /* Update vectorial force */
1224 fix2 = _mm256_add_pd(fix2,tx);
1225 fiy2 = _mm256_add_pd(fiy2,ty);
1226 fiz2 = _mm256_add_pd(fiz2,tz);
1228 fjx3 = _mm256_add_pd(fjx3,tx);
1229 fjy3 = _mm256_add_pd(fjy3,ty);
1230 fjz3 = _mm256_add_pd(fjz3,tz);
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 r31 = _mm256_mul_pd(rsq31,rinv31);
1237 r31 = _mm256_andnot_pd(dummy_mask,r31);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt = _mm256_mul_pd(r31,ewtabscale);
1243 ewitab = _mm256_cvttpd_epi32(ewrt);
1244 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1245 ewitab = _mm_slli_epi32(ewitab,2);
1246 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1247 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1248 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1249 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1250 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1251 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1252 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1253 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1254 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1256 /* Update potential sum for this i atom from the interaction with this j atom. */
1257 velec = _mm256_andnot_pd(dummy_mask,velec);
1258 velecsum = _mm256_add_pd(velecsum,velec);
1262 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1264 /* Calculate temporary vectorial force */
1265 tx = _mm256_mul_pd(fscal,dx31);
1266 ty = _mm256_mul_pd(fscal,dy31);
1267 tz = _mm256_mul_pd(fscal,dz31);
1269 /* Update vectorial force */
1270 fix3 = _mm256_add_pd(fix3,tx);
1271 fiy3 = _mm256_add_pd(fiy3,ty);
1272 fiz3 = _mm256_add_pd(fiz3,tz);
1274 fjx1 = _mm256_add_pd(fjx1,tx);
1275 fjy1 = _mm256_add_pd(fjy1,ty);
1276 fjz1 = _mm256_add_pd(fjz1,tz);
1278 /**************************
1279 * CALCULATE INTERACTIONS *
1280 **************************/
1282 r32 = _mm256_mul_pd(rsq32,rinv32);
1283 r32 = _mm256_andnot_pd(dummy_mask,r32);
1285 /* EWALD ELECTROSTATICS */
1287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1288 ewrt = _mm256_mul_pd(r32,ewtabscale);
1289 ewitab = _mm256_cvttpd_epi32(ewrt);
1290 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1291 ewitab = _mm_slli_epi32(ewitab,2);
1292 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1293 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1294 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1295 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1296 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1297 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1298 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1299 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1300 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1302 /* Update potential sum for this i atom from the interaction with this j atom. */
1303 velec = _mm256_andnot_pd(dummy_mask,velec);
1304 velecsum = _mm256_add_pd(velecsum,velec);
1308 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1310 /* Calculate temporary vectorial force */
1311 tx = _mm256_mul_pd(fscal,dx32);
1312 ty = _mm256_mul_pd(fscal,dy32);
1313 tz = _mm256_mul_pd(fscal,dz32);
1315 /* Update vectorial force */
1316 fix3 = _mm256_add_pd(fix3,tx);
1317 fiy3 = _mm256_add_pd(fiy3,ty);
1318 fiz3 = _mm256_add_pd(fiz3,tz);
1320 fjx2 = _mm256_add_pd(fjx2,tx);
1321 fjy2 = _mm256_add_pd(fjy2,ty);
1322 fjz2 = _mm256_add_pd(fjz2,tz);
1324 /**************************
1325 * CALCULATE INTERACTIONS *
1326 **************************/
1328 r33 = _mm256_mul_pd(rsq33,rinv33);
1329 r33 = _mm256_andnot_pd(dummy_mask,r33);
1331 /* EWALD ELECTROSTATICS */
1333 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1334 ewrt = _mm256_mul_pd(r33,ewtabscale);
1335 ewitab = _mm256_cvttpd_epi32(ewrt);
1336 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1337 ewitab = _mm_slli_epi32(ewitab,2);
1338 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1339 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1340 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1341 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1342 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1343 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1344 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1345 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1346 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1348 /* Update potential sum for this i atom from the interaction with this j atom. */
1349 velec = _mm256_andnot_pd(dummy_mask,velec);
1350 velecsum = _mm256_add_pd(velecsum,velec);
1354 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1356 /* Calculate temporary vectorial force */
1357 tx = _mm256_mul_pd(fscal,dx33);
1358 ty = _mm256_mul_pd(fscal,dy33);
1359 tz = _mm256_mul_pd(fscal,dz33);
1361 /* Update vectorial force */
1362 fix3 = _mm256_add_pd(fix3,tx);
1363 fiy3 = _mm256_add_pd(fiy3,ty);
1364 fiz3 = _mm256_add_pd(fiz3,tz);
1366 fjx3 = _mm256_add_pd(fjx3,tx);
1367 fjy3 = _mm256_add_pd(fjy3,ty);
1368 fjz3 = _mm256_add_pd(fjz3,tz);
1370 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1371 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1372 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1373 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1375 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1376 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1377 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1379 /* Inner loop uses 438 flops */
1382 /* End of innermost loop */
1384 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1385 f+i_coord_offset,fshift+i_shift_offset);
1388 /* Update potential energies */
1389 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1390 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1392 /* Increment number of inner iterations */
1393 inneriter += j_index_end - j_index_start;
1395 /* Outer loop uses 26 flops */
1398 /* Increment number of outer iterations */
1401 /* Update outer/inner flops */
1403 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*438);
1406 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double
1407 * Electrostatics interaction: Ewald
1408 * VdW interaction: CubicSplineTable
1409 * Geometry: Water4-Water4
1410 * Calculate force/pot: Force
1413 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double
1414 (t_nblist * gmx_restrict nlist,
1415 rvec * gmx_restrict xx,
1416 rvec * gmx_restrict ff,
1417 t_forcerec * gmx_restrict fr,
1418 t_mdatoms * gmx_restrict mdatoms,
1419 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1420 t_nrnb * gmx_restrict nrnb)
1422 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1423 * just 0 for non-waters.
1424 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1425 * jnr indices corresponding to data put in the four positions in the SIMD register.
1427 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1428 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1429 int jnrA,jnrB,jnrC,jnrD;
1430 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1431 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1432 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1433 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1434 real rcutoff_scalar;
1435 real *shiftvec,*fshift,*x,*f;
1436 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1437 real scratch[4*DIM];
1438 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1439 real * vdwioffsetptr0;
1440 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1441 real * vdwioffsetptr1;
1442 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1443 real * vdwioffsetptr2;
1444 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1445 real * vdwioffsetptr3;
1446 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1447 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1448 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1449 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1450 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1451 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1452 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1453 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1454 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1455 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1456 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1457 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1458 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1459 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1460 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1461 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1462 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1463 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1464 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1465 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1468 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1471 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1472 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1474 __m128i ifour = _mm_set1_epi32(4);
1475 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1478 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1479 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1481 __m256d dummy_mask,cutoff_mask;
1482 __m128 tmpmask0,tmpmask1;
1483 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1484 __m256d one = _mm256_set1_pd(1.0);
1485 __m256d two = _mm256_set1_pd(2.0);
1491 jindex = nlist->jindex;
1493 shiftidx = nlist->shift;
1495 shiftvec = fr->shift_vec[0];
1496 fshift = fr->fshift[0];
1497 facel = _mm256_set1_pd(fr->epsfac);
1498 charge = mdatoms->chargeA;
1499 nvdwtype = fr->ntype;
1500 vdwparam = fr->nbfp;
1501 vdwtype = mdatoms->typeA;
1503 vftab = kernel_data->table_vdw->data;
1504 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1506 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1507 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1508 beta2 = _mm256_mul_pd(beta,beta);
1509 beta3 = _mm256_mul_pd(beta,beta2);
1511 ewtab = fr->ic->tabq_coul_F;
1512 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1513 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1515 /* Setup water-specific parameters */
1516 inr = nlist->iinr[0];
1517 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1518 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1519 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1520 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1522 jq1 = _mm256_set1_pd(charge[inr+1]);
1523 jq2 = _mm256_set1_pd(charge[inr+2]);
1524 jq3 = _mm256_set1_pd(charge[inr+3]);
1525 vdwjidx0A = 2*vdwtype[inr+0];
1526 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1527 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1528 qq11 = _mm256_mul_pd(iq1,jq1);
1529 qq12 = _mm256_mul_pd(iq1,jq2);
1530 qq13 = _mm256_mul_pd(iq1,jq3);
1531 qq21 = _mm256_mul_pd(iq2,jq1);
1532 qq22 = _mm256_mul_pd(iq2,jq2);
1533 qq23 = _mm256_mul_pd(iq2,jq3);
1534 qq31 = _mm256_mul_pd(iq3,jq1);
1535 qq32 = _mm256_mul_pd(iq3,jq2);
1536 qq33 = _mm256_mul_pd(iq3,jq3);
1538 /* Avoid stupid compiler warnings */
1539 jnrA = jnrB = jnrC = jnrD = 0;
1540 j_coord_offsetA = 0;
1541 j_coord_offsetB = 0;
1542 j_coord_offsetC = 0;
1543 j_coord_offsetD = 0;
1548 for(iidx=0;iidx<4*DIM;iidx++)
1550 scratch[iidx] = 0.0;
1553 /* Start outer loop over neighborlists */
1554 for(iidx=0; iidx<nri; iidx++)
1556 /* Load shift vector for this list */
1557 i_shift_offset = DIM*shiftidx[iidx];
1559 /* Load limits for loop over neighbors */
1560 j_index_start = jindex[iidx];
1561 j_index_end = jindex[iidx+1];
1563 /* Get outer coordinate index */
1565 i_coord_offset = DIM*inr;
1567 /* Load i particle coords and add shift vector */
1568 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1569 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1571 fix0 = _mm256_setzero_pd();
1572 fiy0 = _mm256_setzero_pd();
1573 fiz0 = _mm256_setzero_pd();
1574 fix1 = _mm256_setzero_pd();
1575 fiy1 = _mm256_setzero_pd();
1576 fiz1 = _mm256_setzero_pd();
1577 fix2 = _mm256_setzero_pd();
1578 fiy2 = _mm256_setzero_pd();
1579 fiz2 = _mm256_setzero_pd();
1580 fix3 = _mm256_setzero_pd();
1581 fiy3 = _mm256_setzero_pd();
1582 fiz3 = _mm256_setzero_pd();
1584 /* Start inner kernel loop */
1585 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1588 /* Get j neighbor index, and coordinate index */
1590 jnrB = jjnr[jidx+1];
1591 jnrC = jjnr[jidx+2];
1592 jnrD = jjnr[jidx+3];
1593 j_coord_offsetA = DIM*jnrA;
1594 j_coord_offsetB = DIM*jnrB;
1595 j_coord_offsetC = DIM*jnrC;
1596 j_coord_offsetD = DIM*jnrD;
1598 /* load j atom coordinates */
1599 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1600 x+j_coord_offsetC,x+j_coord_offsetD,
1601 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1602 &jy2,&jz2,&jx3,&jy3,&jz3);
1604 /* Calculate displacement vector */
1605 dx00 = _mm256_sub_pd(ix0,jx0);
1606 dy00 = _mm256_sub_pd(iy0,jy0);
1607 dz00 = _mm256_sub_pd(iz0,jz0);
1608 dx11 = _mm256_sub_pd(ix1,jx1);
1609 dy11 = _mm256_sub_pd(iy1,jy1);
1610 dz11 = _mm256_sub_pd(iz1,jz1);
1611 dx12 = _mm256_sub_pd(ix1,jx2);
1612 dy12 = _mm256_sub_pd(iy1,jy2);
1613 dz12 = _mm256_sub_pd(iz1,jz2);
1614 dx13 = _mm256_sub_pd(ix1,jx3);
1615 dy13 = _mm256_sub_pd(iy1,jy3);
1616 dz13 = _mm256_sub_pd(iz1,jz3);
1617 dx21 = _mm256_sub_pd(ix2,jx1);
1618 dy21 = _mm256_sub_pd(iy2,jy1);
1619 dz21 = _mm256_sub_pd(iz2,jz1);
1620 dx22 = _mm256_sub_pd(ix2,jx2);
1621 dy22 = _mm256_sub_pd(iy2,jy2);
1622 dz22 = _mm256_sub_pd(iz2,jz2);
1623 dx23 = _mm256_sub_pd(ix2,jx3);
1624 dy23 = _mm256_sub_pd(iy2,jy3);
1625 dz23 = _mm256_sub_pd(iz2,jz3);
1626 dx31 = _mm256_sub_pd(ix3,jx1);
1627 dy31 = _mm256_sub_pd(iy3,jy1);
1628 dz31 = _mm256_sub_pd(iz3,jz1);
1629 dx32 = _mm256_sub_pd(ix3,jx2);
1630 dy32 = _mm256_sub_pd(iy3,jy2);
1631 dz32 = _mm256_sub_pd(iz3,jz2);
1632 dx33 = _mm256_sub_pd(ix3,jx3);
1633 dy33 = _mm256_sub_pd(iy3,jy3);
1634 dz33 = _mm256_sub_pd(iz3,jz3);
1636 /* Calculate squared distance and things based on it */
1637 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1638 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1639 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1640 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1641 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1642 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1643 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1644 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1645 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1646 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1648 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1649 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1650 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1651 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1652 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1653 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1654 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1655 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1656 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1657 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1659 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1660 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1661 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1662 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1663 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1664 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1665 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1666 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1667 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1669 fjx0 = _mm256_setzero_pd();
1670 fjy0 = _mm256_setzero_pd();
1671 fjz0 = _mm256_setzero_pd();
1672 fjx1 = _mm256_setzero_pd();
1673 fjy1 = _mm256_setzero_pd();
1674 fjz1 = _mm256_setzero_pd();
1675 fjx2 = _mm256_setzero_pd();
1676 fjy2 = _mm256_setzero_pd();
1677 fjz2 = _mm256_setzero_pd();
1678 fjx3 = _mm256_setzero_pd();
1679 fjy3 = _mm256_setzero_pd();
1680 fjz3 = _mm256_setzero_pd();
1682 /**************************
1683 * CALCULATE INTERACTIONS *
1684 **************************/
1686 r00 = _mm256_mul_pd(rsq00,rinv00);
1688 /* Calculate table index by multiplying r with table scale and truncate to integer */
1689 rt = _mm256_mul_pd(r00,vftabscale);
1690 vfitab = _mm256_cvttpd_epi32(rt);
1691 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1692 vfitab = _mm_slli_epi32(vfitab,3);
1694 /* CUBIC SPLINE TABLE DISPERSION */
1695 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1696 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1697 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1698 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1699 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1700 Heps = _mm256_mul_pd(vfeps,H);
1701 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1702 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1703 fvdw6 = _mm256_mul_pd(c6_00,FF);
1705 /* CUBIC SPLINE TABLE REPULSION */
1706 vfitab = _mm_add_epi32(vfitab,ifour);
1707 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1708 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1709 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1710 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1711 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1712 Heps = _mm256_mul_pd(vfeps,H);
1713 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1714 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1715 fvdw12 = _mm256_mul_pd(c12_00,FF);
1716 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1720 /* Calculate temporary vectorial force */
1721 tx = _mm256_mul_pd(fscal,dx00);
1722 ty = _mm256_mul_pd(fscal,dy00);
1723 tz = _mm256_mul_pd(fscal,dz00);
1725 /* Update vectorial force */
1726 fix0 = _mm256_add_pd(fix0,tx);
1727 fiy0 = _mm256_add_pd(fiy0,ty);
1728 fiz0 = _mm256_add_pd(fiz0,tz);
1730 fjx0 = _mm256_add_pd(fjx0,tx);
1731 fjy0 = _mm256_add_pd(fjy0,ty);
1732 fjz0 = _mm256_add_pd(fjz0,tz);
1734 /**************************
1735 * CALCULATE INTERACTIONS *
1736 **************************/
1738 r11 = _mm256_mul_pd(rsq11,rinv11);
1740 /* EWALD ELECTROSTATICS */
1742 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1743 ewrt = _mm256_mul_pd(r11,ewtabscale);
1744 ewitab = _mm256_cvttpd_epi32(ewrt);
1745 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1746 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1747 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1749 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1750 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1754 /* Calculate temporary vectorial force */
1755 tx = _mm256_mul_pd(fscal,dx11);
1756 ty = _mm256_mul_pd(fscal,dy11);
1757 tz = _mm256_mul_pd(fscal,dz11);
1759 /* Update vectorial force */
1760 fix1 = _mm256_add_pd(fix1,tx);
1761 fiy1 = _mm256_add_pd(fiy1,ty);
1762 fiz1 = _mm256_add_pd(fiz1,tz);
1764 fjx1 = _mm256_add_pd(fjx1,tx);
1765 fjy1 = _mm256_add_pd(fjy1,ty);
1766 fjz1 = _mm256_add_pd(fjz1,tz);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 r12 = _mm256_mul_pd(rsq12,rinv12);
1774 /* EWALD ELECTROSTATICS */
1776 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1777 ewrt = _mm256_mul_pd(r12,ewtabscale);
1778 ewitab = _mm256_cvttpd_epi32(ewrt);
1779 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1780 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1781 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1783 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1784 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1788 /* Calculate temporary vectorial force */
1789 tx = _mm256_mul_pd(fscal,dx12);
1790 ty = _mm256_mul_pd(fscal,dy12);
1791 tz = _mm256_mul_pd(fscal,dz12);
1793 /* Update vectorial force */
1794 fix1 = _mm256_add_pd(fix1,tx);
1795 fiy1 = _mm256_add_pd(fiy1,ty);
1796 fiz1 = _mm256_add_pd(fiz1,tz);
1798 fjx2 = _mm256_add_pd(fjx2,tx);
1799 fjy2 = _mm256_add_pd(fjy2,ty);
1800 fjz2 = _mm256_add_pd(fjz2,tz);
1802 /**************************
1803 * CALCULATE INTERACTIONS *
1804 **************************/
1806 r13 = _mm256_mul_pd(rsq13,rinv13);
1808 /* EWALD ELECTROSTATICS */
1810 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1811 ewrt = _mm256_mul_pd(r13,ewtabscale);
1812 ewitab = _mm256_cvttpd_epi32(ewrt);
1813 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1814 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1815 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1817 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1818 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1822 /* Calculate temporary vectorial force */
1823 tx = _mm256_mul_pd(fscal,dx13);
1824 ty = _mm256_mul_pd(fscal,dy13);
1825 tz = _mm256_mul_pd(fscal,dz13);
1827 /* Update vectorial force */
1828 fix1 = _mm256_add_pd(fix1,tx);
1829 fiy1 = _mm256_add_pd(fiy1,ty);
1830 fiz1 = _mm256_add_pd(fiz1,tz);
1832 fjx3 = _mm256_add_pd(fjx3,tx);
1833 fjy3 = _mm256_add_pd(fjy3,ty);
1834 fjz3 = _mm256_add_pd(fjz3,tz);
1836 /**************************
1837 * CALCULATE INTERACTIONS *
1838 **************************/
1840 r21 = _mm256_mul_pd(rsq21,rinv21);
1842 /* EWALD ELECTROSTATICS */
1844 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1845 ewrt = _mm256_mul_pd(r21,ewtabscale);
1846 ewitab = _mm256_cvttpd_epi32(ewrt);
1847 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1848 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1849 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1851 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1852 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1856 /* Calculate temporary vectorial force */
1857 tx = _mm256_mul_pd(fscal,dx21);
1858 ty = _mm256_mul_pd(fscal,dy21);
1859 tz = _mm256_mul_pd(fscal,dz21);
1861 /* Update vectorial force */
1862 fix2 = _mm256_add_pd(fix2,tx);
1863 fiy2 = _mm256_add_pd(fiy2,ty);
1864 fiz2 = _mm256_add_pd(fiz2,tz);
1866 fjx1 = _mm256_add_pd(fjx1,tx);
1867 fjy1 = _mm256_add_pd(fjy1,ty);
1868 fjz1 = _mm256_add_pd(fjz1,tz);
1870 /**************************
1871 * CALCULATE INTERACTIONS *
1872 **************************/
1874 r22 = _mm256_mul_pd(rsq22,rinv22);
1876 /* EWALD ELECTROSTATICS */
1878 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1879 ewrt = _mm256_mul_pd(r22,ewtabscale);
1880 ewitab = _mm256_cvttpd_epi32(ewrt);
1881 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1882 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1883 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1885 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1886 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1890 /* Calculate temporary vectorial force */
1891 tx = _mm256_mul_pd(fscal,dx22);
1892 ty = _mm256_mul_pd(fscal,dy22);
1893 tz = _mm256_mul_pd(fscal,dz22);
1895 /* Update vectorial force */
1896 fix2 = _mm256_add_pd(fix2,tx);
1897 fiy2 = _mm256_add_pd(fiy2,ty);
1898 fiz2 = _mm256_add_pd(fiz2,tz);
1900 fjx2 = _mm256_add_pd(fjx2,tx);
1901 fjy2 = _mm256_add_pd(fjy2,ty);
1902 fjz2 = _mm256_add_pd(fjz2,tz);
1904 /**************************
1905 * CALCULATE INTERACTIONS *
1906 **************************/
1908 r23 = _mm256_mul_pd(rsq23,rinv23);
1910 /* EWALD ELECTROSTATICS */
1912 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1913 ewrt = _mm256_mul_pd(r23,ewtabscale);
1914 ewitab = _mm256_cvttpd_epi32(ewrt);
1915 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1916 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1917 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1919 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1920 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1924 /* Calculate temporary vectorial force */
1925 tx = _mm256_mul_pd(fscal,dx23);
1926 ty = _mm256_mul_pd(fscal,dy23);
1927 tz = _mm256_mul_pd(fscal,dz23);
1929 /* Update vectorial force */
1930 fix2 = _mm256_add_pd(fix2,tx);
1931 fiy2 = _mm256_add_pd(fiy2,ty);
1932 fiz2 = _mm256_add_pd(fiz2,tz);
1934 fjx3 = _mm256_add_pd(fjx3,tx);
1935 fjy3 = _mm256_add_pd(fjy3,ty);
1936 fjz3 = _mm256_add_pd(fjz3,tz);
1938 /**************************
1939 * CALCULATE INTERACTIONS *
1940 **************************/
1942 r31 = _mm256_mul_pd(rsq31,rinv31);
1944 /* EWALD ELECTROSTATICS */
1946 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1947 ewrt = _mm256_mul_pd(r31,ewtabscale);
1948 ewitab = _mm256_cvttpd_epi32(ewrt);
1949 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1950 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1951 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1953 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1954 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1958 /* Calculate temporary vectorial force */
1959 tx = _mm256_mul_pd(fscal,dx31);
1960 ty = _mm256_mul_pd(fscal,dy31);
1961 tz = _mm256_mul_pd(fscal,dz31);
1963 /* Update vectorial force */
1964 fix3 = _mm256_add_pd(fix3,tx);
1965 fiy3 = _mm256_add_pd(fiy3,ty);
1966 fiz3 = _mm256_add_pd(fiz3,tz);
1968 fjx1 = _mm256_add_pd(fjx1,tx);
1969 fjy1 = _mm256_add_pd(fjy1,ty);
1970 fjz1 = _mm256_add_pd(fjz1,tz);
1972 /**************************
1973 * CALCULATE INTERACTIONS *
1974 **************************/
1976 r32 = _mm256_mul_pd(rsq32,rinv32);
1978 /* EWALD ELECTROSTATICS */
1980 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1981 ewrt = _mm256_mul_pd(r32,ewtabscale);
1982 ewitab = _mm256_cvttpd_epi32(ewrt);
1983 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1984 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1985 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1987 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1988 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1992 /* Calculate temporary vectorial force */
1993 tx = _mm256_mul_pd(fscal,dx32);
1994 ty = _mm256_mul_pd(fscal,dy32);
1995 tz = _mm256_mul_pd(fscal,dz32);
1997 /* Update vectorial force */
1998 fix3 = _mm256_add_pd(fix3,tx);
1999 fiy3 = _mm256_add_pd(fiy3,ty);
2000 fiz3 = _mm256_add_pd(fiz3,tz);
2002 fjx2 = _mm256_add_pd(fjx2,tx);
2003 fjy2 = _mm256_add_pd(fjy2,ty);
2004 fjz2 = _mm256_add_pd(fjz2,tz);
2006 /**************************
2007 * CALCULATE INTERACTIONS *
2008 **************************/
2010 r33 = _mm256_mul_pd(rsq33,rinv33);
2012 /* EWALD ELECTROSTATICS */
2014 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2015 ewrt = _mm256_mul_pd(r33,ewtabscale);
2016 ewitab = _mm256_cvttpd_epi32(ewrt);
2017 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2018 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2019 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2021 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2022 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2026 /* Calculate temporary vectorial force */
2027 tx = _mm256_mul_pd(fscal,dx33);
2028 ty = _mm256_mul_pd(fscal,dy33);
2029 tz = _mm256_mul_pd(fscal,dz33);
2031 /* Update vectorial force */
2032 fix3 = _mm256_add_pd(fix3,tx);
2033 fiy3 = _mm256_add_pd(fiy3,ty);
2034 fiz3 = _mm256_add_pd(fiz3,tz);
2036 fjx3 = _mm256_add_pd(fjx3,tx);
2037 fjy3 = _mm256_add_pd(fjy3,ty);
2038 fjz3 = _mm256_add_pd(fjz3,tz);
2040 fjptrA = f+j_coord_offsetA;
2041 fjptrB = f+j_coord_offsetB;
2042 fjptrC = f+j_coord_offsetC;
2043 fjptrD = f+j_coord_offsetD;
2045 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2046 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2047 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2049 /* Inner loop uses 375 flops */
2052 if(jidx<j_index_end)
2055 /* Get j neighbor index, and coordinate index */
2056 jnrlistA = jjnr[jidx];
2057 jnrlistB = jjnr[jidx+1];
2058 jnrlistC = jjnr[jidx+2];
2059 jnrlistD = jjnr[jidx+3];
2060 /* Sign of each element will be negative for non-real atoms.
2061 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2062 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2064 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2066 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2067 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2068 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2070 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2071 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2072 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2073 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2074 j_coord_offsetA = DIM*jnrA;
2075 j_coord_offsetB = DIM*jnrB;
2076 j_coord_offsetC = DIM*jnrC;
2077 j_coord_offsetD = DIM*jnrD;
2079 /* load j atom coordinates */
2080 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2081 x+j_coord_offsetC,x+j_coord_offsetD,
2082 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2083 &jy2,&jz2,&jx3,&jy3,&jz3);
2085 /* Calculate displacement vector */
2086 dx00 = _mm256_sub_pd(ix0,jx0);
2087 dy00 = _mm256_sub_pd(iy0,jy0);
2088 dz00 = _mm256_sub_pd(iz0,jz0);
2089 dx11 = _mm256_sub_pd(ix1,jx1);
2090 dy11 = _mm256_sub_pd(iy1,jy1);
2091 dz11 = _mm256_sub_pd(iz1,jz1);
2092 dx12 = _mm256_sub_pd(ix1,jx2);
2093 dy12 = _mm256_sub_pd(iy1,jy2);
2094 dz12 = _mm256_sub_pd(iz1,jz2);
2095 dx13 = _mm256_sub_pd(ix1,jx3);
2096 dy13 = _mm256_sub_pd(iy1,jy3);
2097 dz13 = _mm256_sub_pd(iz1,jz3);
2098 dx21 = _mm256_sub_pd(ix2,jx1);
2099 dy21 = _mm256_sub_pd(iy2,jy1);
2100 dz21 = _mm256_sub_pd(iz2,jz1);
2101 dx22 = _mm256_sub_pd(ix2,jx2);
2102 dy22 = _mm256_sub_pd(iy2,jy2);
2103 dz22 = _mm256_sub_pd(iz2,jz2);
2104 dx23 = _mm256_sub_pd(ix2,jx3);
2105 dy23 = _mm256_sub_pd(iy2,jy3);
2106 dz23 = _mm256_sub_pd(iz2,jz3);
2107 dx31 = _mm256_sub_pd(ix3,jx1);
2108 dy31 = _mm256_sub_pd(iy3,jy1);
2109 dz31 = _mm256_sub_pd(iz3,jz1);
2110 dx32 = _mm256_sub_pd(ix3,jx2);
2111 dy32 = _mm256_sub_pd(iy3,jy2);
2112 dz32 = _mm256_sub_pd(iz3,jz2);
2113 dx33 = _mm256_sub_pd(ix3,jx3);
2114 dy33 = _mm256_sub_pd(iy3,jy3);
2115 dz33 = _mm256_sub_pd(iz3,jz3);
2117 /* Calculate squared distance and things based on it */
2118 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2119 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2120 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2121 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2122 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2123 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2124 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2125 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2126 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2127 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2129 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2130 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2131 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2132 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2133 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2134 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2135 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2136 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2137 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2138 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2140 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2141 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2142 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2143 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2144 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2145 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2146 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2147 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2148 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2150 fjx0 = _mm256_setzero_pd();
2151 fjy0 = _mm256_setzero_pd();
2152 fjz0 = _mm256_setzero_pd();
2153 fjx1 = _mm256_setzero_pd();
2154 fjy1 = _mm256_setzero_pd();
2155 fjz1 = _mm256_setzero_pd();
2156 fjx2 = _mm256_setzero_pd();
2157 fjy2 = _mm256_setzero_pd();
2158 fjz2 = _mm256_setzero_pd();
2159 fjx3 = _mm256_setzero_pd();
2160 fjy3 = _mm256_setzero_pd();
2161 fjz3 = _mm256_setzero_pd();
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 r00 = _mm256_mul_pd(rsq00,rinv00);
2168 r00 = _mm256_andnot_pd(dummy_mask,r00);
2170 /* Calculate table index by multiplying r with table scale and truncate to integer */
2171 rt = _mm256_mul_pd(r00,vftabscale);
2172 vfitab = _mm256_cvttpd_epi32(rt);
2173 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2174 vfitab = _mm_slli_epi32(vfitab,3);
2176 /* CUBIC SPLINE TABLE DISPERSION */
2177 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2178 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2179 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2180 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2181 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2182 Heps = _mm256_mul_pd(vfeps,H);
2183 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2184 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2185 fvdw6 = _mm256_mul_pd(c6_00,FF);
2187 /* CUBIC SPLINE TABLE REPULSION */
2188 vfitab = _mm_add_epi32(vfitab,ifour);
2189 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2190 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2191 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2192 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2193 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2194 Heps = _mm256_mul_pd(vfeps,H);
2195 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2196 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2197 fvdw12 = _mm256_mul_pd(c12_00,FF);
2198 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2202 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2204 /* Calculate temporary vectorial force */
2205 tx = _mm256_mul_pd(fscal,dx00);
2206 ty = _mm256_mul_pd(fscal,dy00);
2207 tz = _mm256_mul_pd(fscal,dz00);
2209 /* Update vectorial force */
2210 fix0 = _mm256_add_pd(fix0,tx);
2211 fiy0 = _mm256_add_pd(fiy0,ty);
2212 fiz0 = _mm256_add_pd(fiz0,tz);
2214 fjx0 = _mm256_add_pd(fjx0,tx);
2215 fjy0 = _mm256_add_pd(fjy0,ty);
2216 fjz0 = _mm256_add_pd(fjz0,tz);
2218 /**************************
2219 * CALCULATE INTERACTIONS *
2220 **************************/
2222 r11 = _mm256_mul_pd(rsq11,rinv11);
2223 r11 = _mm256_andnot_pd(dummy_mask,r11);
2225 /* EWALD ELECTROSTATICS */
2227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228 ewrt = _mm256_mul_pd(r11,ewtabscale);
2229 ewitab = _mm256_cvttpd_epi32(ewrt);
2230 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2231 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2232 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2234 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2235 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2239 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2241 /* Calculate temporary vectorial force */
2242 tx = _mm256_mul_pd(fscal,dx11);
2243 ty = _mm256_mul_pd(fscal,dy11);
2244 tz = _mm256_mul_pd(fscal,dz11);
2246 /* Update vectorial force */
2247 fix1 = _mm256_add_pd(fix1,tx);
2248 fiy1 = _mm256_add_pd(fiy1,ty);
2249 fiz1 = _mm256_add_pd(fiz1,tz);
2251 fjx1 = _mm256_add_pd(fjx1,tx);
2252 fjy1 = _mm256_add_pd(fjy1,ty);
2253 fjz1 = _mm256_add_pd(fjz1,tz);
2255 /**************************
2256 * CALCULATE INTERACTIONS *
2257 **************************/
2259 r12 = _mm256_mul_pd(rsq12,rinv12);
2260 r12 = _mm256_andnot_pd(dummy_mask,r12);
2262 /* EWALD ELECTROSTATICS */
2264 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2265 ewrt = _mm256_mul_pd(r12,ewtabscale);
2266 ewitab = _mm256_cvttpd_epi32(ewrt);
2267 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2268 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2269 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2271 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2272 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2276 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2278 /* Calculate temporary vectorial force */
2279 tx = _mm256_mul_pd(fscal,dx12);
2280 ty = _mm256_mul_pd(fscal,dy12);
2281 tz = _mm256_mul_pd(fscal,dz12);
2283 /* Update vectorial force */
2284 fix1 = _mm256_add_pd(fix1,tx);
2285 fiy1 = _mm256_add_pd(fiy1,ty);
2286 fiz1 = _mm256_add_pd(fiz1,tz);
2288 fjx2 = _mm256_add_pd(fjx2,tx);
2289 fjy2 = _mm256_add_pd(fjy2,ty);
2290 fjz2 = _mm256_add_pd(fjz2,tz);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 r13 = _mm256_mul_pd(rsq13,rinv13);
2297 r13 = _mm256_andnot_pd(dummy_mask,r13);
2299 /* EWALD ELECTROSTATICS */
2301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2302 ewrt = _mm256_mul_pd(r13,ewtabscale);
2303 ewitab = _mm256_cvttpd_epi32(ewrt);
2304 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2305 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2306 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2308 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2309 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2313 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2315 /* Calculate temporary vectorial force */
2316 tx = _mm256_mul_pd(fscal,dx13);
2317 ty = _mm256_mul_pd(fscal,dy13);
2318 tz = _mm256_mul_pd(fscal,dz13);
2320 /* Update vectorial force */
2321 fix1 = _mm256_add_pd(fix1,tx);
2322 fiy1 = _mm256_add_pd(fiy1,ty);
2323 fiz1 = _mm256_add_pd(fiz1,tz);
2325 fjx3 = _mm256_add_pd(fjx3,tx);
2326 fjy3 = _mm256_add_pd(fjy3,ty);
2327 fjz3 = _mm256_add_pd(fjz3,tz);
2329 /**************************
2330 * CALCULATE INTERACTIONS *
2331 **************************/
2333 r21 = _mm256_mul_pd(rsq21,rinv21);
2334 r21 = _mm256_andnot_pd(dummy_mask,r21);
2336 /* EWALD ELECTROSTATICS */
2338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2339 ewrt = _mm256_mul_pd(r21,ewtabscale);
2340 ewitab = _mm256_cvttpd_epi32(ewrt);
2341 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2342 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2343 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2345 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2346 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2350 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2352 /* Calculate temporary vectorial force */
2353 tx = _mm256_mul_pd(fscal,dx21);
2354 ty = _mm256_mul_pd(fscal,dy21);
2355 tz = _mm256_mul_pd(fscal,dz21);
2357 /* Update vectorial force */
2358 fix2 = _mm256_add_pd(fix2,tx);
2359 fiy2 = _mm256_add_pd(fiy2,ty);
2360 fiz2 = _mm256_add_pd(fiz2,tz);
2362 fjx1 = _mm256_add_pd(fjx1,tx);
2363 fjy1 = _mm256_add_pd(fjy1,ty);
2364 fjz1 = _mm256_add_pd(fjz1,tz);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 r22 = _mm256_mul_pd(rsq22,rinv22);
2371 r22 = _mm256_andnot_pd(dummy_mask,r22);
2373 /* EWALD ELECTROSTATICS */
2375 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2376 ewrt = _mm256_mul_pd(r22,ewtabscale);
2377 ewitab = _mm256_cvttpd_epi32(ewrt);
2378 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2379 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2380 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2382 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2383 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2387 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2389 /* Calculate temporary vectorial force */
2390 tx = _mm256_mul_pd(fscal,dx22);
2391 ty = _mm256_mul_pd(fscal,dy22);
2392 tz = _mm256_mul_pd(fscal,dz22);
2394 /* Update vectorial force */
2395 fix2 = _mm256_add_pd(fix2,tx);
2396 fiy2 = _mm256_add_pd(fiy2,ty);
2397 fiz2 = _mm256_add_pd(fiz2,tz);
2399 fjx2 = _mm256_add_pd(fjx2,tx);
2400 fjy2 = _mm256_add_pd(fjy2,ty);
2401 fjz2 = _mm256_add_pd(fjz2,tz);
2403 /**************************
2404 * CALCULATE INTERACTIONS *
2405 **************************/
2407 r23 = _mm256_mul_pd(rsq23,rinv23);
2408 r23 = _mm256_andnot_pd(dummy_mask,r23);
2410 /* EWALD ELECTROSTATICS */
2412 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2413 ewrt = _mm256_mul_pd(r23,ewtabscale);
2414 ewitab = _mm256_cvttpd_epi32(ewrt);
2415 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2416 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2417 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2419 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2420 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2424 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2426 /* Calculate temporary vectorial force */
2427 tx = _mm256_mul_pd(fscal,dx23);
2428 ty = _mm256_mul_pd(fscal,dy23);
2429 tz = _mm256_mul_pd(fscal,dz23);
2431 /* Update vectorial force */
2432 fix2 = _mm256_add_pd(fix2,tx);
2433 fiy2 = _mm256_add_pd(fiy2,ty);
2434 fiz2 = _mm256_add_pd(fiz2,tz);
2436 fjx3 = _mm256_add_pd(fjx3,tx);
2437 fjy3 = _mm256_add_pd(fjy3,ty);
2438 fjz3 = _mm256_add_pd(fjz3,tz);
2440 /**************************
2441 * CALCULATE INTERACTIONS *
2442 **************************/
2444 r31 = _mm256_mul_pd(rsq31,rinv31);
2445 r31 = _mm256_andnot_pd(dummy_mask,r31);
2447 /* EWALD ELECTROSTATICS */
2449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2450 ewrt = _mm256_mul_pd(r31,ewtabscale);
2451 ewitab = _mm256_cvttpd_epi32(ewrt);
2452 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2453 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2454 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2456 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2457 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2461 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2463 /* Calculate temporary vectorial force */
2464 tx = _mm256_mul_pd(fscal,dx31);
2465 ty = _mm256_mul_pd(fscal,dy31);
2466 tz = _mm256_mul_pd(fscal,dz31);
2468 /* Update vectorial force */
2469 fix3 = _mm256_add_pd(fix3,tx);
2470 fiy3 = _mm256_add_pd(fiy3,ty);
2471 fiz3 = _mm256_add_pd(fiz3,tz);
2473 fjx1 = _mm256_add_pd(fjx1,tx);
2474 fjy1 = _mm256_add_pd(fjy1,ty);
2475 fjz1 = _mm256_add_pd(fjz1,tz);
2477 /**************************
2478 * CALCULATE INTERACTIONS *
2479 **************************/
2481 r32 = _mm256_mul_pd(rsq32,rinv32);
2482 r32 = _mm256_andnot_pd(dummy_mask,r32);
2484 /* EWALD ELECTROSTATICS */
2486 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2487 ewrt = _mm256_mul_pd(r32,ewtabscale);
2488 ewitab = _mm256_cvttpd_epi32(ewrt);
2489 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2490 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2491 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2493 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2494 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2498 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2500 /* Calculate temporary vectorial force */
2501 tx = _mm256_mul_pd(fscal,dx32);
2502 ty = _mm256_mul_pd(fscal,dy32);
2503 tz = _mm256_mul_pd(fscal,dz32);
2505 /* Update vectorial force */
2506 fix3 = _mm256_add_pd(fix3,tx);
2507 fiy3 = _mm256_add_pd(fiy3,ty);
2508 fiz3 = _mm256_add_pd(fiz3,tz);
2510 fjx2 = _mm256_add_pd(fjx2,tx);
2511 fjy2 = _mm256_add_pd(fjy2,ty);
2512 fjz2 = _mm256_add_pd(fjz2,tz);
2514 /**************************
2515 * CALCULATE INTERACTIONS *
2516 **************************/
2518 r33 = _mm256_mul_pd(rsq33,rinv33);
2519 r33 = _mm256_andnot_pd(dummy_mask,r33);
2521 /* EWALD ELECTROSTATICS */
2523 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2524 ewrt = _mm256_mul_pd(r33,ewtabscale);
2525 ewitab = _mm256_cvttpd_epi32(ewrt);
2526 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2527 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2528 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2530 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2531 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2535 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2537 /* Calculate temporary vectorial force */
2538 tx = _mm256_mul_pd(fscal,dx33);
2539 ty = _mm256_mul_pd(fscal,dy33);
2540 tz = _mm256_mul_pd(fscal,dz33);
2542 /* Update vectorial force */
2543 fix3 = _mm256_add_pd(fix3,tx);
2544 fiy3 = _mm256_add_pd(fiy3,ty);
2545 fiz3 = _mm256_add_pd(fiz3,tz);
2547 fjx3 = _mm256_add_pd(fjx3,tx);
2548 fjy3 = _mm256_add_pd(fjy3,ty);
2549 fjz3 = _mm256_add_pd(fjz3,tz);
2551 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2552 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2553 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2554 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2556 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2557 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2558 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2560 /* Inner loop uses 385 flops */
2563 /* End of innermost loop */
2565 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2566 f+i_coord_offset,fshift+i_shift_offset);
2568 /* Increment number of inner iterations */
2569 inneriter += j_index_end - j_index_start;
2571 /* Outer loop uses 24 flops */
2574 /* Increment number of outer iterations */
2577 /* Update outer/inner flops */
2579 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*385);