2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
115 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
118 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
119 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
121 __m128i ifour = _mm_set1_epi32(4);
122 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
125 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
126 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
128 __m256d dummy_mask,cutoff_mask;
129 __m128 tmpmask0,tmpmask1;
130 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
131 __m256d one = _mm256_set1_pd(1.0);
132 __m256d two = _mm256_set1_pd(2.0);
138 jindex = nlist->jindex;
140 shiftidx = nlist->shift;
142 shiftvec = fr->shift_vec[0];
143 fshift = fr->fshift[0];
144 facel = _mm256_set1_pd(fr->epsfac);
145 charge = mdatoms->chargeA;
146 nvdwtype = fr->ntype;
148 vdwtype = mdatoms->typeA;
150 vftab = kernel_data->table_vdw->data;
151 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
153 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
154 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
155 beta2 = _mm256_mul_pd(beta,beta);
156 beta3 = _mm256_mul_pd(beta,beta2);
158 ewtab = fr->ic->tabq_coul_FDV0;
159 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
160 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
162 /* Setup water-specific parameters */
163 inr = nlist->iinr[0];
164 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
165 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
166 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
167 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
169 jq1 = _mm256_set1_pd(charge[inr+1]);
170 jq2 = _mm256_set1_pd(charge[inr+2]);
171 jq3 = _mm256_set1_pd(charge[inr+3]);
172 vdwjidx0A = 2*vdwtype[inr+0];
173 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
174 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
175 qq11 = _mm256_mul_pd(iq1,jq1);
176 qq12 = _mm256_mul_pd(iq1,jq2);
177 qq13 = _mm256_mul_pd(iq1,jq3);
178 qq21 = _mm256_mul_pd(iq2,jq1);
179 qq22 = _mm256_mul_pd(iq2,jq2);
180 qq23 = _mm256_mul_pd(iq2,jq3);
181 qq31 = _mm256_mul_pd(iq3,jq1);
182 qq32 = _mm256_mul_pd(iq3,jq2);
183 qq33 = _mm256_mul_pd(iq3,jq3);
185 /* Avoid stupid compiler warnings */
186 jnrA = jnrB = jnrC = jnrD = 0;
195 for(iidx=0;iidx<4*DIM;iidx++)
200 /* Start outer loop over neighborlists */
201 for(iidx=0; iidx<nri; iidx++)
203 /* Load shift vector for this list */
204 i_shift_offset = DIM*shiftidx[iidx];
206 /* Load limits for loop over neighbors */
207 j_index_start = jindex[iidx];
208 j_index_end = jindex[iidx+1];
210 /* Get outer coordinate index */
212 i_coord_offset = DIM*inr;
214 /* Load i particle coords and add shift vector */
215 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
216 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
218 fix0 = _mm256_setzero_pd();
219 fiy0 = _mm256_setzero_pd();
220 fiz0 = _mm256_setzero_pd();
221 fix1 = _mm256_setzero_pd();
222 fiy1 = _mm256_setzero_pd();
223 fiz1 = _mm256_setzero_pd();
224 fix2 = _mm256_setzero_pd();
225 fiy2 = _mm256_setzero_pd();
226 fiz2 = _mm256_setzero_pd();
227 fix3 = _mm256_setzero_pd();
228 fiy3 = _mm256_setzero_pd();
229 fiz3 = _mm256_setzero_pd();
231 /* Reset potential sums */
232 velecsum = _mm256_setzero_pd();
233 vvdwsum = _mm256_setzero_pd();
235 /* Start inner kernel loop */
236 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
239 /* Get j neighbor index, and coordinate index */
244 j_coord_offsetA = DIM*jnrA;
245 j_coord_offsetB = DIM*jnrB;
246 j_coord_offsetC = DIM*jnrC;
247 j_coord_offsetD = DIM*jnrD;
249 /* load j atom coordinates */
250 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
251 x+j_coord_offsetC,x+j_coord_offsetD,
252 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
253 &jy2,&jz2,&jx3,&jy3,&jz3);
255 /* Calculate displacement vector */
256 dx00 = _mm256_sub_pd(ix0,jx0);
257 dy00 = _mm256_sub_pd(iy0,jy0);
258 dz00 = _mm256_sub_pd(iz0,jz0);
259 dx11 = _mm256_sub_pd(ix1,jx1);
260 dy11 = _mm256_sub_pd(iy1,jy1);
261 dz11 = _mm256_sub_pd(iz1,jz1);
262 dx12 = _mm256_sub_pd(ix1,jx2);
263 dy12 = _mm256_sub_pd(iy1,jy2);
264 dz12 = _mm256_sub_pd(iz1,jz2);
265 dx13 = _mm256_sub_pd(ix1,jx3);
266 dy13 = _mm256_sub_pd(iy1,jy3);
267 dz13 = _mm256_sub_pd(iz1,jz3);
268 dx21 = _mm256_sub_pd(ix2,jx1);
269 dy21 = _mm256_sub_pd(iy2,jy1);
270 dz21 = _mm256_sub_pd(iz2,jz1);
271 dx22 = _mm256_sub_pd(ix2,jx2);
272 dy22 = _mm256_sub_pd(iy2,jy2);
273 dz22 = _mm256_sub_pd(iz2,jz2);
274 dx23 = _mm256_sub_pd(ix2,jx3);
275 dy23 = _mm256_sub_pd(iy2,jy3);
276 dz23 = _mm256_sub_pd(iz2,jz3);
277 dx31 = _mm256_sub_pd(ix3,jx1);
278 dy31 = _mm256_sub_pd(iy3,jy1);
279 dz31 = _mm256_sub_pd(iz3,jz1);
280 dx32 = _mm256_sub_pd(ix3,jx2);
281 dy32 = _mm256_sub_pd(iy3,jy2);
282 dz32 = _mm256_sub_pd(iz3,jz2);
283 dx33 = _mm256_sub_pd(ix3,jx3);
284 dy33 = _mm256_sub_pd(iy3,jy3);
285 dz33 = _mm256_sub_pd(iz3,jz3);
287 /* Calculate squared distance and things based on it */
288 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
289 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
290 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
291 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
292 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
293 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
294 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
295 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
296 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
297 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
299 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
300 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
301 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
302 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
303 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
304 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
305 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
306 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
307 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
308 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
310 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
311 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
312 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
313 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
314 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
315 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
316 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
317 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
318 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
320 fjx0 = _mm256_setzero_pd();
321 fjy0 = _mm256_setzero_pd();
322 fjz0 = _mm256_setzero_pd();
323 fjx1 = _mm256_setzero_pd();
324 fjy1 = _mm256_setzero_pd();
325 fjz1 = _mm256_setzero_pd();
326 fjx2 = _mm256_setzero_pd();
327 fjy2 = _mm256_setzero_pd();
328 fjz2 = _mm256_setzero_pd();
329 fjx3 = _mm256_setzero_pd();
330 fjy3 = _mm256_setzero_pd();
331 fjz3 = _mm256_setzero_pd();
333 /**************************
334 * CALCULATE INTERACTIONS *
335 **************************/
337 r00 = _mm256_mul_pd(rsq00,rinv00);
339 /* Calculate table index by multiplying r with table scale and truncate to integer */
340 rt = _mm256_mul_pd(r00,vftabscale);
341 vfitab = _mm256_cvttpd_epi32(rt);
342 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
343 vfitab = _mm_slli_epi32(vfitab,3);
345 /* CUBIC SPLINE TABLE DISPERSION */
346 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
347 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
348 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
349 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
350 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
351 Heps = _mm256_mul_pd(vfeps,H);
352 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
353 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
354 vvdw6 = _mm256_mul_pd(c6_00,VV);
355 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
356 fvdw6 = _mm256_mul_pd(c6_00,FF);
358 /* CUBIC SPLINE TABLE REPULSION */
359 vfitab = _mm_add_epi32(vfitab,ifour);
360 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
361 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
362 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
363 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
364 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
365 Heps = _mm256_mul_pd(vfeps,H);
366 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
367 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
368 vvdw12 = _mm256_mul_pd(c12_00,VV);
369 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
370 fvdw12 = _mm256_mul_pd(c12_00,FF);
371 vvdw = _mm256_add_pd(vvdw12,vvdw6);
372 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
379 /* Calculate temporary vectorial force */
380 tx = _mm256_mul_pd(fscal,dx00);
381 ty = _mm256_mul_pd(fscal,dy00);
382 tz = _mm256_mul_pd(fscal,dz00);
384 /* Update vectorial force */
385 fix0 = _mm256_add_pd(fix0,tx);
386 fiy0 = _mm256_add_pd(fiy0,ty);
387 fiz0 = _mm256_add_pd(fiz0,tz);
389 fjx0 = _mm256_add_pd(fjx0,tx);
390 fjy0 = _mm256_add_pd(fjy0,ty);
391 fjz0 = _mm256_add_pd(fjz0,tz);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 r11 = _mm256_mul_pd(rsq11,rinv11);
399 /* EWALD ELECTROSTATICS */
401 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
402 ewrt = _mm256_mul_pd(r11,ewtabscale);
403 ewitab = _mm256_cvttpd_epi32(ewrt);
404 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
405 ewitab = _mm_slli_epi32(ewitab,2);
406 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
407 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
408 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
409 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
410 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
411 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
412 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
413 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
414 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velecsum = _mm256_add_pd(velecsum,velec);
421 /* Calculate temporary vectorial force */
422 tx = _mm256_mul_pd(fscal,dx11);
423 ty = _mm256_mul_pd(fscal,dy11);
424 tz = _mm256_mul_pd(fscal,dz11);
426 /* Update vectorial force */
427 fix1 = _mm256_add_pd(fix1,tx);
428 fiy1 = _mm256_add_pd(fiy1,ty);
429 fiz1 = _mm256_add_pd(fiz1,tz);
431 fjx1 = _mm256_add_pd(fjx1,tx);
432 fjy1 = _mm256_add_pd(fjy1,ty);
433 fjz1 = _mm256_add_pd(fjz1,tz);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 r12 = _mm256_mul_pd(rsq12,rinv12);
441 /* EWALD ELECTROSTATICS */
443 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
444 ewrt = _mm256_mul_pd(r12,ewtabscale);
445 ewitab = _mm256_cvttpd_epi32(ewrt);
446 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
447 ewitab = _mm_slli_epi32(ewitab,2);
448 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
449 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
450 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
451 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
452 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
453 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
454 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
455 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
456 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum = _mm256_add_pd(velecsum,velec);
463 /* Calculate temporary vectorial force */
464 tx = _mm256_mul_pd(fscal,dx12);
465 ty = _mm256_mul_pd(fscal,dy12);
466 tz = _mm256_mul_pd(fscal,dz12);
468 /* Update vectorial force */
469 fix1 = _mm256_add_pd(fix1,tx);
470 fiy1 = _mm256_add_pd(fiy1,ty);
471 fiz1 = _mm256_add_pd(fiz1,tz);
473 fjx2 = _mm256_add_pd(fjx2,tx);
474 fjy2 = _mm256_add_pd(fjy2,ty);
475 fjz2 = _mm256_add_pd(fjz2,tz);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 r13 = _mm256_mul_pd(rsq13,rinv13);
483 /* EWALD ELECTROSTATICS */
485 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
486 ewrt = _mm256_mul_pd(r13,ewtabscale);
487 ewitab = _mm256_cvttpd_epi32(ewrt);
488 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
489 ewitab = _mm_slli_epi32(ewitab,2);
490 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
491 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
492 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
493 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
494 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
495 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
496 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
497 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
498 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velecsum = _mm256_add_pd(velecsum,velec);
505 /* Calculate temporary vectorial force */
506 tx = _mm256_mul_pd(fscal,dx13);
507 ty = _mm256_mul_pd(fscal,dy13);
508 tz = _mm256_mul_pd(fscal,dz13);
510 /* Update vectorial force */
511 fix1 = _mm256_add_pd(fix1,tx);
512 fiy1 = _mm256_add_pd(fiy1,ty);
513 fiz1 = _mm256_add_pd(fiz1,tz);
515 fjx3 = _mm256_add_pd(fjx3,tx);
516 fjy3 = _mm256_add_pd(fjy3,ty);
517 fjz3 = _mm256_add_pd(fjz3,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 r21 = _mm256_mul_pd(rsq21,rinv21);
525 /* EWALD ELECTROSTATICS */
527 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
528 ewrt = _mm256_mul_pd(r21,ewtabscale);
529 ewitab = _mm256_cvttpd_epi32(ewrt);
530 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
531 ewitab = _mm_slli_epi32(ewitab,2);
532 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
533 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
534 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
535 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
536 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
537 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
538 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
539 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
540 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velecsum = _mm256_add_pd(velecsum,velec);
547 /* Calculate temporary vectorial force */
548 tx = _mm256_mul_pd(fscal,dx21);
549 ty = _mm256_mul_pd(fscal,dy21);
550 tz = _mm256_mul_pd(fscal,dz21);
552 /* Update vectorial force */
553 fix2 = _mm256_add_pd(fix2,tx);
554 fiy2 = _mm256_add_pd(fiy2,ty);
555 fiz2 = _mm256_add_pd(fiz2,tz);
557 fjx1 = _mm256_add_pd(fjx1,tx);
558 fjy1 = _mm256_add_pd(fjy1,ty);
559 fjz1 = _mm256_add_pd(fjz1,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 r22 = _mm256_mul_pd(rsq22,rinv22);
567 /* EWALD ELECTROSTATICS */
569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570 ewrt = _mm256_mul_pd(r22,ewtabscale);
571 ewitab = _mm256_cvttpd_epi32(ewrt);
572 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
573 ewitab = _mm_slli_epi32(ewitab,2);
574 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
575 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
576 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
577 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
578 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
579 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
580 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
581 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
582 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velecsum = _mm256_add_pd(velecsum,velec);
589 /* Calculate temporary vectorial force */
590 tx = _mm256_mul_pd(fscal,dx22);
591 ty = _mm256_mul_pd(fscal,dy22);
592 tz = _mm256_mul_pd(fscal,dz22);
594 /* Update vectorial force */
595 fix2 = _mm256_add_pd(fix2,tx);
596 fiy2 = _mm256_add_pd(fiy2,ty);
597 fiz2 = _mm256_add_pd(fiz2,tz);
599 fjx2 = _mm256_add_pd(fjx2,tx);
600 fjy2 = _mm256_add_pd(fjy2,ty);
601 fjz2 = _mm256_add_pd(fjz2,tz);
603 /**************************
604 * CALCULATE INTERACTIONS *
605 **************************/
607 r23 = _mm256_mul_pd(rsq23,rinv23);
609 /* EWALD ELECTROSTATICS */
611 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
612 ewrt = _mm256_mul_pd(r23,ewtabscale);
613 ewitab = _mm256_cvttpd_epi32(ewrt);
614 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
615 ewitab = _mm_slli_epi32(ewitab,2);
616 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
617 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
618 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
619 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
620 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
621 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
622 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
623 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
624 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velecsum = _mm256_add_pd(velecsum,velec);
631 /* Calculate temporary vectorial force */
632 tx = _mm256_mul_pd(fscal,dx23);
633 ty = _mm256_mul_pd(fscal,dy23);
634 tz = _mm256_mul_pd(fscal,dz23);
636 /* Update vectorial force */
637 fix2 = _mm256_add_pd(fix2,tx);
638 fiy2 = _mm256_add_pd(fiy2,ty);
639 fiz2 = _mm256_add_pd(fiz2,tz);
641 fjx3 = _mm256_add_pd(fjx3,tx);
642 fjy3 = _mm256_add_pd(fjy3,ty);
643 fjz3 = _mm256_add_pd(fjz3,tz);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 r31 = _mm256_mul_pd(rsq31,rinv31);
651 /* EWALD ELECTROSTATICS */
653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
654 ewrt = _mm256_mul_pd(r31,ewtabscale);
655 ewitab = _mm256_cvttpd_epi32(ewrt);
656 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
657 ewitab = _mm_slli_epi32(ewitab,2);
658 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
659 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
660 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
661 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
662 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
663 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
664 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
665 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
666 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
668 /* Update potential sum for this i atom from the interaction with this j atom. */
669 velecsum = _mm256_add_pd(velecsum,velec);
673 /* Calculate temporary vectorial force */
674 tx = _mm256_mul_pd(fscal,dx31);
675 ty = _mm256_mul_pd(fscal,dy31);
676 tz = _mm256_mul_pd(fscal,dz31);
678 /* Update vectorial force */
679 fix3 = _mm256_add_pd(fix3,tx);
680 fiy3 = _mm256_add_pd(fiy3,ty);
681 fiz3 = _mm256_add_pd(fiz3,tz);
683 fjx1 = _mm256_add_pd(fjx1,tx);
684 fjy1 = _mm256_add_pd(fjy1,ty);
685 fjz1 = _mm256_add_pd(fjz1,tz);
687 /**************************
688 * CALCULATE INTERACTIONS *
689 **************************/
691 r32 = _mm256_mul_pd(rsq32,rinv32);
693 /* EWALD ELECTROSTATICS */
695 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
696 ewrt = _mm256_mul_pd(r32,ewtabscale);
697 ewitab = _mm256_cvttpd_epi32(ewrt);
698 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
699 ewitab = _mm_slli_epi32(ewitab,2);
700 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
701 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
702 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
703 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
704 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
705 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
706 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
707 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
708 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
710 /* Update potential sum for this i atom from the interaction with this j atom. */
711 velecsum = _mm256_add_pd(velecsum,velec);
715 /* Calculate temporary vectorial force */
716 tx = _mm256_mul_pd(fscal,dx32);
717 ty = _mm256_mul_pd(fscal,dy32);
718 tz = _mm256_mul_pd(fscal,dz32);
720 /* Update vectorial force */
721 fix3 = _mm256_add_pd(fix3,tx);
722 fiy3 = _mm256_add_pd(fiy3,ty);
723 fiz3 = _mm256_add_pd(fiz3,tz);
725 fjx2 = _mm256_add_pd(fjx2,tx);
726 fjy2 = _mm256_add_pd(fjy2,ty);
727 fjz2 = _mm256_add_pd(fjz2,tz);
729 /**************************
730 * CALCULATE INTERACTIONS *
731 **************************/
733 r33 = _mm256_mul_pd(rsq33,rinv33);
735 /* EWALD ELECTROSTATICS */
737 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
738 ewrt = _mm256_mul_pd(r33,ewtabscale);
739 ewitab = _mm256_cvttpd_epi32(ewrt);
740 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
741 ewitab = _mm_slli_epi32(ewitab,2);
742 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
743 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
744 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
745 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
746 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
747 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
748 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
749 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
750 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
752 /* Update potential sum for this i atom from the interaction with this j atom. */
753 velecsum = _mm256_add_pd(velecsum,velec);
757 /* Calculate temporary vectorial force */
758 tx = _mm256_mul_pd(fscal,dx33);
759 ty = _mm256_mul_pd(fscal,dy33);
760 tz = _mm256_mul_pd(fscal,dz33);
762 /* Update vectorial force */
763 fix3 = _mm256_add_pd(fix3,tx);
764 fiy3 = _mm256_add_pd(fiy3,ty);
765 fiz3 = _mm256_add_pd(fiz3,tz);
767 fjx3 = _mm256_add_pd(fjx3,tx);
768 fjy3 = _mm256_add_pd(fjy3,ty);
769 fjz3 = _mm256_add_pd(fjz3,tz);
771 fjptrA = f+j_coord_offsetA;
772 fjptrB = f+j_coord_offsetB;
773 fjptrC = f+j_coord_offsetC;
774 fjptrD = f+j_coord_offsetD;
776 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
777 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
778 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
780 /* Inner loop uses 428 flops */
786 /* Get j neighbor index, and coordinate index */
787 jnrlistA = jjnr[jidx];
788 jnrlistB = jjnr[jidx+1];
789 jnrlistC = jjnr[jidx+2];
790 jnrlistD = jjnr[jidx+3];
791 /* Sign of each element will be negative for non-real atoms.
792 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
793 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
795 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
797 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
798 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
799 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
801 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
802 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
803 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
804 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
805 j_coord_offsetA = DIM*jnrA;
806 j_coord_offsetB = DIM*jnrB;
807 j_coord_offsetC = DIM*jnrC;
808 j_coord_offsetD = DIM*jnrD;
810 /* load j atom coordinates */
811 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
812 x+j_coord_offsetC,x+j_coord_offsetD,
813 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
814 &jy2,&jz2,&jx3,&jy3,&jz3);
816 /* Calculate displacement vector */
817 dx00 = _mm256_sub_pd(ix0,jx0);
818 dy00 = _mm256_sub_pd(iy0,jy0);
819 dz00 = _mm256_sub_pd(iz0,jz0);
820 dx11 = _mm256_sub_pd(ix1,jx1);
821 dy11 = _mm256_sub_pd(iy1,jy1);
822 dz11 = _mm256_sub_pd(iz1,jz1);
823 dx12 = _mm256_sub_pd(ix1,jx2);
824 dy12 = _mm256_sub_pd(iy1,jy2);
825 dz12 = _mm256_sub_pd(iz1,jz2);
826 dx13 = _mm256_sub_pd(ix1,jx3);
827 dy13 = _mm256_sub_pd(iy1,jy3);
828 dz13 = _mm256_sub_pd(iz1,jz3);
829 dx21 = _mm256_sub_pd(ix2,jx1);
830 dy21 = _mm256_sub_pd(iy2,jy1);
831 dz21 = _mm256_sub_pd(iz2,jz1);
832 dx22 = _mm256_sub_pd(ix2,jx2);
833 dy22 = _mm256_sub_pd(iy2,jy2);
834 dz22 = _mm256_sub_pd(iz2,jz2);
835 dx23 = _mm256_sub_pd(ix2,jx3);
836 dy23 = _mm256_sub_pd(iy2,jy3);
837 dz23 = _mm256_sub_pd(iz2,jz3);
838 dx31 = _mm256_sub_pd(ix3,jx1);
839 dy31 = _mm256_sub_pd(iy3,jy1);
840 dz31 = _mm256_sub_pd(iz3,jz1);
841 dx32 = _mm256_sub_pd(ix3,jx2);
842 dy32 = _mm256_sub_pd(iy3,jy2);
843 dz32 = _mm256_sub_pd(iz3,jz2);
844 dx33 = _mm256_sub_pd(ix3,jx3);
845 dy33 = _mm256_sub_pd(iy3,jy3);
846 dz33 = _mm256_sub_pd(iz3,jz3);
848 /* Calculate squared distance and things based on it */
849 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
850 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
851 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
852 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
853 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
854 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
855 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
856 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
857 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
858 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
860 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
861 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
862 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
863 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
864 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
865 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
866 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
867 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
868 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
869 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
871 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
872 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
873 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
874 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
875 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
876 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
877 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
878 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
879 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
881 fjx0 = _mm256_setzero_pd();
882 fjy0 = _mm256_setzero_pd();
883 fjz0 = _mm256_setzero_pd();
884 fjx1 = _mm256_setzero_pd();
885 fjy1 = _mm256_setzero_pd();
886 fjz1 = _mm256_setzero_pd();
887 fjx2 = _mm256_setzero_pd();
888 fjy2 = _mm256_setzero_pd();
889 fjz2 = _mm256_setzero_pd();
890 fjx3 = _mm256_setzero_pd();
891 fjy3 = _mm256_setzero_pd();
892 fjz3 = _mm256_setzero_pd();
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 r00 = _mm256_mul_pd(rsq00,rinv00);
899 r00 = _mm256_andnot_pd(dummy_mask,r00);
901 /* Calculate table index by multiplying r with table scale and truncate to integer */
902 rt = _mm256_mul_pd(r00,vftabscale);
903 vfitab = _mm256_cvttpd_epi32(rt);
904 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
905 vfitab = _mm_slli_epi32(vfitab,3);
907 /* CUBIC SPLINE TABLE DISPERSION */
908 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
909 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
910 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
911 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
912 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
913 Heps = _mm256_mul_pd(vfeps,H);
914 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
915 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
916 vvdw6 = _mm256_mul_pd(c6_00,VV);
917 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
918 fvdw6 = _mm256_mul_pd(c6_00,FF);
920 /* CUBIC SPLINE TABLE REPULSION */
921 vfitab = _mm_add_epi32(vfitab,ifour);
922 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
923 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
924 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
925 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
926 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
927 Heps = _mm256_mul_pd(vfeps,H);
928 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
929 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
930 vvdw12 = _mm256_mul_pd(c12_00,VV);
931 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
932 fvdw12 = _mm256_mul_pd(c12_00,FF);
933 vvdw = _mm256_add_pd(vvdw12,vvdw6);
934 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
936 /* Update potential sum for this i atom from the interaction with this j atom. */
937 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
938 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
942 fscal = _mm256_andnot_pd(dummy_mask,fscal);
944 /* Calculate temporary vectorial force */
945 tx = _mm256_mul_pd(fscal,dx00);
946 ty = _mm256_mul_pd(fscal,dy00);
947 tz = _mm256_mul_pd(fscal,dz00);
949 /* Update vectorial force */
950 fix0 = _mm256_add_pd(fix0,tx);
951 fiy0 = _mm256_add_pd(fiy0,ty);
952 fiz0 = _mm256_add_pd(fiz0,tz);
954 fjx0 = _mm256_add_pd(fjx0,tx);
955 fjy0 = _mm256_add_pd(fjy0,ty);
956 fjz0 = _mm256_add_pd(fjz0,tz);
958 /**************************
959 * CALCULATE INTERACTIONS *
960 **************************/
962 r11 = _mm256_mul_pd(rsq11,rinv11);
963 r11 = _mm256_andnot_pd(dummy_mask,r11);
965 /* EWALD ELECTROSTATICS */
967 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
968 ewrt = _mm256_mul_pd(r11,ewtabscale);
969 ewitab = _mm256_cvttpd_epi32(ewrt);
970 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
971 ewitab = _mm_slli_epi32(ewitab,2);
972 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
973 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
974 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
975 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
976 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
977 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
978 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
979 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
980 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm256_andnot_pd(dummy_mask,velec);
984 velecsum = _mm256_add_pd(velecsum,velec);
988 fscal = _mm256_andnot_pd(dummy_mask,fscal);
990 /* Calculate temporary vectorial force */
991 tx = _mm256_mul_pd(fscal,dx11);
992 ty = _mm256_mul_pd(fscal,dy11);
993 tz = _mm256_mul_pd(fscal,dz11);
995 /* Update vectorial force */
996 fix1 = _mm256_add_pd(fix1,tx);
997 fiy1 = _mm256_add_pd(fiy1,ty);
998 fiz1 = _mm256_add_pd(fiz1,tz);
1000 fjx1 = _mm256_add_pd(fjx1,tx);
1001 fjy1 = _mm256_add_pd(fjy1,ty);
1002 fjz1 = _mm256_add_pd(fjz1,tz);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 r12 = _mm256_mul_pd(rsq12,rinv12);
1009 r12 = _mm256_andnot_pd(dummy_mask,r12);
1011 /* EWALD ELECTROSTATICS */
1013 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1014 ewrt = _mm256_mul_pd(r12,ewtabscale);
1015 ewitab = _mm256_cvttpd_epi32(ewrt);
1016 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1017 ewitab = _mm_slli_epi32(ewitab,2);
1018 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1019 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1020 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1021 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1022 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1023 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1024 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1025 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1026 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1028 /* Update potential sum for this i atom from the interaction with this j atom. */
1029 velec = _mm256_andnot_pd(dummy_mask,velec);
1030 velecsum = _mm256_add_pd(velecsum,velec);
1034 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1036 /* Calculate temporary vectorial force */
1037 tx = _mm256_mul_pd(fscal,dx12);
1038 ty = _mm256_mul_pd(fscal,dy12);
1039 tz = _mm256_mul_pd(fscal,dz12);
1041 /* Update vectorial force */
1042 fix1 = _mm256_add_pd(fix1,tx);
1043 fiy1 = _mm256_add_pd(fiy1,ty);
1044 fiz1 = _mm256_add_pd(fiz1,tz);
1046 fjx2 = _mm256_add_pd(fjx2,tx);
1047 fjy2 = _mm256_add_pd(fjy2,ty);
1048 fjz2 = _mm256_add_pd(fjz2,tz);
1050 /**************************
1051 * CALCULATE INTERACTIONS *
1052 **************************/
1054 r13 = _mm256_mul_pd(rsq13,rinv13);
1055 r13 = _mm256_andnot_pd(dummy_mask,r13);
1057 /* EWALD ELECTROSTATICS */
1059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060 ewrt = _mm256_mul_pd(r13,ewtabscale);
1061 ewitab = _mm256_cvttpd_epi32(ewrt);
1062 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1063 ewitab = _mm_slli_epi32(ewitab,2);
1064 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1065 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1066 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1067 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1068 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1069 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1070 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1071 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1072 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1074 /* Update potential sum for this i atom from the interaction with this j atom. */
1075 velec = _mm256_andnot_pd(dummy_mask,velec);
1076 velecsum = _mm256_add_pd(velecsum,velec);
1080 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1082 /* Calculate temporary vectorial force */
1083 tx = _mm256_mul_pd(fscal,dx13);
1084 ty = _mm256_mul_pd(fscal,dy13);
1085 tz = _mm256_mul_pd(fscal,dz13);
1087 /* Update vectorial force */
1088 fix1 = _mm256_add_pd(fix1,tx);
1089 fiy1 = _mm256_add_pd(fiy1,ty);
1090 fiz1 = _mm256_add_pd(fiz1,tz);
1092 fjx3 = _mm256_add_pd(fjx3,tx);
1093 fjy3 = _mm256_add_pd(fjy3,ty);
1094 fjz3 = _mm256_add_pd(fjz3,tz);
1096 /**************************
1097 * CALCULATE INTERACTIONS *
1098 **************************/
1100 r21 = _mm256_mul_pd(rsq21,rinv21);
1101 r21 = _mm256_andnot_pd(dummy_mask,r21);
1103 /* EWALD ELECTROSTATICS */
1105 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1106 ewrt = _mm256_mul_pd(r21,ewtabscale);
1107 ewitab = _mm256_cvttpd_epi32(ewrt);
1108 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1109 ewitab = _mm_slli_epi32(ewitab,2);
1110 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1111 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1112 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1113 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1114 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1115 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1116 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1117 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1118 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1120 /* Update potential sum for this i atom from the interaction with this j atom. */
1121 velec = _mm256_andnot_pd(dummy_mask,velec);
1122 velecsum = _mm256_add_pd(velecsum,velec);
1126 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1128 /* Calculate temporary vectorial force */
1129 tx = _mm256_mul_pd(fscal,dx21);
1130 ty = _mm256_mul_pd(fscal,dy21);
1131 tz = _mm256_mul_pd(fscal,dz21);
1133 /* Update vectorial force */
1134 fix2 = _mm256_add_pd(fix2,tx);
1135 fiy2 = _mm256_add_pd(fiy2,ty);
1136 fiz2 = _mm256_add_pd(fiz2,tz);
1138 fjx1 = _mm256_add_pd(fjx1,tx);
1139 fjy1 = _mm256_add_pd(fjy1,ty);
1140 fjz1 = _mm256_add_pd(fjz1,tz);
1142 /**************************
1143 * CALCULATE INTERACTIONS *
1144 **************************/
1146 r22 = _mm256_mul_pd(rsq22,rinv22);
1147 r22 = _mm256_andnot_pd(dummy_mask,r22);
1149 /* EWALD ELECTROSTATICS */
1151 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1152 ewrt = _mm256_mul_pd(r22,ewtabscale);
1153 ewitab = _mm256_cvttpd_epi32(ewrt);
1154 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1155 ewitab = _mm_slli_epi32(ewitab,2);
1156 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1157 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1158 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1159 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1160 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1161 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1162 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1163 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1164 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1166 /* Update potential sum for this i atom from the interaction with this j atom. */
1167 velec = _mm256_andnot_pd(dummy_mask,velec);
1168 velecsum = _mm256_add_pd(velecsum,velec);
1172 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1174 /* Calculate temporary vectorial force */
1175 tx = _mm256_mul_pd(fscal,dx22);
1176 ty = _mm256_mul_pd(fscal,dy22);
1177 tz = _mm256_mul_pd(fscal,dz22);
1179 /* Update vectorial force */
1180 fix2 = _mm256_add_pd(fix2,tx);
1181 fiy2 = _mm256_add_pd(fiy2,ty);
1182 fiz2 = _mm256_add_pd(fiz2,tz);
1184 fjx2 = _mm256_add_pd(fjx2,tx);
1185 fjy2 = _mm256_add_pd(fjy2,ty);
1186 fjz2 = _mm256_add_pd(fjz2,tz);
1188 /**************************
1189 * CALCULATE INTERACTIONS *
1190 **************************/
1192 r23 = _mm256_mul_pd(rsq23,rinv23);
1193 r23 = _mm256_andnot_pd(dummy_mask,r23);
1195 /* EWALD ELECTROSTATICS */
1197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1198 ewrt = _mm256_mul_pd(r23,ewtabscale);
1199 ewitab = _mm256_cvttpd_epi32(ewrt);
1200 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1201 ewitab = _mm_slli_epi32(ewitab,2);
1202 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1203 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1204 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1205 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1206 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1207 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1208 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1209 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1210 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1212 /* Update potential sum for this i atom from the interaction with this j atom. */
1213 velec = _mm256_andnot_pd(dummy_mask,velec);
1214 velecsum = _mm256_add_pd(velecsum,velec);
1218 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1220 /* Calculate temporary vectorial force */
1221 tx = _mm256_mul_pd(fscal,dx23);
1222 ty = _mm256_mul_pd(fscal,dy23);
1223 tz = _mm256_mul_pd(fscal,dz23);
1225 /* Update vectorial force */
1226 fix2 = _mm256_add_pd(fix2,tx);
1227 fiy2 = _mm256_add_pd(fiy2,ty);
1228 fiz2 = _mm256_add_pd(fiz2,tz);
1230 fjx3 = _mm256_add_pd(fjx3,tx);
1231 fjy3 = _mm256_add_pd(fjy3,ty);
1232 fjz3 = _mm256_add_pd(fjz3,tz);
1234 /**************************
1235 * CALCULATE INTERACTIONS *
1236 **************************/
1238 r31 = _mm256_mul_pd(rsq31,rinv31);
1239 r31 = _mm256_andnot_pd(dummy_mask,r31);
1241 /* EWALD ELECTROSTATICS */
1243 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1244 ewrt = _mm256_mul_pd(r31,ewtabscale);
1245 ewitab = _mm256_cvttpd_epi32(ewrt);
1246 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1247 ewitab = _mm_slli_epi32(ewitab,2);
1248 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1249 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1250 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1251 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1252 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1253 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1254 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1255 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1256 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec = _mm256_andnot_pd(dummy_mask,velec);
1260 velecsum = _mm256_add_pd(velecsum,velec);
1264 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1266 /* Calculate temporary vectorial force */
1267 tx = _mm256_mul_pd(fscal,dx31);
1268 ty = _mm256_mul_pd(fscal,dy31);
1269 tz = _mm256_mul_pd(fscal,dz31);
1271 /* Update vectorial force */
1272 fix3 = _mm256_add_pd(fix3,tx);
1273 fiy3 = _mm256_add_pd(fiy3,ty);
1274 fiz3 = _mm256_add_pd(fiz3,tz);
1276 fjx1 = _mm256_add_pd(fjx1,tx);
1277 fjy1 = _mm256_add_pd(fjy1,ty);
1278 fjz1 = _mm256_add_pd(fjz1,tz);
1280 /**************************
1281 * CALCULATE INTERACTIONS *
1282 **************************/
1284 r32 = _mm256_mul_pd(rsq32,rinv32);
1285 r32 = _mm256_andnot_pd(dummy_mask,r32);
1287 /* EWALD ELECTROSTATICS */
1289 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1290 ewrt = _mm256_mul_pd(r32,ewtabscale);
1291 ewitab = _mm256_cvttpd_epi32(ewrt);
1292 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1293 ewitab = _mm_slli_epi32(ewitab,2);
1294 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1295 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1296 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1297 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1298 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1299 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1300 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1301 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1302 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1304 /* Update potential sum for this i atom from the interaction with this j atom. */
1305 velec = _mm256_andnot_pd(dummy_mask,velec);
1306 velecsum = _mm256_add_pd(velecsum,velec);
1310 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1312 /* Calculate temporary vectorial force */
1313 tx = _mm256_mul_pd(fscal,dx32);
1314 ty = _mm256_mul_pd(fscal,dy32);
1315 tz = _mm256_mul_pd(fscal,dz32);
1317 /* Update vectorial force */
1318 fix3 = _mm256_add_pd(fix3,tx);
1319 fiy3 = _mm256_add_pd(fiy3,ty);
1320 fiz3 = _mm256_add_pd(fiz3,tz);
1322 fjx2 = _mm256_add_pd(fjx2,tx);
1323 fjy2 = _mm256_add_pd(fjy2,ty);
1324 fjz2 = _mm256_add_pd(fjz2,tz);
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 r33 = _mm256_mul_pd(rsq33,rinv33);
1331 r33 = _mm256_andnot_pd(dummy_mask,r33);
1333 /* EWALD ELECTROSTATICS */
1335 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1336 ewrt = _mm256_mul_pd(r33,ewtabscale);
1337 ewitab = _mm256_cvttpd_epi32(ewrt);
1338 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1339 ewitab = _mm_slli_epi32(ewitab,2);
1340 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1341 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1342 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1343 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1344 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1345 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1346 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1347 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1348 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1350 /* Update potential sum for this i atom from the interaction with this j atom. */
1351 velec = _mm256_andnot_pd(dummy_mask,velec);
1352 velecsum = _mm256_add_pd(velecsum,velec);
1356 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1358 /* Calculate temporary vectorial force */
1359 tx = _mm256_mul_pd(fscal,dx33);
1360 ty = _mm256_mul_pd(fscal,dy33);
1361 tz = _mm256_mul_pd(fscal,dz33);
1363 /* Update vectorial force */
1364 fix3 = _mm256_add_pd(fix3,tx);
1365 fiy3 = _mm256_add_pd(fiy3,ty);
1366 fiz3 = _mm256_add_pd(fiz3,tz);
1368 fjx3 = _mm256_add_pd(fjx3,tx);
1369 fjy3 = _mm256_add_pd(fjy3,ty);
1370 fjz3 = _mm256_add_pd(fjz3,tz);
1372 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1373 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1374 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1375 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1377 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1378 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1379 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1381 /* Inner loop uses 438 flops */
1384 /* End of innermost loop */
1386 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1387 f+i_coord_offset,fshift+i_shift_offset);
1390 /* Update potential energies */
1391 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1392 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1394 /* Increment number of inner iterations */
1395 inneriter += j_index_end - j_index_start;
1397 /* Outer loop uses 26 flops */
1400 /* Increment number of outer iterations */
1403 /* Update outer/inner flops */
1405 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*438);
1408 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double
1409 * Electrostatics interaction: Ewald
1410 * VdW interaction: CubicSplineTable
1411 * Geometry: Water4-Water4
1412 * Calculate force/pot: Force
1415 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double
1416 (t_nblist * gmx_restrict nlist,
1417 rvec * gmx_restrict xx,
1418 rvec * gmx_restrict ff,
1419 t_forcerec * gmx_restrict fr,
1420 t_mdatoms * gmx_restrict mdatoms,
1421 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1422 t_nrnb * gmx_restrict nrnb)
1424 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1425 * just 0 for non-waters.
1426 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1427 * jnr indices corresponding to data put in the four positions in the SIMD register.
1429 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1430 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1431 int jnrA,jnrB,jnrC,jnrD;
1432 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1433 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1434 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1435 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1436 real rcutoff_scalar;
1437 real *shiftvec,*fshift,*x,*f;
1438 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1439 real scratch[4*DIM];
1440 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1441 real * vdwioffsetptr0;
1442 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1443 real * vdwioffsetptr1;
1444 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1445 real * vdwioffsetptr2;
1446 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1447 real * vdwioffsetptr3;
1448 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1449 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1450 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1451 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1452 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1453 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1454 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1455 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1456 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1457 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1458 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1459 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1460 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1461 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1462 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1463 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1464 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1465 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1466 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1467 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1470 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1473 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1474 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1476 __m128i ifour = _mm_set1_epi32(4);
1477 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1480 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1481 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1483 __m256d dummy_mask,cutoff_mask;
1484 __m128 tmpmask0,tmpmask1;
1485 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1486 __m256d one = _mm256_set1_pd(1.0);
1487 __m256d two = _mm256_set1_pd(2.0);
1493 jindex = nlist->jindex;
1495 shiftidx = nlist->shift;
1497 shiftvec = fr->shift_vec[0];
1498 fshift = fr->fshift[0];
1499 facel = _mm256_set1_pd(fr->epsfac);
1500 charge = mdatoms->chargeA;
1501 nvdwtype = fr->ntype;
1502 vdwparam = fr->nbfp;
1503 vdwtype = mdatoms->typeA;
1505 vftab = kernel_data->table_vdw->data;
1506 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1508 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1509 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1510 beta2 = _mm256_mul_pd(beta,beta);
1511 beta3 = _mm256_mul_pd(beta,beta2);
1513 ewtab = fr->ic->tabq_coul_F;
1514 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1515 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1517 /* Setup water-specific parameters */
1518 inr = nlist->iinr[0];
1519 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1520 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1521 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1522 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1524 jq1 = _mm256_set1_pd(charge[inr+1]);
1525 jq2 = _mm256_set1_pd(charge[inr+2]);
1526 jq3 = _mm256_set1_pd(charge[inr+3]);
1527 vdwjidx0A = 2*vdwtype[inr+0];
1528 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1529 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1530 qq11 = _mm256_mul_pd(iq1,jq1);
1531 qq12 = _mm256_mul_pd(iq1,jq2);
1532 qq13 = _mm256_mul_pd(iq1,jq3);
1533 qq21 = _mm256_mul_pd(iq2,jq1);
1534 qq22 = _mm256_mul_pd(iq2,jq2);
1535 qq23 = _mm256_mul_pd(iq2,jq3);
1536 qq31 = _mm256_mul_pd(iq3,jq1);
1537 qq32 = _mm256_mul_pd(iq3,jq2);
1538 qq33 = _mm256_mul_pd(iq3,jq3);
1540 /* Avoid stupid compiler warnings */
1541 jnrA = jnrB = jnrC = jnrD = 0;
1542 j_coord_offsetA = 0;
1543 j_coord_offsetB = 0;
1544 j_coord_offsetC = 0;
1545 j_coord_offsetD = 0;
1550 for(iidx=0;iidx<4*DIM;iidx++)
1552 scratch[iidx] = 0.0;
1555 /* Start outer loop over neighborlists */
1556 for(iidx=0; iidx<nri; iidx++)
1558 /* Load shift vector for this list */
1559 i_shift_offset = DIM*shiftidx[iidx];
1561 /* Load limits for loop over neighbors */
1562 j_index_start = jindex[iidx];
1563 j_index_end = jindex[iidx+1];
1565 /* Get outer coordinate index */
1567 i_coord_offset = DIM*inr;
1569 /* Load i particle coords and add shift vector */
1570 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1571 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1573 fix0 = _mm256_setzero_pd();
1574 fiy0 = _mm256_setzero_pd();
1575 fiz0 = _mm256_setzero_pd();
1576 fix1 = _mm256_setzero_pd();
1577 fiy1 = _mm256_setzero_pd();
1578 fiz1 = _mm256_setzero_pd();
1579 fix2 = _mm256_setzero_pd();
1580 fiy2 = _mm256_setzero_pd();
1581 fiz2 = _mm256_setzero_pd();
1582 fix3 = _mm256_setzero_pd();
1583 fiy3 = _mm256_setzero_pd();
1584 fiz3 = _mm256_setzero_pd();
1586 /* Start inner kernel loop */
1587 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1590 /* Get j neighbor index, and coordinate index */
1592 jnrB = jjnr[jidx+1];
1593 jnrC = jjnr[jidx+2];
1594 jnrD = jjnr[jidx+3];
1595 j_coord_offsetA = DIM*jnrA;
1596 j_coord_offsetB = DIM*jnrB;
1597 j_coord_offsetC = DIM*jnrC;
1598 j_coord_offsetD = DIM*jnrD;
1600 /* load j atom coordinates */
1601 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1602 x+j_coord_offsetC,x+j_coord_offsetD,
1603 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1604 &jy2,&jz2,&jx3,&jy3,&jz3);
1606 /* Calculate displacement vector */
1607 dx00 = _mm256_sub_pd(ix0,jx0);
1608 dy00 = _mm256_sub_pd(iy0,jy0);
1609 dz00 = _mm256_sub_pd(iz0,jz0);
1610 dx11 = _mm256_sub_pd(ix1,jx1);
1611 dy11 = _mm256_sub_pd(iy1,jy1);
1612 dz11 = _mm256_sub_pd(iz1,jz1);
1613 dx12 = _mm256_sub_pd(ix1,jx2);
1614 dy12 = _mm256_sub_pd(iy1,jy2);
1615 dz12 = _mm256_sub_pd(iz1,jz2);
1616 dx13 = _mm256_sub_pd(ix1,jx3);
1617 dy13 = _mm256_sub_pd(iy1,jy3);
1618 dz13 = _mm256_sub_pd(iz1,jz3);
1619 dx21 = _mm256_sub_pd(ix2,jx1);
1620 dy21 = _mm256_sub_pd(iy2,jy1);
1621 dz21 = _mm256_sub_pd(iz2,jz1);
1622 dx22 = _mm256_sub_pd(ix2,jx2);
1623 dy22 = _mm256_sub_pd(iy2,jy2);
1624 dz22 = _mm256_sub_pd(iz2,jz2);
1625 dx23 = _mm256_sub_pd(ix2,jx3);
1626 dy23 = _mm256_sub_pd(iy2,jy3);
1627 dz23 = _mm256_sub_pd(iz2,jz3);
1628 dx31 = _mm256_sub_pd(ix3,jx1);
1629 dy31 = _mm256_sub_pd(iy3,jy1);
1630 dz31 = _mm256_sub_pd(iz3,jz1);
1631 dx32 = _mm256_sub_pd(ix3,jx2);
1632 dy32 = _mm256_sub_pd(iy3,jy2);
1633 dz32 = _mm256_sub_pd(iz3,jz2);
1634 dx33 = _mm256_sub_pd(ix3,jx3);
1635 dy33 = _mm256_sub_pd(iy3,jy3);
1636 dz33 = _mm256_sub_pd(iz3,jz3);
1638 /* Calculate squared distance and things based on it */
1639 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1640 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1641 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1642 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1643 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1644 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1645 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1646 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1647 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1648 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1650 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1651 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1652 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1653 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1654 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1655 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1656 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1657 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1658 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1659 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1661 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1662 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1663 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1664 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1665 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1666 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1667 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1668 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1669 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1671 fjx0 = _mm256_setzero_pd();
1672 fjy0 = _mm256_setzero_pd();
1673 fjz0 = _mm256_setzero_pd();
1674 fjx1 = _mm256_setzero_pd();
1675 fjy1 = _mm256_setzero_pd();
1676 fjz1 = _mm256_setzero_pd();
1677 fjx2 = _mm256_setzero_pd();
1678 fjy2 = _mm256_setzero_pd();
1679 fjz2 = _mm256_setzero_pd();
1680 fjx3 = _mm256_setzero_pd();
1681 fjy3 = _mm256_setzero_pd();
1682 fjz3 = _mm256_setzero_pd();
1684 /**************************
1685 * CALCULATE INTERACTIONS *
1686 **************************/
1688 r00 = _mm256_mul_pd(rsq00,rinv00);
1690 /* Calculate table index by multiplying r with table scale and truncate to integer */
1691 rt = _mm256_mul_pd(r00,vftabscale);
1692 vfitab = _mm256_cvttpd_epi32(rt);
1693 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1694 vfitab = _mm_slli_epi32(vfitab,3);
1696 /* CUBIC SPLINE TABLE DISPERSION */
1697 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1698 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1699 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1700 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1701 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1702 Heps = _mm256_mul_pd(vfeps,H);
1703 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1704 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1705 fvdw6 = _mm256_mul_pd(c6_00,FF);
1707 /* CUBIC SPLINE TABLE REPULSION */
1708 vfitab = _mm_add_epi32(vfitab,ifour);
1709 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1710 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1711 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1712 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1713 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1714 Heps = _mm256_mul_pd(vfeps,H);
1715 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1716 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1717 fvdw12 = _mm256_mul_pd(c12_00,FF);
1718 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1722 /* Calculate temporary vectorial force */
1723 tx = _mm256_mul_pd(fscal,dx00);
1724 ty = _mm256_mul_pd(fscal,dy00);
1725 tz = _mm256_mul_pd(fscal,dz00);
1727 /* Update vectorial force */
1728 fix0 = _mm256_add_pd(fix0,tx);
1729 fiy0 = _mm256_add_pd(fiy0,ty);
1730 fiz0 = _mm256_add_pd(fiz0,tz);
1732 fjx0 = _mm256_add_pd(fjx0,tx);
1733 fjy0 = _mm256_add_pd(fjy0,ty);
1734 fjz0 = _mm256_add_pd(fjz0,tz);
1736 /**************************
1737 * CALCULATE INTERACTIONS *
1738 **************************/
1740 r11 = _mm256_mul_pd(rsq11,rinv11);
1742 /* EWALD ELECTROSTATICS */
1744 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1745 ewrt = _mm256_mul_pd(r11,ewtabscale);
1746 ewitab = _mm256_cvttpd_epi32(ewrt);
1747 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1748 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1749 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1751 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1752 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1756 /* Calculate temporary vectorial force */
1757 tx = _mm256_mul_pd(fscal,dx11);
1758 ty = _mm256_mul_pd(fscal,dy11);
1759 tz = _mm256_mul_pd(fscal,dz11);
1761 /* Update vectorial force */
1762 fix1 = _mm256_add_pd(fix1,tx);
1763 fiy1 = _mm256_add_pd(fiy1,ty);
1764 fiz1 = _mm256_add_pd(fiz1,tz);
1766 fjx1 = _mm256_add_pd(fjx1,tx);
1767 fjy1 = _mm256_add_pd(fjy1,ty);
1768 fjz1 = _mm256_add_pd(fjz1,tz);
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 r12 = _mm256_mul_pd(rsq12,rinv12);
1776 /* EWALD ELECTROSTATICS */
1778 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1779 ewrt = _mm256_mul_pd(r12,ewtabscale);
1780 ewitab = _mm256_cvttpd_epi32(ewrt);
1781 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1782 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1783 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1785 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1786 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1790 /* Calculate temporary vectorial force */
1791 tx = _mm256_mul_pd(fscal,dx12);
1792 ty = _mm256_mul_pd(fscal,dy12);
1793 tz = _mm256_mul_pd(fscal,dz12);
1795 /* Update vectorial force */
1796 fix1 = _mm256_add_pd(fix1,tx);
1797 fiy1 = _mm256_add_pd(fiy1,ty);
1798 fiz1 = _mm256_add_pd(fiz1,tz);
1800 fjx2 = _mm256_add_pd(fjx2,tx);
1801 fjy2 = _mm256_add_pd(fjy2,ty);
1802 fjz2 = _mm256_add_pd(fjz2,tz);
1804 /**************************
1805 * CALCULATE INTERACTIONS *
1806 **************************/
1808 r13 = _mm256_mul_pd(rsq13,rinv13);
1810 /* EWALD ELECTROSTATICS */
1812 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1813 ewrt = _mm256_mul_pd(r13,ewtabscale);
1814 ewitab = _mm256_cvttpd_epi32(ewrt);
1815 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1816 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1817 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1819 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1820 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1824 /* Calculate temporary vectorial force */
1825 tx = _mm256_mul_pd(fscal,dx13);
1826 ty = _mm256_mul_pd(fscal,dy13);
1827 tz = _mm256_mul_pd(fscal,dz13);
1829 /* Update vectorial force */
1830 fix1 = _mm256_add_pd(fix1,tx);
1831 fiy1 = _mm256_add_pd(fiy1,ty);
1832 fiz1 = _mm256_add_pd(fiz1,tz);
1834 fjx3 = _mm256_add_pd(fjx3,tx);
1835 fjy3 = _mm256_add_pd(fjy3,ty);
1836 fjz3 = _mm256_add_pd(fjz3,tz);
1838 /**************************
1839 * CALCULATE INTERACTIONS *
1840 **************************/
1842 r21 = _mm256_mul_pd(rsq21,rinv21);
1844 /* EWALD ELECTROSTATICS */
1846 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1847 ewrt = _mm256_mul_pd(r21,ewtabscale);
1848 ewitab = _mm256_cvttpd_epi32(ewrt);
1849 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1850 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1851 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1853 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1854 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1858 /* Calculate temporary vectorial force */
1859 tx = _mm256_mul_pd(fscal,dx21);
1860 ty = _mm256_mul_pd(fscal,dy21);
1861 tz = _mm256_mul_pd(fscal,dz21);
1863 /* Update vectorial force */
1864 fix2 = _mm256_add_pd(fix2,tx);
1865 fiy2 = _mm256_add_pd(fiy2,ty);
1866 fiz2 = _mm256_add_pd(fiz2,tz);
1868 fjx1 = _mm256_add_pd(fjx1,tx);
1869 fjy1 = _mm256_add_pd(fjy1,ty);
1870 fjz1 = _mm256_add_pd(fjz1,tz);
1872 /**************************
1873 * CALCULATE INTERACTIONS *
1874 **************************/
1876 r22 = _mm256_mul_pd(rsq22,rinv22);
1878 /* EWALD ELECTROSTATICS */
1880 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1881 ewrt = _mm256_mul_pd(r22,ewtabscale);
1882 ewitab = _mm256_cvttpd_epi32(ewrt);
1883 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1884 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1885 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1887 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1888 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1892 /* Calculate temporary vectorial force */
1893 tx = _mm256_mul_pd(fscal,dx22);
1894 ty = _mm256_mul_pd(fscal,dy22);
1895 tz = _mm256_mul_pd(fscal,dz22);
1897 /* Update vectorial force */
1898 fix2 = _mm256_add_pd(fix2,tx);
1899 fiy2 = _mm256_add_pd(fiy2,ty);
1900 fiz2 = _mm256_add_pd(fiz2,tz);
1902 fjx2 = _mm256_add_pd(fjx2,tx);
1903 fjy2 = _mm256_add_pd(fjy2,ty);
1904 fjz2 = _mm256_add_pd(fjz2,tz);
1906 /**************************
1907 * CALCULATE INTERACTIONS *
1908 **************************/
1910 r23 = _mm256_mul_pd(rsq23,rinv23);
1912 /* EWALD ELECTROSTATICS */
1914 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1915 ewrt = _mm256_mul_pd(r23,ewtabscale);
1916 ewitab = _mm256_cvttpd_epi32(ewrt);
1917 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1918 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1919 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1921 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1922 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1926 /* Calculate temporary vectorial force */
1927 tx = _mm256_mul_pd(fscal,dx23);
1928 ty = _mm256_mul_pd(fscal,dy23);
1929 tz = _mm256_mul_pd(fscal,dz23);
1931 /* Update vectorial force */
1932 fix2 = _mm256_add_pd(fix2,tx);
1933 fiy2 = _mm256_add_pd(fiy2,ty);
1934 fiz2 = _mm256_add_pd(fiz2,tz);
1936 fjx3 = _mm256_add_pd(fjx3,tx);
1937 fjy3 = _mm256_add_pd(fjy3,ty);
1938 fjz3 = _mm256_add_pd(fjz3,tz);
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 r31 = _mm256_mul_pd(rsq31,rinv31);
1946 /* EWALD ELECTROSTATICS */
1948 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1949 ewrt = _mm256_mul_pd(r31,ewtabscale);
1950 ewitab = _mm256_cvttpd_epi32(ewrt);
1951 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1952 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1953 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1955 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1956 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1960 /* Calculate temporary vectorial force */
1961 tx = _mm256_mul_pd(fscal,dx31);
1962 ty = _mm256_mul_pd(fscal,dy31);
1963 tz = _mm256_mul_pd(fscal,dz31);
1965 /* Update vectorial force */
1966 fix3 = _mm256_add_pd(fix3,tx);
1967 fiy3 = _mm256_add_pd(fiy3,ty);
1968 fiz3 = _mm256_add_pd(fiz3,tz);
1970 fjx1 = _mm256_add_pd(fjx1,tx);
1971 fjy1 = _mm256_add_pd(fjy1,ty);
1972 fjz1 = _mm256_add_pd(fjz1,tz);
1974 /**************************
1975 * CALCULATE INTERACTIONS *
1976 **************************/
1978 r32 = _mm256_mul_pd(rsq32,rinv32);
1980 /* EWALD ELECTROSTATICS */
1982 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1983 ewrt = _mm256_mul_pd(r32,ewtabscale);
1984 ewitab = _mm256_cvttpd_epi32(ewrt);
1985 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1986 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1987 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1989 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1990 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1994 /* Calculate temporary vectorial force */
1995 tx = _mm256_mul_pd(fscal,dx32);
1996 ty = _mm256_mul_pd(fscal,dy32);
1997 tz = _mm256_mul_pd(fscal,dz32);
1999 /* Update vectorial force */
2000 fix3 = _mm256_add_pd(fix3,tx);
2001 fiy3 = _mm256_add_pd(fiy3,ty);
2002 fiz3 = _mm256_add_pd(fiz3,tz);
2004 fjx2 = _mm256_add_pd(fjx2,tx);
2005 fjy2 = _mm256_add_pd(fjy2,ty);
2006 fjz2 = _mm256_add_pd(fjz2,tz);
2008 /**************************
2009 * CALCULATE INTERACTIONS *
2010 **************************/
2012 r33 = _mm256_mul_pd(rsq33,rinv33);
2014 /* EWALD ELECTROSTATICS */
2016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2017 ewrt = _mm256_mul_pd(r33,ewtabscale);
2018 ewitab = _mm256_cvttpd_epi32(ewrt);
2019 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2020 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2021 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2023 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2024 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2028 /* Calculate temporary vectorial force */
2029 tx = _mm256_mul_pd(fscal,dx33);
2030 ty = _mm256_mul_pd(fscal,dy33);
2031 tz = _mm256_mul_pd(fscal,dz33);
2033 /* Update vectorial force */
2034 fix3 = _mm256_add_pd(fix3,tx);
2035 fiy3 = _mm256_add_pd(fiy3,ty);
2036 fiz3 = _mm256_add_pd(fiz3,tz);
2038 fjx3 = _mm256_add_pd(fjx3,tx);
2039 fjy3 = _mm256_add_pd(fjy3,ty);
2040 fjz3 = _mm256_add_pd(fjz3,tz);
2042 fjptrA = f+j_coord_offsetA;
2043 fjptrB = f+j_coord_offsetB;
2044 fjptrC = f+j_coord_offsetC;
2045 fjptrD = f+j_coord_offsetD;
2047 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2048 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2049 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2051 /* Inner loop uses 375 flops */
2054 if(jidx<j_index_end)
2057 /* Get j neighbor index, and coordinate index */
2058 jnrlistA = jjnr[jidx];
2059 jnrlistB = jjnr[jidx+1];
2060 jnrlistC = jjnr[jidx+2];
2061 jnrlistD = jjnr[jidx+3];
2062 /* Sign of each element will be negative for non-real atoms.
2063 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2064 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2066 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2068 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2069 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2070 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2072 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2073 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2074 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2075 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2076 j_coord_offsetA = DIM*jnrA;
2077 j_coord_offsetB = DIM*jnrB;
2078 j_coord_offsetC = DIM*jnrC;
2079 j_coord_offsetD = DIM*jnrD;
2081 /* load j atom coordinates */
2082 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2083 x+j_coord_offsetC,x+j_coord_offsetD,
2084 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2085 &jy2,&jz2,&jx3,&jy3,&jz3);
2087 /* Calculate displacement vector */
2088 dx00 = _mm256_sub_pd(ix0,jx0);
2089 dy00 = _mm256_sub_pd(iy0,jy0);
2090 dz00 = _mm256_sub_pd(iz0,jz0);
2091 dx11 = _mm256_sub_pd(ix1,jx1);
2092 dy11 = _mm256_sub_pd(iy1,jy1);
2093 dz11 = _mm256_sub_pd(iz1,jz1);
2094 dx12 = _mm256_sub_pd(ix1,jx2);
2095 dy12 = _mm256_sub_pd(iy1,jy2);
2096 dz12 = _mm256_sub_pd(iz1,jz2);
2097 dx13 = _mm256_sub_pd(ix1,jx3);
2098 dy13 = _mm256_sub_pd(iy1,jy3);
2099 dz13 = _mm256_sub_pd(iz1,jz3);
2100 dx21 = _mm256_sub_pd(ix2,jx1);
2101 dy21 = _mm256_sub_pd(iy2,jy1);
2102 dz21 = _mm256_sub_pd(iz2,jz1);
2103 dx22 = _mm256_sub_pd(ix2,jx2);
2104 dy22 = _mm256_sub_pd(iy2,jy2);
2105 dz22 = _mm256_sub_pd(iz2,jz2);
2106 dx23 = _mm256_sub_pd(ix2,jx3);
2107 dy23 = _mm256_sub_pd(iy2,jy3);
2108 dz23 = _mm256_sub_pd(iz2,jz3);
2109 dx31 = _mm256_sub_pd(ix3,jx1);
2110 dy31 = _mm256_sub_pd(iy3,jy1);
2111 dz31 = _mm256_sub_pd(iz3,jz1);
2112 dx32 = _mm256_sub_pd(ix3,jx2);
2113 dy32 = _mm256_sub_pd(iy3,jy2);
2114 dz32 = _mm256_sub_pd(iz3,jz2);
2115 dx33 = _mm256_sub_pd(ix3,jx3);
2116 dy33 = _mm256_sub_pd(iy3,jy3);
2117 dz33 = _mm256_sub_pd(iz3,jz3);
2119 /* Calculate squared distance and things based on it */
2120 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2121 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2122 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2123 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2124 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2125 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2126 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2127 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2128 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2129 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2131 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2132 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2133 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2134 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2135 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2136 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2137 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2138 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2139 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2140 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2142 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2143 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2144 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2145 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2146 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2147 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2148 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2149 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2150 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2152 fjx0 = _mm256_setzero_pd();
2153 fjy0 = _mm256_setzero_pd();
2154 fjz0 = _mm256_setzero_pd();
2155 fjx1 = _mm256_setzero_pd();
2156 fjy1 = _mm256_setzero_pd();
2157 fjz1 = _mm256_setzero_pd();
2158 fjx2 = _mm256_setzero_pd();
2159 fjy2 = _mm256_setzero_pd();
2160 fjz2 = _mm256_setzero_pd();
2161 fjx3 = _mm256_setzero_pd();
2162 fjy3 = _mm256_setzero_pd();
2163 fjz3 = _mm256_setzero_pd();
2165 /**************************
2166 * CALCULATE INTERACTIONS *
2167 **************************/
2169 r00 = _mm256_mul_pd(rsq00,rinv00);
2170 r00 = _mm256_andnot_pd(dummy_mask,r00);
2172 /* Calculate table index by multiplying r with table scale and truncate to integer */
2173 rt = _mm256_mul_pd(r00,vftabscale);
2174 vfitab = _mm256_cvttpd_epi32(rt);
2175 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2176 vfitab = _mm_slli_epi32(vfitab,3);
2178 /* CUBIC SPLINE TABLE DISPERSION */
2179 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2180 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2181 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2182 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2183 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2184 Heps = _mm256_mul_pd(vfeps,H);
2185 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2186 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2187 fvdw6 = _mm256_mul_pd(c6_00,FF);
2189 /* CUBIC SPLINE TABLE REPULSION */
2190 vfitab = _mm_add_epi32(vfitab,ifour);
2191 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2192 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2193 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2194 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2195 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2196 Heps = _mm256_mul_pd(vfeps,H);
2197 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2198 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2199 fvdw12 = _mm256_mul_pd(c12_00,FF);
2200 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2204 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2206 /* Calculate temporary vectorial force */
2207 tx = _mm256_mul_pd(fscal,dx00);
2208 ty = _mm256_mul_pd(fscal,dy00);
2209 tz = _mm256_mul_pd(fscal,dz00);
2211 /* Update vectorial force */
2212 fix0 = _mm256_add_pd(fix0,tx);
2213 fiy0 = _mm256_add_pd(fiy0,ty);
2214 fiz0 = _mm256_add_pd(fiz0,tz);
2216 fjx0 = _mm256_add_pd(fjx0,tx);
2217 fjy0 = _mm256_add_pd(fjy0,ty);
2218 fjz0 = _mm256_add_pd(fjz0,tz);
2220 /**************************
2221 * CALCULATE INTERACTIONS *
2222 **************************/
2224 r11 = _mm256_mul_pd(rsq11,rinv11);
2225 r11 = _mm256_andnot_pd(dummy_mask,r11);
2227 /* EWALD ELECTROSTATICS */
2229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2230 ewrt = _mm256_mul_pd(r11,ewtabscale);
2231 ewitab = _mm256_cvttpd_epi32(ewrt);
2232 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2233 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2234 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2236 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2237 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2241 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2243 /* Calculate temporary vectorial force */
2244 tx = _mm256_mul_pd(fscal,dx11);
2245 ty = _mm256_mul_pd(fscal,dy11);
2246 tz = _mm256_mul_pd(fscal,dz11);
2248 /* Update vectorial force */
2249 fix1 = _mm256_add_pd(fix1,tx);
2250 fiy1 = _mm256_add_pd(fiy1,ty);
2251 fiz1 = _mm256_add_pd(fiz1,tz);
2253 fjx1 = _mm256_add_pd(fjx1,tx);
2254 fjy1 = _mm256_add_pd(fjy1,ty);
2255 fjz1 = _mm256_add_pd(fjz1,tz);
2257 /**************************
2258 * CALCULATE INTERACTIONS *
2259 **************************/
2261 r12 = _mm256_mul_pd(rsq12,rinv12);
2262 r12 = _mm256_andnot_pd(dummy_mask,r12);
2264 /* EWALD ELECTROSTATICS */
2266 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2267 ewrt = _mm256_mul_pd(r12,ewtabscale);
2268 ewitab = _mm256_cvttpd_epi32(ewrt);
2269 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2270 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2271 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2273 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2274 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2278 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2280 /* Calculate temporary vectorial force */
2281 tx = _mm256_mul_pd(fscal,dx12);
2282 ty = _mm256_mul_pd(fscal,dy12);
2283 tz = _mm256_mul_pd(fscal,dz12);
2285 /* Update vectorial force */
2286 fix1 = _mm256_add_pd(fix1,tx);
2287 fiy1 = _mm256_add_pd(fiy1,ty);
2288 fiz1 = _mm256_add_pd(fiz1,tz);
2290 fjx2 = _mm256_add_pd(fjx2,tx);
2291 fjy2 = _mm256_add_pd(fjy2,ty);
2292 fjz2 = _mm256_add_pd(fjz2,tz);
2294 /**************************
2295 * CALCULATE INTERACTIONS *
2296 **************************/
2298 r13 = _mm256_mul_pd(rsq13,rinv13);
2299 r13 = _mm256_andnot_pd(dummy_mask,r13);
2301 /* EWALD ELECTROSTATICS */
2303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2304 ewrt = _mm256_mul_pd(r13,ewtabscale);
2305 ewitab = _mm256_cvttpd_epi32(ewrt);
2306 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2307 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2308 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2310 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2311 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2315 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2317 /* Calculate temporary vectorial force */
2318 tx = _mm256_mul_pd(fscal,dx13);
2319 ty = _mm256_mul_pd(fscal,dy13);
2320 tz = _mm256_mul_pd(fscal,dz13);
2322 /* Update vectorial force */
2323 fix1 = _mm256_add_pd(fix1,tx);
2324 fiy1 = _mm256_add_pd(fiy1,ty);
2325 fiz1 = _mm256_add_pd(fiz1,tz);
2327 fjx3 = _mm256_add_pd(fjx3,tx);
2328 fjy3 = _mm256_add_pd(fjy3,ty);
2329 fjz3 = _mm256_add_pd(fjz3,tz);
2331 /**************************
2332 * CALCULATE INTERACTIONS *
2333 **************************/
2335 r21 = _mm256_mul_pd(rsq21,rinv21);
2336 r21 = _mm256_andnot_pd(dummy_mask,r21);
2338 /* EWALD ELECTROSTATICS */
2340 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2341 ewrt = _mm256_mul_pd(r21,ewtabscale);
2342 ewitab = _mm256_cvttpd_epi32(ewrt);
2343 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2344 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2345 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2347 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2348 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2352 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2354 /* Calculate temporary vectorial force */
2355 tx = _mm256_mul_pd(fscal,dx21);
2356 ty = _mm256_mul_pd(fscal,dy21);
2357 tz = _mm256_mul_pd(fscal,dz21);
2359 /* Update vectorial force */
2360 fix2 = _mm256_add_pd(fix2,tx);
2361 fiy2 = _mm256_add_pd(fiy2,ty);
2362 fiz2 = _mm256_add_pd(fiz2,tz);
2364 fjx1 = _mm256_add_pd(fjx1,tx);
2365 fjy1 = _mm256_add_pd(fjy1,ty);
2366 fjz1 = _mm256_add_pd(fjz1,tz);
2368 /**************************
2369 * CALCULATE INTERACTIONS *
2370 **************************/
2372 r22 = _mm256_mul_pd(rsq22,rinv22);
2373 r22 = _mm256_andnot_pd(dummy_mask,r22);
2375 /* EWALD ELECTROSTATICS */
2377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2378 ewrt = _mm256_mul_pd(r22,ewtabscale);
2379 ewitab = _mm256_cvttpd_epi32(ewrt);
2380 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2381 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2382 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2384 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2385 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2389 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2391 /* Calculate temporary vectorial force */
2392 tx = _mm256_mul_pd(fscal,dx22);
2393 ty = _mm256_mul_pd(fscal,dy22);
2394 tz = _mm256_mul_pd(fscal,dz22);
2396 /* Update vectorial force */
2397 fix2 = _mm256_add_pd(fix2,tx);
2398 fiy2 = _mm256_add_pd(fiy2,ty);
2399 fiz2 = _mm256_add_pd(fiz2,tz);
2401 fjx2 = _mm256_add_pd(fjx2,tx);
2402 fjy2 = _mm256_add_pd(fjy2,ty);
2403 fjz2 = _mm256_add_pd(fjz2,tz);
2405 /**************************
2406 * CALCULATE INTERACTIONS *
2407 **************************/
2409 r23 = _mm256_mul_pd(rsq23,rinv23);
2410 r23 = _mm256_andnot_pd(dummy_mask,r23);
2412 /* EWALD ELECTROSTATICS */
2414 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2415 ewrt = _mm256_mul_pd(r23,ewtabscale);
2416 ewitab = _mm256_cvttpd_epi32(ewrt);
2417 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2418 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2419 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2421 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2422 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2426 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2428 /* Calculate temporary vectorial force */
2429 tx = _mm256_mul_pd(fscal,dx23);
2430 ty = _mm256_mul_pd(fscal,dy23);
2431 tz = _mm256_mul_pd(fscal,dz23);
2433 /* Update vectorial force */
2434 fix2 = _mm256_add_pd(fix2,tx);
2435 fiy2 = _mm256_add_pd(fiy2,ty);
2436 fiz2 = _mm256_add_pd(fiz2,tz);
2438 fjx3 = _mm256_add_pd(fjx3,tx);
2439 fjy3 = _mm256_add_pd(fjy3,ty);
2440 fjz3 = _mm256_add_pd(fjz3,tz);
2442 /**************************
2443 * CALCULATE INTERACTIONS *
2444 **************************/
2446 r31 = _mm256_mul_pd(rsq31,rinv31);
2447 r31 = _mm256_andnot_pd(dummy_mask,r31);
2449 /* EWALD ELECTROSTATICS */
2451 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2452 ewrt = _mm256_mul_pd(r31,ewtabscale);
2453 ewitab = _mm256_cvttpd_epi32(ewrt);
2454 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2455 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2456 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2458 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2459 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2463 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2465 /* Calculate temporary vectorial force */
2466 tx = _mm256_mul_pd(fscal,dx31);
2467 ty = _mm256_mul_pd(fscal,dy31);
2468 tz = _mm256_mul_pd(fscal,dz31);
2470 /* Update vectorial force */
2471 fix3 = _mm256_add_pd(fix3,tx);
2472 fiy3 = _mm256_add_pd(fiy3,ty);
2473 fiz3 = _mm256_add_pd(fiz3,tz);
2475 fjx1 = _mm256_add_pd(fjx1,tx);
2476 fjy1 = _mm256_add_pd(fjy1,ty);
2477 fjz1 = _mm256_add_pd(fjz1,tz);
2479 /**************************
2480 * CALCULATE INTERACTIONS *
2481 **************************/
2483 r32 = _mm256_mul_pd(rsq32,rinv32);
2484 r32 = _mm256_andnot_pd(dummy_mask,r32);
2486 /* EWALD ELECTROSTATICS */
2488 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2489 ewrt = _mm256_mul_pd(r32,ewtabscale);
2490 ewitab = _mm256_cvttpd_epi32(ewrt);
2491 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2492 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2493 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2495 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2496 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2500 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2502 /* Calculate temporary vectorial force */
2503 tx = _mm256_mul_pd(fscal,dx32);
2504 ty = _mm256_mul_pd(fscal,dy32);
2505 tz = _mm256_mul_pd(fscal,dz32);
2507 /* Update vectorial force */
2508 fix3 = _mm256_add_pd(fix3,tx);
2509 fiy3 = _mm256_add_pd(fiy3,ty);
2510 fiz3 = _mm256_add_pd(fiz3,tz);
2512 fjx2 = _mm256_add_pd(fjx2,tx);
2513 fjy2 = _mm256_add_pd(fjy2,ty);
2514 fjz2 = _mm256_add_pd(fjz2,tz);
2516 /**************************
2517 * CALCULATE INTERACTIONS *
2518 **************************/
2520 r33 = _mm256_mul_pd(rsq33,rinv33);
2521 r33 = _mm256_andnot_pd(dummy_mask,r33);
2523 /* EWALD ELECTROSTATICS */
2525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2526 ewrt = _mm256_mul_pd(r33,ewtabscale);
2527 ewitab = _mm256_cvttpd_epi32(ewrt);
2528 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2529 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2530 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2532 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2533 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2537 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2539 /* Calculate temporary vectorial force */
2540 tx = _mm256_mul_pd(fscal,dx33);
2541 ty = _mm256_mul_pd(fscal,dy33);
2542 tz = _mm256_mul_pd(fscal,dz33);
2544 /* Update vectorial force */
2545 fix3 = _mm256_add_pd(fix3,tx);
2546 fiy3 = _mm256_add_pd(fiy3,ty);
2547 fiz3 = _mm256_add_pd(fiz3,tz);
2549 fjx3 = _mm256_add_pd(fjx3,tx);
2550 fjy3 = _mm256_add_pd(fjy3,ty);
2551 fjz3 = _mm256_add_pd(fjz3,tz);
2553 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2554 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2555 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2556 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2558 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2559 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2560 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2562 /* Inner loop uses 385 flops */
2565 /* End of innermost loop */
2567 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2568 f+i_coord_offset,fshift+i_shift_offset);
2570 /* Increment number of inner iterations */
2571 inneriter += j_index_end - j_index_start;
2573 /* Outer loop uses 24 flops */
2576 /* Increment number of outer iterations */
2579 /* Update outer/inner flops */
2581 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*385);