2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 real * vdwioffsetptr3;
90 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
112 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
116 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
118 __m128i ifour = _mm_set1_epi32(4);
119 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
121 __m256d dummy_mask,cutoff_mask;
122 __m128 tmpmask0,tmpmask1;
123 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
124 __m256d one = _mm256_set1_pd(1.0);
125 __m256d two = _mm256_set1_pd(2.0);
131 jindex = nlist->jindex;
133 shiftidx = nlist->shift;
135 shiftvec = fr->shift_vec[0];
136 fshift = fr->fshift[0];
137 facel = _mm256_set1_pd(fr->ic->epsfac);
138 charge = mdatoms->chargeA;
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 vftab = kernel_data->table_elec_vdw->data;
144 vftabscale = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
149 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
150 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
151 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
153 jq1 = _mm256_set1_pd(charge[inr+1]);
154 jq2 = _mm256_set1_pd(charge[inr+2]);
155 jq3 = _mm256_set1_pd(charge[inr+3]);
156 vdwjidx0A = 2*vdwtype[inr+0];
157 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
158 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
159 qq11 = _mm256_mul_pd(iq1,jq1);
160 qq12 = _mm256_mul_pd(iq1,jq2);
161 qq13 = _mm256_mul_pd(iq1,jq3);
162 qq21 = _mm256_mul_pd(iq2,jq1);
163 qq22 = _mm256_mul_pd(iq2,jq2);
164 qq23 = _mm256_mul_pd(iq2,jq3);
165 qq31 = _mm256_mul_pd(iq3,jq1);
166 qq32 = _mm256_mul_pd(iq3,jq2);
167 qq33 = _mm256_mul_pd(iq3,jq3);
169 /* Avoid stupid compiler warnings */
170 jnrA = jnrB = jnrC = jnrD = 0;
179 for(iidx=0;iidx<4*DIM;iidx++)
184 /* Start outer loop over neighborlists */
185 for(iidx=0; iidx<nri; iidx++)
187 /* Load shift vector for this list */
188 i_shift_offset = DIM*shiftidx[iidx];
190 /* Load limits for loop over neighbors */
191 j_index_start = jindex[iidx];
192 j_index_end = jindex[iidx+1];
194 /* Get outer coordinate index */
196 i_coord_offset = DIM*inr;
198 /* Load i particle coords and add shift vector */
199 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
200 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
202 fix0 = _mm256_setzero_pd();
203 fiy0 = _mm256_setzero_pd();
204 fiz0 = _mm256_setzero_pd();
205 fix1 = _mm256_setzero_pd();
206 fiy1 = _mm256_setzero_pd();
207 fiz1 = _mm256_setzero_pd();
208 fix2 = _mm256_setzero_pd();
209 fiy2 = _mm256_setzero_pd();
210 fiz2 = _mm256_setzero_pd();
211 fix3 = _mm256_setzero_pd();
212 fiy3 = _mm256_setzero_pd();
213 fiz3 = _mm256_setzero_pd();
215 /* Reset potential sums */
216 velecsum = _mm256_setzero_pd();
217 vvdwsum = _mm256_setzero_pd();
219 /* Start inner kernel loop */
220 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
223 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
230 j_coord_offsetC = DIM*jnrC;
231 j_coord_offsetD = DIM*jnrD;
233 /* load j atom coordinates */
234 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
235 x+j_coord_offsetC,x+j_coord_offsetD,
236 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
237 &jy2,&jz2,&jx3,&jy3,&jz3);
239 /* Calculate displacement vector */
240 dx00 = _mm256_sub_pd(ix0,jx0);
241 dy00 = _mm256_sub_pd(iy0,jy0);
242 dz00 = _mm256_sub_pd(iz0,jz0);
243 dx11 = _mm256_sub_pd(ix1,jx1);
244 dy11 = _mm256_sub_pd(iy1,jy1);
245 dz11 = _mm256_sub_pd(iz1,jz1);
246 dx12 = _mm256_sub_pd(ix1,jx2);
247 dy12 = _mm256_sub_pd(iy1,jy2);
248 dz12 = _mm256_sub_pd(iz1,jz2);
249 dx13 = _mm256_sub_pd(ix1,jx3);
250 dy13 = _mm256_sub_pd(iy1,jy3);
251 dz13 = _mm256_sub_pd(iz1,jz3);
252 dx21 = _mm256_sub_pd(ix2,jx1);
253 dy21 = _mm256_sub_pd(iy2,jy1);
254 dz21 = _mm256_sub_pd(iz2,jz1);
255 dx22 = _mm256_sub_pd(ix2,jx2);
256 dy22 = _mm256_sub_pd(iy2,jy2);
257 dz22 = _mm256_sub_pd(iz2,jz2);
258 dx23 = _mm256_sub_pd(ix2,jx3);
259 dy23 = _mm256_sub_pd(iy2,jy3);
260 dz23 = _mm256_sub_pd(iz2,jz3);
261 dx31 = _mm256_sub_pd(ix3,jx1);
262 dy31 = _mm256_sub_pd(iy3,jy1);
263 dz31 = _mm256_sub_pd(iz3,jz1);
264 dx32 = _mm256_sub_pd(ix3,jx2);
265 dy32 = _mm256_sub_pd(iy3,jy2);
266 dz32 = _mm256_sub_pd(iz3,jz2);
267 dx33 = _mm256_sub_pd(ix3,jx3);
268 dy33 = _mm256_sub_pd(iy3,jy3);
269 dz33 = _mm256_sub_pd(iz3,jz3);
271 /* Calculate squared distance and things based on it */
272 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
273 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
274 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
275 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
276 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
277 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
278 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
279 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
280 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
281 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
283 rinv00 = avx256_invsqrt_d(rsq00);
284 rinv11 = avx256_invsqrt_d(rsq11);
285 rinv12 = avx256_invsqrt_d(rsq12);
286 rinv13 = avx256_invsqrt_d(rsq13);
287 rinv21 = avx256_invsqrt_d(rsq21);
288 rinv22 = avx256_invsqrt_d(rsq22);
289 rinv23 = avx256_invsqrt_d(rsq23);
290 rinv31 = avx256_invsqrt_d(rsq31);
291 rinv32 = avx256_invsqrt_d(rsq32);
292 rinv33 = avx256_invsqrt_d(rsq33);
294 fjx0 = _mm256_setzero_pd();
295 fjy0 = _mm256_setzero_pd();
296 fjz0 = _mm256_setzero_pd();
297 fjx1 = _mm256_setzero_pd();
298 fjy1 = _mm256_setzero_pd();
299 fjz1 = _mm256_setzero_pd();
300 fjx2 = _mm256_setzero_pd();
301 fjy2 = _mm256_setzero_pd();
302 fjz2 = _mm256_setzero_pd();
303 fjx3 = _mm256_setzero_pd();
304 fjy3 = _mm256_setzero_pd();
305 fjz3 = _mm256_setzero_pd();
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 r00 = _mm256_mul_pd(rsq00,rinv00);
313 /* Calculate table index by multiplying r with table scale and truncate to integer */
314 rt = _mm256_mul_pd(r00,vftabscale);
315 vfitab = _mm256_cvttpd_epi32(rt);
316 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
317 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
319 /* CUBIC SPLINE TABLE DISPERSION */
320 vfitab = _mm_add_epi32(vfitab,ifour);
321 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
322 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
323 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
324 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
325 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
326 Heps = _mm256_mul_pd(vfeps,H);
327 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
328 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
329 vvdw6 = _mm256_mul_pd(c6_00,VV);
330 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
331 fvdw6 = _mm256_mul_pd(c6_00,FF);
333 /* CUBIC SPLINE TABLE REPULSION */
334 vfitab = _mm_add_epi32(vfitab,ifour);
335 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
336 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
337 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
338 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
339 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
340 Heps = _mm256_mul_pd(vfeps,H);
341 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
342 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
343 vvdw12 = _mm256_mul_pd(c12_00,VV);
344 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
345 fvdw12 = _mm256_mul_pd(c12_00,FF);
346 vvdw = _mm256_add_pd(vvdw12,vvdw6);
347 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
354 /* Calculate temporary vectorial force */
355 tx = _mm256_mul_pd(fscal,dx00);
356 ty = _mm256_mul_pd(fscal,dy00);
357 tz = _mm256_mul_pd(fscal,dz00);
359 /* Update vectorial force */
360 fix0 = _mm256_add_pd(fix0,tx);
361 fiy0 = _mm256_add_pd(fiy0,ty);
362 fiz0 = _mm256_add_pd(fiz0,tz);
364 fjx0 = _mm256_add_pd(fjx0,tx);
365 fjy0 = _mm256_add_pd(fjy0,ty);
366 fjz0 = _mm256_add_pd(fjz0,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 r11 = _mm256_mul_pd(rsq11,rinv11);
374 /* Calculate table index by multiplying r with table scale and truncate to integer */
375 rt = _mm256_mul_pd(r11,vftabscale);
376 vfitab = _mm256_cvttpd_epi32(rt);
377 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
378 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
380 /* CUBIC SPLINE TABLE ELECTROSTATICS */
381 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
382 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
383 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
384 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
385 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
386 Heps = _mm256_mul_pd(vfeps,H);
387 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
388 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
389 velec = _mm256_mul_pd(qq11,VV);
390 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
391 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum = _mm256_add_pd(velecsum,velec);
398 /* Calculate temporary vectorial force */
399 tx = _mm256_mul_pd(fscal,dx11);
400 ty = _mm256_mul_pd(fscal,dy11);
401 tz = _mm256_mul_pd(fscal,dz11);
403 /* Update vectorial force */
404 fix1 = _mm256_add_pd(fix1,tx);
405 fiy1 = _mm256_add_pd(fiy1,ty);
406 fiz1 = _mm256_add_pd(fiz1,tz);
408 fjx1 = _mm256_add_pd(fjx1,tx);
409 fjy1 = _mm256_add_pd(fjy1,ty);
410 fjz1 = _mm256_add_pd(fjz1,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 r12 = _mm256_mul_pd(rsq12,rinv12);
418 /* Calculate table index by multiplying r with table scale and truncate to integer */
419 rt = _mm256_mul_pd(r12,vftabscale);
420 vfitab = _mm256_cvttpd_epi32(rt);
421 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
422 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
424 /* CUBIC SPLINE TABLE ELECTROSTATICS */
425 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
426 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
427 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
428 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
429 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
430 Heps = _mm256_mul_pd(vfeps,H);
431 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
432 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
433 velec = _mm256_mul_pd(qq12,VV);
434 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
435 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
437 /* Update potential sum for this i atom from the interaction with this j atom. */
438 velecsum = _mm256_add_pd(velecsum,velec);
442 /* Calculate temporary vectorial force */
443 tx = _mm256_mul_pd(fscal,dx12);
444 ty = _mm256_mul_pd(fscal,dy12);
445 tz = _mm256_mul_pd(fscal,dz12);
447 /* Update vectorial force */
448 fix1 = _mm256_add_pd(fix1,tx);
449 fiy1 = _mm256_add_pd(fiy1,ty);
450 fiz1 = _mm256_add_pd(fiz1,tz);
452 fjx2 = _mm256_add_pd(fjx2,tx);
453 fjy2 = _mm256_add_pd(fjy2,ty);
454 fjz2 = _mm256_add_pd(fjz2,tz);
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
460 r13 = _mm256_mul_pd(rsq13,rinv13);
462 /* Calculate table index by multiplying r with table scale and truncate to integer */
463 rt = _mm256_mul_pd(r13,vftabscale);
464 vfitab = _mm256_cvttpd_epi32(rt);
465 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
466 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
468 /* CUBIC SPLINE TABLE ELECTROSTATICS */
469 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
470 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
471 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
472 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
473 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
474 Heps = _mm256_mul_pd(vfeps,H);
475 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
476 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
477 velec = _mm256_mul_pd(qq13,VV);
478 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
479 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velecsum = _mm256_add_pd(velecsum,velec);
486 /* Calculate temporary vectorial force */
487 tx = _mm256_mul_pd(fscal,dx13);
488 ty = _mm256_mul_pd(fscal,dy13);
489 tz = _mm256_mul_pd(fscal,dz13);
491 /* Update vectorial force */
492 fix1 = _mm256_add_pd(fix1,tx);
493 fiy1 = _mm256_add_pd(fiy1,ty);
494 fiz1 = _mm256_add_pd(fiz1,tz);
496 fjx3 = _mm256_add_pd(fjx3,tx);
497 fjy3 = _mm256_add_pd(fjy3,ty);
498 fjz3 = _mm256_add_pd(fjz3,tz);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 r21 = _mm256_mul_pd(rsq21,rinv21);
506 /* Calculate table index by multiplying r with table scale and truncate to integer */
507 rt = _mm256_mul_pd(r21,vftabscale);
508 vfitab = _mm256_cvttpd_epi32(rt);
509 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
510 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
512 /* CUBIC SPLINE TABLE ELECTROSTATICS */
513 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
514 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
515 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
516 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
517 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
518 Heps = _mm256_mul_pd(vfeps,H);
519 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
520 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
521 velec = _mm256_mul_pd(qq21,VV);
522 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
523 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm256_add_pd(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm256_mul_pd(fscal,dx21);
532 ty = _mm256_mul_pd(fscal,dy21);
533 tz = _mm256_mul_pd(fscal,dz21);
535 /* Update vectorial force */
536 fix2 = _mm256_add_pd(fix2,tx);
537 fiy2 = _mm256_add_pd(fiy2,ty);
538 fiz2 = _mm256_add_pd(fiz2,tz);
540 fjx1 = _mm256_add_pd(fjx1,tx);
541 fjy1 = _mm256_add_pd(fjy1,ty);
542 fjz1 = _mm256_add_pd(fjz1,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r22 = _mm256_mul_pd(rsq22,rinv22);
550 /* Calculate table index by multiplying r with table scale and truncate to integer */
551 rt = _mm256_mul_pd(r22,vftabscale);
552 vfitab = _mm256_cvttpd_epi32(rt);
553 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
554 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
556 /* CUBIC SPLINE TABLE ELECTROSTATICS */
557 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
558 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
559 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
560 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
561 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
562 Heps = _mm256_mul_pd(vfeps,H);
563 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
564 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
565 velec = _mm256_mul_pd(qq22,VV);
566 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
567 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velecsum = _mm256_add_pd(velecsum,velec);
574 /* Calculate temporary vectorial force */
575 tx = _mm256_mul_pd(fscal,dx22);
576 ty = _mm256_mul_pd(fscal,dy22);
577 tz = _mm256_mul_pd(fscal,dz22);
579 /* Update vectorial force */
580 fix2 = _mm256_add_pd(fix2,tx);
581 fiy2 = _mm256_add_pd(fiy2,ty);
582 fiz2 = _mm256_add_pd(fiz2,tz);
584 fjx2 = _mm256_add_pd(fjx2,tx);
585 fjy2 = _mm256_add_pd(fjy2,ty);
586 fjz2 = _mm256_add_pd(fjz2,tz);
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 r23 = _mm256_mul_pd(rsq23,rinv23);
594 /* Calculate table index by multiplying r with table scale and truncate to integer */
595 rt = _mm256_mul_pd(r23,vftabscale);
596 vfitab = _mm256_cvttpd_epi32(rt);
597 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
598 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
600 /* CUBIC SPLINE TABLE ELECTROSTATICS */
601 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
602 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
603 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
604 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
605 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
606 Heps = _mm256_mul_pd(vfeps,H);
607 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
608 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
609 velec = _mm256_mul_pd(qq23,VV);
610 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
611 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velecsum = _mm256_add_pd(velecsum,velec);
618 /* Calculate temporary vectorial force */
619 tx = _mm256_mul_pd(fscal,dx23);
620 ty = _mm256_mul_pd(fscal,dy23);
621 tz = _mm256_mul_pd(fscal,dz23);
623 /* Update vectorial force */
624 fix2 = _mm256_add_pd(fix2,tx);
625 fiy2 = _mm256_add_pd(fiy2,ty);
626 fiz2 = _mm256_add_pd(fiz2,tz);
628 fjx3 = _mm256_add_pd(fjx3,tx);
629 fjy3 = _mm256_add_pd(fjy3,ty);
630 fjz3 = _mm256_add_pd(fjz3,tz);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 r31 = _mm256_mul_pd(rsq31,rinv31);
638 /* Calculate table index by multiplying r with table scale and truncate to integer */
639 rt = _mm256_mul_pd(r31,vftabscale);
640 vfitab = _mm256_cvttpd_epi32(rt);
641 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
642 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
645 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
646 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
647 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
648 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
649 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
650 Heps = _mm256_mul_pd(vfeps,H);
651 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
652 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
653 velec = _mm256_mul_pd(qq31,VV);
654 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
655 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velecsum = _mm256_add_pd(velecsum,velec);
662 /* Calculate temporary vectorial force */
663 tx = _mm256_mul_pd(fscal,dx31);
664 ty = _mm256_mul_pd(fscal,dy31);
665 tz = _mm256_mul_pd(fscal,dz31);
667 /* Update vectorial force */
668 fix3 = _mm256_add_pd(fix3,tx);
669 fiy3 = _mm256_add_pd(fiy3,ty);
670 fiz3 = _mm256_add_pd(fiz3,tz);
672 fjx1 = _mm256_add_pd(fjx1,tx);
673 fjy1 = _mm256_add_pd(fjy1,ty);
674 fjz1 = _mm256_add_pd(fjz1,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 r32 = _mm256_mul_pd(rsq32,rinv32);
682 /* Calculate table index by multiplying r with table scale and truncate to integer */
683 rt = _mm256_mul_pd(r32,vftabscale);
684 vfitab = _mm256_cvttpd_epi32(rt);
685 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
686 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
688 /* CUBIC SPLINE TABLE ELECTROSTATICS */
689 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
690 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
691 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
692 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
693 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
694 Heps = _mm256_mul_pd(vfeps,H);
695 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
696 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
697 velec = _mm256_mul_pd(qq32,VV);
698 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
699 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
701 /* Update potential sum for this i atom from the interaction with this j atom. */
702 velecsum = _mm256_add_pd(velecsum,velec);
706 /* Calculate temporary vectorial force */
707 tx = _mm256_mul_pd(fscal,dx32);
708 ty = _mm256_mul_pd(fscal,dy32);
709 tz = _mm256_mul_pd(fscal,dz32);
711 /* Update vectorial force */
712 fix3 = _mm256_add_pd(fix3,tx);
713 fiy3 = _mm256_add_pd(fiy3,ty);
714 fiz3 = _mm256_add_pd(fiz3,tz);
716 fjx2 = _mm256_add_pd(fjx2,tx);
717 fjy2 = _mm256_add_pd(fjy2,ty);
718 fjz2 = _mm256_add_pd(fjz2,tz);
720 /**************************
721 * CALCULATE INTERACTIONS *
722 **************************/
724 r33 = _mm256_mul_pd(rsq33,rinv33);
726 /* Calculate table index by multiplying r with table scale and truncate to integer */
727 rt = _mm256_mul_pd(r33,vftabscale);
728 vfitab = _mm256_cvttpd_epi32(rt);
729 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
730 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
732 /* CUBIC SPLINE TABLE ELECTROSTATICS */
733 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
734 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
735 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
736 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
737 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
738 Heps = _mm256_mul_pd(vfeps,H);
739 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
740 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
741 velec = _mm256_mul_pd(qq33,VV);
742 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
743 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
745 /* Update potential sum for this i atom from the interaction with this j atom. */
746 velecsum = _mm256_add_pd(velecsum,velec);
750 /* Calculate temporary vectorial force */
751 tx = _mm256_mul_pd(fscal,dx33);
752 ty = _mm256_mul_pd(fscal,dy33);
753 tz = _mm256_mul_pd(fscal,dz33);
755 /* Update vectorial force */
756 fix3 = _mm256_add_pd(fix3,tx);
757 fiy3 = _mm256_add_pd(fiy3,ty);
758 fiz3 = _mm256_add_pd(fiz3,tz);
760 fjx3 = _mm256_add_pd(fjx3,tx);
761 fjy3 = _mm256_add_pd(fjy3,ty);
762 fjz3 = _mm256_add_pd(fjz3,tz);
764 fjptrA = f+j_coord_offsetA;
765 fjptrB = f+j_coord_offsetB;
766 fjptrC = f+j_coord_offsetC;
767 fjptrD = f+j_coord_offsetD;
769 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
770 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
771 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
773 /* Inner loop uses 446 flops */
779 /* Get j neighbor index, and coordinate index */
780 jnrlistA = jjnr[jidx];
781 jnrlistB = jjnr[jidx+1];
782 jnrlistC = jjnr[jidx+2];
783 jnrlistD = jjnr[jidx+3];
784 /* Sign of each element will be negative for non-real atoms.
785 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
786 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
788 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
790 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
791 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
792 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
794 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
795 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
796 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
797 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
798 j_coord_offsetA = DIM*jnrA;
799 j_coord_offsetB = DIM*jnrB;
800 j_coord_offsetC = DIM*jnrC;
801 j_coord_offsetD = DIM*jnrD;
803 /* load j atom coordinates */
804 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
805 x+j_coord_offsetC,x+j_coord_offsetD,
806 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
807 &jy2,&jz2,&jx3,&jy3,&jz3);
809 /* Calculate displacement vector */
810 dx00 = _mm256_sub_pd(ix0,jx0);
811 dy00 = _mm256_sub_pd(iy0,jy0);
812 dz00 = _mm256_sub_pd(iz0,jz0);
813 dx11 = _mm256_sub_pd(ix1,jx1);
814 dy11 = _mm256_sub_pd(iy1,jy1);
815 dz11 = _mm256_sub_pd(iz1,jz1);
816 dx12 = _mm256_sub_pd(ix1,jx2);
817 dy12 = _mm256_sub_pd(iy1,jy2);
818 dz12 = _mm256_sub_pd(iz1,jz2);
819 dx13 = _mm256_sub_pd(ix1,jx3);
820 dy13 = _mm256_sub_pd(iy1,jy3);
821 dz13 = _mm256_sub_pd(iz1,jz3);
822 dx21 = _mm256_sub_pd(ix2,jx1);
823 dy21 = _mm256_sub_pd(iy2,jy1);
824 dz21 = _mm256_sub_pd(iz2,jz1);
825 dx22 = _mm256_sub_pd(ix2,jx2);
826 dy22 = _mm256_sub_pd(iy2,jy2);
827 dz22 = _mm256_sub_pd(iz2,jz2);
828 dx23 = _mm256_sub_pd(ix2,jx3);
829 dy23 = _mm256_sub_pd(iy2,jy3);
830 dz23 = _mm256_sub_pd(iz2,jz3);
831 dx31 = _mm256_sub_pd(ix3,jx1);
832 dy31 = _mm256_sub_pd(iy3,jy1);
833 dz31 = _mm256_sub_pd(iz3,jz1);
834 dx32 = _mm256_sub_pd(ix3,jx2);
835 dy32 = _mm256_sub_pd(iy3,jy2);
836 dz32 = _mm256_sub_pd(iz3,jz2);
837 dx33 = _mm256_sub_pd(ix3,jx3);
838 dy33 = _mm256_sub_pd(iy3,jy3);
839 dz33 = _mm256_sub_pd(iz3,jz3);
841 /* Calculate squared distance and things based on it */
842 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
843 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
844 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
845 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
846 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
847 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
848 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
849 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
850 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
851 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
853 rinv00 = avx256_invsqrt_d(rsq00);
854 rinv11 = avx256_invsqrt_d(rsq11);
855 rinv12 = avx256_invsqrt_d(rsq12);
856 rinv13 = avx256_invsqrt_d(rsq13);
857 rinv21 = avx256_invsqrt_d(rsq21);
858 rinv22 = avx256_invsqrt_d(rsq22);
859 rinv23 = avx256_invsqrt_d(rsq23);
860 rinv31 = avx256_invsqrt_d(rsq31);
861 rinv32 = avx256_invsqrt_d(rsq32);
862 rinv33 = avx256_invsqrt_d(rsq33);
864 fjx0 = _mm256_setzero_pd();
865 fjy0 = _mm256_setzero_pd();
866 fjz0 = _mm256_setzero_pd();
867 fjx1 = _mm256_setzero_pd();
868 fjy1 = _mm256_setzero_pd();
869 fjz1 = _mm256_setzero_pd();
870 fjx2 = _mm256_setzero_pd();
871 fjy2 = _mm256_setzero_pd();
872 fjz2 = _mm256_setzero_pd();
873 fjx3 = _mm256_setzero_pd();
874 fjy3 = _mm256_setzero_pd();
875 fjz3 = _mm256_setzero_pd();
877 /**************************
878 * CALCULATE INTERACTIONS *
879 **************************/
881 r00 = _mm256_mul_pd(rsq00,rinv00);
882 r00 = _mm256_andnot_pd(dummy_mask,r00);
884 /* Calculate table index by multiplying r with table scale and truncate to integer */
885 rt = _mm256_mul_pd(r00,vftabscale);
886 vfitab = _mm256_cvttpd_epi32(rt);
887 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
888 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
890 /* CUBIC SPLINE TABLE DISPERSION */
891 vfitab = _mm_add_epi32(vfitab,ifour);
892 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
893 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
894 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
895 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
896 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
897 Heps = _mm256_mul_pd(vfeps,H);
898 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
899 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
900 vvdw6 = _mm256_mul_pd(c6_00,VV);
901 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
902 fvdw6 = _mm256_mul_pd(c6_00,FF);
904 /* CUBIC SPLINE TABLE REPULSION */
905 vfitab = _mm_add_epi32(vfitab,ifour);
906 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
907 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
908 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
909 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
910 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
911 Heps = _mm256_mul_pd(vfeps,H);
912 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
913 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
914 vvdw12 = _mm256_mul_pd(c12_00,VV);
915 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
916 fvdw12 = _mm256_mul_pd(c12_00,FF);
917 vvdw = _mm256_add_pd(vvdw12,vvdw6);
918 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
920 /* Update potential sum for this i atom from the interaction with this j atom. */
921 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
922 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
926 fscal = _mm256_andnot_pd(dummy_mask,fscal);
928 /* Calculate temporary vectorial force */
929 tx = _mm256_mul_pd(fscal,dx00);
930 ty = _mm256_mul_pd(fscal,dy00);
931 tz = _mm256_mul_pd(fscal,dz00);
933 /* Update vectorial force */
934 fix0 = _mm256_add_pd(fix0,tx);
935 fiy0 = _mm256_add_pd(fiy0,ty);
936 fiz0 = _mm256_add_pd(fiz0,tz);
938 fjx0 = _mm256_add_pd(fjx0,tx);
939 fjy0 = _mm256_add_pd(fjy0,ty);
940 fjz0 = _mm256_add_pd(fjz0,tz);
942 /**************************
943 * CALCULATE INTERACTIONS *
944 **************************/
946 r11 = _mm256_mul_pd(rsq11,rinv11);
947 r11 = _mm256_andnot_pd(dummy_mask,r11);
949 /* Calculate table index by multiplying r with table scale and truncate to integer */
950 rt = _mm256_mul_pd(r11,vftabscale);
951 vfitab = _mm256_cvttpd_epi32(rt);
952 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
953 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
955 /* CUBIC SPLINE TABLE ELECTROSTATICS */
956 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
957 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
958 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
959 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
960 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
961 Heps = _mm256_mul_pd(vfeps,H);
962 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
963 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
964 velec = _mm256_mul_pd(qq11,VV);
965 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
966 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec = _mm256_andnot_pd(dummy_mask,velec);
970 velecsum = _mm256_add_pd(velecsum,velec);
974 fscal = _mm256_andnot_pd(dummy_mask,fscal);
976 /* Calculate temporary vectorial force */
977 tx = _mm256_mul_pd(fscal,dx11);
978 ty = _mm256_mul_pd(fscal,dy11);
979 tz = _mm256_mul_pd(fscal,dz11);
981 /* Update vectorial force */
982 fix1 = _mm256_add_pd(fix1,tx);
983 fiy1 = _mm256_add_pd(fiy1,ty);
984 fiz1 = _mm256_add_pd(fiz1,tz);
986 fjx1 = _mm256_add_pd(fjx1,tx);
987 fjy1 = _mm256_add_pd(fjy1,ty);
988 fjz1 = _mm256_add_pd(fjz1,tz);
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 r12 = _mm256_mul_pd(rsq12,rinv12);
995 r12 = _mm256_andnot_pd(dummy_mask,r12);
997 /* Calculate table index by multiplying r with table scale and truncate to integer */
998 rt = _mm256_mul_pd(r12,vftabscale);
999 vfitab = _mm256_cvttpd_epi32(rt);
1000 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1001 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1003 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1004 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1005 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1006 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1007 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1008 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1009 Heps = _mm256_mul_pd(vfeps,H);
1010 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1011 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1012 velec = _mm256_mul_pd(qq12,VV);
1013 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1014 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1016 /* Update potential sum for this i atom from the interaction with this j atom. */
1017 velec = _mm256_andnot_pd(dummy_mask,velec);
1018 velecsum = _mm256_add_pd(velecsum,velec);
1022 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1024 /* Calculate temporary vectorial force */
1025 tx = _mm256_mul_pd(fscal,dx12);
1026 ty = _mm256_mul_pd(fscal,dy12);
1027 tz = _mm256_mul_pd(fscal,dz12);
1029 /* Update vectorial force */
1030 fix1 = _mm256_add_pd(fix1,tx);
1031 fiy1 = _mm256_add_pd(fiy1,ty);
1032 fiz1 = _mm256_add_pd(fiz1,tz);
1034 fjx2 = _mm256_add_pd(fjx2,tx);
1035 fjy2 = _mm256_add_pd(fjy2,ty);
1036 fjz2 = _mm256_add_pd(fjz2,tz);
1038 /**************************
1039 * CALCULATE INTERACTIONS *
1040 **************************/
1042 r13 = _mm256_mul_pd(rsq13,rinv13);
1043 r13 = _mm256_andnot_pd(dummy_mask,r13);
1045 /* Calculate table index by multiplying r with table scale and truncate to integer */
1046 rt = _mm256_mul_pd(r13,vftabscale);
1047 vfitab = _mm256_cvttpd_epi32(rt);
1048 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1049 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1051 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1052 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1053 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1054 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1055 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1056 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1057 Heps = _mm256_mul_pd(vfeps,H);
1058 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1059 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1060 velec = _mm256_mul_pd(qq13,VV);
1061 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1062 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1064 /* Update potential sum for this i atom from the interaction with this j atom. */
1065 velec = _mm256_andnot_pd(dummy_mask,velec);
1066 velecsum = _mm256_add_pd(velecsum,velec);
1070 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1072 /* Calculate temporary vectorial force */
1073 tx = _mm256_mul_pd(fscal,dx13);
1074 ty = _mm256_mul_pd(fscal,dy13);
1075 tz = _mm256_mul_pd(fscal,dz13);
1077 /* Update vectorial force */
1078 fix1 = _mm256_add_pd(fix1,tx);
1079 fiy1 = _mm256_add_pd(fiy1,ty);
1080 fiz1 = _mm256_add_pd(fiz1,tz);
1082 fjx3 = _mm256_add_pd(fjx3,tx);
1083 fjy3 = _mm256_add_pd(fjy3,ty);
1084 fjz3 = _mm256_add_pd(fjz3,tz);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 r21 = _mm256_mul_pd(rsq21,rinv21);
1091 r21 = _mm256_andnot_pd(dummy_mask,r21);
1093 /* Calculate table index by multiplying r with table scale and truncate to integer */
1094 rt = _mm256_mul_pd(r21,vftabscale);
1095 vfitab = _mm256_cvttpd_epi32(rt);
1096 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1097 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1099 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1100 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1101 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1102 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1103 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1104 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1105 Heps = _mm256_mul_pd(vfeps,H);
1106 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1107 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1108 velec = _mm256_mul_pd(qq21,VV);
1109 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1110 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1112 /* Update potential sum for this i atom from the interaction with this j atom. */
1113 velec = _mm256_andnot_pd(dummy_mask,velec);
1114 velecsum = _mm256_add_pd(velecsum,velec);
1118 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1120 /* Calculate temporary vectorial force */
1121 tx = _mm256_mul_pd(fscal,dx21);
1122 ty = _mm256_mul_pd(fscal,dy21);
1123 tz = _mm256_mul_pd(fscal,dz21);
1125 /* Update vectorial force */
1126 fix2 = _mm256_add_pd(fix2,tx);
1127 fiy2 = _mm256_add_pd(fiy2,ty);
1128 fiz2 = _mm256_add_pd(fiz2,tz);
1130 fjx1 = _mm256_add_pd(fjx1,tx);
1131 fjy1 = _mm256_add_pd(fjy1,ty);
1132 fjz1 = _mm256_add_pd(fjz1,tz);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 r22 = _mm256_mul_pd(rsq22,rinv22);
1139 r22 = _mm256_andnot_pd(dummy_mask,r22);
1141 /* Calculate table index by multiplying r with table scale and truncate to integer */
1142 rt = _mm256_mul_pd(r22,vftabscale);
1143 vfitab = _mm256_cvttpd_epi32(rt);
1144 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1145 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1147 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1149 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1150 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1151 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1152 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1153 Heps = _mm256_mul_pd(vfeps,H);
1154 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1155 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1156 velec = _mm256_mul_pd(qq22,VV);
1157 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1158 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1160 /* Update potential sum for this i atom from the interaction with this j atom. */
1161 velec = _mm256_andnot_pd(dummy_mask,velec);
1162 velecsum = _mm256_add_pd(velecsum,velec);
1166 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1168 /* Calculate temporary vectorial force */
1169 tx = _mm256_mul_pd(fscal,dx22);
1170 ty = _mm256_mul_pd(fscal,dy22);
1171 tz = _mm256_mul_pd(fscal,dz22);
1173 /* Update vectorial force */
1174 fix2 = _mm256_add_pd(fix2,tx);
1175 fiy2 = _mm256_add_pd(fiy2,ty);
1176 fiz2 = _mm256_add_pd(fiz2,tz);
1178 fjx2 = _mm256_add_pd(fjx2,tx);
1179 fjy2 = _mm256_add_pd(fjy2,ty);
1180 fjz2 = _mm256_add_pd(fjz2,tz);
1182 /**************************
1183 * CALCULATE INTERACTIONS *
1184 **************************/
1186 r23 = _mm256_mul_pd(rsq23,rinv23);
1187 r23 = _mm256_andnot_pd(dummy_mask,r23);
1189 /* Calculate table index by multiplying r with table scale and truncate to integer */
1190 rt = _mm256_mul_pd(r23,vftabscale);
1191 vfitab = _mm256_cvttpd_epi32(rt);
1192 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1193 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1195 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1196 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1197 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1198 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1199 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1200 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1201 Heps = _mm256_mul_pd(vfeps,H);
1202 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1203 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1204 velec = _mm256_mul_pd(qq23,VV);
1205 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1206 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1208 /* Update potential sum for this i atom from the interaction with this j atom. */
1209 velec = _mm256_andnot_pd(dummy_mask,velec);
1210 velecsum = _mm256_add_pd(velecsum,velec);
1214 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1216 /* Calculate temporary vectorial force */
1217 tx = _mm256_mul_pd(fscal,dx23);
1218 ty = _mm256_mul_pd(fscal,dy23);
1219 tz = _mm256_mul_pd(fscal,dz23);
1221 /* Update vectorial force */
1222 fix2 = _mm256_add_pd(fix2,tx);
1223 fiy2 = _mm256_add_pd(fiy2,ty);
1224 fiz2 = _mm256_add_pd(fiz2,tz);
1226 fjx3 = _mm256_add_pd(fjx3,tx);
1227 fjy3 = _mm256_add_pd(fjy3,ty);
1228 fjz3 = _mm256_add_pd(fjz3,tz);
1230 /**************************
1231 * CALCULATE INTERACTIONS *
1232 **************************/
1234 r31 = _mm256_mul_pd(rsq31,rinv31);
1235 r31 = _mm256_andnot_pd(dummy_mask,r31);
1237 /* Calculate table index by multiplying r with table scale and truncate to integer */
1238 rt = _mm256_mul_pd(r31,vftabscale);
1239 vfitab = _mm256_cvttpd_epi32(rt);
1240 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1241 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1243 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1244 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1245 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1246 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1247 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1248 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1249 Heps = _mm256_mul_pd(vfeps,H);
1250 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1251 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1252 velec = _mm256_mul_pd(qq31,VV);
1253 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1254 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1256 /* Update potential sum for this i atom from the interaction with this j atom. */
1257 velec = _mm256_andnot_pd(dummy_mask,velec);
1258 velecsum = _mm256_add_pd(velecsum,velec);
1262 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1264 /* Calculate temporary vectorial force */
1265 tx = _mm256_mul_pd(fscal,dx31);
1266 ty = _mm256_mul_pd(fscal,dy31);
1267 tz = _mm256_mul_pd(fscal,dz31);
1269 /* Update vectorial force */
1270 fix3 = _mm256_add_pd(fix3,tx);
1271 fiy3 = _mm256_add_pd(fiy3,ty);
1272 fiz3 = _mm256_add_pd(fiz3,tz);
1274 fjx1 = _mm256_add_pd(fjx1,tx);
1275 fjy1 = _mm256_add_pd(fjy1,ty);
1276 fjz1 = _mm256_add_pd(fjz1,tz);
1278 /**************************
1279 * CALCULATE INTERACTIONS *
1280 **************************/
1282 r32 = _mm256_mul_pd(rsq32,rinv32);
1283 r32 = _mm256_andnot_pd(dummy_mask,r32);
1285 /* Calculate table index by multiplying r with table scale and truncate to integer */
1286 rt = _mm256_mul_pd(r32,vftabscale);
1287 vfitab = _mm256_cvttpd_epi32(rt);
1288 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1289 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1291 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1292 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1293 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1294 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1295 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1296 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1297 Heps = _mm256_mul_pd(vfeps,H);
1298 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1299 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1300 velec = _mm256_mul_pd(qq32,VV);
1301 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1302 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
1304 /* Update potential sum for this i atom from the interaction with this j atom. */
1305 velec = _mm256_andnot_pd(dummy_mask,velec);
1306 velecsum = _mm256_add_pd(velecsum,velec);
1310 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1312 /* Calculate temporary vectorial force */
1313 tx = _mm256_mul_pd(fscal,dx32);
1314 ty = _mm256_mul_pd(fscal,dy32);
1315 tz = _mm256_mul_pd(fscal,dz32);
1317 /* Update vectorial force */
1318 fix3 = _mm256_add_pd(fix3,tx);
1319 fiy3 = _mm256_add_pd(fiy3,ty);
1320 fiz3 = _mm256_add_pd(fiz3,tz);
1322 fjx2 = _mm256_add_pd(fjx2,tx);
1323 fjy2 = _mm256_add_pd(fjy2,ty);
1324 fjz2 = _mm256_add_pd(fjz2,tz);
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 r33 = _mm256_mul_pd(rsq33,rinv33);
1331 r33 = _mm256_andnot_pd(dummy_mask,r33);
1333 /* Calculate table index by multiplying r with table scale and truncate to integer */
1334 rt = _mm256_mul_pd(r33,vftabscale);
1335 vfitab = _mm256_cvttpd_epi32(rt);
1336 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1337 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1339 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1340 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1341 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1342 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1343 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1344 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1345 Heps = _mm256_mul_pd(vfeps,H);
1346 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1347 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1348 velec = _mm256_mul_pd(qq33,VV);
1349 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1350 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
1352 /* Update potential sum for this i atom from the interaction with this j atom. */
1353 velec = _mm256_andnot_pd(dummy_mask,velec);
1354 velecsum = _mm256_add_pd(velecsum,velec);
1358 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1360 /* Calculate temporary vectorial force */
1361 tx = _mm256_mul_pd(fscal,dx33);
1362 ty = _mm256_mul_pd(fscal,dy33);
1363 tz = _mm256_mul_pd(fscal,dz33);
1365 /* Update vectorial force */
1366 fix3 = _mm256_add_pd(fix3,tx);
1367 fiy3 = _mm256_add_pd(fiy3,ty);
1368 fiz3 = _mm256_add_pd(fiz3,tz);
1370 fjx3 = _mm256_add_pd(fjx3,tx);
1371 fjy3 = _mm256_add_pd(fjy3,ty);
1372 fjz3 = _mm256_add_pd(fjz3,tz);
1374 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1375 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1376 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1377 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1379 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1380 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1381 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1383 /* Inner loop uses 456 flops */
1386 /* End of innermost loop */
1388 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1389 f+i_coord_offset,fshift+i_shift_offset);
1392 /* Update potential energies */
1393 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1394 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1396 /* Increment number of inner iterations */
1397 inneriter += j_index_end - j_index_start;
1399 /* Outer loop uses 26 flops */
1402 /* Increment number of outer iterations */
1405 /* Update outer/inner flops */
1407 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1410 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1411 * Electrostatics interaction: CubicSplineTable
1412 * VdW interaction: CubicSplineTable
1413 * Geometry: Water4-Water4
1414 * Calculate force/pot: Force
1417 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1418 (t_nblist * gmx_restrict nlist,
1419 rvec * gmx_restrict xx,
1420 rvec * gmx_restrict ff,
1421 struct t_forcerec * gmx_restrict fr,
1422 t_mdatoms * gmx_restrict mdatoms,
1423 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1424 t_nrnb * gmx_restrict nrnb)
1426 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1427 * just 0 for non-waters.
1428 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1429 * jnr indices corresponding to data put in the four positions in the SIMD register.
1431 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1432 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1433 int jnrA,jnrB,jnrC,jnrD;
1434 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1435 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1436 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1437 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1438 real rcutoff_scalar;
1439 real *shiftvec,*fshift,*x,*f;
1440 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1441 real scratch[4*DIM];
1442 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1443 real * vdwioffsetptr0;
1444 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1445 real * vdwioffsetptr1;
1446 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1447 real * vdwioffsetptr2;
1448 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1449 real * vdwioffsetptr3;
1450 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1451 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1452 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1453 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1454 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1455 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1456 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1457 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1458 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1459 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1460 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1461 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1462 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1463 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1464 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1465 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1466 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1467 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1468 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1469 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1472 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1475 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1476 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1478 __m128i ifour = _mm_set1_epi32(4);
1479 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1481 __m256d dummy_mask,cutoff_mask;
1482 __m128 tmpmask0,tmpmask1;
1483 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1484 __m256d one = _mm256_set1_pd(1.0);
1485 __m256d two = _mm256_set1_pd(2.0);
1491 jindex = nlist->jindex;
1493 shiftidx = nlist->shift;
1495 shiftvec = fr->shift_vec[0];
1496 fshift = fr->fshift[0];
1497 facel = _mm256_set1_pd(fr->ic->epsfac);
1498 charge = mdatoms->chargeA;
1499 nvdwtype = fr->ntype;
1500 vdwparam = fr->nbfp;
1501 vdwtype = mdatoms->typeA;
1503 vftab = kernel_data->table_elec_vdw->data;
1504 vftabscale = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
1506 /* Setup water-specific parameters */
1507 inr = nlist->iinr[0];
1508 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1509 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1510 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1511 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1513 jq1 = _mm256_set1_pd(charge[inr+1]);
1514 jq2 = _mm256_set1_pd(charge[inr+2]);
1515 jq3 = _mm256_set1_pd(charge[inr+3]);
1516 vdwjidx0A = 2*vdwtype[inr+0];
1517 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1518 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1519 qq11 = _mm256_mul_pd(iq1,jq1);
1520 qq12 = _mm256_mul_pd(iq1,jq2);
1521 qq13 = _mm256_mul_pd(iq1,jq3);
1522 qq21 = _mm256_mul_pd(iq2,jq1);
1523 qq22 = _mm256_mul_pd(iq2,jq2);
1524 qq23 = _mm256_mul_pd(iq2,jq3);
1525 qq31 = _mm256_mul_pd(iq3,jq1);
1526 qq32 = _mm256_mul_pd(iq3,jq2);
1527 qq33 = _mm256_mul_pd(iq3,jq3);
1529 /* Avoid stupid compiler warnings */
1530 jnrA = jnrB = jnrC = jnrD = 0;
1531 j_coord_offsetA = 0;
1532 j_coord_offsetB = 0;
1533 j_coord_offsetC = 0;
1534 j_coord_offsetD = 0;
1539 for(iidx=0;iidx<4*DIM;iidx++)
1541 scratch[iidx] = 0.0;
1544 /* Start outer loop over neighborlists */
1545 for(iidx=0; iidx<nri; iidx++)
1547 /* Load shift vector for this list */
1548 i_shift_offset = DIM*shiftidx[iidx];
1550 /* Load limits for loop over neighbors */
1551 j_index_start = jindex[iidx];
1552 j_index_end = jindex[iidx+1];
1554 /* Get outer coordinate index */
1556 i_coord_offset = DIM*inr;
1558 /* Load i particle coords and add shift vector */
1559 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1560 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1562 fix0 = _mm256_setzero_pd();
1563 fiy0 = _mm256_setzero_pd();
1564 fiz0 = _mm256_setzero_pd();
1565 fix1 = _mm256_setzero_pd();
1566 fiy1 = _mm256_setzero_pd();
1567 fiz1 = _mm256_setzero_pd();
1568 fix2 = _mm256_setzero_pd();
1569 fiy2 = _mm256_setzero_pd();
1570 fiz2 = _mm256_setzero_pd();
1571 fix3 = _mm256_setzero_pd();
1572 fiy3 = _mm256_setzero_pd();
1573 fiz3 = _mm256_setzero_pd();
1575 /* Start inner kernel loop */
1576 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1579 /* Get j neighbor index, and coordinate index */
1581 jnrB = jjnr[jidx+1];
1582 jnrC = jjnr[jidx+2];
1583 jnrD = jjnr[jidx+3];
1584 j_coord_offsetA = DIM*jnrA;
1585 j_coord_offsetB = DIM*jnrB;
1586 j_coord_offsetC = DIM*jnrC;
1587 j_coord_offsetD = DIM*jnrD;
1589 /* load j atom coordinates */
1590 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1591 x+j_coord_offsetC,x+j_coord_offsetD,
1592 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1593 &jy2,&jz2,&jx3,&jy3,&jz3);
1595 /* Calculate displacement vector */
1596 dx00 = _mm256_sub_pd(ix0,jx0);
1597 dy00 = _mm256_sub_pd(iy0,jy0);
1598 dz00 = _mm256_sub_pd(iz0,jz0);
1599 dx11 = _mm256_sub_pd(ix1,jx1);
1600 dy11 = _mm256_sub_pd(iy1,jy1);
1601 dz11 = _mm256_sub_pd(iz1,jz1);
1602 dx12 = _mm256_sub_pd(ix1,jx2);
1603 dy12 = _mm256_sub_pd(iy1,jy2);
1604 dz12 = _mm256_sub_pd(iz1,jz2);
1605 dx13 = _mm256_sub_pd(ix1,jx3);
1606 dy13 = _mm256_sub_pd(iy1,jy3);
1607 dz13 = _mm256_sub_pd(iz1,jz3);
1608 dx21 = _mm256_sub_pd(ix2,jx1);
1609 dy21 = _mm256_sub_pd(iy2,jy1);
1610 dz21 = _mm256_sub_pd(iz2,jz1);
1611 dx22 = _mm256_sub_pd(ix2,jx2);
1612 dy22 = _mm256_sub_pd(iy2,jy2);
1613 dz22 = _mm256_sub_pd(iz2,jz2);
1614 dx23 = _mm256_sub_pd(ix2,jx3);
1615 dy23 = _mm256_sub_pd(iy2,jy3);
1616 dz23 = _mm256_sub_pd(iz2,jz3);
1617 dx31 = _mm256_sub_pd(ix3,jx1);
1618 dy31 = _mm256_sub_pd(iy3,jy1);
1619 dz31 = _mm256_sub_pd(iz3,jz1);
1620 dx32 = _mm256_sub_pd(ix3,jx2);
1621 dy32 = _mm256_sub_pd(iy3,jy2);
1622 dz32 = _mm256_sub_pd(iz3,jz2);
1623 dx33 = _mm256_sub_pd(ix3,jx3);
1624 dy33 = _mm256_sub_pd(iy3,jy3);
1625 dz33 = _mm256_sub_pd(iz3,jz3);
1627 /* Calculate squared distance and things based on it */
1628 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1629 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1630 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1631 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1632 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1633 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1634 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1635 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1636 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1637 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1639 rinv00 = avx256_invsqrt_d(rsq00);
1640 rinv11 = avx256_invsqrt_d(rsq11);
1641 rinv12 = avx256_invsqrt_d(rsq12);
1642 rinv13 = avx256_invsqrt_d(rsq13);
1643 rinv21 = avx256_invsqrt_d(rsq21);
1644 rinv22 = avx256_invsqrt_d(rsq22);
1645 rinv23 = avx256_invsqrt_d(rsq23);
1646 rinv31 = avx256_invsqrt_d(rsq31);
1647 rinv32 = avx256_invsqrt_d(rsq32);
1648 rinv33 = avx256_invsqrt_d(rsq33);
1650 fjx0 = _mm256_setzero_pd();
1651 fjy0 = _mm256_setzero_pd();
1652 fjz0 = _mm256_setzero_pd();
1653 fjx1 = _mm256_setzero_pd();
1654 fjy1 = _mm256_setzero_pd();
1655 fjz1 = _mm256_setzero_pd();
1656 fjx2 = _mm256_setzero_pd();
1657 fjy2 = _mm256_setzero_pd();
1658 fjz2 = _mm256_setzero_pd();
1659 fjx3 = _mm256_setzero_pd();
1660 fjy3 = _mm256_setzero_pd();
1661 fjz3 = _mm256_setzero_pd();
1663 /**************************
1664 * CALCULATE INTERACTIONS *
1665 **************************/
1667 r00 = _mm256_mul_pd(rsq00,rinv00);
1669 /* Calculate table index by multiplying r with table scale and truncate to integer */
1670 rt = _mm256_mul_pd(r00,vftabscale);
1671 vfitab = _mm256_cvttpd_epi32(rt);
1672 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1673 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1675 /* CUBIC SPLINE TABLE DISPERSION */
1676 vfitab = _mm_add_epi32(vfitab,ifour);
1677 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1678 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1679 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1680 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1681 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1682 Heps = _mm256_mul_pd(vfeps,H);
1683 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1684 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1685 fvdw6 = _mm256_mul_pd(c6_00,FF);
1687 /* CUBIC SPLINE TABLE REPULSION */
1688 vfitab = _mm_add_epi32(vfitab,ifour);
1689 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1690 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1691 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1692 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1693 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1694 Heps = _mm256_mul_pd(vfeps,H);
1695 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1696 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1697 fvdw12 = _mm256_mul_pd(c12_00,FF);
1698 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1702 /* Calculate temporary vectorial force */
1703 tx = _mm256_mul_pd(fscal,dx00);
1704 ty = _mm256_mul_pd(fscal,dy00);
1705 tz = _mm256_mul_pd(fscal,dz00);
1707 /* Update vectorial force */
1708 fix0 = _mm256_add_pd(fix0,tx);
1709 fiy0 = _mm256_add_pd(fiy0,ty);
1710 fiz0 = _mm256_add_pd(fiz0,tz);
1712 fjx0 = _mm256_add_pd(fjx0,tx);
1713 fjy0 = _mm256_add_pd(fjy0,ty);
1714 fjz0 = _mm256_add_pd(fjz0,tz);
1716 /**************************
1717 * CALCULATE INTERACTIONS *
1718 **************************/
1720 r11 = _mm256_mul_pd(rsq11,rinv11);
1722 /* Calculate table index by multiplying r with table scale and truncate to integer */
1723 rt = _mm256_mul_pd(r11,vftabscale);
1724 vfitab = _mm256_cvttpd_epi32(rt);
1725 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1726 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1728 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1729 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1730 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1731 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1732 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1733 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1734 Heps = _mm256_mul_pd(vfeps,H);
1735 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1736 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1737 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1741 /* Calculate temporary vectorial force */
1742 tx = _mm256_mul_pd(fscal,dx11);
1743 ty = _mm256_mul_pd(fscal,dy11);
1744 tz = _mm256_mul_pd(fscal,dz11);
1746 /* Update vectorial force */
1747 fix1 = _mm256_add_pd(fix1,tx);
1748 fiy1 = _mm256_add_pd(fiy1,ty);
1749 fiz1 = _mm256_add_pd(fiz1,tz);
1751 fjx1 = _mm256_add_pd(fjx1,tx);
1752 fjy1 = _mm256_add_pd(fjy1,ty);
1753 fjz1 = _mm256_add_pd(fjz1,tz);
1755 /**************************
1756 * CALCULATE INTERACTIONS *
1757 **************************/
1759 r12 = _mm256_mul_pd(rsq12,rinv12);
1761 /* Calculate table index by multiplying r with table scale and truncate to integer */
1762 rt = _mm256_mul_pd(r12,vftabscale);
1763 vfitab = _mm256_cvttpd_epi32(rt);
1764 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1765 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1767 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1768 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1769 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1770 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1771 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1772 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1773 Heps = _mm256_mul_pd(vfeps,H);
1774 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1775 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1776 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1780 /* Calculate temporary vectorial force */
1781 tx = _mm256_mul_pd(fscal,dx12);
1782 ty = _mm256_mul_pd(fscal,dy12);
1783 tz = _mm256_mul_pd(fscal,dz12);
1785 /* Update vectorial force */
1786 fix1 = _mm256_add_pd(fix1,tx);
1787 fiy1 = _mm256_add_pd(fiy1,ty);
1788 fiz1 = _mm256_add_pd(fiz1,tz);
1790 fjx2 = _mm256_add_pd(fjx2,tx);
1791 fjy2 = _mm256_add_pd(fjy2,ty);
1792 fjz2 = _mm256_add_pd(fjz2,tz);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 r13 = _mm256_mul_pd(rsq13,rinv13);
1800 /* Calculate table index by multiplying r with table scale and truncate to integer */
1801 rt = _mm256_mul_pd(r13,vftabscale);
1802 vfitab = _mm256_cvttpd_epi32(rt);
1803 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1804 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1806 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1807 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1808 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1809 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1810 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1811 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1812 Heps = _mm256_mul_pd(vfeps,H);
1813 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1814 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1815 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1819 /* Calculate temporary vectorial force */
1820 tx = _mm256_mul_pd(fscal,dx13);
1821 ty = _mm256_mul_pd(fscal,dy13);
1822 tz = _mm256_mul_pd(fscal,dz13);
1824 /* Update vectorial force */
1825 fix1 = _mm256_add_pd(fix1,tx);
1826 fiy1 = _mm256_add_pd(fiy1,ty);
1827 fiz1 = _mm256_add_pd(fiz1,tz);
1829 fjx3 = _mm256_add_pd(fjx3,tx);
1830 fjy3 = _mm256_add_pd(fjy3,ty);
1831 fjz3 = _mm256_add_pd(fjz3,tz);
1833 /**************************
1834 * CALCULATE INTERACTIONS *
1835 **************************/
1837 r21 = _mm256_mul_pd(rsq21,rinv21);
1839 /* Calculate table index by multiplying r with table scale and truncate to integer */
1840 rt = _mm256_mul_pd(r21,vftabscale);
1841 vfitab = _mm256_cvttpd_epi32(rt);
1842 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1843 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1845 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1846 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1847 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1848 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1849 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1850 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1851 Heps = _mm256_mul_pd(vfeps,H);
1852 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1853 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1854 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1858 /* Calculate temporary vectorial force */
1859 tx = _mm256_mul_pd(fscal,dx21);
1860 ty = _mm256_mul_pd(fscal,dy21);
1861 tz = _mm256_mul_pd(fscal,dz21);
1863 /* Update vectorial force */
1864 fix2 = _mm256_add_pd(fix2,tx);
1865 fiy2 = _mm256_add_pd(fiy2,ty);
1866 fiz2 = _mm256_add_pd(fiz2,tz);
1868 fjx1 = _mm256_add_pd(fjx1,tx);
1869 fjy1 = _mm256_add_pd(fjy1,ty);
1870 fjz1 = _mm256_add_pd(fjz1,tz);
1872 /**************************
1873 * CALCULATE INTERACTIONS *
1874 **************************/
1876 r22 = _mm256_mul_pd(rsq22,rinv22);
1878 /* Calculate table index by multiplying r with table scale and truncate to integer */
1879 rt = _mm256_mul_pd(r22,vftabscale);
1880 vfitab = _mm256_cvttpd_epi32(rt);
1881 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1882 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1884 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1885 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1886 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1887 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1888 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1889 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1890 Heps = _mm256_mul_pd(vfeps,H);
1891 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1892 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1893 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1897 /* Calculate temporary vectorial force */
1898 tx = _mm256_mul_pd(fscal,dx22);
1899 ty = _mm256_mul_pd(fscal,dy22);
1900 tz = _mm256_mul_pd(fscal,dz22);
1902 /* Update vectorial force */
1903 fix2 = _mm256_add_pd(fix2,tx);
1904 fiy2 = _mm256_add_pd(fiy2,ty);
1905 fiz2 = _mm256_add_pd(fiz2,tz);
1907 fjx2 = _mm256_add_pd(fjx2,tx);
1908 fjy2 = _mm256_add_pd(fjy2,ty);
1909 fjz2 = _mm256_add_pd(fjz2,tz);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 r23 = _mm256_mul_pd(rsq23,rinv23);
1917 /* Calculate table index by multiplying r with table scale and truncate to integer */
1918 rt = _mm256_mul_pd(r23,vftabscale);
1919 vfitab = _mm256_cvttpd_epi32(rt);
1920 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1921 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1925 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1926 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1927 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1928 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1929 Heps = _mm256_mul_pd(vfeps,H);
1930 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1931 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1932 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1936 /* Calculate temporary vectorial force */
1937 tx = _mm256_mul_pd(fscal,dx23);
1938 ty = _mm256_mul_pd(fscal,dy23);
1939 tz = _mm256_mul_pd(fscal,dz23);
1941 /* Update vectorial force */
1942 fix2 = _mm256_add_pd(fix2,tx);
1943 fiy2 = _mm256_add_pd(fiy2,ty);
1944 fiz2 = _mm256_add_pd(fiz2,tz);
1946 fjx3 = _mm256_add_pd(fjx3,tx);
1947 fjy3 = _mm256_add_pd(fjy3,ty);
1948 fjz3 = _mm256_add_pd(fjz3,tz);
1950 /**************************
1951 * CALCULATE INTERACTIONS *
1952 **************************/
1954 r31 = _mm256_mul_pd(rsq31,rinv31);
1956 /* Calculate table index by multiplying r with table scale and truncate to integer */
1957 rt = _mm256_mul_pd(r31,vftabscale);
1958 vfitab = _mm256_cvttpd_epi32(rt);
1959 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1960 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1962 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1964 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1965 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1966 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1967 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1968 Heps = _mm256_mul_pd(vfeps,H);
1969 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1970 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1971 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1975 /* Calculate temporary vectorial force */
1976 tx = _mm256_mul_pd(fscal,dx31);
1977 ty = _mm256_mul_pd(fscal,dy31);
1978 tz = _mm256_mul_pd(fscal,dz31);
1980 /* Update vectorial force */
1981 fix3 = _mm256_add_pd(fix3,tx);
1982 fiy3 = _mm256_add_pd(fiy3,ty);
1983 fiz3 = _mm256_add_pd(fiz3,tz);
1985 fjx1 = _mm256_add_pd(fjx1,tx);
1986 fjy1 = _mm256_add_pd(fjy1,ty);
1987 fjz1 = _mm256_add_pd(fjz1,tz);
1989 /**************************
1990 * CALCULATE INTERACTIONS *
1991 **************************/
1993 r32 = _mm256_mul_pd(rsq32,rinv32);
1995 /* Calculate table index by multiplying r with table scale and truncate to integer */
1996 rt = _mm256_mul_pd(r32,vftabscale);
1997 vfitab = _mm256_cvttpd_epi32(rt);
1998 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1999 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2001 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2002 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2003 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2004 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2005 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2006 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2007 Heps = _mm256_mul_pd(vfeps,H);
2008 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2009 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2010 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2014 /* Calculate temporary vectorial force */
2015 tx = _mm256_mul_pd(fscal,dx32);
2016 ty = _mm256_mul_pd(fscal,dy32);
2017 tz = _mm256_mul_pd(fscal,dz32);
2019 /* Update vectorial force */
2020 fix3 = _mm256_add_pd(fix3,tx);
2021 fiy3 = _mm256_add_pd(fiy3,ty);
2022 fiz3 = _mm256_add_pd(fiz3,tz);
2024 fjx2 = _mm256_add_pd(fjx2,tx);
2025 fjy2 = _mm256_add_pd(fjy2,ty);
2026 fjz2 = _mm256_add_pd(fjz2,tz);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 r33 = _mm256_mul_pd(rsq33,rinv33);
2034 /* Calculate table index by multiplying r with table scale and truncate to integer */
2035 rt = _mm256_mul_pd(r33,vftabscale);
2036 vfitab = _mm256_cvttpd_epi32(rt);
2037 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2038 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2040 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2041 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2042 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2043 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2044 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2045 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2046 Heps = _mm256_mul_pd(vfeps,H);
2047 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2048 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2049 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2053 /* Calculate temporary vectorial force */
2054 tx = _mm256_mul_pd(fscal,dx33);
2055 ty = _mm256_mul_pd(fscal,dy33);
2056 tz = _mm256_mul_pd(fscal,dz33);
2058 /* Update vectorial force */
2059 fix3 = _mm256_add_pd(fix3,tx);
2060 fiy3 = _mm256_add_pd(fiy3,ty);
2061 fiz3 = _mm256_add_pd(fiz3,tz);
2063 fjx3 = _mm256_add_pd(fjx3,tx);
2064 fjy3 = _mm256_add_pd(fjy3,ty);
2065 fjz3 = _mm256_add_pd(fjz3,tz);
2067 fjptrA = f+j_coord_offsetA;
2068 fjptrB = f+j_coord_offsetB;
2069 fjptrC = f+j_coord_offsetC;
2070 fjptrD = f+j_coord_offsetD;
2072 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2073 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2074 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2076 /* Inner loop uses 402 flops */
2079 if(jidx<j_index_end)
2082 /* Get j neighbor index, and coordinate index */
2083 jnrlistA = jjnr[jidx];
2084 jnrlistB = jjnr[jidx+1];
2085 jnrlistC = jjnr[jidx+2];
2086 jnrlistD = jjnr[jidx+3];
2087 /* Sign of each element will be negative for non-real atoms.
2088 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2089 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2091 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2093 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2094 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2095 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2097 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2098 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2099 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2100 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2101 j_coord_offsetA = DIM*jnrA;
2102 j_coord_offsetB = DIM*jnrB;
2103 j_coord_offsetC = DIM*jnrC;
2104 j_coord_offsetD = DIM*jnrD;
2106 /* load j atom coordinates */
2107 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2108 x+j_coord_offsetC,x+j_coord_offsetD,
2109 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2110 &jy2,&jz2,&jx3,&jy3,&jz3);
2112 /* Calculate displacement vector */
2113 dx00 = _mm256_sub_pd(ix0,jx0);
2114 dy00 = _mm256_sub_pd(iy0,jy0);
2115 dz00 = _mm256_sub_pd(iz0,jz0);
2116 dx11 = _mm256_sub_pd(ix1,jx1);
2117 dy11 = _mm256_sub_pd(iy1,jy1);
2118 dz11 = _mm256_sub_pd(iz1,jz1);
2119 dx12 = _mm256_sub_pd(ix1,jx2);
2120 dy12 = _mm256_sub_pd(iy1,jy2);
2121 dz12 = _mm256_sub_pd(iz1,jz2);
2122 dx13 = _mm256_sub_pd(ix1,jx3);
2123 dy13 = _mm256_sub_pd(iy1,jy3);
2124 dz13 = _mm256_sub_pd(iz1,jz3);
2125 dx21 = _mm256_sub_pd(ix2,jx1);
2126 dy21 = _mm256_sub_pd(iy2,jy1);
2127 dz21 = _mm256_sub_pd(iz2,jz1);
2128 dx22 = _mm256_sub_pd(ix2,jx2);
2129 dy22 = _mm256_sub_pd(iy2,jy2);
2130 dz22 = _mm256_sub_pd(iz2,jz2);
2131 dx23 = _mm256_sub_pd(ix2,jx3);
2132 dy23 = _mm256_sub_pd(iy2,jy3);
2133 dz23 = _mm256_sub_pd(iz2,jz3);
2134 dx31 = _mm256_sub_pd(ix3,jx1);
2135 dy31 = _mm256_sub_pd(iy3,jy1);
2136 dz31 = _mm256_sub_pd(iz3,jz1);
2137 dx32 = _mm256_sub_pd(ix3,jx2);
2138 dy32 = _mm256_sub_pd(iy3,jy2);
2139 dz32 = _mm256_sub_pd(iz3,jz2);
2140 dx33 = _mm256_sub_pd(ix3,jx3);
2141 dy33 = _mm256_sub_pd(iy3,jy3);
2142 dz33 = _mm256_sub_pd(iz3,jz3);
2144 /* Calculate squared distance and things based on it */
2145 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2146 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2147 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2148 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2149 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2150 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2151 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2152 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2153 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2154 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2156 rinv00 = avx256_invsqrt_d(rsq00);
2157 rinv11 = avx256_invsqrt_d(rsq11);
2158 rinv12 = avx256_invsqrt_d(rsq12);
2159 rinv13 = avx256_invsqrt_d(rsq13);
2160 rinv21 = avx256_invsqrt_d(rsq21);
2161 rinv22 = avx256_invsqrt_d(rsq22);
2162 rinv23 = avx256_invsqrt_d(rsq23);
2163 rinv31 = avx256_invsqrt_d(rsq31);
2164 rinv32 = avx256_invsqrt_d(rsq32);
2165 rinv33 = avx256_invsqrt_d(rsq33);
2167 fjx0 = _mm256_setzero_pd();
2168 fjy0 = _mm256_setzero_pd();
2169 fjz0 = _mm256_setzero_pd();
2170 fjx1 = _mm256_setzero_pd();
2171 fjy1 = _mm256_setzero_pd();
2172 fjz1 = _mm256_setzero_pd();
2173 fjx2 = _mm256_setzero_pd();
2174 fjy2 = _mm256_setzero_pd();
2175 fjz2 = _mm256_setzero_pd();
2176 fjx3 = _mm256_setzero_pd();
2177 fjy3 = _mm256_setzero_pd();
2178 fjz3 = _mm256_setzero_pd();
2180 /**************************
2181 * CALCULATE INTERACTIONS *
2182 **************************/
2184 r00 = _mm256_mul_pd(rsq00,rinv00);
2185 r00 = _mm256_andnot_pd(dummy_mask,r00);
2187 /* Calculate table index by multiplying r with table scale and truncate to integer */
2188 rt = _mm256_mul_pd(r00,vftabscale);
2189 vfitab = _mm256_cvttpd_epi32(rt);
2190 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2191 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2193 /* CUBIC SPLINE TABLE DISPERSION */
2194 vfitab = _mm_add_epi32(vfitab,ifour);
2195 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2196 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2197 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2198 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2199 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2200 Heps = _mm256_mul_pd(vfeps,H);
2201 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2202 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2203 fvdw6 = _mm256_mul_pd(c6_00,FF);
2205 /* CUBIC SPLINE TABLE REPULSION */
2206 vfitab = _mm_add_epi32(vfitab,ifour);
2207 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2208 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2209 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2210 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2211 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2212 Heps = _mm256_mul_pd(vfeps,H);
2213 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2214 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2215 fvdw12 = _mm256_mul_pd(c12_00,FF);
2216 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2220 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2222 /* Calculate temporary vectorial force */
2223 tx = _mm256_mul_pd(fscal,dx00);
2224 ty = _mm256_mul_pd(fscal,dy00);
2225 tz = _mm256_mul_pd(fscal,dz00);
2227 /* Update vectorial force */
2228 fix0 = _mm256_add_pd(fix0,tx);
2229 fiy0 = _mm256_add_pd(fiy0,ty);
2230 fiz0 = _mm256_add_pd(fiz0,tz);
2232 fjx0 = _mm256_add_pd(fjx0,tx);
2233 fjy0 = _mm256_add_pd(fjy0,ty);
2234 fjz0 = _mm256_add_pd(fjz0,tz);
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 r11 = _mm256_mul_pd(rsq11,rinv11);
2241 r11 = _mm256_andnot_pd(dummy_mask,r11);
2243 /* Calculate table index by multiplying r with table scale and truncate to integer */
2244 rt = _mm256_mul_pd(r11,vftabscale);
2245 vfitab = _mm256_cvttpd_epi32(rt);
2246 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2247 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2249 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2250 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2251 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2252 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2253 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2254 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2255 Heps = _mm256_mul_pd(vfeps,H);
2256 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2257 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2258 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2262 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2264 /* Calculate temporary vectorial force */
2265 tx = _mm256_mul_pd(fscal,dx11);
2266 ty = _mm256_mul_pd(fscal,dy11);
2267 tz = _mm256_mul_pd(fscal,dz11);
2269 /* Update vectorial force */
2270 fix1 = _mm256_add_pd(fix1,tx);
2271 fiy1 = _mm256_add_pd(fiy1,ty);
2272 fiz1 = _mm256_add_pd(fiz1,tz);
2274 fjx1 = _mm256_add_pd(fjx1,tx);
2275 fjy1 = _mm256_add_pd(fjy1,ty);
2276 fjz1 = _mm256_add_pd(fjz1,tz);
2278 /**************************
2279 * CALCULATE INTERACTIONS *
2280 **************************/
2282 r12 = _mm256_mul_pd(rsq12,rinv12);
2283 r12 = _mm256_andnot_pd(dummy_mask,r12);
2285 /* Calculate table index by multiplying r with table scale and truncate to integer */
2286 rt = _mm256_mul_pd(r12,vftabscale);
2287 vfitab = _mm256_cvttpd_epi32(rt);
2288 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2289 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2291 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2292 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2293 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2294 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2295 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2296 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2297 Heps = _mm256_mul_pd(vfeps,H);
2298 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2299 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2300 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2304 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2306 /* Calculate temporary vectorial force */
2307 tx = _mm256_mul_pd(fscal,dx12);
2308 ty = _mm256_mul_pd(fscal,dy12);
2309 tz = _mm256_mul_pd(fscal,dz12);
2311 /* Update vectorial force */
2312 fix1 = _mm256_add_pd(fix1,tx);
2313 fiy1 = _mm256_add_pd(fiy1,ty);
2314 fiz1 = _mm256_add_pd(fiz1,tz);
2316 fjx2 = _mm256_add_pd(fjx2,tx);
2317 fjy2 = _mm256_add_pd(fjy2,ty);
2318 fjz2 = _mm256_add_pd(fjz2,tz);
2320 /**************************
2321 * CALCULATE INTERACTIONS *
2322 **************************/
2324 r13 = _mm256_mul_pd(rsq13,rinv13);
2325 r13 = _mm256_andnot_pd(dummy_mask,r13);
2327 /* Calculate table index by multiplying r with table scale and truncate to integer */
2328 rt = _mm256_mul_pd(r13,vftabscale);
2329 vfitab = _mm256_cvttpd_epi32(rt);
2330 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2331 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2333 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2334 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2335 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2336 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2337 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2338 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2339 Heps = _mm256_mul_pd(vfeps,H);
2340 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2341 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2342 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
2346 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2348 /* Calculate temporary vectorial force */
2349 tx = _mm256_mul_pd(fscal,dx13);
2350 ty = _mm256_mul_pd(fscal,dy13);
2351 tz = _mm256_mul_pd(fscal,dz13);
2353 /* Update vectorial force */
2354 fix1 = _mm256_add_pd(fix1,tx);
2355 fiy1 = _mm256_add_pd(fiy1,ty);
2356 fiz1 = _mm256_add_pd(fiz1,tz);
2358 fjx3 = _mm256_add_pd(fjx3,tx);
2359 fjy3 = _mm256_add_pd(fjy3,ty);
2360 fjz3 = _mm256_add_pd(fjz3,tz);
2362 /**************************
2363 * CALCULATE INTERACTIONS *
2364 **************************/
2366 r21 = _mm256_mul_pd(rsq21,rinv21);
2367 r21 = _mm256_andnot_pd(dummy_mask,r21);
2369 /* Calculate table index by multiplying r with table scale and truncate to integer */
2370 rt = _mm256_mul_pd(r21,vftabscale);
2371 vfitab = _mm256_cvttpd_epi32(rt);
2372 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2373 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2375 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2376 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2377 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2378 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2379 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2380 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2381 Heps = _mm256_mul_pd(vfeps,H);
2382 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2383 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2384 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2388 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2390 /* Calculate temporary vectorial force */
2391 tx = _mm256_mul_pd(fscal,dx21);
2392 ty = _mm256_mul_pd(fscal,dy21);
2393 tz = _mm256_mul_pd(fscal,dz21);
2395 /* Update vectorial force */
2396 fix2 = _mm256_add_pd(fix2,tx);
2397 fiy2 = _mm256_add_pd(fiy2,ty);
2398 fiz2 = _mm256_add_pd(fiz2,tz);
2400 fjx1 = _mm256_add_pd(fjx1,tx);
2401 fjy1 = _mm256_add_pd(fjy1,ty);
2402 fjz1 = _mm256_add_pd(fjz1,tz);
2404 /**************************
2405 * CALCULATE INTERACTIONS *
2406 **************************/
2408 r22 = _mm256_mul_pd(rsq22,rinv22);
2409 r22 = _mm256_andnot_pd(dummy_mask,r22);
2411 /* Calculate table index by multiplying r with table scale and truncate to integer */
2412 rt = _mm256_mul_pd(r22,vftabscale);
2413 vfitab = _mm256_cvttpd_epi32(rt);
2414 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2415 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2417 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2418 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2419 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2420 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2421 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2422 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2423 Heps = _mm256_mul_pd(vfeps,H);
2424 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2425 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2426 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2430 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2432 /* Calculate temporary vectorial force */
2433 tx = _mm256_mul_pd(fscal,dx22);
2434 ty = _mm256_mul_pd(fscal,dy22);
2435 tz = _mm256_mul_pd(fscal,dz22);
2437 /* Update vectorial force */
2438 fix2 = _mm256_add_pd(fix2,tx);
2439 fiy2 = _mm256_add_pd(fiy2,ty);
2440 fiz2 = _mm256_add_pd(fiz2,tz);
2442 fjx2 = _mm256_add_pd(fjx2,tx);
2443 fjy2 = _mm256_add_pd(fjy2,ty);
2444 fjz2 = _mm256_add_pd(fjz2,tz);
2446 /**************************
2447 * CALCULATE INTERACTIONS *
2448 **************************/
2450 r23 = _mm256_mul_pd(rsq23,rinv23);
2451 r23 = _mm256_andnot_pd(dummy_mask,r23);
2453 /* Calculate table index by multiplying r with table scale and truncate to integer */
2454 rt = _mm256_mul_pd(r23,vftabscale);
2455 vfitab = _mm256_cvttpd_epi32(rt);
2456 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2457 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2459 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2460 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2461 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2462 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2463 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2464 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2465 Heps = _mm256_mul_pd(vfeps,H);
2466 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2467 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2468 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
2472 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2474 /* Calculate temporary vectorial force */
2475 tx = _mm256_mul_pd(fscal,dx23);
2476 ty = _mm256_mul_pd(fscal,dy23);
2477 tz = _mm256_mul_pd(fscal,dz23);
2479 /* Update vectorial force */
2480 fix2 = _mm256_add_pd(fix2,tx);
2481 fiy2 = _mm256_add_pd(fiy2,ty);
2482 fiz2 = _mm256_add_pd(fiz2,tz);
2484 fjx3 = _mm256_add_pd(fjx3,tx);
2485 fjy3 = _mm256_add_pd(fjy3,ty);
2486 fjz3 = _mm256_add_pd(fjz3,tz);
2488 /**************************
2489 * CALCULATE INTERACTIONS *
2490 **************************/
2492 r31 = _mm256_mul_pd(rsq31,rinv31);
2493 r31 = _mm256_andnot_pd(dummy_mask,r31);
2495 /* Calculate table index by multiplying r with table scale and truncate to integer */
2496 rt = _mm256_mul_pd(r31,vftabscale);
2497 vfitab = _mm256_cvttpd_epi32(rt);
2498 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2499 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2501 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2502 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2503 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2504 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2505 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2506 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2507 Heps = _mm256_mul_pd(vfeps,H);
2508 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2509 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2510 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
2514 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2516 /* Calculate temporary vectorial force */
2517 tx = _mm256_mul_pd(fscal,dx31);
2518 ty = _mm256_mul_pd(fscal,dy31);
2519 tz = _mm256_mul_pd(fscal,dz31);
2521 /* Update vectorial force */
2522 fix3 = _mm256_add_pd(fix3,tx);
2523 fiy3 = _mm256_add_pd(fiy3,ty);
2524 fiz3 = _mm256_add_pd(fiz3,tz);
2526 fjx1 = _mm256_add_pd(fjx1,tx);
2527 fjy1 = _mm256_add_pd(fjy1,ty);
2528 fjz1 = _mm256_add_pd(fjz1,tz);
2530 /**************************
2531 * CALCULATE INTERACTIONS *
2532 **************************/
2534 r32 = _mm256_mul_pd(rsq32,rinv32);
2535 r32 = _mm256_andnot_pd(dummy_mask,r32);
2537 /* Calculate table index by multiplying r with table scale and truncate to integer */
2538 rt = _mm256_mul_pd(r32,vftabscale);
2539 vfitab = _mm256_cvttpd_epi32(rt);
2540 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2541 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2543 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2544 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2545 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2546 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2547 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2548 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2549 Heps = _mm256_mul_pd(vfeps,H);
2550 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2551 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2552 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2556 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2558 /* Calculate temporary vectorial force */
2559 tx = _mm256_mul_pd(fscal,dx32);
2560 ty = _mm256_mul_pd(fscal,dy32);
2561 tz = _mm256_mul_pd(fscal,dz32);
2563 /* Update vectorial force */
2564 fix3 = _mm256_add_pd(fix3,tx);
2565 fiy3 = _mm256_add_pd(fiy3,ty);
2566 fiz3 = _mm256_add_pd(fiz3,tz);
2568 fjx2 = _mm256_add_pd(fjx2,tx);
2569 fjy2 = _mm256_add_pd(fjy2,ty);
2570 fjz2 = _mm256_add_pd(fjz2,tz);
2572 /**************************
2573 * CALCULATE INTERACTIONS *
2574 **************************/
2576 r33 = _mm256_mul_pd(rsq33,rinv33);
2577 r33 = _mm256_andnot_pd(dummy_mask,r33);
2579 /* Calculate table index by multiplying r with table scale and truncate to integer */
2580 rt = _mm256_mul_pd(r33,vftabscale);
2581 vfitab = _mm256_cvttpd_epi32(rt);
2582 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2583 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2585 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2586 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2587 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2588 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2589 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2590 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2591 Heps = _mm256_mul_pd(vfeps,H);
2592 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2593 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2594 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2598 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2600 /* Calculate temporary vectorial force */
2601 tx = _mm256_mul_pd(fscal,dx33);
2602 ty = _mm256_mul_pd(fscal,dy33);
2603 tz = _mm256_mul_pd(fscal,dz33);
2605 /* Update vectorial force */
2606 fix3 = _mm256_add_pd(fix3,tx);
2607 fiy3 = _mm256_add_pd(fiy3,ty);
2608 fiz3 = _mm256_add_pd(fiz3,tz);
2610 fjx3 = _mm256_add_pd(fjx3,tx);
2611 fjy3 = _mm256_add_pd(fjy3,ty);
2612 fjz3 = _mm256_add_pd(fjz3,tz);
2614 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2615 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2616 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2617 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2619 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2620 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2621 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2623 /* Inner loop uses 412 flops */
2626 /* End of innermost loop */
2628 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2629 f+i_coord_offset,fshift+i_shift_offset);
2631 /* Increment number of inner iterations */
2632 inneriter += j_index_end - j_index_start;
2634 /* Outer loop uses 24 flops */
2637 /* Increment number of outer iterations */
2640 /* Update outer/inner flops */
2642 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);