2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
107 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
111 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
113 __m128i ifour = _mm_set1_epi32(4);
114 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
116 __m256d dummy_mask,cutoff_mask;
117 __m128 tmpmask0,tmpmask1;
118 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
119 __m256d one = _mm256_set1_pd(1.0);
120 __m256d two = _mm256_set1_pd(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm256_set1_pd(fr->ic->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 vftab = kernel_data->table_elec->data;
139 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
144 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
145 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
146 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
148 jq0 = _mm256_set1_pd(charge[inr+0]);
149 jq1 = _mm256_set1_pd(charge[inr+1]);
150 jq2 = _mm256_set1_pd(charge[inr+2]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 qq00 = _mm256_mul_pd(iq0,jq0);
153 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
154 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
155 qq01 = _mm256_mul_pd(iq0,jq1);
156 qq02 = _mm256_mul_pd(iq0,jq2);
157 qq10 = _mm256_mul_pd(iq1,jq0);
158 qq11 = _mm256_mul_pd(iq1,jq1);
159 qq12 = _mm256_mul_pd(iq1,jq2);
160 qq20 = _mm256_mul_pd(iq2,jq0);
161 qq21 = _mm256_mul_pd(iq2,jq1);
162 qq22 = _mm256_mul_pd(iq2,jq2);
164 /* Avoid stupid compiler warnings */
165 jnrA = jnrB = jnrC = jnrD = 0;
174 for(iidx=0;iidx<4*DIM;iidx++)
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197 fix0 = _mm256_setzero_pd();
198 fiy0 = _mm256_setzero_pd();
199 fiz0 = _mm256_setzero_pd();
200 fix1 = _mm256_setzero_pd();
201 fiy1 = _mm256_setzero_pd();
202 fiz1 = _mm256_setzero_pd();
203 fix2 = _mm256_setzero_pd();
204 fiy2 = _mm256_setzero_pd();
205 fiz2 = _mm256_setzero_pd();
207 /* Reset potential sums */
208 velecsum = _mm256_setzero_pd();
209 vvdwsum = _mm256_setzero_pd();
211 /* Start inner kernel loop */
212 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
225 /* load j atom coordinates */
226 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
227 x+j_coord_offsetC,x+j_coord_offsetD,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
230 /* Calculate displacement vector */
231 dx00 = _mm256_sub_pd(ix0,jx0);
232 dy00 = _mm256_sub_pd(iy0,jy0);
233 dz00 = _mm256_sub_pd(iz0,jz0);
234 dx01 = _mm256_sub_pd(ix0,jx1);
235 dy01 = _mm256_sub_pd(iy0,jy1);
236 dz01 = _mm256_sub_pd(iz0,jz1);
237 dx02 = _mm256_sub_pd(ix0,jx2);
238 dy02 = _mm256_sub_pd(iy0,jy2);
239 dz02 = _mm256_sub_pd(iz0,jz2);
240 dx10 = _mm256_sub_pd(ix1,jx0);
241 dy10 = _mm256_sub_pd(iy1,jy0);
242 dz10 = _mm256_sub_pd(iz1,jz0);
243 dx11 = _mm256_sub_pd(ix1,jx1);
244 dy11 = _mm256_sub_pd(iy1,jy1);
245 dz11 = _mm256_sub_pd(iz1,jz1);
246 dx12 = _mm256_sub_pd(ix1,jx2);
247 dy12 = _mm256_sub_pd(iy1,jy2);
248 dz12 = _mm256_sub_pd(iz1,jz2);
249 dx20 = _mm256_sub_pd(ix2,jx0);
250 dy20 = _mm256_sub_pd(iy2,jy0);
251 dz20 = _mm256_sub_pd(iz2,jz0);
252 dx21 = _mm256_sub_pd(ix2,jx1);
253 dy21 = _mm256_sub_pd(iy2,jy1);
254 dz21 = _mm256_sub_pd(iz2,jz1);
255 dx22 = _mm256_sub_pd(ix2,jx2);
256 dy22 = _mm256_sub_pd(iy2,jy2);
257 dz22 = _mm256_sub_pd(iz2,jz2);
259 /* Calculate squared distance and things based on it */
260 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
261 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
262 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
263 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
264 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
265 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
266 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
267 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
268 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
270 rinv00 = avx256_invsqrt_d(rsq00);
271 rinv01 = avx256_invsqrt_d(rsq01);
272 rinv02 = avx256_invsqrt_d(rsq02);
273 rinv10 = avx256_invsqrt_d(rsq10);
274 rinv11 = avx256_invsqrt_d(rsq11);
275 rinv12 = avx256_invsqrt_d(rsq12);
276 rinv20 = avx256_invsqrt_d(rsq20);
277 rinv21 = avx256_invsqrt_d(rsq21);
278 rinv22 = avx256_invsqrt_d(rsq22);
280 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
282 fjx0 = _mm256_setzero_pd();
283 fjy0 = _mm256_setzero_pd();
284 fjz0 = _mm256_setzero_pd();
285 fjx1 = _mm256_setzero_pd();
286 fjy1 = _mm256_setzero_pd();
287 fjz1 = _mm256_setzero_pd();
288 fjx2 = _mm256_setzero_pd();
289 fjy2 = _mm256_setzero_pd();
290 fjz2 = _mm256_setzero_pd();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 r00 = _mm256_mul_pd(rsq00,rinv00);
298 /* Calculate table index by multiplying r with table scale and truncate to integer */
299 rt = _mm256_mul_pd(r00,vftabscale);
300 vfitab = _mm256_cvttpd_epi32(rt);
301 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
302 vfitab = _mm_slli_epi32(vfitab,2);
304 /* CUBIC SPLINE TABLE ELECTROSTATICS */
305 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
306 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
307 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
308 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
309 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
310 Heps = _mm256_mul_pd(vfeps,H);
311 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
312 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
313 velec = _mm256_mul_pd(qq00,VV);
314 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
315 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
317 /* LENNARD-JONES DISPERSION/REPULSION */
319 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
320 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
321 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
322 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
323 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
325 /* Update potential sum for this i atom from the interaction with this j atom. */
326 velecsum = _mm256_add_pd(velecsum,velec);
327 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
329 fscal = _mm256_add_pd(felec,fvdw);
331 /* Calculate temporary vectorial force */
332 tx = _mm256_mul_pd(fscal,dx00);
333 ty = _mm256_mul_pd(fscal,dy00);
334 tz = _mm256_mul_pd(fscal,dz00);
336 /* Update vectorial force */
337 fix0 = _mm256_add_pd(fix0,tx);
338 fiy0 = _mm256_add_pd(fiy0,ty);
339 fiz0 = _mm256_add_pd(fiz0,tz);
341 fjx0 = _mm256_add_pd(fjx0,tx);
342 fjy0 = _mm256_add_pd(fjy0,ty);
343 fjz0 = _mm256_add_pd(fjz0,tz);
345 /**************************
346 * CALCULATE INTERACTIONS *
347 **************************/
349 r01 = _mm256_mul_pd(rsq01,rinv01);
351 /* Calculate table index by multiplying r with table scale and truncate to integer */
352 rt = _mm256_mul_pd(r01,vftabscale);
353 vfitab = _mm256_cvttpd_epi32(rt);
354 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
355 vfitab = _mm_slli_epi32(vfitab,2);
357 /* CUBIC SPLINE TABLE ELECTROSTATICS */
358 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
359 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
360 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
361 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
362 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
363 Heps = _mm256_mul_pd(vfeps,H);
364 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
365 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
366 velec = _mm256_mul_pd(qq01,VV);
367 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
368 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
370 /* Update potential sum for this i atom from the interaction with this j atom. */
371 velecsum = _mm256_add_pd(velecsum,velec);
375 /* Calculate temporary vectorial force */
376 tx = _mm256_mul_pd(fscal,dx01);
377 ty = _mm256_mul_pd(fscal,dy01);
378 tz = _mm256_mul_pd(fscal,dz01);
380 /* Update vectorial force */
381 fix0 = _mm256_add_pd(fix0,tx);
382 fiy0 = _mm256_add_pd(fiy0,ty);
383 fiz0 = _mm256_add_pd(fiz0,tz);
385 fjx1 = _mm256_add_pd(fjx1,tx);
386 fjy1 = _mm256_add_pd(fjy1,ty);
387 fjz1 = _mm256_add_pd(fjz1,tz);
389 /**************************
390 * CALCULATE INTERACTIONS *
391 **************************/
393 r02 = _mm256_mul_pd(rsq02,rinv02);
395 /* Calculate table index by multiplying r with table scale and truncate to integer */
396 rt = _mm256_mul_pd(r02,vftabscale);
397 vfitab = _mm256_cvttpd_epi32(rt);
398 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
399 vfitab = _mm_slli_epi32(vfitab,2);
401 /* CUBIC SPLINE TABLE ELECTROSTATICS */
402 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
403 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
404 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
405 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
406 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
407 Heps = _mm256_mul_pd(vfeps,H);
408 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
409 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
410 velec = _mm256_mul_pd(qq02,VV);
411 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
412 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velecsum = _mm256_add_pd(velecsum,velec);
419 /* Calculate temporary vectorial force */
420 tx = _mm256_mul_pd(fscal,dx02);
421 ty = _mm256_mul_pd(fscal,dy02);
422 tz = _mm256_mul_pd(fscal,dz02);
424 /* Update vectorial force */
425 fix0 = _mm256_add_pd(fix0,tx);
426 fiy0 = _mm256_add_pd(fiy0,ty);
427 fiz0 = _mm256_add_pd(fiz0,tz);
429 fjx2 = _mm256_add_pd(fjx2,tx);
430 fjy2 = _mm256_add_pd(fjy2,ty);
431 fjz2 = _mm256_add_pd(fjz2,tz);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 r10 = _mm256_mul_pd(rsq10,rinv10);
439 /* Calculate table index by multiplying r with table scale and truncate to integer */
440 rt = _mm256_mul_pd(r10,vftabscale);
441 vfitab = _mm256_cvttpd_epi32(rt);
442 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
443 vfitab = _mm_slli_epi32(vfitab,2);
445 /* CUBIC SPLINE TABLE ELECTROSTATICS */
446 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
447 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
448 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
449 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
450 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
451 Heps = _mm256_mul_pd(vfeps,H);
452 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
453 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
454 velec = _mm256_mul_pd(qq10,VV);
455 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
456 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum = _mm256_add_pd(velecsum,velec);
463 /* Calculate temporary vectorial force */
464 tx = _mm256_mul_pd(fscal,dx10);
465 ty = _mm256_mul_pd(fscal,dy10);
466 tz = _mm256_mul_pd(fscal,dz10);
468 /* Update vectorial force */
469 fix1 = _mm256_add_pd(fix1,tx);
470 fiy1 = _mm256_add_pd(fiy1,ty);
471 fiz1 = _mm256_add_pd(fiz1,tz);
473 fjx0 = _mm256_add_pd(fjx0,tx);
474 fjy0 = _mm256_add_pd(fjy0,ty);
475 fjz0 = _mm256_add_pd(fjz0,tz);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 r11 = _mm256_mul_pd(rsq11,rinv11);
483 /* Calculate table index by multiplying r with table scale and truncate to integer */
484 rt = _mm256_mul_pd(r11,vftabscale);
485 vfitab = _mm256_cvttpd_epi32(rt);
486 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
487 vfitab = _mm_slli_epi32(vfitab,2);
489 /* CUBIC SPLINE TABLE ELECTROSTATICS */
490 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
491 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
492 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
493 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
494 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
495 Heps = _mm256_mul_pd(vfeps,H);
496 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
497 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
498 velec = _mm256_mul_pd(qq11,VV);
499 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
500 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velecsum = _mm256_add_pd(velecsum,velec);
507 /* Calculate temporary vectorial force */
508 tx = _mm256_mul_pd(fscal,dx11);
509 ty = _mm256_mul_pd(fscal,dy11);
510 tz = _mm256_mul_pd(fscal,dz11);
512 /* Update vectorial force */
513 fix1 = _mm256_add_pd(fix1,tx);
514 fiy1 = _mm256_add_pd(fiy1,ty);
515 fiz1 = _mm256_add_pd(fiz1,tz);
517 fjx1 = _mm256_add_pd(fjx1,tx);
518 fjy1 = _mm256_add_pd(fjy1,ty);
519 fjz1 = _mm256_add_pd(fjz1,tz);
521 /**************************
522 * CALCULATE INTERACTIONS *
523 **************************/
525 r12 = _mm256_mul_pd(rsq12,rinv12);
527 /* Calculate table index by multiplying r with table scale and truncate to integer */
528 rt = _mm256_mul_pd(r12,vftabscale);
529 vfitab = _mm256_cvttpd_epi32(rt);
530 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
531 vfitab = _mm_slli_epi32(vfitab,2);
533 /* CUBIC SPLINE TABLE ELECTROSTATICS */
534 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
535 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
536 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
537 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
538 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
539 Heps = _mm256_mul_pd(vfeps,H);
540 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
541 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
542 velec = _mm256_mul_pd(qq12,VV);
543 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
544 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
546 /* Update potential sum for this i atom from the interaction with this j atom. */
547 velecsum = _mm256_add_pd(velecsum,velec);
551 /* Calculate temporary vectorial force */
552 tx = _mm256_mul_pd(fscal,dx12);
553 ty = _mm256_mul_pd(fscal,dy12);
554 tz = _mm256_mul_pd(fscal,dz12);
556 /* Update vectorial force */
557 fix1 = _mm256_add_pd(fix1,tx);
558 fiy1 = _mm256_add_pd(fiy1,ty);
559 fiz1 = _mm256_add_pd(fiz1,tz);
561 fjx2 = _mm256_add_pd(fjx2,tx);
562 fjy2 = _mm256_add_pd(fjy2,ty);
563 fjz2 = _mm256_add_pd(fjz2,tz);
565 /**************************
566 * CALCULATE INTERACTIONS *
567 **************************/
569 r20 = _mm256_mul_pd(rsq20,rinv20);
571 /* Calculate table index by multiplying r with table scale and truncate to integer */
572 rt = _mm256_mul_pd(r20,vftabscale);
573 vfitab = _mm256_cvttpd_epi32(rt);
574 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
575 vfitab = _mm_slli_epi32(vfitab,2);
577 /* CUBIC SPLINE TABLE ELECTROSTATICS */
578 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
579 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
580 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
581 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
582 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
583 Heps = _mm256_mul_pd(vfeps,H);
584 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
585 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
586 velec = _mm256_mul_pd(qq20,VV);
587 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
588 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
590 /* Update potential sum for this i atom from the interaction with this j atom. */
591 velecsum = _mm256_add_pd(velecsum,velec);
595 /* Calculate temporary vectorial force */
596 tx = _mm256_mul_pd(fscal,dx20);
597 ty = _mm256_mul_pd(fscal,dy20);
598 tz = _mm256_mul_pd(fscal,dz20);
600 /* Update vectorial force */
601 fix2 = _mm256_add_pd(fix2,tx);
602 fiy2 = _mm256_add_pd(fiy2,ty);
603 fiz2 = _mm256_add_pd(fiz2,tz);
605 fjx0 = _mm256_add_pd(fjx0,tx);
606 fjy0 = _mm256_add_pd(fjy0,ty);
607 fjz0 = _mm256_add_pd(fjz0,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 r21 = _mm256_mul_pd(rsq21,rinv21);
615 /* Calculate table index by multiplying r with table scale and truncate to integer */
616 rt = _mm256_mul_pd(r21,vftabscale);
617 vfitab = _mm256_cvttpd_epi32(rt);
618 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
619 vfitab = _mm_slli_epi32(vfitab,2);
621 /* CUBIC SPLINE TABLE ELECTROSTATICS */
622 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
623 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
624 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
625 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
626 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
627 Heps = _mm256_mul_pd(vfeps,H);
628 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
629 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
630 velec = _mm256_mul_pd(qq21,VV);
631 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
632 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velecsum = _mm256_add_pd(velecsum,velec);
639 /* Calculate temporary vectorial force */
640 tx = _mm256_mul_pd(fscal,dx21);
641 ty = _mm256_mul_pd(fscal,dy21);
642 tz = _mm256_mul_pd(fscal,dz21);
644 /* Update vectorial force */
645 fix2 = _mm256_add_pd(fix2,tx);
646 fiy2 = _mm256_add_pd(fiy2,ty);
647 fiz2 = _mm256_add_pd(fiz2,tz);
649 fjx1 = _mm256_add_pd(fjx1,tx);
650 fjy1 = _mm256_add_pd(fjy1,ty);
651 fjz1 = _mm256_add_pd(fjz1,tz);
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 r22 = _mm256_mul_pd(rsq22,rinv22);
659 /* Calculate table index by multiplying r with table scale and truncate to integer */
660 rt = _mm256_mul_pd(r22,vftabscale);
661 vfitab = _mm256_cvttpd_epi32(rt);
662 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
663 vfitab = _mm_slli_epi32(vfitab,2);
665 /* CUBIC SPLINE TABLE ELECTROSTATICS */
666 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
667 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
668 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
669 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
670 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
671 Heps = _mm256_mul_pd(vfeps,H);
672 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
673 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
674 velec = _mm256_mul_pd(qq22,VV);
675 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
676 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
678 /* Update potential sum for this i atom from the interaction with this j atom. */
679 velecsum = _mm256_add_pd(velecsum,velec);
683 /* Calculate temporary vectorial force */
684 tx = _mm256_mul_pd(fscal,dx22);
685 ty = _mm256_mul_pd(fscal,dy22);
686 tz = _mm256_mul_pd(fscal,dz22);
688 /* Update vectorial force */
689 fix2 = _mm256_add_pd(fix2,tx);
690 fiy2 = _mm256_add_pd(fiy2,ty);
691 fiz2 = _mm256_add_pd(fiz2,tz);
693 fjx2 = _mm256_add_pd(fjx2,tx);
694 fjy2 = _mm256_add_pd(fjy2,ty);
695 fjz2 = _mm256_add_pd(fjz2,tz);
697 fjptrA = f+j_coord_offsetA;
698 fjptrB = f+j_coord_offsetB;
699 fjptrC = f+j_coord_offsetC;
700 fjptrD = f+j_coord_offsetD;
702 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
703 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
705 /* Inner loop uses 400 flops */
711 /* Get j neighbor index, and coordinate index */
712 jnrlistA = jjnr[jidx];
713 jnrlistB = jjnr[jidx+1];
714 jnrlistC = jjnr[jidx+2];
715 jnrlistD = jjnr[jidx+3];
716 /* Sign of each element will be negative for non-real atoms.
717 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
718 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
720 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
722 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
723 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
724 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
726 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
727 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
728 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
729 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
730 j_coord_offsetA = DIM*jnrA;
731 j_coord_offsetB = DIM*jnrB;
732 j_coord_offsetC = DIM*jnrC;
733 j_coord_offsetD = DIM*jnrD;
735 /* load j atom coordinates */
736 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
737 x+j_coord_offsetC,x+j_coord_offsetD,
738 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
740 /* Calculate displacement vector */
741 dx00 = _mm256_sub_pd(ix0,jx0);
742 dy00 = _mm256_sub_pd(iy0,jy0);
743 dz00 = _mm256_sub_pd(iz0,jz0);
744 dx01 = _mm256_sub_pd(ix0,jx1);
745 dy01 = _mm256_sub_pd(iy0,jy1);
746 dz01 = _mm256_sub_pd(iz0,jz1);
747 dx02 = _mm256_sub_pd(ix0,jx2);
748 dy02 = _mm256_sub_pd(iy0,jy2);
749 dz02 = _mm256_sub_pd(iz0,jz2);
750 dx10 = _mm256_sub_pd(ix1,jx0);
751 dy10 = _mm256_sub_pd(iy1,jy0);
752 dz10 = _mm256_sub_pd(iz1,jz0);
753 dx11 = _mm256_sub_pd(ix1,jx1);
754 dy11 = _mm256_sub_pd(iy1,jy1);
755 dz11 = _mm256_sub_pd(iz1,jz1);
756 dx12 = _mm256_sub_pd(ix1,jx2);
757 dy12 = _mm256_sub_pd(iy1,jy2);
758 dz12 = _mm256_sub_pd(iz1,jz2);
759 dx20 = _mm256_sub_pd(ix2,jx0);
760 dy20 = _mm256_sub_pd(iy2,jy0);
761 dz20 = _mm256_sub_pd(iz2,jz0);
762 dx21 = _mm256_sub_pd(ix2,jx1);
763 dy21 = _mm256_sub_pd(iy2,jy1);
764 dz21 = _mm256_sub_pd(iz2,jz1);
765 dx22 = _mm256_sub_pd(ix2,jx2);
766 dy22 = _mm256_sub_pd(iy2,jy2);
767 dz22 = _mm256_sub_pd(iz2,jz2);
769 /* Calculate squared distance and things based on it */
770 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
771 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
772 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
773 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
774 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
775 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
776 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
777 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
778 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
780 rinv00 = avx256_invsqrt_d(rsq00);
781 rinv01 = avx256_invsqrt_d(rsq01);
782 rinv02 = avx256_invsqrt_d(rsq02);
783 rinv10 = avx256_invsqrt_d(rsq10);
784 rinv11 = avx256_invsqrt_d(rsq11);
785 rinv12 = avx256_invsqrt_d(rsq12);
786 rinv20 = avx256_invsqrt_d(rsq20);
787 rinv21 = avx256_invsqrt_d(rsq21);
788 rinv22 = avx256_invsqrt_d(rsq22);
790 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
792 fjx0 = _mm256_setzero_pd();
793 fjy0 = _mm256_setzero_pd();
794 fjz0 = _mm256_setzero_pd();
795 fjx1 = _mm256_setzero_pd();
796 fjy1 = _mm256_setzero_pd();
797 fjz1 = _mm256_setzero_pd();
798 fjx2 = _mm256_setzero_pd();
799 fjy2 = _mm256_setzero_pd();
800 fjz2 = _mm256_setzero_pd();
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 r00 = _mm256_mul_pd(rsq00,rinv00);
807 r00 = _mm256_andnot_pd(dummy_mask,r00);
809 /* Calculate table index by multiplying r with table scale and truncate to integer */
810 rt = _mm256_mul_pd(r00,vftabscale);
811 vfitab = _mm256_cvttpd_epi32(rt);
812 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
813 vfitab = _mm_slli_epi32(vfitab,2);
815 /* CUBIC SPLINE TABLE ELECTROSTATICS */
816 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
817 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
818 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
819 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
820 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
821 Heps = _mm256_mul_pd(vfeps,H);
822 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
823 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
824 velec = _mm256_mul_pd(qq00,VV);
825 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
826 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
828 /* LENNARD-JONES DISPERSION/REPULSION */
830 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
831 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
832 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
833 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
834 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
836 /* Update potential sum for this i atom from the interaction with this j atom. */
837 velec = _mm256_andnot_pd(dummy_mask,velec);
838 velecsum = _mm256_add_pd(velecsum,velec);
839 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
840 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
842 fscal = _mm256_add_pd(felec,fvdw);
844 fscal = _mm256_andnot_pd(dummy_mask,fscal);
846 /* Calculate temporary vectorial force */
847 tx = _mm256_mul_pd(fscal,dx00);
848 ty = _mm256_mul_pd(fscal,dy00);
849 tz = _mm256_mul_pd(fscal,dz00);
851 /* Update vectorial force */
852 fix0 = _mm256_add_pd(fix0,tx);
853 fiy0 = _mm256_add_pd(fiy0,ty);
854 fiz0 = _mm256_add_pd(fiz0,tz);
856 fjx0 = _mm256_add_pd(fjx0,tx);
857 fjy0 = _mm256_add_pd(fjy0,ty);
858 fjz0 = _mm256_add_pd(fjz0,tz);
860 /**************************
861 * CALCULATE INTERACTIONS *
862 **************************/
864 r01 = _mm256_mul_pd(rsq01,rinv01);
865 r01 = _mm256_andnot_pd(dummy_mask,r01);
867 /* Calculate table index by multiplying r with table scale and truncate to integer */
868 rt = _mm256_mul_pd(r01,vftabscale);
869 vfitab = _mm256_cvttpd_epi32(rt);
870 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
871 vfitab = _mm_slli_epi32(vfitab,2);
873 /* CUBIC SPLINE TABLE ELECTROSTATICS */
874 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
875 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
876 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
877 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
878 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
879 Heps = _mm256_mul_pd(vfeps,H);
880 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
881 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
882 velec = _mm256_mul_pd(qq01,VV);
883 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
884 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 velec = _mm256_andnot_pd(dummy_mask,velec);
888 velecsum = _mm256_add_pd(velecsum,velec);
892 fscal = _mm256_andnot_pd(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm256_mul_pd(fscal,dx01);
896 ty = _mm256_mul_pd(fscal,dy01);
897 tz = _mm256_mul_pd(fscal,dz01);
899 /* Update vectorial force */
900 fix0 = _mm256_add_pd(fix0,tx);
901 fiy0 = _mm256_add_pd(fiy0,ty);
902 fiz0 = _mm256_add_pd(fiz0,tz);
904 fjx1 = _mm256_add_pd(fjx1,tx);
905 fjy1 = _mm256_add_pd(fjy1,ty);
906 fjz1 = _mm256_add_pd(fjz1,tz);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 r02 = _mm256_mul_pd(rsq02,rinv02);
913 r02 = _mm256_andnot_pd(dummy_mask,r02);
915 /* Calculate table index by multiplying r with table scale and truncate to integer */
916 rt = _mm256_mul_pd(r02,vftabscale);
917 vfitab = _mm256_cvttpd_epi32(rt);
918 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
919 vfitab = _mm_slli_epi32(vfitab,2);
921 /* CUBIC SPLINE TABLE ELECTROSTATICS */
922 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
923 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
924 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
925 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
926 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
927 Heps = _mm256_mul_pd(vfeps,H);
928 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
929 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
930 velec = _mm256_mul_pd(qq02,VV);
931 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
932 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
934 /* Update potential sum for this i atom from the interaction with this j atom. */
935 velec = _mm256_andnot_pd(dummy_mask,velec);
936 velecsum = _mm256_add_pd(velecsum,velec);
940 fscal = _mm256_andnot_pd(dummy_mask,fscal);
942 /* Calculate temporary vectorial force */
943 tx = _mm256_mul_pd(fscal,dx02);
944 ty = _mm256_mul_pd(fscal,dy02);
945 tz = _mm256_mul_pd(fscal,dz02);
947 /* Update vectorial force */
948 fix0 = _mm256_add_pd(fix0,tx);
949 fiy0 = _mm256_add_pd(fiy0,ty);
950 fiz0 = _mm256_add_pd(fiz0,tz);
952 fjx2 = _mm256_add_pd(fjx2,tx);
953 fjy2 = _mm256_add_pd(fjy2,ty);
954 fjz2 = _mm256_add_pd(fjz2,tz);
956 /**************************
957 * CALCULATE INTERACTIONS *
958 **************************/
960 r10 = _mm256_mul_pd(rsq10,rinv10);
961 r10 = _mm256_andnot_pd(dummy_mask,r10);
963 /* Calculate table index by multiplying r with table scale and truncate to integer */
964 rt = _mm256_mul_pd(r10,vftabscale);
965 vfitab = _mm256_cvttpd_epi32(rt);
966 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
967 vfitab = _mm_slli_epi32(vfitab,2);
969 /* CUBIC SPLINE TABLE ELECTROSTATICS */
970 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
971 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
972 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
973 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
974 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
975 Heps = _mm256_mul_pd(vfeps,H);
976 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
977 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
978 velec = _mm256_mul_pd(qq10,VV);
979 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
980 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm256_andnot_pd(dummy_mask,velec);
984 velecsum = _mm256_add_pd(velecsum,velec);
988 fscal = _mm256_andnot_pd(dummy_mask,fscal);
990 /* Calculate temporary vectorial force */
991 tx = _mm256_mul_pd(fscal,dx10);
992 ty = _mm256_mul_pd(fscal,dy10);
993 tz = _mm256_mul_pd(fscal,dz10);
995 /* Update vectorial force */
996 fix1 = _mm256_add_pd(fix1,tx);
997 fiy1 = _mm256_add_pd(fiy1,ty);
998 fiz1 = _mm256_add_pd(fiz1,tz);
1000 fjx0 = _mm256_add_pd(fjx0,tx);
1001 fjy0 = _mm256_add_pd(fjy0,ty);
1002 fjz0 = _mm256_add_pd(fjz0,tz);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 r11 = _mm256_mul_pd(rsq11,rinv11);
1009 r11 = _mm256_andnot_pd(dummy_mask,r11);
1011 /* Calculate table index by multiplying r with table scale and truncate to integer */
1012 rt = _mm256_mul_pd(r11,vftabscale);
1013 vfitab = _mm256_cvttpd_epi32(rt);
1014 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1015 vfitab = _mm_slli_epi32(vfitab,2);
1017 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1018 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1019 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1020 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1021 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1022 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1023 Heps = _mm256_mul_pd(vfeps,H);
1024 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1025 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1026 velec = _mm256_mul_pd(qq11,VV);
1027 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1028 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1030 /* Update potential sum for this i atom from the interaction with this j atom. */
1031 velec = _mm256_andnot_pd(dummy_mask,velec);
1032 velecsum = _mm256_add_pd(velecsum,velec);
1036 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1038 /* Calculate temporary vectorial force */
1039 tx = _mm256_mul_pd(fscal,dx11);
1040 ty = _mm256_mul_pd(fscal,dy11);
1041 tz = _mm256_mul_pd(fscal,dz11);
1043 /* Update vectorial force */
1044 fix1 = _mm256_add_pd(fix1,tx);
1045 fiy1 = _mm256_add_pd(fiy1,ty);
1046 fiz1 = _mm256_add_pd(fiz1,tz);
1048 fjx1 = _mm256_add_pd(fjx1,tx);
1049 fjy1 = _mm256_add_pd(fjy1,ty);
1050 fjz1 = _mm256_add_pd(fjz1,tz);
1052 /**************************
1053 * CALCULATE INTERACTIONS *
1054 **************************/
1056 r12 = _mm256_mul_pd(rsq12,rinv12);
1057 r12 = _mm256_andnot_pd(dummy_mask,r12);
1059 /* Calculate table index by multiplying r with table scale and truncate to integer */
1060 rt = _mm256_mul_pd(r12,vftabscale);
1061 vfitab = _mm256_cvttpd_epi32(rt);
1062 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1063 vfitab = _mm_slli_epi32(vfitab,2);
1065 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1066 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1067 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1068 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1069 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1070 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1071 Heps = _mm256_mul_pd(vfeps,H);
1072 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1073 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1074 velec = _mm256_mul_pd(qq12,VV);
1075 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1076 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 velec = _mm256_andnot_pd(dummy_mask,velec);
1080 velecsum = _mm256_add_pd(velecsum,velec);
1084 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1086 /* Calculate temporary vectorial force */
1087 tx = _mm256_mul_pd(fscal,dx12);
1088 ty = _mm256_mul_pd(fscal,dy12);
1089 tz = _mm256_mul_pd(fscal,dz12);
1091 /* Update vectorial force */
1092 fix1 = _mm256_add_pd(fix1,tx);
1093 fiy1 = _mm256_add_pd(fiy1,ty);
1094 fiz1 = _mm256_add_pd(fiz1,tz);
1096 fjx2 = _mm256_add_pd(fjx2,tx);
1097 fjy2 = _mm256_add_pd(fjy2,ty);
1098 fjz2 = _mm256_add_pd(fjz2,tz);
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1104 r20 = _mm256_mul_pd(rsq20,rinv20);
1105 r20 = _mm256_andnot_pd(dummy_mask,r20);
1107 /* Calculate table index by multiplying r with table scale and truncate to integer */
1108 rt = _mm256_mul_pd(r20,vftabscale);
1109 vfitab = _mm256_cvttpd_epi32(rt);
1110 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1111 vfitab = _mm_slli_epi32(vfitab,2);
1113 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1114 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1115 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1116 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1117 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1118 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1119 Heps = _mm256_mul_pd(vfeps,H);
1120 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1121 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1122 velec = _mm256_mul_pd(qq20,VV);
1123 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1124 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec = _mm256_andnot_pd(dummy_mask,velec);
1128 velecsum = _mm256_add_pd(velecsum,velec);
1132 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1134 /* Calculate temporary vectorial force */
1135 tx = _mm256_mul_pd(fscal,dx20);
1136 ty = _mm256_mul_pd(fscal,dy20);
1137 tz = _mm256_mul_pd(fscal,dz20);
1139 /* Update vectorial force */
1140 fix2 = _mm256_add_pd(fix2,tx);
1141 fiy2 = _mm256_add_pd(fiy2,ty);
1142 fiz2 = _mm256_add_pd(fiz2,tz);
1144 fjx0 = _mm256_add_pd(fjx0,tx);
1145 fjy0 = _mm256_add_pd(fjy0,ty);
1146 fjz0 = _mm256_add_pd(fjz0,tz);
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1152 r21 = _mm256_mul_pd(rsq21,rinv21);
1153 r21 = _mm256_andnot_pd(dummy_mask,r21);
1155 /* Calculate table index by multiplying r with table scale and truncate to integer */
1156 rt = _mm256_mul_pd(r21,vftabscale);
1157 vfitab = _mm256_cvttpd_epi32(rt);
1158 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1159 vfitab = _mm_slli_epi32(vfitab,2);
1161 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1162 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1163 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1164 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1165 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1166 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1167 Heps = _mm256_mul_pd(vfeps,H);
1168 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1169 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1170 velec = _mm256_mul_pd(qq21,VV);
1171 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1172 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1174 /* Update potential sum for this i atom from the interaction with this j atom. */
1175 velec = _mm256_andnot_pd(dummy_mask,velec);
1176 velecsum = _mm256_add_pd(velecsum,velec);
1180 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1182 /* Calculate temporary vectorial force */
1183 tx = _mm256_mul_pd(fscal,dx21);
1184 ty = _mm256_mul_pd(fscal,dy21);
1185 tz = _mm256_mul_pd(fscal,dz21);
1187 /* Update vectorial force */
1188 fix2 = _mm256_add_pd(fix2,tx);
1189 fiy2 = _mm256_add_pd(fiy2,ty);
1190 fiz2 = _mm256_add_pd(fiz2,tz);
1192 fjx1 = _mm256_add_pd(fjx1,tx);
1193 fjy1 = _mm256_add_pd(fjy1,ty);
1194 fjz1 = _mm256_add_pd(fjz1,tz);
1196 /**************************
1197 * CALCULATE INTERACTIONS *
1198 **************************/
1200 r22 = _mm256_mul_pd(rsq22,rinv22);
1201 r22 = _mm256_andnot_pd(dummy_mask,r22);
1203 /* Calculate table index by multiplying r with table scale and truncate to integer */
1204 rt = _mm256_mul_pd(r22,vftabscale);
1205 vfitab = _mm256_cvttpd_epi32(rt);
1206 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1207 vfitab = _mm_slli_epi32(vfitab,2);
1209 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1210 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1211 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1212 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1213 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1214 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1215 Heps = _mm256_mul_pd(vfeps,H);
1216 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1217 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1218 velec = _mm256_mul_pd(qq22,VV);
1219 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1220 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1222 /* Update potential sum for this i atom from the interaction with this j atom. */
1223 velec = _mm256_andnot_pd(dummy_mask,velec);
1224 velecsum = _mm256_add_pd(velecsum,velec);
1228 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm256_mul_pd(fscal,dx22);
1232 ty = _mm256_mul_pd(fscal,dy22);
1233 tz = _mm256_mul_pd(fscal,dz22);
1235 /* Update vectorial force */
1236 fix2 = _mm256_add_pd(fix2,tx);
1237 fiy2 = _mm256_add_pd(fiy2,ty);
1238 fiz2 = _mm256_add_pd(fiz2,tz);
1240 fjx2 = _mm256_add_pd(fjx2,tx);
1241 fjy2 = _mm256_add_pd(fjy2,ty);
1242 fjz2 = _mm256_add_pd(fjz2,tz);
1244 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1245 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1246 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1247 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1249 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1250 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1252 /* Inner loop uses 409 flops */
1255 /* End of innermost loop */
1257 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1258 f+i_coord_offset,fshift+i_shift_offset);
1261 /* Update potential energies */
1262 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1263 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1265 /* Increment number of inner iterations */
1266 inneriter += j_index_end - j_index_start;
1268 /* Outer loop uses 20 flops */
1271 /* Increment number of outer iterations */
1274 /* Update outer/inner flops */
1276 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1279 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1280 * Electrostatics interaction: CubicSplineTable
1281 * VdW interaction: LennardJones
1282 * Geometry: Water3-Water3
1283 * Calculate force/pot: Force
1286 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1287 (t_nblist * gmx_restrict nlist,
1288 rvec * gmx_restrict xx,
1289 rvec * gmx_restrict ff,
1290 struct t_forcerec * gmx_restrict fr,
1291 t_mdatoms * gmx_restrict mdatoms,
1292 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1293 t_nrnb * gmx_restrict nrnb)
1295 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1296 * just 0 for non-waters.
1297 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1298 * jnr indices corresponding to data put in the four positions in the SIMD register.
1300 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1301 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1302 int jnrA,jnrB,jnrC,jnrD;
1303 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1304 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1305 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1306 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1307 real rcutoff_scalar;
1308 real *shiftvec,*fshift,*x,*f;
1309 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1310 real scratch[4*DIM];
1311 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1312 real * vdwioffsetptr0;
1313 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1314 real * vdwioffsetptr1;
1315 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1316 real * vdwioffsetptr2;
1317 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1318 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1319 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1320 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1321 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1322 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1323 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1324 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1325 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1326 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1327 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1328 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1329 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1330 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1331 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1332 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1333 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1336 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1339 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1340 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1342 __m128i ifour = _mm_set1_epi32(4);
1343 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1345 __m256d dummy_mask,cutoff_mask;
1346 __m128 tmpmask0,tmpmask1;
1347 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1348 __m256d one = _mm256_set1_pd(1.0);
1349 __m256d two = _mm256_set1_pd(2.0);
1355 jindex = nlist->jindex;
1357 shiftidx = nlist->shift;
1359 shiftvec = fr->shift_vec[0];
1360 fshift = fr->fshift[0];
1361 facel = _mm256_set1_pd(fr->ic->epsfac);
1362 charge = mdatoms->chargeA;
1363 nvdwtype = fr->ntype;
1364 vdwparam = fr->nbfp;
1365 vdwtype = mdatoms->typeA;
1367 vftab = kernel_data->table_elec->data;
1368 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
1370 /* Setup water-specific parameters */
1371 inr = nlist->iinr[0];
1372 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1373 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1374 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1375 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1377 jq0 = _mm256_set1_pd(charge[inr+0]);
1378 jq1 = _mm256_set1_pd(charge[inr+1]);
1379 jq2 = _mm256_set1_pd(charge[inr+2]);
1380 vdwjidx0A = 2*vdwtype[inr+0];
1381 qq00 = _mm256_mul_pd(iq0,jq0);
1382 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1383 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1384 qq01 = _mm256_mul_pd(iq0,jq1);
1385 qq02 = _mm256_mul_pd(iq0,jq2);
1386 qq10 = _mm256_mul_pd(iq1,jq0);
1387 qq11 = _mm256_mul_pd(iq1,jq1);
1388 qq12 = _mm256_mul_pd(iq1,jq2);
1389 qq20 = _mm256_mul_pd(iq2,jq0);
1390 qq21 = _mm256_mul_pd(iq2,jq1);
1391 qq22 = _mm256_mul_pd(iq2,jq2);
1393 /* Avoid stupid compiler warnings */
1394 jnrA = jnrB = jnrC = jnrD = 0;
1395 j_coord_offsetA = 0;
1396 j_coord_offsetB = 0;
1397 j_coord_offsetC = 0;
1398 j_coord_offsetD = 0;
1403 for(iidx=0;iidx<4*DIM;iidx++)
1405 scratch[iidx] = 0.0;
1408 /* Start outer loop over neighborlists */
1409 for(iidx=0; iidx<nri; iidx++)
1411 /* Load shift vector for this list */
1412 i_shift_offset = DIM*shiftidx[iidx];
1414 /* Load limits for loop over neighbors */
1415 j_index_start = jindex[iidx];
1416 j_index_end = jindex[iidx+1];
1418 /* Get outer coordinate index */
1420 i_coord_offset = DIM*inr;
1422 /* Load i particle coords and add shift vector */
1423 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1424 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1426 fix0 = _mm256_setzero_pd();
1427 fiy0 = _mm256_setzero_pd();
1428 fiz0 = _mm256_setzero_pd();
1429 fix1 = _mm256_setzero_pd();
1430 fiy1 = _mm256_setzero_pd();
1431 fiz1 = _mm256_setzero_pd();
1432 fix2 = _mm256_setzero_pd();
1433 fiy2 = _mm256_setzero_pd();
1434 fiz2 = _mm256_setzero_pd();
1436 /* Start inner kernel loop */
1437 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1440 /* Get j neighbor index, and coordinate index */
1442 jnrB = jjnr[jidx+1];
1443 jnrC = jjnr[jidx+2];
1444 jnrD = jjnr[jidx+3];
1445 j_coord_offsetA = DIM*jnrA;
1446 j_coord_offsetB = DIM*jnrB;
1447 j_coord_offsetC = DIM*jnrC;
1448 j_coord_offsetD = DIM*jnrD;
1450 /* load j atom coordinates */
1451 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1452 x+j_coord_offsetC,x+j_coord_offsetD,
1453 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1455 /* Calculate displacement vector */
1456 dx00 = _mm256_sub_pd(ix0,jx0);
1457 dy00 = _mm256_sub_pd(iy0,jy0);
1458 dz00 = _mm256_sub_pd(iz0,jz0);
1459 dx01 = _mm256_sub_pd(ix0,jx1);
1460 dy01 = _mm256_sub_pd(iy0,jy1);
1461 dz01 = _mm256_sub_pd(iz0,jz1);
1462 dx02 = _mm256_sub_pd(ix0,jx2);
1463 dy02 = _mm256_sub_pd(iy0,jy2);
1464 dz02 = _mm256_sub_pd(iz0,jz2);
1465 dx10 = _mm256_sub_pd(ix1,jx0);
1466 dy10 = _mm256_sub_pd(iy1,jy0);
1467 dz10 = _mm256_sub_pd(iz1,jz0);
1468 dx11 = _mm256_sub_pd(ix1,jx1);
1469 dy11 = _mm256_sub_pd(iy1,jy1);
1470 dz11 = _mm256_sub_pd(iz1,jz1);
1471 dx12 = _mm256_sub_pd(ix1,jx2);
1472 dy12 = _mm256_sub_pd(iy1,jy2);
1473 dz12 = _mm256_sub_pd(iz1,jz2);
1474 dx20 = _mm256_sub_pd(ix2,jx0);
1475 dy20 = _mm256_sub_pd(iy2,jy0);
1476 dz20 = _mm256_sub_pd(iz2,jz0);
1477 dx21 = _mm256_sub_pd(ix2,jx1);
1478 dy21 = _mm256_sub_pd(iy2,jy1);
1479 dz21 = _mm256_sub_pd(iz2,jz1);
1480 dx22 = _mm256_sub_pd(ix2,jx2);
1481 dy22 = _mm256_sub_pd(iy2,jy2);
1482 dz22 = _mm256_sub_pd(iz2,jz2);
1484 /* Calculate squared distance and things based on it */
1485 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1486 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1487 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1488 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1489 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1490 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1491 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1492 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1493 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1495 rinv00 = avx256_invsqrt_d(rsq00);
1496 rinv01 = avx256_invsqrt_d(rsq01);
1497 rinv02 = avx256_invsqrt_d(rsq02);
1498 rinv10 = avx256_invsqrt_d(rsq10);
1499 rinv11 = avx256_invsqrt_d(rsq11);
1500 rinv12 = avx256_invsqrt_d(rsq12);
1501 rinv20 = avx256_invsqrt_d(rsq20);
1502 rinv21 = avx256_invsqrt_d(rsq21);
1503 rinv22 = avx256_invsqrt_d(rsq22);
1505 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1507 fjx0 = _mm256_setzero_pd();
1508 fjy0 = _mm256_setzero_pd();
1509 fjz0 = _mm256_setzero_pd();
1510 fjx1 = _mm256_setzero_pd();
1511 fjy1 = _mm256_setzero_pd();
1512 fjz1 = _mm256_setzero_pd();
1513 fjx2 = _mm256_setzero_pd();
1514 fjy2 = _mm256_setzero_pd();
1515 fjz2 = _mm256_setzero_pd();
1517 /**************************
1518 * CALCULATE INTERACTIONS *
1519 **************************/
1521 r00 = _mm256_mul_pd(rsq00,rinv00);
1523 /* Calculate table index by multiplying r with table scale and truncate to integer */
1524 rt = _mm256_mul_pd(r00,vftabscale);
1525 vfitab = _mm256_cvttpd_epi32(rt);
1526 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1527 vfitab = _mm_slli_epi32(vfitab,2);
1529 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1530 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1531 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1532 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1533 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1534 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1535 Heps = _mm256_mul_pd(vfeps,H);
1536 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1537 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1538 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1540 /* LENNARD-JONES DISPERSION/REPULSION */
1542 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1543 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1545 fscal = _mm256_add_pd(felec,fvdw);
1547 /* Calculate temporary vectorial force */
1548 tx = _mm256_mul_pd(fscal,dx00);
1549 ty = _mm256_mul_pd(fscal,dy00);
1550 tz = _mm256_mul_pd(fscal,dz00);
1552 /* Update vectorial force */
1553 fix0 = _mm256_add_pd(fix0,tx);
1554 fiy0 = _mm256_add_pd(fiy0,ty);
1555 fiz0 = _mm256_add_pd(fiz0,tz);
1557 fjx0 = _mm256_add_pd(fjx0,tx);
1558 fjy0 = _mm256_add_pd(fjy0,ty);
1559 fjz0 = _mm256_add_pd(fjz0,tz);
1561 /**************************
1562 * CALCULATE INTERACTIONS *
1563 **************************/
1565 r01 = _mm256_mul_pd(rsq01,rinv01);
1567 /* Calculate table index by multiplying r with table scale and truncate to integer */
1568 rt = _mm256_mul_pd(r01,vftabscale);
1569 vfitab = _mm256_cvttpd_epi32(rt);
1570 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1571 vfitab = _mm_slli_epi32(vfitab,2);
1573 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1574 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1575 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1576 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1577 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1578 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1579 Heps = _mm256_mul_pd(vfeps,H);
1580 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1581 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1582 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1586 /* Calculate temporary vectorial force */
1587 tx = _mm256_mul_pd(fscal,dx01);
1588 ty = _mm256_mul_pd(fscal,dy01);
1589 tz = _mm256_mul_pd(fscal,dz01);
1591 /* Update vectorial force */
1592 fix0 = _mm256_add_pd(fix0,tx);
1593 fiy0 = _mm256_add_pd(fiy0,ty);
1594 fiz0 = _mm256_add_pd(fiz0,tz);
1596 fjx1 = _mm256_add_pd(fjx1,tx);
1597 fjy1 = _mm256_add_pd(fjy1,ty);
1598 fjz1 = _mm256_add_pd(fjz1,tz);
1600 /**************************
1601 * CALCULATE INTERACTIONS *
1602 **************************/
1604 r02 = _mm256_mul_pd(rsq02,rinv02);
1606 /* Calculate table index by multiplying r with table scale and truncate to integer */
1607 rt = _mm256_mul_pd(r02,vftabscale);
1608 vfitab = _mm256_cvttpd_epi32(rt);
1609 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1610 vfitab = _mm_slli_epi32(vfitab,2);
1612 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1613 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1614 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1615 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1616 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1617 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1618 Heps = _mm256_mul_pd(vfeps,H);
1619 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1620 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1621 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1625 /* Calculate temporary vectorial force */
1626 tx = _mm256_mul_pd(fscal,dx02);
1627 ty = _mm256_mul_pd(fscal,dy02);
1628 tz = _mm256_mul_pd(fscal,dz02);
1630 /* Update vectorial force */
1631 fix0 = _mm256_add_pd(fix0,tx);
1632 fiy0 = _mm256_add_pd(fiy0,ty);
1633 fiz0 = _mm256_add_pd(fiz0,tz);
1635 fjx2 = _mm256_add_pd(fjx2,tx);
1636 fjy2 = _mm256_add_pd(fjy2,ty);
1637 fjz2 = _mm256_add_pd(fjz2,tz);
1639 /**************************
1640 * CALCULATE INTERACTIONS *
1641 **************************/
1643 r10 = _mm256_mul_pd(rsq10,rinv10);
1645 /* Calculate table index by multiplying r with table scale and truncate to integer */
1646 rt = _mm256_mul_pd(r10,vftabscale);
1647 vfitab = _mm256_cvttpd_epi32(rt);
1648 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1649 vfitab = _mm_slli_epi32(vfitab,2);
1651 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1652 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1653 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1654 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1655 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1656 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1657 Heps = _mm256_mul_pd(vfeps,H);
1658 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1659 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1660 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1664 /* Calculate temporary vectorial force */
1665 tx = _mm256_mul_pd(fscal,dx10);
1666 ty = _mm256_mul_pd(fscal,dy10);
1667 tz = _mm256_mul_pd(fscal,dz10);
1669 /* Update vectorial force */
1670 fix1 = _mm256_add_pd(fix1,tx);
1671 fiy1 = _mm256_add_pd(fiy1,ty);
1672 fiz1 = _mm256_add_pd(fiz1,tz);
1674 fjx0 = _mm256_add_pd(fjx0,tx);
1675 fjy0 = _mm256_add_pd(fjy0,ty);
1676 fjz0 = _mm256_add_pd(fjz0,tz);
1678 /**************************
1679 * CALCULATE INTERACTIONS *
1680 **************************/
1682 r11 = _mm256_mul_pd(rsq11,rinv11);
1684 /* Calculate table index by multiplying r with table scale and truncate to integer */
1685 rt = _mm256_mul_pd(r11,vftabscale);
1686 vfitab = _mm256_cvttpd_epi32(rt);
1687 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1688 vfitab = _mm_slli_epi32(vfitab,2);
1690 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1691 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1692 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1693 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1694 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1695 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1696 Heps = _mm256_mul_pd(vfeps,H);
1697 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1698 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1699 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1703 /* Calculate temporary vectorial force */
1704 tx = _mm256_mul_pd(fscal,dx11);
1705 ty = _mm256_mul_pd(fscal,dy11);
1706 tz = _mm256_mul_pd(fscal,dz11);
1708 /* Update vectorial force */
1709 fix1 = _mm256_add_pd(fix1,tx);
1710 fiy1 = _mm256_add_pd(fiy1,ty);
1711 fiz1 = _mm256_add_pd(fiz1,tz);
1713 fjx1 = _mm256_add_pd(fjx1,tx);
1714 fjy1 = _mm256_add_pd(fjy1,ty);
1715 fjz1 = _mm256_add_pd(fjz1,tz);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 r12 = _mm256_mul_pd(rsq12,rinv12);
1723 /* Calculate table index by multiplying r with table scale and truncate to integer */
1724 rt = _mm256_mul_pd(r12,vftabscale);
1725 vfitab = _mm256_cvttpd_epi32(rt);
1726 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1727 vfitab = _mm_slli_epi32(vfitab,2);
1729 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1730 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1731 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1732 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1733 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1734 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1735 Heps = _mm256_mul_pd(vfeps,H);
1736 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1737 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1738 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1742 /* Calculate temporary vectorial force */
1743 tx = _mm256_mul_pd(fscal,dx12);
1744 ty = _mm256_mul_pd(fscal,dy12);
1745 tz = _mm256_mul_pd(fscal,dz12);
1747 /* Update vectorial force */
1748 fix1 = _mm256_add_pd(fix1,tx);
1749 fiy1 = _mm256_add_pd(fiy1,ty);
1750 fiz1 = _mm256_add_pd(fiz1,tz);
1752 fjx2 = _mm256_add_pd(fjx2,tx);
1753 fjy2 = _mm256_add_pd(fjy2,ty);
1754 fjz2 = _mm256_add_pd(fjz2,tz);
1756 /**************************
1757 * CALCULATE INTERACTIONS *
1758 **************************/
1760 r20 = _mm256_mul_pd(rsq20,rinv20);
1762 /* Calculate table index by multiplying r with table scale and truncate to integer */
1763 rt = _mm256_mul_pd(r20,vftabscale);
1764 vfitab = _mm256_cvttpd_epi32(rt);
1765 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1766 vfitab = _mm_slli_epi32(vfitab,2);
1768 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1769 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1770 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1771 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1772 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1773 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1774 Heps = _mm256_mul_pd(vfeps,H);
1775 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1776 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1777 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1781 /* Calculate temporary vectorial force */
1782 tx = _mm256_mul_pd(fscal,dx20);
1783 ty = _mm256_mul_pd(fscal,dy20);
1784 tz = _mm256_mul_pd(fscal,dz20);
1786 /* Update vectorial force */
1787 fix2 = _mm256_add_pd(fix2,tx);
1788 fiy2 = _mm256_add_pd(fiy2,ty);
1789 fiz2 = _mm256_add_pd(fiz2,tz);
1791 fjx0 = _mm256_add_pd(fjx0,tx);
1792 fjy0 = _mm256_add_pd(fjy0,ty);
1793 fjz0 = _mm256_add_pd(fjz0,tz);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 r21 = _mm256_mul_pd(rsq21,rinv21);
1801 /* Calculate table index by multiplying r with table scale and truncate to integer */
1802 rt = _mm256_mul_pd(r21,vftabscale);
1803 vfitab = _mm256_cvttpd_epi32(rt);
1804 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1805 vfitab = _mm_slli_epi32(vfitab,2);
1807 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1808 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1809 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1810 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1811 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1812 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1813 Heps = _mm256_mul_pd(vfeps,H);
1814 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1815 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1816 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1820 /* Calculate temporary vectorial force */
1821 tx = _mm256_mul_pd(fscal,dx21);
1822 ty = _mm256_mul_pd(fscal,dy21);
1823 tz = _mm256_mul_pd(fscal,dz21);
1825 /* Update vectorial force */
1826 fix2 = _mm256_add_pd(fix2,tx);
1827 fiy2 = _mm256_add_pd(fiy2,ty);
1828 fiz2 = _mm256_add_pd(fiz2,tz);
1830 fjx1 = _mm256_add_pd(fjx1,tx);
1831 fjy1 = _mm256_add_pd(fjy1,ty);
1832 fjz1 = _mm256_add_pd(fjz1,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r22 = _mm256_mul_pd(rsq22,rinv22);
1840 /* Calculate table index by multiplying r with table scale and truncate to integer */
1841 rt = _mm256_mul_pd(r22,vftabscale);
1842 vfitab = _mm256_cvttpd_epi32(rt);
1843 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1844 vfitab = _mm_slli_epi32(vfitab,2);
1846 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1847 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1848 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1849 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1850 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1851 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1852 Heps = _mm256_mul_pd(vfeps,H);
1853 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1854 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1855 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1859 /* Calculate temporary vectorial force */
1860 tx = _mm256_mul_pd(fscal,dx22);
1861 ty = _mm256_mul_pd(fscal,dy22);
1862 tz = _mm256_mul_pd(fscal,dz22);
1864 /* Update vectorial force */
1865 fix2 = _mm256_add_pd(fix2,tx);
1866 fiy2 = _mm256_add_pd(fiy2,ty);
1867 fiz2 = _mm256_add_pd(fiz2,tz);
1869 fjx2 = _mm256_add_pd(fjx2,tx);
1870 fjy2 = _mm256_add_pd(fjy2,ty);
1871 fjz2 = _mm256_add_pd(fjz2,tz);
1873 fjptrA = f+j_coord_offsetA;
1874 fjptrB = f+j_coord_offsetB;
1875 fjptrC = f+j_coord_offsetC;
1876 fjptrD = f+j_coord_offsetD;
1878 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1879 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1881 /* Inner loop uses 359 flops */
1884 if(jidx<j_index_end)
1887 /* Get j neighbor index, and coordinate index */
1888 jnrlistA = jjnr[jidx];
1889 jnrlistB = jjnr[jidx+1];
1890 jnrlistC = jjnr[jidx+2];
1891 jnrlistD = jjnr[jidx+3];
1892 /* Sign of each element will be negative for non-real atoms.
1893 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1894 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1896 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1898 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1899 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1900 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1902 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1903 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1904 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1905 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1906 j_coord_offsetA = DIM*jnrA;
1907 j_coord_offsetB = DIM*jnrB;
1908 j_coord_offsetC = DIM*jnrC;
1909 j_coord_offsetD = DIM*jnrD;
1911 /* load j atom coordinates */
1912 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1913 x+j_coord_offsetC,x+j_coord_offsetD,
1914 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1916 /* Calculate displacement vector */
1917 dx00 = _mm256_sub_pd(ix0,jx0);
1918 dy00 = _mm256_sub_pd(iy0,jy0);
1919 dz00 = _mm256_sub_pd(iz0,jz0);
1920 dx01 = _mm256_sub_pd(ix0,jx1);
1921 dy01 = _mm256_sub_pd(iy0,jy1);
1922 dz01 = _mm256_sub_pd(iz0,jz1);
1923 dx02 = _mm256_sub_pd(ix0,jx2);
1924 dy02 = _mm256_sub_pd(iy0,jy2);
1925 dz02 = _mm256_sub_pd(iz0,jz2);
1926 dx10 = _mm256_sub_pd(ix1,jx0);
1927 dy10 = _mm256_sub_pd(iy1,jy0);
1928 dz10 = _mm256_sub_pd(iz1,jz0);
1929 dx11 = _mm256_sub_pd(ix1,jx1);
1930 dy11 = _mm256_sub_pd(iy1,jy1);
1931 dz11 = _mm256_sub_pd(iz1,jz1);
1932 dx12 = _mm256_sub_pd(ix1,jx2);
1933 dy12 = _mm256_sub_pd(iy1,jy2);
1934 dz12 = _mm256_sub_pd(iz1,jz2);
1935 dx20 = _mm256_sub_pd(ix2,jx0);
1936 dy20 = _mm256_sub_pd(iy2,jy0);
1937 dz20 = _mm256_sub_pd(iz2,jz0);
1938 dx21 = _mm256_sub_pd(ix2,jx1);
1939 dy21 = _mm256_sub_pd(iy2,jy1);
1940 dz21 = _mm256_sub_pd(iz2,jz1);
1941 dx22 = _mm256_sub_pd(ix2,jx2);
1942 dy22 = _mm256_sub_pd(iy2,jy2);
1943 dz22 = _mm256_sub_pd(iz2,jz2);
1945 /* Calculate squared distance and things based on it */
1946 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1947 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1948 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1949 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1950 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1951 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1952 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1953 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1954 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1956 rinv00 = avx256_invsqrt_d(rsq00);
1957 rinv01 = avx256_invsqrt_d(rsq01);
1958 rinv02 = avx256_invsqrt_d(rsq02);
1959 rinv10 = avx256_invsqrt_d(rsq10);
1960 rinv11 = avx256_invsqrt_d(rsq11);
1961 rinv12 = avx256_invsqrt_d(rsq12);
1962 rinv20 = avx256_invsqrt_d(rsq20);
1963 rinv21 = avx256_invsqrt_d(rsq21);
1964 rinv22 = avx256_invsqrt_d(rsq22);
1966 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1968 fjx0 = _mm256_setzero_pd();
1969 fjy0 = _mm256_setzero_pd();
1970 fjz0 = _mm256_setzero_pd();
1971 fjx1 = _mm256_setzero_pd();
1972 fjy1 = _mm256_setzero_pd();
1973 fjz1 = _mm256_setzero_pd();
1974 fjx2 = _mm256_setzero_pd();
1975 fjy2 = _mm256_setzero_pd();
1976 fjz2 = _mm256_setzero_pd();
1978 /**************************
1979 * CALCULATE INTERACTIONS *
1980 **************************/
1982 r00 = _mm256_mul_pd(rsq00,rinv00);
1983 r00 = _mm256_andnot_pd(dummy_mask,r00);
1985 /* Calculate table index by multiplying r with table scale and truncate to integer */
1986 rt = _mm256_mul_pd(r00,vftabscale);
1987 vfitab = _mm256_cvttpd_epi32(rt);
1988 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1989 vfitab = _mm_slli_epi32(vfitab,2);
1991 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1992 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1993 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1994 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1995 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1996 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1997 Heps = _mm256_mul_pd(vfeps,H);
1998 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1999 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2000 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
2002 /* LENNARD-JONES DISPERSION/REPULSION */
2004 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2005 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2007 fscal = _mm256_add_pd(felec,fvdw);
2009 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2011 /* Calculate temporary vectorial force */
2012 tx = _mm256_mul_pd(fscal,dx00);
2013 ty = _mm256_mul_pd(fscal,dy00);
2014 tz = _mm256_mul_pd(fscal,dz00);
2016 /* Update vectorial force */
2017 fix0 = _mm256_add_pd(fix0,tx);
2018 fiy0 = _mm256_add_pd(fiy0,ty);
2019 fiz0 = _mm256_add_pd(fiz0,tz);
2021 fjx0 = _mm256_add_pd(fjx0,tx);
2022 fjy0 = _mm256_add_pd(fjy0,ty);
2023 fjz0 = _mm256_add_pd(fjz0,tz);
2025 /**************************
2026 * CALCULATE INTERACTIONS *
2027 **************************/
2029 r01 = _mm256_mul_pd(rsq01,rinv01);
2030 r01 = _mm256_andnot_pd(dummy_mask,r01);
2032 /* Calculate table index by multiplying r with table scale and truncate to integer */
2033 rt = _mm256_mul_pd(r01,vftabscale);
2034 vfitab = _mm256_cvttpd_epi32(rt);
2035 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2036 vfitab = _mm_slli_epi32(vfitab,2);
2038 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2039 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2040 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2041 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2042 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2043 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2044 Heps = _mm256_mul_pd(vfeps,H);
2045 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2046 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2047 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
2051 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2053 /* Calculate temporary vectorial force */
2054 tx = _mm256_mul_pd(fscal,dx01);
2055 ty = _mm256_mul_pd(fscal,dy01);
2056 tz = _mm256_mul_pd(fscal,dz01);
2058 /* Update vectorial force */
2059 fix0 = _mm256_add_pd(fix0,tx);
2060 fiy0 = _mm256_add_pd(fiy0,ty);
2061 fiz0 = _mm256_add_pd(fiz0,tz);
2063 fjx1 = _mm256_add_pd(fjx1,tx);
2064 fjy1 = _mm256_add_pd(fjy1,ty);
2065 fjz1 = _mm256_add_pd(fjz1,tz);
2067 /**************************
2068 * CALCULATE INTERACTIONS *
2069 **************************/
2071 r02 = _mm256_mul_pd(rsq02,rinv02);
2072 r02 = _mm256_andnot_pd(dummy_mask,r02);
2074 /* Calculate table index by multiplying r with table scale and truncate to integer */
2075 rt = _mm256_mul_pd(r02,vftabscale);
2076 vfitab = _mm256_cvttpd_epi32(rt);
2077 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2078 vfitab = _mm_slli_epi32(vfitab,2);
2080 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2081 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2082 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2083 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2084 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2085 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2086 Heps = _mm256_mul_pd(vfeps,H);
2087 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2088 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2089 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2093 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm256_mul_pd(fscal,dx02);
2097 ty = _mm256_mul_pd(fscal,dy02);
2098 tz = _mm256_mul_pd(fscal,dz02);
2100 /* Update vectorial force */
2101 fix0 = _mm256_add_pd(fix0,tx);
2102 fiy0 = _mm256_add_pd(fiy0,ty);
2103 fiz0 = _mm256_add_pd(fiz0,tz);
2105 fjx2 = _mm256_add_pd(fjx2,tx);
2106 fjy2 = _mm256_add_pd(fjy2,ty);
2107 fjz2 = _mm256_add_pd(fjz2,tz);
2109 /**************************
2110 * CALCULATE INTERACTIONS *
2111 **************************/
2113 r10 = _mm256_mul_pd(rsq10,rinv10);
2114 r10 = _mm256_andnot_pd(dummy_mask,r10);
2116 /* Calculate table index by multiplying r with table scale and truncate to integer */
2117 rt = _mm256_mul_pd(r10,vftabscale);
2118 vfitab = _mm256_cvttpd_epi32(rt);
2119 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2120 vfitab = _mm_slli_epi32(vfitab,2);
2122 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2123 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2124 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2125 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2126 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2127 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2128 Heps = _mm256_mul_pd(vfeps,H);
2129 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2130 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2131 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2135 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2137 /* Calculate temporary vectorial force */
2138 tx = _mm256_mul_pd(fscal,dx10);
2139 ty = _mm256_mul_pd(fscal,dy10);
2140 tz = _mm256_mul_pd(fscal,dz10);
2142 /* Update vectorial force */
2143 fix1 = _mm256_add_pd(fix1,tx);
2144 fiy1 = _mm256_add_pd(fiy1,ty);
2145 fiz1 = _mm256_add_pd(fiz1,tz);
2147 fjx0 = _mm256_add_pd(fjx0,tx);
2148 fjy0 = _mm256_add_pd(fjy0,ty);
2149 fjz0 = _mm256_add_pd(fjz0,tz);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 r11 = _mm256_mul_pd(rsq11,rinv11);
2156 r11 = _mm256_andnot_pd(dummy_mask,r11);
2158 /* Calculate table index by multiplying r with table scale and truncate to integer */
2159 rt = _mm256_mul_pd(r11,vftabscale);
2160 vfitab = _mm256_cvttpd_epi32(rt);
2161 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2162 vfitab = _mm_slli_epi32(vfitab,2);
2164 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2165 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2166 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2167 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2168 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2169 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2170 Heps = _mm256_mul_pd(vfeps,H);
2171 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2172 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2173 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2177 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2179 /* Calculate temporary vectorial force */
2180 tx = _mm256_mul_pd(fscal,dx11);
2181 ty = _mm256_mul_pd(fscal,dy11);
2182 tz = _mm256_mul_pd(fscal,dz11);
2184 /* Update vectorial force */
2185 fix1 = _mm256_add_pd(fix1,tx);
2186 fiy1 = _mm256_add_pd(fiy1,ty);
2187 fiz1 = _mm256_add_pd(fiz1,tz);
2189 fjx1 = _mm256_add_pd(fjx1,tx);
2190 fjy1 = _mm256_add_pd(fjy1,ty);
2191 fjz1 = _mm256_add_pd(fjz1,tz);
2193 /**************************
2194 * CALCULATE INTERACTIONS *
2195 **************************/
2197 r12 = _mm256_mul_pd(rsq12,rinv12);
2198 r12 = _mm256_andnot_pd(dummy_mask,r12);
2200 /* Calculate table index by multiplying r with table scale and truncate to integer */
2201 rt = _mm256_mul_pd(r12,vftabscale);
2202 vfitab = _mm256_cvttpd_epi32(rt);
2203 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2204 vfitab = _mm_slli_epi32(vfitab,2);
2206 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2207 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2208 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2209 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2210 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2211 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2212 Heps = _mm256_mul_pd(vfeps,H);
2213 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2214 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2215 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2219 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2221 /* Calculate temporary vectorial force */
2222 tx = _mm256_mul_pd(fscal,dx12);
2223 ty = _mm256_mul_pd(fscal,dy12);
2224 tz = _mm256_mul_pd(fscal,dz12);
2226 /* Update vectorial force */
2227 fix1 = _mm256_add_pd(fix1,tx);
2228 fiy1 = _mm256_add_pd(fiy1,ty);
2229 fiz1 = _mm256_add_pd(fiz1,tz);
2231 fjx2 = _mm256_add_pd(fjx2,tx);
2232 fjy2 = _mm256_add_pd(fjy2,ty);
2233 fjz2 = _mm256_add_pd(fjz2,tz);
2235 /**************************
2236 * CALCULATE INTERACTIONS *
2237 **************************/
2239 r20 = _mm256_mul_pd(rsq20,rinv20);
2240 r20 = _mm256_andnot_pd(dummy_mask,r20);
2242 /* Calculate table index by multiplying r with table scale and truncate to integer */
2243 rt = _mm256_mul_pd(r20,vftabscale);
2244 vfitab = _mm256_cvttpd_epi32(rt);
2245 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2246 vfitab = _mm_slli_epi32(vfitab,2);
2248 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2249 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2250 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2251 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2252 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2253 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2254 Heps = _mm256_mul_pd(vfeps,H);
2255 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2256 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2257 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2261 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2263 /* Calculate temporary vectorial force */
2264 tx = _mm256_mul_pd(fscal,dx20);
2265 ty = _mm256_mul_pd(fscal,dy20);
2266 tz = _mm256_mul_pd(fscal,dz20);
2268 /* Update vectorial force */
2269 fix2 = _mm256_add_pd(fix2,tx);
2270 fiy2 = _mm256_add_pd(fiy2,ty);
2271 fiz2 = _mm256_add_pd(fiz2,tz);
2273 fjx0 = _mm256_add_pd(fjx0,tx);
2274 fjy0 = _mm256_add_pd(fjy0,ty);
2275 fjz0 = _mm256_add_pd(fjz0,tz);
2277 /**************************
2278 * CALCULATE INTERACTIONS *
2279 **************************/
2281 r21 = _mm256_mul_pd(rsq21,rinv21);
2282 r21 = _mm256_andnot_pd(dummy_mask,r21);
2284 /* Calculate table index by multiplying r with table scale and truncate to integer */
2285 rt = _mm256_mul_pd(r21,vftabscale);
2286 vfitab = _mm256_cvttpd_epi32(rt);
2287 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2288 vfitab = _mm_slli_epi32(vfitab,2);
2290 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2291 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2292 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2293 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2294 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2295 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2296 Heps = _mm256_mul_pd(vfeps,H);
2297 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2298 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2299 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2303 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2305 /* Calculate temporary vectorial force */
2306 tx = _mm256_mul_pd(fscal,dx21);
2307 ty = _mm256_mul_pd(fscal,dy21);
2308 tz = _mm256_mul_pd(fscal,dz21);
2310 /* Update vectorial force */
2311 fix2 = _mm256_add_pd(fix2,tx);
2312 fiy2 = _mm256_add_pd(fiy2,ty);
2313 fiz2 = _mm256_add_pd(fiz2,tz);
2315 fjx1 = _mm256_add_pd(fjx1,tx);
2316 fjy1 = _mm256_add_pd(fjy1,ty);
2317 fjz1 = _mm256_add_pd(fjz1,tz);
2319 /**************************
2320 * CALCULATE INTERACTIONS *
2321 **************************/
2323 r22 = _mm256_mul_pd(rsq22,rinv22);
2324 r22 = _mm256_andnot_pd(dummy_mask,r22);
2326 /* Calculate table index by multiplying r with table scale and truncate to integer */
2327 rt = _mm256_mul_pd(r22,vftabscale);
2328 vfitab = _mm256_cvttpd_epi32(rt);
2329 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2330 vfitab = _mm_slli_epi32(vfitab,2);
2332 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2333 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2334 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2335 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2336 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2337 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2338 Heps = _mm256_mul_pd(vfeps,H);
2339 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2340 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2341 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2345 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2347 /* Calculate temporary vectorial force */
2348 tx = _mm256_mul_pd(fscal,dx22);
2349 ty = _mm256_mul_pd(fscal,dy22);
2350 tz = _mm256_mul_pd(fscal,dz22);
2352 /* Update vectorial force */
2353 fix2 = _mm256_add_pd(fix2,tx);
2354 fiy2 = _mm256_add_pd(fiy2,ty);
2355 fiz2 = _mm256_add_pd(fiz2,tz);
2357 fjx2 = _mm256_add_pd(fjx2,tx);
2358 fjy2 = _mm256_add_pd(fjy2,ty);
2359 fjz2 = _mm256_add_pd(fjz2,tz);
2361 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2362 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2363 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2364 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2366 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2367 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2369 /* Inner loop uses 368 flops */
2372 /* End of innermost loop */
2374 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2375 f+i_coord_offset,fshift+i_shift_offset);
2377 /* Increment number of inner iterations */
2378 inneriter += j_index_end - j_index_start;
2380 /* Outer loop uses 18 flops */
2383 /* Increment number of outer iterations */
2386 /* Update outer/inner flops */
2388 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);