2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
112 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
114 __m128i ifour = _mm_set1_epi32(4);
115 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
117 __m256d dummy_mask,cutoff_mask;
118 __m128 tmpmask0,tmpmask1;
119 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120 __m256d one = _mm256_set1_pd(1.0);
121 __m256d two = _mm256_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm256_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 vftab = kernel_data->table_elec->data;
140 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
145 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
146 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
147 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
149 jq0 = _mm256_set1_pd(charge[inr+0]);
150 jq1 = _mm256_set1_pd(charge[inr+1]);
151 jq2 = _mm256_set1_pd(charge[inr+2]);
152 vdwjidx0A = 2*vdwtype[inr+0];
153 qq00 = _mm256_mul_pd(iq0,jq0);
154 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
155 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
156 qq01 = _mm256_mul_pd(iq0,jq1);
157 qq02 = _mm256_mul_pd(iq0,jq2);
158 qq10 = _mm256_mul_pd(iq1,jq0);
159 qq11 = _mm256_mul_pd(iq1,jq1);
160 qq12 = _mm256_mul_pd(iq1,jq2);
161 qq20 = _mm256_mul_pd(iq2,jq0);
162 qq21 = _mm256_mul_pd(iq2,jq1);
163 qq22 = _mm256_mul_pd(iq2,jq2);
165 /* Avoid stupid compiler warnings */
166 jnrA = jnrB = jnrC = jnrD = 0;
175 for(iidx=0;iidx<4*DIM;iidx++)
180 /* Start outer loop over neighborlists */
181 for(iidx=0; iidx<nri; iidx++)
183 /* Load shift vector for this list */
184 i_shift_offset = DIM*shiftidx[iidx];
186 /* Load limits for loop over neighbors */
187 j_index_start = jindex[iidx];
188 j_index_end = jindex[iidx+1];
190 /* Get outer coordinate index */
192 i_coord_offset = DIM*inr;
194 /* Load i particle coords and add shift vector */
195 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
196 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
198 fix0 = _mm256_setzero_pd();
199 fiy0 = _mm256_setzero_pd();
200 fiz0 = _mm256_setzero_pd();
201 fix1 = _mm256_setzero_pd();
202 fiy1 = _mm256_setzero_pd();
203 fiz1 = _mm256_setzero_pd();
204 fix2 = _mm256_setzero_pd();
205 fiy2 = _mm256_setzero_pd();
206 fiz2 = _mm256_setzero_pd();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_pd();
210 vvdwsum = _mm256_setzero_pd();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
216 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA = DIM*jnrA;
222 j_coord_offsetB = DIM*jnrB;
223 j_coord_offsetC = DIM*jnrC;
224 j_coord_offsetD = DIM*jnrD;
226 /* load j atom coordinates */
227 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
231 /* Calculate displacement vector */
232 dx00 = _mm256_sub_pd(ix0,jx0);
233 dy00 = _mm256_sub_pd(iy0,jy0);
234 dz00 = _mm256_sub_pd(iz0,jz0);
235 dx01 = _mm256_sub_pd(ix0,jx1);
236 dy01 = _mm256_sub_pd(iy0,jy1);
237 dz01 = _mm256_sub_pd(iz0,jz1);
238 dx02 = _mm256_sub_pd(ix0,jx2);
239 dy02 = _mm256_sub_pd(iy0,jy2);
240 dz02 = _mm256_sub_pd(iz0,jz2);
241 dx10 = _mm256_sub_pd(ix1,jx0);
242 dy10 = _mm256_sub_pd(iy1,jy0);
243 dz10 = _mm256_sub_pd(iz1,jz0);
244 dx11 = _mm256_sub_pd(ix1,jx1);
245 dy11 = _mm256_sub_pd(iy1,jy1);
246 dz11 = _mm256_sub_pd(iz1,jz1);
247 dx12 = _mm256_sub_pd(ix1,jx2);
248 dy12 = _mm256_sub_pd(iy1,jy2);
249 dz12 = _mm256_sub_pd(iz1,jz2);
250 dx20 = _mm256_sub_pd(ix2,jx0);
251 dy20 = _mm256_sub_pd(iy2,jy0);
252 dz20 = _mm256_sub_pd(iz2,jz0);
253 dx21 = _mm256_sub_pd(ix2,jx1);
254 dy21 = _mm256_sub_pd(iy2,jy1);
255 dz21 = _mm256_sub_pd(iz2,jz1);
256 dx22 = _mm256_sub_pd(ix2,jx2);
257 dy22 = _mm256_sub_pd(iy2,jy2);
258 dz22 = _mm256_sub_pd(iz2,jz2);
260 /* Calculate squared distance and things based on it */
261 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
262 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
263 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
264 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
265 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
266 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
267 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
268 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
269 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
271 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
272 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
273 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
274 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
275 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
276 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
277 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
278 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
279 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
281 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
283 fjx0 = _mm256_setzero_pd();
284 fjy0 = _mm256_setzero_pd();
285 fjz0 = _mm256_setzero_pd();
286 fjx1 = _mm256_setzero_pd();
287 fjy1 = _mm256_setzero_pd();
288 fjz1 = _mm256_setzero_pd();
289 fjx2 = _mm256_setzero_pd();
290 fjy2 = _mm256_setzero_pd();
291 fjz2 = _mm256_setzero_pd();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 r00 = _mm256_mul_pd(rsq00,rinv00);
299 /* Calculate table index by multiplying r with table scale and truncate to integer */
300 rt = _mm256_mul_pd(r00,vftabscale);
301 vfitab = _mm256_cvttpd_epi32(rt);
302 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
303 vfitab = _mm_slli_epi32(vfitab,2);
305 /* CUBIC SPLINE TABLE ELECTROSTATICS */
306 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
309 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
310 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
311 Heps = _mm256_mul_pd(vfeps,H);
312 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
313 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
314 velec = _mm256_mul_pd(qq00,VV);
315 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
316 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
318 /* LENNARD-JONES DISPERSION/REPULSION */
320 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
321 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
322 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
323 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
324 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
326 /* Update potential sum for this i atom from the interaction with this j atom. */
327 velecsum = _mm256_add_pd(velecsum,velec);
328 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
330 fscal = _mm256_add_pd(felec,fvdw);
332 /* Calculate temporary vectorial force */
333 tx = _mm256_mul_pd(fscal,dx00);
334 ty = _mm256_mul_pd(fscal,dy00);
335 tz = _mm256_mul_pd(fscal,dz00);
337 /* Update vectorial force */
338 fix0 = _mm256_add_pd(fix0,tx);
339 fiy0 = _mm256_add_pd(fiy0,ty);
340 fiz0 = _mm256_add_pd(fiz0,tz);
342 fjx0 = _mm256_add_pd(fjx0,tx);
343 fjy0 = _mm256_add_pd(fjy0,ty);
344 fjz0 = _mm256_add_pd(fjz0,tz);
346 /**************************
347 * CALCULATE INTERACTIONS *
348 **************************/
350 r01 = _mm256_mul_pd(rsq01,rinv01);
352 /* Calculate table index by multiplying r with table scale and truncate to integer */
353 rt = _mm256_mul_pd(r01,vftabscale);
354 vfitab = _mm256_cvttpd_epi32(rt);
355 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
356 vfitab = _mm_slli_epi32(vfitab,2);
358 /* CUBIC SPLINE TABLE ELECTROSTATICS */
359 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
360 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
361 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
362 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
363 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
364 Heps = _mm256_mul_pd(vfeps,H);
365 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
366 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
367 velec = _mm256_mul_pd(qq01,VV);
368 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
369 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 velecsum = _mm256_add_pd(velecsum,velec);
376 /* Calculate temporary vectorial force */
377 tx = _mm256_mul_pd(fscal,dx01);
378 ty = _mm256_mul_pd(fscal,dy01);
379 tz = _mm256_mul_pd(fscal,dz01);
381 /* Update vectorial force */
382 fix0 = _mm256_add_pd(fix0,tx);
383 fiy0 = _mm256_add_pd(fiy0,ty);
384 fiz0 = _mm256_add_pd(fiz0,tz);
386 fjx1 = _mm256_add_pd(fjx1,tx);
387 fjy1 = _mm256_add_pd(fjy1,ty);
388 fjz1 = _mm256_add_pd(fjz1,tz);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 r02 = _mm256_mul_pd(rsq02,rinv02);
396 /* Calculate table index by multiplying r with table scale and truncate to integer */
397 rt = _mm256_mul_pd(r02,vftabscale);
398 vfitab = _mm256_cvttpd_epi32(rt);
399 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
400 vfitab = _mm_slli_epi32(vfitab,2);
402 /* CUBIC SPLINE TABLE ELECTROSTATICS */
403 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
404 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
405 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
406 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
407 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
408 Heps = _mm256_mul_pd(vfeps,H);
409 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
410 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
411 velec = _mm256_mul_pd(qq02,VV);
412 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
413 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
415 /* Update potential sum for this i atom from the interaction with this j atom. */
416 velecsum = _mm256_add_pd(velecsum,velec);
420 /* Calculate temporary vectorial force */
421 tx = _mm256_mul_pd(fscal,dx02);
422 ty = _mm256_mul_pd(fscal,dy02);
423 tz = _mm256_mul_pd(fscal,dz02);
425 /* Update vectorial force */
426 fix0 = _mm256_add_pd(fix0,tx);
427 fiy0 = _mm256_add_pd(fiy0,ty);
428 fiz0 = _mm256_add_pd(fiz0,tz);
430 fjx2 = _mm256_add_pd(fjx2,tx);
431 fjy2 = _mm256_add_pd(fjy2,ty);
432 fjz2 = _mm256_add_pd(fjz2,tz);
434 /**************************
435 * CALCULATE INTERACTIONS *
436 **************************/
438 r10 = _mm256_mul_pd(rsq10,rinv10);
440 /* Calculate table index by multiplying r with table scale and truncate to integer */
441 rt = _mm256_mul_pd(r10,vftabscale);
442 vfitab = _mm256_cvttpd_epi32(rt);
443 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
444 vfitab = _mm_slli_epi32(vfitab,2);
446 /* CUBIC SPLINE TABLE ELECTROSTATICS */
447 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
448 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
449 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
450 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
451 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
452 Heps = _mm256_mul_pd(vfeps,H);
453 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
454 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
455 velec = _mm256_mul_pd(qq10,VV);
456 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
457 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 velecsum = _mm256_add_pd(velecsum,velec);
464 /* Calculate temporary vectorial force */
465 tx = _mm256_mul_pd(fscal,dx10);
466 ty = _mm256_mul_pd(fscal,dy10);
467 tz = _mm256_mul_pd(fscal,dz10);
469 /* Update vectorial force */
470 fix1 = _mm256_add_pd(fix1,tx);
471 fiy1 = _mm256_add_pd(fiy1,ty);
472 fiz1 = _mm256_add_pd(fiz1,tz);
474 fjx0 = _mm256_add_pd(fjx0,tx);
475 fjy0 = _mm256_add_pd(fjy0,ty);
476 fjz0 = _mm256_add_pd(fjz0,tz);
478 /**************************
479 * CALCULATE INTERACTIONS *
480 **************************/
482 r11 = _mm256_mul_pd(rsq11,rinv11);
484 /* Calculate table index by multiplying r with table scale and truncate to integer */
485 rt = _mm256_mul_pd(r11,vftabscale);
486 vfitab = _mm256_cvttpd_epi32(rt);
487 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
488 vfitab = _mm_slli_epi32(vfitab,2);
490 /* CUBIC SPLINE TABLE ELECTROSTATICS */
491 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
492 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
493 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
494 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
495 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
496 Heps = _mm256_mul_pd(vfeps,H);
497 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
498 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
499 velec = _mm256_mul_pd(qq11,VV);
500 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
501 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
503 /* Update potential sum for this i atom from the interaction with this j atom. */
504 velecsum = _mm256_add_pd(velecsum,velec);
508 /* Calculate temporary vectorial force */
509 tx = _mm256_mul_pd(fscal,dx11);
510 ty = _mm256_mul_pd(fscal,dy11);
511 tz = _mm256_mul_pd(fscal,dz11);
513 /* Update vectorial force */
514 fix1 = _mm256_add_pd(fix1,tx);
515 fiy1 = _mm256_add_pd(fiy1,ty);
516 fiz1 = _mm256_add_pd(fiz1,tz);
518 fjx1 = _mm256_add_pd(fjx1,tx);
519 fjy1 = _mm256_add_pd(fjy1,ty);
520 fjz1 = _mm256_add_pd(fjz1,tz);
522 /**************************
523 * CALCULATE INTERACTIONS *
524 **************************/
526 r12 = _mm256_mul_pd(rsq12,rinv12);
528 /* Calculate table index by multiplying r with table scale and truncate to integer */
529 rt = _mm256_mul_pd(r12,vftabscale);
530 vfitab = _mm256_cvttpd_epi32(rt);
531 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
532 vfitab = _mm_slli_epi32(vfitab,2);
534 /* CUBIC SPLINE TABLE ELECTROSTATICS */
535 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
536 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
537 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
538 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
539 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
540 Heps = _mm256_mul_pd(vfeps,H);
541 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
542 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
543 velec = _mm256_mul_pd(qq12,VV);
544 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
545 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
547 /* Update potential sum for this i atom from the interaction with this j atom. */
548 velecsum = _mm256_add_pd(velecsum,velec);
552 /* Calculate temporary vectorial force */
553 tx = _mm256_mul_pd(fscal,dx12);
554 ty = _mm256_mul_pd(fscal,dy12);
555 tz = _mm256_mul_pd(fscal,dz12);
557 /* Update vectorial force */
558 fix1 = _mm256_add_pd(fix1,tx);
559 fiy1 = _mm256_add_pd(fiy1,ty);
560 fiz1 = _mm256_add_pd(fiz1,tz);
562 fjx2 = _mm256_add_pd(fjx2,tx);
563 fjy2 = _mm256_add_pd(fjy2,ty);
564 fjz2 = _mm256_add_pd(fjz2,tz);
566 /**************************
567 * CALCULATE INTERACTIONS *
568 **************************/
570 r20 = _mm256_mul_pd(rsq20,rinv20);
572 /* Calculate table index by multiplying r with table scale and truncate to integer */
573 rt = _mm256_mul_pd(r20,vftabscale);
574 vfitab = _mm256_cvttpd_epi32(rt);
575 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
576 vfitab = _mm_slli_epi32(vfitab,2);
578 /* CUBIC SPLINE TABLE ELECTROSTATICS */
579 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
580 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
581 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
582 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
583 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
584 Heps = _mm256_mul_pd(vfeps,H);
585 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
586 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
587 velec = _mm256_mul_pd(qq20,VV);
588 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
589 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velecsum = _mm256_add_pd(velecsum,velec);
596 /* Calculate temporary vectorial force */
597 tx = _mm256_mul_pd(fscal,dx20);
598 ty = _mm256_mul_pd(fscal,dy20);
599 tz = _mm256_mul_pd(fscal,dz20);
601 /* Update vectorial force */
602 fix2 = _mm256_add_pd(fix2,tx);
603 fiy2 = _mm256_add_pd(fiy2,ty);
604 fiz2 = _mm256_add_pd(fiz2,tz);
606 fjx0 = _mm256_add_pd(fjx0,tx);
607 fjy0 = _mm256_add_pd(fjy0,ty);
608 fjz0 = _mm256_add_pd(fjz0,tz);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 r21 = _mm256_mul_pd(rsq21,rinv21);
616 /* Calculate table index by multiplying r with table scale and truncate to integer */
617 rt = _mm256_mul_pd(r21,vftabscale);
618 vfitab = _mm256_cvttpd_epi32(rt);
619 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
620 vfitab = _mm_slli_epi32(vfitab,2);
622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
623 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
624 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
625 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
626 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
627 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
628 Heps = _mm256_mul_pd(vfeps,H);
629 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
630 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
631 velec = _mm256_mul_pd(qq21,VV);
632 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
633 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
635 /* Update potential sum for this i atom from the interaction with this j atom. */
636 velecsum = _mm256_add_pd(velecsum,velec);
640 /* Calculate temporary vectorial force */
641 tx = _mm256_mul_pd(fscal,dx21);
642 ty = _mm256_mul_pd(fscal,dy21);
643 tz = _mm256_mul_pd(fscal,dz21);
645 /* Update vectorial force */
646 fix2 = _mm256_add_pd(fix2,tx);
647 fiy2 = _mm256_add_pd(fiy2,ty);
648 fiz2 = _mm256_add_pd(fiz2,tz);
650 fjx1 = _mm256_add_pd(fjx1,tx);
651 fjy1 = _mm256_add_pd(fjy1,ty);
652 fjz1 = _mm256_add_pd(fjz1,tz);
654 /**************************
655 * CALCULATE INTERACTIONS *
656 **************************/
658 r22 = _mm256_mul_pd(rsq22,rinv22);
660 /* Calculate table index by multiplying r with table scale and truncate to integer */
661 rt = _mm256_mul_pd(r22,vftabscale);
662 vfitab = _mm256_cvttpd_epi32(rt);
663 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
664 vfitab = _mm_slli_epi32(vfitab,2);
666 /* CUBIC SPLINE TABLE ELECTROSTATICS */
667 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
668 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
669 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
670 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
671 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
672 Heps = _mm256_mul_pd(vfeps,H);
673 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
674 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
675 velec = _mm256_mul_pd(qq22,VV);
676 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
677 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
679 /* Update potential sum for this i atom from the interaction with this j atom. */
680 velecsum = _mm256_add_pd(velecsum,velec);
684 /* Calculate temporary vectorial force */
685 tx = _mm256_mul_pd(fscal,dx22);
686 ty = _mm256_mul_pd(fscal,dy22);
687 tz = _mm256_mul_pd(fscal,dz22);
689 /* Update vectorial force */
690 fix2 = _mm256_add_pd(fix2,tx);
691 fiy2 = _mm256_add_pd(fiy2,ty);
692 fiz2 = _mm256_add_pd(fiz2,tz);
694 fjx2 = _mm256_add_pd(fjx2,tx);
695 fjy2 = _mm256_add_pd(fjy2,ty);
696 fjz2 = _mm256_add_pd(fjz2,tz);
698 fjptrA = f+j_coord_offsetA;
699 fjptrB = f+j_coord_offsetB;
700 fjptrC = f+j_coord_offsetC;
701 fjptrD = f+j_coord_offsetD;
703 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
704 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
706 /* Inner loop uses 400 flops */
712 /* Get j neighbor index, and coordinate index */
713 jnrlistA = jjnr[jidx];
714 jnrlistB = jjnr[jidx+1];
715 jnrlistC = jjnr[jidx+2];
716 jnrlistD = jjnr[jidx+3];
717 /* Sign of each element will be negative for non-real atoms.
718 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
719 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
721 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
723 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
724 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
725 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
727 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
728 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
729 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
730 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
731 j_coord_offsetA = DIM*jnrA;
732 j_coord_offsetB = DIM*jnrB;
733 j_coord_offsetC = DIM*jnrC;
734 j_coord_offsetD = DIM*jnrD;
736 /* load j atom coordinates */
737 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
738 x+j_coord_offsetC,x+j_coord_offsetD,
739 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
741 /* Calculate displacement vector */
742 dx00 = _mm256_sub_pd(ix0,jx0);
743 dy00 = _mm256_sub_pd(iy0,jy0);
744 dz00 = _mm256_sub_pd(iz0,jz0);
745 dx01 = _mm256_sub_pd(ix0,jx1);
746 dy01 = _mm256_sub_pd(iy0,jy1);
747 dz01 = _mm256_sub_pd(iz0,jz1);
748 dx02 = _mm256_sub_pd(ix0,jx2);
749 dy02 = _mm256_sub_pd(iy0,jy2);
750 dz02 = _mm256_sub_pd(iz0,jz2);
751 dx10 = _mm256_sub_pd(ix1,jx0);
752 dy10 = _mm256_sub_pd(iy1,jy0);
753 dz10 = _mm256_sub_pd(iz1,jz0);
754 dx11 = _mm256_sub_pd(ix1,jx1);
755 dy11 = _mm256_sub_pd(iy1,jy1);
756 dz11 = _mm256_sub_pd(iz1,jz1);
757 dx12 = _mm256_sub_pd(ix1,jx2);
758 dy12 = _mm256_sub_pd(iy1,jy2);
759 dz12 = _mm256_sub_pd(iz1,jz2);
760 dx20 = _mm256_sub_pd(ix2,jx0);
761 dy20 = _mm256_sub_pd(iy2,jy0);
762 dz20 = _mm256_sub_pd(iz2,jz0);
763 dx21 = _mm256_sub_pd(ix2,jx1);
764 dy21 = _mm256_sub_pd(iy2,jy1);
765 dz21 = _mm256_sub_pd(iz2,jz1);
766 dx22 = _mm256_sub_pd(ix2,jx2);
767 dy22 = _mm256_sub_pd(iy2,jy2);
768 dz22 = _mm256_sub_pd(iz2,jz2);
770 /* Calculate squared distance and things based on it */
771 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
772 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
773 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
774 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
775 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
776 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
777 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
778 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
779 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
781 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
782 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
783 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
784 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
785 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
786 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
787 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
788 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
789 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
791 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
793 fjx0 = _mm256_setzero_pd();
794 fjy0 = _mm256_setzero_pd();
795 fjz0 = _mm256_setzero_pd();
796 fjx1 = _mm256_setzero_pd();
797 fjy1 = _mm256_setzero_pd();
798 fjz1 = _mm256_setzero_pd();
799 fjx2 = _mm256_setzero_pd();
800 fjy2 = _mm256_setzero_pd();
801 fjz2 = _mm256_setzero_pd();
803 /**************************
804 * CALCULATE INTERACTIONS *
805 **************************/
807 r00 = _mm256_mul_pd(rsq00,rinv00);
808 r00 = _mm256_andnot_pd(dummy_mask,r00);
810 /* Calculate table index by multiplying r with table scale and truncate to integer */
811 rt = _mm256_mul_pd(r00,vftabscale);
812 vfitab = _mm256_cvttpd_epi32(rt);
813 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
814 vfitab = _mm_slli_epi32(vfitab,2);
816 /* CUBIC SPLINE TABLE ELECTROSTATICS */
817 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
818 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
819 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
820 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
821 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
822 Heps = _mm256_mul_pd(vfeps,H);
823 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
824 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
825 velec = _mm256_mul_pd(qq00,VV);
826 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
827 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
829 /* LENNARD-JONES DISPERSION/REPULSION */
831 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
832 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
833 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
834 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
835 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _mm256_andnot_pd(dummy_mask,velec);
839 velecsum = _mm256_add_pd(velecsum,velec);
840 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
841 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
843 fscal = _mm256_add_pd(felec,fvdw);
845 fscal = _mm256_andnot_pd(dummy_mask,fscal);
847 /* Calculate temporary vectorial force */
848 tx = _mm256_mul_pd(fscal,dx00);
849 ty = _mm256_mul_pd(fscal,dy00);
850 tz = _mm256_mul_pd(fscal,dz00);
852 /* Update vectorial force */
853 fix0 = _mm256_add_pd(fix0,tx);
854 fiy0 = _mm256_add_pd(fiy0,ty);
855 fiz0 = _mm256_add_pd(fiz0,tz);
857 fjx0 = _mm256_add_pd(fjx0,tx);
858 fjy0 = _mm256_add_pd(fjy0,ty);
859 fjz0 = _mm256_add_pd(fjz0,tz);
861 /**************************
862 * CALCULATE INTERACTIONS *
863 **************************/
865 r01 = _mm256_mul_pd(rsq01,rinv01);
866 r01 = _mm256_andnot_pd(dummy_mask,r01);
868 /* Calculate table index by multiplying r with table scale and truncate to integer */
869 rt = _mm256_mul_pd(r01,vftabscale);
870 vfitab = _mm256_cvttpd_epi32(rt);
871 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
872 vfitab = _mm_slli_epi32(vfitab,2);
874 /* CUBIC SPLINE TABLE ELECTROSTATICS */
875 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
876 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
877 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
878 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
879 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
880 Heps = _mm256_mul_pd(vfeps,H);
881 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
882 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
883 velec = _mm256_mul_pd(qq01,VV);
884 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
885 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm256_andnot_pd(dummy_mask,velec);
889 velecsum = _mm256_add_pd(velecsum,velec);
893 fscal = _mm256_andnot_pd(dummy_mask,fscal);
895 /* Calculate temporary vectorial force */
896 tx = _mm256_mul_pd(fscal,dx01);
897 ty = _mm256_mul_pd(fscal,dy01);
898 tz = _mm256_mul_pd(fscal,dz01);
900 /* Update vectorial force */
901 fix0 = _mm256_add_pd(fix0,tx);
902 fiy0 = _mm256_add_pd(fiy0,ty);
903 fiz0 = _mm256_add_pd(fiz0,tz);
905 fjx1 = _mm256_add_pd(fjx1,tx);
906 fjy1 = _mm256_add_pd(fjy1,ty);
907 fjz1 = _mm256_add_pd(fjz1,tz);
909 /**************************
910 * CALCULATE INTERACTIONS *
911 **************************/
913 r02 = _mm256_mul_pd(rsq02,rinv02);
914 r02 = _mm256_andnot_pd(dummy_mask,r02);
916 /* Calculate table index by multiplying r with table scale and truncate to integer */
917 rt = _mm256_mul_pd(r02,vftabscale);
918 vfitab = _mm256_cvttpd_epi32(rt);
919 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
920 vfitab = _mm_slli_epi32(vfitab,2);
922 /* CUBIC SPLINE TABLE ELECTROSTATICS */
923 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
924 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
925 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
926 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
927 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
928 Heps = _mm256_mul_pd(vfeps,H);
929 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
930 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
931 velec = _mm256_mul_pd(qq02,VV);
932 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
933 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
935 /* Update potential sum for this i atom from the interaction with this j atom. */
936 velec = _mm256_andnot_pd(dummy_mask,velec);
937 velecsum = _mm256_add_pd(velecsum,velec);
941 fscal = _mm256_andnot_pd(dummy_mask,fscal);
943 /* Calculate temporary vectorial force */
944 tx = _mm256_mul_pd(fscal,dx02);
945 ty = _mm256_mul_pd(fscal,dy02);
946 tz = _mm256_mul_pd(fscal,dz02);
948 /* Update vectorial force */
949 fix0 = _mm256_add_pd(fix0,tx);
950 fiy0 = _mm256_add_pd(fiy0,ty);
951 fiz0 = _mm256_add_pd(fiz0,tz);
953 fjx2 = _mm256_add_pd(fjx2,tx);
954 fjy2 = _mm256_add_pd(fjy2,ty);
955 fjz2 = _mm256_add_pd(fjz2,tz);
957 /**************************
958 * CALCULATE INTERACTIONS *
959 **************************/
961 r10 = _mm256_mul_pd(rsq10,rinv10);
962 r10 = _mm256_andnot_pd(dummy_mask,r10);
964 /* Calculate table index by multiplying r with table scale and truncate to integer */
965 rt = _mm256_mul_pd(r10,vftabscale);
966 vfitab = _mm256_cvttpd_epi32(rt);
967 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
968 vfitab = _mm_slli_epi32(vfitab,2);
970 /* CUBIC SPLINE TABLE ELECTROSTATICS */
971 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
972 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
973 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
974 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
975 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
976 Heps = _mm256_mul_pd(vfeps,H);
977 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
978 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
979 velec = _mm256_mul_pd(qq10,VV);
980 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
981 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
983 /* Update potential sum for this i atom from the interaction with this j atom. */
984 velec = _mm256_andnot_pd(dummy_mask,velec);
985 velecsum = _mm256_add_pd(velecsum,velec);
989 fscal = _mm256_andnot_pd(dummy_mask,fscal);
991 /* Calculate temporary vectorial force */
992 tx = _mm256_mul_pd(fscal,dx10);
993 ty = _mm256_mul_pd(fscal,dy10);
994 tz = _mm256_mul_pd(fscal,dz10);
996 /* Update vectorial force */
997 fix1 = _mm256_add_pd(fix1,tx);
998 fiy1 = _mm256_add_pd(fiy1,ty);
999 fiz1 = _mm256_add_pd(fiz1,tz);
1001 fjx0 = _mm256_add_pd(fjx0,tx);
1002 fjy0 = _mm256_add_pd(fjy0,ty);
1003 fjz0 = _mm256_add_pd(fjz0,tz);
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1009 r11 = _mm256_mul_pd(rsq11,rinv11);
1010 r11 = _mm256_andnot_pd(dummy_mask,r11);
1012 /* Calculate table index by multiplying r with table scale and truncate to integer */
1013 rt = _mm256_mul_pd(r11,vftabscale);
1014 vfitab = _mm256_cvttpd_epi32(rt);
1015 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1016 vfitab = _mm_slli_epi32(vfitab,2);
1018 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1019 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1020 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1021 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1022 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1023 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1024 Heps = _mm256_mul_pd(vfeps,H);
1025 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1026 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1027 velec = _mm256_mul_pd(qq11,VV);
1028 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1029 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1031 /* Update potential sum for this i atom from the interaction with this j atom. */
1032 velec = _mm256_andnot_pd(dummy_mask,velec);
1033 velecsum = _mm256_add_pd(velecsum,velec);
1037 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1039 /* Calculate temporary vectorial force */
1040 tx = _mm256_mul_pd(fscal,dx11);
1041 ty = _mm256_mul_pd(fscal,dy11);
1042 tz = _mm256_mul_pd(fscal,dz11);
1044 /* Update vectorial force */
1045 fix1 = _mm256_add_pd(fix1,tx);
1046 fiy1 = _mm256_add_pd(fiy1,ty);
1047 fiz1 = _mm256_add_pd(fiz1,tz);
1049 fjx1 = _mm256_add_pd(fjx1,tx);
1050 fjy1 = _mm256_add_pd(fjy1,ty);
1051 fjz1 = _mm256_add_pd(fjz1,tz);
1053 /**************************
1054 * CALCULATE INTERACTIONS *
1055 **************************/
1057 r12 = _mm256_mul_pd(rsq12,rinv12);
1058 r12 = _mm256_andnot_pd(dummy_mask,r12);
1060 /* Calculate table index by multiplying r with table scale and truncate to integer */
1061 rt = _mm256_mul_pd(r12,vftabscale);
1062 vfitab = _mm256_cvttpd_epi32(rt);
1063 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1064 vfitab = _mm_slli_epi32(vfitab,2);
1066 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1067 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1068 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1069 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1070 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1071 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1072 Heps = _mm256_mul_pd(vfeps,H);
1073 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1074 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1075 velec = _mm256_mul_pd(qq12,VV);
1076 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1077 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1079 /* Update potential sum for this i atom from the interaction with this j atom. */
1080 velec = _mm256_andnot_pd(dummy_mask,velec);
1081 velecsum = _mm256_add_pd(velecsum,velec);
1085 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1087 /* Calculate temporary vectorial force */
1088 tx = _mm256_mul_pd(fscal,dx12);
1089 ty = _mm256_mul_pd(fscal,dy12);
1090 tz = _mm256_mul_pd(fscal,dz12);
1092 /* Update vectorial force */
1093 fix1 = _mm256_add_pd(fix1,tx);
1094 fiy1 = _mm256_add_pd(fiy1,ty);
1095 fiz1 = _mm256_add_pd(fiz1,tz);
1097 fjx2 = _mm256_add_pd(fjx2,tx);
1098 fjy2 = _mm256_add_pd(fjy2,ty);
1099 fjz2 = _mm256_add_pd(fjz2,tz);
1101 /**************************
1102 * CALCULATE INTERACTIONS *
1103 **************************/
1105 r20 = _mm256_mul_pd(rsq20,rinv20);
1106 r20 = _mm256_andnot_pd(dummy_mask,r20);
1108 /* Calculate table index by multiplying r with table scale and truncate to integer */
1109 rt = _mm256_mul_pd(r20,vftabscale);
1110 vfitab = _mm256_cvttpd_epi32(rt);
1111 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1112 vfitab = _mm_slli_epi32(vfitab,2);
1114 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1115 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1116 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1117 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1118 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1119 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1120 Heps = _mm256_mul_pd(vfeps,H);
1121 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1122 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1123 velec = _mm256_mul_pd(qq20,VV);
1124 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1125 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1127 /* Update potential sum for this i atom from the interaction with this j atom. */
1128 velec = _mm256_andnot_pd(dummy_mask,velec);
1129 velecsum = _mm256_add_pd(velecsum,velec);
1133 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1135 /* Calculate temporary vectorial force */
1136 tx = _mm256_mul_pd(fscal,dx20);
1137 ty = _mm256_mul_pd(fscal,dy20);
1138 tz = _mm256_mul_pd(fscal,dz20);
1140 /* Update vectorial force */
1141 fix2 = _mm256_add_pd(fix2,tx);
1142 fiy2 = _mm256_add_pd(fiy2,ty);
1143 fiz2 = _mm256_add_pd(fiz2,tz);
1145 fjx0 = _mm256_add_pd(fjx0,tx);
1146 fjy0 = _mm256_add_pd(fjy0,ty);
1147 fjz0 = _mm256_add_pd(fjz0,tz);
1149 /**************************
1150 * CALCULATE INTERACTIONS *
1151 **************************/
1153 r21 = _mm256_mul_pd(rsq21,rinv21);
1154 r21 = _mm256_andnot_pd(dummy_mask,r21);
1156 /* Calculate table index by multiplying r with table scale and truncate to integer */
1157 rt = _mm256_mul_pd(r21,vftabscale);
1158 vfitab = _mm256_cvttpd_epi32(rt);
1159 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1160 vfitab = _mm_slli_epi32(vfitab,2);
1162 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1163 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1164 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1165 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1166 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1167 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1168 Heps = _mm256_mul_pd(vfeps,H);
1169 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1170 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1171 velec = _mm256_mul_pd(qq21,VV);
1172 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1173 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1175 /* Update potential sum for this i atom from the interaction with this j atom. */
1176 velec = _mm256_andnot_pd(dummy_mask,velec);
1177 velecsum = _mm256_add_pd(velecsum,velec);
1181 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1183 /* Calculate temporary vectorial force */
1184 tx = _mm256_mul_pd(fscal,dx21);
1185 ty = _mm256_mul_pd(fscal,dy21);
1186 tz = _mm256_mul_pd(fscal,dz21);
1188 /* Update vectorial force */
1189 fix2 = _mm256_add_pd(fix2,tx);
1190 fiy2 = _mm256_add_pd(fiy2,ty);
1191 fiz2 = _mm256_add_pd(fiz2,tz);
1193 fjx1 = _mm256_add_pd(fjx1,tx);
1194 fjy1 = _mm256_add_pd(fjy1,ty);
1195 fjz1 = _mm256_add_pd(fjz1,tz);
1197 /**************************
1198 * CALCULATE INTERACTIONS *
1199 **************************/
1201 r22 = _mm256_mul_pd(rsq22,rinv22);
1202 r22 = _mm256_andnot_pd(dummy_mask,r22);
1204 /* Calculate table index by multiplying r with table scale and truncate to integer */
1205 rt = _mm256_mul_pd(r22,vftabscale);
1206 vfitab = _mm256_cvttpd_epi32(rt);
1207 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1208 vfitab = _mm_slli_epi32(vfitab,2);
1210 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1211 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1212 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1213 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1214 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1215 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1216 Heps = _mm256_mul_pd(vfeps,H);
1217 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1218 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1219 velec = _mm256_mul_pd(qq22,VV);
1220 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1221 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1223 /* Update potential sum for this i atom from the interaction with this j atom. */
1224 velec = _mm256_andnot_pd(dummy_mask,velec);
1225 velecsum = _mm256_add_pd(velecsum,velec);
1229 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1231 /* Calculate temporary vectorial force */
1232 tx = _mm256_mul_pd(fscal,dx22);
1233 ty = _mm256_mul_pd(fscal,dy22);
1234 tz = _mm256_mul_pd(fscal,dz22);
1236 /* Update vectorial force */
1237 fix2 = _mm256_add_pd(fix2,tx);
1238 fiy2 = _mm256_add_pd(fiy2,ty);
1239 fiz2 = _mm256_add_pd(fiz2,tz);
1241 fjx2 = _mm256_add_pd(fjx2,tx);
1242 fjy2 = _mm256_add_pd(fjy2,ty);
1243 fjz2 = _mm256_add_pd(fjz2,tz);
1245 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1246 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1247 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1248 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1250 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1251 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1253 /* Inner loop uses 409 flops */
1256 /* End of innermost loop */
1258 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1259 f+i_coord_offset,fshift+i_shift_offset);
1262 /* Update potential energies */
1263 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1264 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1266 /* Increment number of inner iterations */
1267 inneriter += j_index_end - j_index_start;
1269 /* Outer loop uses 20 flops */
1272 /* Increment number of outer iterations */
1275 /* Update outer/inner flops */
1277 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1280 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1281 * Electrostatics interaction: CubicSplineTable
1282 * VdW interaction: LennardJones
1283 * Geometry: Water3-Water3
1284 * Calculate force/pot: Force
1287 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1288 (t_nblist * gmx_restrict nlist,
1289 rvec * gmx_restrict xx,
1290 rvec * gmx_restrict ff,
1291 t_forcerec * gmx_restrict fr,
1292 t_mdatoms * gmx_restrict mdatoms,
1293 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1294 t_nrnb * gmx_restrict nrnb)
1296 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1297 * just 0 for non-waters.
1298 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1299 * jnr indices corresponding to data put in the four positions in the SIMD register.
1301 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1302 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1303 int jnrA,jnrB,jnrC,jnrD;
1304 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1305 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1306 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1307 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1308 real rcutoff_scalar;
1309 real *shiftvec,*fshift,*x,*f;
1310 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1311 real scratch[4*DIM];
1312 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1313 real * vdwioffsetptr0;
1314 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1315 real * vdwioffsetptr1;
1316 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1317 real * vdwioffsetptr2;
1318 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1319 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1320 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1321 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1322 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1323 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1324 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1325 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1326 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1327 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1328 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1329 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1330 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1331 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1332 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1333 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1334 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1337 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1340 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1341 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1343 __m128i ifour = _mm_set1_epi32(4);
1344 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1346 __m256d dummy_mask,cutoff_mask;
1347 __m128 tmpmask0,tmpmask1;
1348 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1349 __m256d one = _mm256_set1_pd(1.0);
1350 __m256d two = _mm256_set1_pd(2.0);
1356 jindex = nlist->jindex;
1358 shiftidx = nlist->shift;
1360 shiftvec = fr->shift_vec[0];
1361 fshift = fr->fshift[0];
1362 facel = _mm256_set1_pd(fr->epsfac);
1363 charge = mdatoms->chargeA;
1364 nvdwtype = fr->ntype;
1365 vdwparam = fr->nbfp;
1366 vdwtype = mdatoms->typeA;
1368 vftab = kernel_data->table_elec->data;
1369 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
1371 /* Setup water-specific parameters */
1372 inr = nlist->iinr[0];
1373 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1374 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1375 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1376 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1378 jq0 = _mm256_set1_pd(charge[inr+0]);
1379 jq1 = _mm256_set1_pd(charge[inr+1]);
1380 jq2 = _mm256_set1_pd(charge[inr+2]);
1381 vdwjidx0A = 2*vdwtype[inr+0];
1382 qq00 = _mm256_mul_pd(iq0,jq0);
1383 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1384 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1385 qq01 = _mm256_mul_pd(iq0,jq1);
1386 qq02 = _mm256_mul_pd(iq0,jq2);
1387 qq10 = _mm256_mul_pd(iq1,jq0);
1388 qq11 = _mm256_mul_pd(iq1,jq1);
1389 qq12 = _mm256_mul_pd(iq1,jq2);
1390 qq20 = _mm256_mul_pd(iq2,jq0);
1391 qq21 = _mm256_mul_pd(iq2,jq1);
1392 qq22 = _mm256_mul_pd(iq2,jq2);
1394 /* Avoid stupid compiler warnings */
1395 jnrA = jnrB = jnrC = jnrD = 0;
1396 j_coord_offsetA = 0;
1397 j_coord_offsetB = 0;
1398 j_coord_offsetC = 0;
1399 j_coord_offsetD = 0;
1404 for(iidx=0;iidx<4*DIM;iidx++)
1406 scratch[iidx] = 0.0;
1409 /* Start outer loop over neighborlists */
1410 for(iidx=0; iidx<nri; iidx++)
1412 /* Load shift vector for this list */
1413 i_shift_offset = DIM*shiftidx[iidx];
1415 /* Load limits for loop over neighbors */
1416 j_index_start = jindex[iidx];
1417 j_index_end = jindex[iidx+1];
1419 /* Get outer coordinate index */
1421 i_coord_offset = DIM*inr;
1423 /* Load i particle coords and add shift vector */
1424 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1425 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1427 fix0 = _mm256_setzero_pd();
1428 fiy0 = _mm256_setzero_pd();
1429 fiz0 = _mm256_setzero_pd();
1430 fix1 = _mm256_setzero_pd();
1431 fiy1 = _mm256_setzero_pd();
1432 fiz1 = _mm256_setzero_pd();
1433 fix2 = _mm256_setzero_pd();
1434 fiy2 = _mm256_setzero_pd();
1435 fiz2 = _mm256_setzero_pd();
1437 /* Start inner kernel loop */
1438 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1441 /* Get j neighbor index, and coordinate index */
1443 jnrB = jjnr[jidx+1];
1444 jnrC = jjnr[jidx+2];
1445 jnrD = jjnr[jidx+3];
1446 j_coord_offsetA = DIM*jnrA;
1447 j_coord_offsetB = DIM*jnrB;
1448 j_coord_offsetC = DIM*jnrC;
1449 j_coord_offsetD = DIM*jnrD;
1451 /* load j atom coordinates */
1452 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1453 x+j_coord_offsetC,x+j_coord_offsetD,
1454 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1456 /* Calculate displacement vector */
1457 dx00 = _mm256_sub_pd(ix0,jx0);
1458 dy00 = _mm256_sub_pd(iy0,jy0);
1459 dz00 = _mm256_sub_pd(iz0,jz0);
1460 dx01 = _mm256_sub_pd(ix0,jx1);
1461 dy01 = _mm256_sub_pd(iy0,jy1);
1462 dz01 = _mm256_sub_pd(iz0,jz1);
1463 dx02 = _mm256_sub_pd(ix0,jx2);
1464 dy02 = _mm256_sub_pd(iy0,jy2);
1465 dz02 = _mm256_sub_pd(iz0,jz2);
1466 dx10 = _mm256_sub_pd(ix1,jx0);
1467 dy10 = _mm256_sub_pd(iy1,jy0);
1468 dz10 = _mm256_sub_pd(iz1,jz0);
1469 dx11 = _mm256_sub_pd(ix1,jx1);
1470 dy11 = _mm256_sub_pd(iy1,jy1);
1471 dz11 = _mm256_sub_pd(iz1,jz1);
1472 dx12 = _mm256_sub_pd(ix1,jx2);
1473 dy12 = _mm256_sub_pd(iy1,jy2);
1474 dz12 = _mm256_sub_pd(iz1,jz2);
1475 dx20 = _mm256_sub_pd(ix2,jx0);
1476 dy20 = _mm256_sub_pd(iy2,jy0);
1477 dz20 = _mm256_sub_pd(iz2,jz0);
1478 dx21 = _mm256_sub_pd(ix2,jx1);
1479 dy21 = _mm256_sub_pd(iy2,jy1);
1480 dz21 = _mm256_sub_pd(iz2,jz1);
1481 dx22 = _mm256_sub_pd(ix2,jx2);
1482 dy22 = _mm256_sub_pd(iy2,jy2);
1483 dz22 = _mm256_sub_pd(iz2,jz2);
1485 /* Calculate squared distance and things based on it */
1486 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1487 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1488 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1489 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1490 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1491 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1492 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1493 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1494 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1496 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1497 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1498 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1499 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1500 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1501 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1502 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1503 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1504 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1506 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1508 fjx0 = _mm256_setzero_pd();
1509 fjy0 = _mm256_setzero_pd();
1510 fjz0 = _mm256_setzero_pd();
1511 fjx1 = _mm256_setzero_pd();
1512 fjy1 = _mm256_setzero_pd();
1513 fjz1 = _mm256_setzero_pd();
1514 fjx2 = _mm256_setzero_pd();
1515 fjy2 = _mm256_setzero_pd();
1516 fjz2 = _mm256_setzero_pd();
1518 /**************************
1519 * CALCULATE INTERACTIONS *
1520 **************************/
1522 r00 = _mm256_mul_pd(rsq00,rinv00);
1524 /* Calculate table index by multiplying r with table scale and truncate to integer */
1525 rt = _mm256_mul_pd(r00,vftabscale);
1526 vfitab = _mm256_cvttpd_epi32(rt);
1527 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1528 vfitab = _mm_slli_epi32(vfitab,2);
1530 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1531 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1532 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1533 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1534 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1535 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1536 Heps = _mm256_mul_pd(vfeps,H);
1537 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1538 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1539 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1541 /* LENNARD-JONES DISPERSION/REPULSION */
1543 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1544 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1546 fscal = _mm256_add_pd(felec,fvdw);
1548 /* Calculate temporary vectorial force */
1549 tx = _mm256_mul_pd(fscal,dx00);
1550 ty = _mm256_mul_pd(fscal,dy00);
1551 tz = _mm256_mul_pd(fscal,dz00);
1553 /* Update vectorial force */
1554 fix0 = _mm256_add_pd(fix0,tx);
1555 fiy0 = _mm256_add_pd(fiy0,ty);
1556 fiz0 = _mm256_add_pd(fiz0,tz);
1558 fjx0 = _mm256_add_pd(fjx0,tx);
1559 fjy0 = _mm256_add_pd(fjy0,ty);
1560 fjz0 = _mm256_add_pd(fjz0,tz);
1562 /**************************
1563 * CALCULATE INTERACTIONS *
1564 **************************/
1566 r01 = _mm256_mul_pd(rsq01,rinv01);
1568 /* Calculate table index by multiplying r with table scale and truncate to integer */
1569 rt = _mm256_mul_pd(r01,vftabscale);
1570 vfitab = _mm256_cvttpd_epi32(rt);
1571 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1572 vfitab = _mm_slli_epi32(vfitab,2);
1574 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1575 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1576 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1577 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1578 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1579 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1580 Heps = _mm256_mul_pd(vfeps,H);
1581 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1582 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1583 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1587 /* Calculate temporary vectorial force */
1588 tx = _mm256_mul_pd(fscal,dx01);
1589 ty = _mm256_mul_pd(fscal,dy01);
1590 tz = _mm256_mul_pd(fscal,dz01);
1592 /* Update vectorial force */
1593 fix0 = _mm256_add_pd(fix0,tx);
1594 fiy0 = _mm256_add_pd(fiy0,ty);
1595 fiz0 = _mm256_add_pd(fiz0,tz);
1597 fjx1 = _mm256_add_pd(fjx1,tx);
1598 fjy1 = _mm256_add_pd(fjy1,ty);
1599 fjz1 = _mm256_add_pd(fjz1,tz);
1601 /**************************
1602 * CALCULATE INTERACTIONS *
1603 **************************/
1605 r02 = _mm256_mul_pd(rsq02,rinv02);
1607 /* Calculate table index by multiplying r with table scale and truncate to integer */
1608 rt = _mm256_mul_pd(r02,vftabscale);
1609 vfitab = _mm256_cvttpd_epi32(rt);
1610 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1611 vfitab = _mm_slli_epi32(vfitab,2);
1613 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1614 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1615 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1616 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1617 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1618 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1619 Heps = _mm256_mul_pd(vfeps,H);
1620 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1621 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1622 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1626 /* Calculate temporary vectorial force */
1627 tx = _mm256_mul_pd(fscal,dx02);
1628 ty = _mm256_mul_pd(fscal,dy02);
1629 tz = _mm256_mul_pd(fscal,dz02);
1631 /* Update vectorial force */
1632 fix0 = _mm256_add_pd(fix0,tx);
1633 fiy0 = _mm256_add_pd(fiy0,ty);
1634 fiz0 = _mm256_add_pd(fiz0,tz);
1636 fjx2 = _mm256_add_pd(fjx2,tx);
1637 fjy2 = _mm256_add_pd(fjy2,ty);
1638 fjz2 = _mm256_add_pd(fjz2,tz);
1640 /**************************
1641 * CALCULATE INTERACTIONS *
1642 **************************/
1644 r10 = _mm256_mul_pd(rsq10,rinv10);
1646 /* Calculate table index by multiplying r with table scale and truncate to integer */
1647 rt = _mm256_mul_pd(r10,vftabscale);
1648 vfitab = _mm256_cvttpd_epi32(rt);
1649 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1650 vfitab = _mm_slli_epi32(vfitab,2);
1652 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1653 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1654 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1655 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1656 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1657 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1658 Heps = _mm256_mul_pd(vfeps,H);
1659 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1660 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1661 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1665 /* Calculate temporary vectorial force */
1666 tx = _mm256_mul_pd(fscal,dx10);
1667 ty = _mm256_mul_pd(fscal,dy10);
1668 tz = _mm256_mul_pd(fscal,dz10);
1670 /* Update vectorial force */
1671 fix1 = _mm256_add_pd(fix1,tx);
1672 fiy1 = _mm256_add_pd(fiy1,ty);
1673 fiz1 = _mm256_add_pd(fiz1,tz);
1675 fjx0 = _mm256_add_pd(fjx0,tx);
1676 fjy0 = _mm256_add_pd(fjy0,ty);
1677 fjz0 = _mm256_add_pd(fjz0,tz);
1679 /**************************
1680 * CALCULATE INTERACTIONS *
1681 **************************/
1683 r11 = _mm256_mul_pd(rsq11,rinv11);
1685 /* Calculate table index by multiplying r with table scale and truncate to integer */
1686 rt = _mm256_mul_pd(r11,vftabscale);
1687 vfitab = _mm256_cvttpd_epi32(rt);
1688 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1689 vfitab = _mm_slli_epi32(vfitab,2);
1691 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1692 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1693 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1694 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1695 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1696 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1697 Heps = _mm256_mul_pd(vfeps,H);
1698 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1699 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1700 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1704 /* Calculate temporary vectorial force */
1705 tx = _mm256_mul_pd(fscal,dx11);
1706 ty = _mm256_mul_pd(fscal,dy11);
1707 tz = _mm256_mul_pd(fscal,dz11);
1709 /* Update vectorial force */
1710 fix1 = _mm256_add_pd(fix1,tx);
1711 fiy1 = _mm256_add_pd(fiy1,ty);
1712 fiz1 = _mm256_add_pd(fiz1,tz);
1714 fjx1 = _mm256_add_pd(fjx1,tx);
1715 fjy1 = _mm256_add_pd(fjy1,ty);
1716 fjz1 = _mm256_add_pd(fjz1,tz);
1718 /**************************
1719 * CALCULATE INTERACTIONS *
1720 **************************/
1722 r12 = _mm256_mul_pd(rsq12,rinv12);
1724 /* Calculate table index by multiplying r with table scale and truncate to integer */
1725 rt = _mm256_mul_pd(r12,vftabscale);
1726 vfitab = _mm256_cvttpd_epi32(rt);
1727 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1728 vfitab = _mm_slli_epi32(vfitab,2);
1730 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1731 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1732 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1733 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1734 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1735 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1736 Heps = _mm256_mul_pd(vfeps,H);
1737 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1738 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1739 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1743 /* Calculate temporary vectorial force */
1744 tx = _mm256_mul_pd(fscal,dx12);
1745 ty = _mm256_mul_pd(fscal,dy12);
1746 tz = _mm256_mul_pd(fscal,dz12);
1748 /* Update vectorial force */
1749 fix1 = _mm256_add_pd(fix1,tx);
1750 fiy1 = _mm256_add_pd(fiy1,ty);
1751 fiz1 = _mm256_add_pd(fiz1,tz);
1753 fjx2 = _mm256_add_pd(fjx2,tx);
1754 fjy2 = _mm256_add_pd(fjy2,ty);
1755 fjz2 = _mm256_add_pd(fjz2,tz);
1757 /**************************
1758 * CALCULATE INTERACTIONS *
1759 **************************/
1761 r20 = _mm256_mul_pd(rsq20,rinv20);
1763 /* Calculate table index by multiplying r with table scale and truncate to integer */
1764 rt = _mm256_mul_pd(r20,vftabscale);
1765 vfitab = _mm256_cvttpd_epi32(rt);
1766 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1767 vfitab = _mm_slli_epi32(vfitab,2);
1769 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1770 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1771 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1772 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1773 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1774 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1775 Heps = _mm256_mul_pd(vfeps,H);
1776 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1777 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1778 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1782 /* Calculate temporary vectorial force */
1783 tx = _mm256_mul_pd(fscal,dx20);
1784 ty = _mm256_mul_pd(fscal,dy20);
1785 tz = _mm256_mul_pd(fscal,dz20);
1787 /* Update vectorial force */
1788 fix2 = _mm256_add_pd(fix2,tx);
1789 fiy2 = _mm256_add_pd(fiy2,ty);
1790 fiz2 = _mm256_add_pd(fiz2,tz);
1792 fjx0 = _mm256_add_pd(fjx0,tx);
1793 fjy0 = _mm256_add_pd(fjy0,ty);
1794 fjz0 = _mm256_add_pd(fjz0,tz);
1796 /**************************
1797 * CALCULATE INTERACTIONS *
1798 **************************/
1800 r21 = _mm256_mul_pd(rsq21,rinv21);
1802 /* Calculate table index by multiplying r with table scale and truncate to integer */
1803 rt = _mm256_mul_pd(r21,vftabscale);
1804 vfitab = _mm256_cvttpd_epi32(rt);
1805 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1806 vfitab = _mm_slli_epi32(vfitab,2);
1808 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1809 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1810 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1811 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1812 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1813 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1814 Heps = _mm256_mul_pd(vfeps,H);
1815 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1816 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1817 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1821 /* Calculate temporary vectorial force */
1822 tx = _mm256_mul_pd(fscal,dx21);
1823 ty = _mm256_mul_pd(fscal,dy21);
1824 tz = _mm256_mul_pd(fscal,dz21);
1826 /* Update vectorial force */
1827 fix2 = _mm256_add_pd(fix2,tx);
1828 fiy2 = _mm256_add_pd(fiy2,ty);
1829 fiz2 = _mm256_add_pd(fiz2,tz);
1831 fjx1 = _mm256_add_pd(fjx1,tx);
1832 fjy1 = _mm256_add_pd(fjy1,ty);
1833 fjz1 = _mm256_add_pd(fjz1,tz);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 r22 = _mm256_mul_pd(rsq22,rinv22);
1841 /* Calculate table index by multiplying r with table scale and truncate to integer */
1842 rt = _mm256_mul_pd(r22,vftabscale);
1843 vfitab = _mm256_cvttpd_epi32(rt);
1844 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1845 vfitab = _mm_slli_epi32(vfitab,2);
1847 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1848 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1849 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1850 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1851 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1852 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1853 Heps = _mm256_mul_pd(vfeps,H);
1854 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1855 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1856 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1860 /* Calculate temporary vectorial force */
1861 tx = _mm256_mul_pd(fscal,dx22);
1862 ty = _mm256_mul_pd(fscal,dy22);
1863 tz = _mm256_mul_pd(fscal,dz22);
1865 /* Update vectorial force */
1866 fix2 = _mm256_add_pd(fix2,tx);
1867 fiy2 = _mm256_add_pd(fiy2,ty);
1868 fiz2 = _mm256_add_pd(fiz2,tz);
1870 fjx2 = _mm256_add_pd(fjx2,tx);
1871 fjy2 = _mm256_add_pd(fjy2,ty);
1872 fjz2 = _mm256_add_pd(fjz2,tz);
1874 fjptrA = f+j_coord_offsetA;
1875 fjptrB = f+j_coord_offsetB;
1876 fjptrC = f+j_coord_offsetC;
1877 fjptrD = f+j_coord_offsetD;
1879 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1880 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1882 /* Inner loop uses 359 flops */
1885 if(jidx<j_index_end)
1888 /* Get j neighbor index, and coordinate index */
1889 jnrlistA = jjnr[jidx];
1890 jnrlistB = jjnr[jidx+1];
1891 jnrlistC = jjnr[jidx+2];
1892 jnrlistD = jjnr[jidx+3];
1893 /* Sign of each element will be negative for non-real atoms.
1894 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1895 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1897 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1899 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1900 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1901 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1903 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1904 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1905 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1906 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1907 j_coord_offsetA = DIM*jnrA;
1908 j_coord_offsetB = DIM*jnrB;
1909 j_coord_offsetC = DIM*jnrC;
1910 j_coord_offsetD = DIM*jnrD;
1912 /* load j atom coordinates */
1913 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1914 x+j_coord_offsetC,x+j_coord_offsetD,
1915 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1917 /* Calculate displacement vector */
1918 dx00 = _mm256_sub_pd(ix0,jx0);
1919 dy00 = _mm256_sub_pd(iy0,jy0);
1920 dz00 = _mm256_sub_pd(iz0,jz0);
1921 dx01 = _mm256_sub_pd(ix0,jx1);
1922 dy01 = _mm256_sub_pd(iy0,jy1);
1923 dz01 = _mm256_sub_pd(iz0,jz1);
1924 dx02 = _mm256_sub_pd(ix0,jx2);
1925 dy02 = _mm256_sub_pd(iy0,jy2);
1926 dz02 = _mm256_sub_pd(iz0,jz2);
1927 dx10 = _mm256_sub_pd(ix1,jx0);
1928 dy10 = _mm256_sub_pd(iy1,jy0);
1929 dz10 = _mm256_sub_pd(iz1,jz0);
1930 dx11 = _mm256_sub_pd(ix1,jx1);
1931 dy11 = _mm256_sub_pd(iy1,jy1);
1932 dz11 = _mm256_sub_pd(iz1,jz1);
1933 dx12 = _mm256_sub_pd(ix1,jx2);
1934 dy12 = _mm256_sub_pd(iy1,jy2);
1935 dz12 = _mm256_sub_pd(iz1,jz2);
1936 dx20 = _mm256_sub_pd(ix2,jx0);
1937 dy20 = _mm256_sub_pd(iy2,jy0);
1938 dz20 = _mm256_sub_pd(iz2,jz0);
1939 dx21 = _mm256_sub_pd(ix2,jx1);
1940 dy21 = _mm256_sub_pd(iy2,jy1);
1941 dz21 = _mm256_sub_pd(iz2,jz1);
1942 dx22 = _mm256_sub_pd(ix2,jx2);
1943 dy22 = _mm256_sub_pd(iy2,jy2);
1944 dz22 = _mm256_sub_pd(iz2,jz2);
1946 /* Calculate squared distance and things based on it */
1947 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1948 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1949 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1950 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1951 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1952 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1953 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1954 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1955 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1957 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1958 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1959 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1960 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1961 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1962 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1963 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1964 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1965 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1967 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1969 fjx0 = _mm256_setzero_pd();
1970 fjy0 = _mm256_setzero_pd();
1971 fjz0 = _mm256_setzero_pd();
1972 fjx1 = _mm256_setzero_pd();
1973 fjy1 = _mm256_setzero_pd();
1974 fjz1 = _mm256_setzero_pd();
1975 fjx2 = _mm256_setzero_pd();
1976 fjy2 = _mm256_setzero_pd();
1977 fjz2 = _mm256_setzero_pd();
1979 /**************************
1980 * CALCULATE INTERACTIONS *
1981 **************************/
1983 r00 = _mm256_mul_pd(rsq00,rinv00);
1984 r00 = _mm256_andnot_pd(dummy_mask,r00);
1986 /* Calculate table index by multiplying r with table scale and truncate to integer */
1987 rt = _mm256_mul_pd(r00,vftabscale);
1988 vfitab = _mm256_cvttpd_epi32(rt);
1989 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1990 vfitab = _mm_slli_epi32(vfitab,2);
1992 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1993 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1994 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1995 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1996 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1997 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1998 Heps = _mm256_mul_pd(vfeps,H);
1999 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2000 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2001 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
2003 /* LENNARD-JONES DISPERSION/REPULSION */
2005 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2006 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2008 fscal = _mm256_add_pd(felec,fvdw);
2010 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2012 /* Calculate temporary vectorial force */
2013 tx = _mm256_mul_pd(fscal,dx00);
2014 ty = _mm256_mul_pd(fscal,dy00);
2015 tz = _mm256_mul_pd(fscal,dz00);
2017 /* Update vectorial force */
2018 fix0 = _mm256_add_pd(fix0,tx);
2019 fiy0 = _mm256_add_pd(fiy0,ty);
2020 fiz0 = _mm256_add_pd(fiz0,tz);
2022 fjx0 = _mm256_add_pd(fjx0,tx);
2023 fjy0 = _mm256_add_pd(fjy0,ty);
2024 fjz0 = _mm256_add_pd(fjz0,tz);
2026 /**************************
2027 * CALCULATE INTERACTIONS *
2028 **************************/
2030 r01 = _mm256_mul_pd(rsq01,rinv01);
2031 r01 = _mm256_andnot_pd(dummy_mask,r01);
2033 /* Calculate table index by multiplying r with table scale and truncate to integer */
2034 rt = _mm256_mul_pd(r01,vftabscale);
2035 vfitab = _mm256_cvttpd_epi32(rt);
2036 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2037 vfitab = _mm_slli_epi32(vfitab,2);
2039 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2040 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2041 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2042 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2043 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2044 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2045 Heps = _mm256_mul_pd(vfeps,H);
2046 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2047 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2048 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
2052 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2054 /* Calculate temporary vectorial force */
2055 tx = _mm256_mul_pd(fscal,dx01);
2056 ty = _mm256_mul_pd(fscal,dy01);
2057 tz = _mm256_mul_pd(fscal,dz01);
2059 /* Update vectorial force */
2060 fix0 = _mm256_add_pd(fix0,tx);
2061 fiy0 = _mm256_add_pd(fiy0,ty);
2062 fiz0 = _mm256_add_pd(fiz0,tz);
2064 fjx1 = _mm256_add_pd(fjx1,tx);
2065 fjy1 = _mm256_add_pd(fjy1,ty);
2066 fjz1 = _mm256_add_pd(fjz1,tz);
2068 /**************************
2069 * CALCULATE INTERACTIONS *
2070 **************************/
2072 r02 = _mm256_mul_pd(rsq02,rinv02);
2073 r02 = _mm256_andnot_pd(dummy_mask,r02);
2075 /* Calculate table index by multiplying r with table scale and truncate to integer */
2076 rt = _mm256_mul_pd(r02,vftabscale);
2077 vfitab = _mm256_cvttpd_epi32(rt);
2078 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2079 vfitab = _mm_slli_epi32(vfitab,2);
2081 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2082 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2083 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2084 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2085 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2086 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2087 Heps = _mm256_mul_pd(vfeps,H);
2088 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2089 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2090 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2094 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2096 /* Calculate temporary vectorial force */
2097 tx = _mm256_mul_pd(fscal,dx02);
2098 ty = _mm256_mul_pd(fscal,dy02);
2099 tz = _mm256_mul_pd(fscal,dz02);
2101 /* Update vectorial force */
2102 fix0 = _mm256_add_pd(fix0,tx);
2103 fiy0 = _mm256_add_pd(fiy0,ty);
2104 fiz0 = _mm256_add_pd(fiz0,tz);
2106 fjx2 = _mm256_add_pd(fjx2,tx);
2107 fjy2 = _mm256_add_pd(fjy2,ty);
2108 fjz2 = _mm256_add_pd(fjz2,tz);
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 r10 = _mm256_mul_pd(rsq10,rinv10);
2115 r10 = _mm256_andnot_pd(dummy_mask,r10);
2117 /* Calculate table index by multiplying r with table scale and truncate to integer */
2118 rt = _mm256_mul_pd(r10,vftabscale);
2119 vfitab = _mm256_cvttpd_epi32(rt);
2120 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2121 vfitab = _mm_slli_epi32(vfitab,2);
2123 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2124 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2125 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2126 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2127 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2128 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2129 Heps = _mm256_mul_pd(vfeps,H);
2130 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2131 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2132 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2136 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2138 /* Calculate temporary vectorial force */
2139 tx = _mm256_mul_pd(fscal,dx10);
2140 ty = _mm256_mul_pd(fscal,dy10);
2141 tz = _mm256_mul_pd(fscal,dz10);
2143 /* Update vectorial force */
2144 fix1 = _mm256_add_pd(fix1,tx);
2145 fiy1 = _mm256_add_pd(fiy1,ty);
2146 fiz1 = _mm256_add_pd(fiz1,tz);
2148 fjx0 = _mm256_add_pd(fjx0,tx);
2149 fjy0 = _mm256_add_pd(fjy0,ty);
2150 fjz0 = _mm256_add_pd(fjz0,tz);
2152 /**************************
2153 * CALCULATE INTERACTIONS *
2154 **************************/
2156 r11 = _mm256_mul_pd(rsq11,rinv11);
2157 r11 = _mm256_andnot_pd(dummy_mask,r11);
2159 /* Calculate table index by multiplying r with table scale and truncate to integer */
2160 rt = _mm256_mul_pd(r11,vftabscale);
2161 vfitab = _mm256_cvttpd_epi32(rt);
2162 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2163 vfitab = _mm_slli_epi32(vfitab,2);
2165 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2166 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2167 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2168 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2169 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2170 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2171 Heps = _mm256_mul_pd(vfeps,H);
2172 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2173 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2174 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2178 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2180 /* Calculate temporary vectorial force */
2181 tx = _mm256_mul_pd(fscal,dx11);
2182 ty = _mm256_mul_pd(fscal,dy11);
2183 tz = _mm256_mul_pd(fscal,dz11);
2185 /* Update vectorial force */
2186 fix1 = _mm256_add_pd(fix1,tx);
2187 fiy1 = _mm256_add_pd(fiy1,ty);
2188 fiz1 = _mm256_add_pd(fiz1,tz);
2190 fjx1 = _mm256_add_pd(fjx1,tx);
2191 fjy1 = _mm256_add_pd(fjy1,ty);
2192 fjz1 = _mm256_add_pd(fjz1,tz);
2194 /**************************
2195 * CALCULATE INTERACTIONS *
2196 **************************/
2198 r12 = _mm256_mul_pd(rsq12,rinv12);
2199 r12 = _mm256_andnot_pd(dummy_mask,r12);
2201 /* Calculate table index by multiplying r with table scale and truncate to integer */
2202 rt = _mm256_mul_pd(r12,vftabscale);
2203 vfitab = _mm256_cvttpd_epi32(rt);
2204 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2205 vfitab = _mm_slli_epi32(vfitab,2);
2207 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2208 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2209 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2210 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2211 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2212 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2213 Heps = _mm256_mul_pd(vfeps,H);
2214 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2215 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2216 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2220 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2222 /* Calculate temporary vectorial force */
2223 tx = _mm256_mul_pd(fscal,dx12);
2224 ty = _mm256_mul_pd(fscal,dy12);
2225 tz = _mm256_mul_pd(fscal,dz12);
2227 /* Update vectorial force */
2228 fix1 = _mm256_add_pd(fix1,tx);
2229 fiy1 = _mm256_add_pd(fiy1,ty);
2230 fiz1 = _mm256_add_pd(fiz1,tz);
2232 fjx2 = _mm256_add_pd(fjx2,tx);
2233 fjy2 = _mm256_add_pd(fjy2,ty);
2234 fjz2 = _mm256_add_pd(fjz2,tz);
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 r20 = _mm256_mul_pd(rsq20,rinv20);
2241 r20 = _mm256_andnot_pd(dummy_mask,r20);
2243 /* Calculate table index by multiplying r with table scale and truncate to integer */
2244 rt = _mm256_mul_pd(r20,vftabscale);
2245 vfitab = _mm256_cvttpd_epi32(rt);
2246 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2247 vfitab = _mm_slli_epi32(vfitab,2);
2249 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2250 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2251 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2252 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2253 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2254 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2255 Heps = _mm256_mul_pd(vfeps,H);
2256 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2257 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2258 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2262 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2264 /* Calculate temporary vectorial force */
2265 tx = _mm256_mul_pd(fscal,dx20);
2266 ty = _mm256_mul_pd(fscal,dy20);
2267 tz = _mm256_mul_pd(fscal,dz20);
2269 /* Update vectorial force */
2270 fix2 = _mm256_add_pd(fix2,tx);
2271 fiy2 = _mm256_add_pd(fiy2,ty);
2272 fiz2 = _mm256_add_pd(fiz2,tz);
2274 fjx0 = _mm256_add_pd(fjx0,tx);
2275 fjy0 = _mm256_add_pd(fjy0,ty);
2276 fjz0 = _mm256_add_pd(fjz0,tz);
2278 /**************************
2279 * CALCULATE INTERACTIONS *
2280 **************************/
2282 r21 = _mm256_mul_pd(rsq21,rinv21);
2283 r21 = _mm256_andnot_pd(dummy_mask,r21);
2285 /* Calculate table index by multiplying r with table scale and truncate to integer */
2286 rt = _mm256_mul_pd(r21,vftabscale);
2287 vfitab = _mm256_cvttpd_epi32(rt);
2288 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2289 vfitab = _mm_slli_epi32(vfitab,2);
2291 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2292 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2293 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2294 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2295 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2296 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2297 Heps = _mm256_mul_pd(vfeps,H);
2298 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2299 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2300 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2304 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2306 /* Calculate temporary vectorial force */
2307 tx = _mm256_mul_pd(fscal,dx21);
2308 ty = _mm256_mul_pd(fscal,dy21);
2309 tz = _mm256_mul_pd(fscal,dz21);
2311 /* Update vectorial force */
2312 fix2 = _mm256_add_pd(fix2,tx);
2313 fiy2 = _mm256_add_pd(fiy2,ty);
2314 fiz2 = _mm256_add_pd(fiz2,tz);
2316 fjx1 = _mm256_add_pd(fjx1,tx);
2317 fjy1 = _mm256_add_pd(fjy1,ty);
2318 fjz1 = _mm256_add_pd(fjz1,tz);
2320 /**************************
2321 * CALCULATE INTERACTIONS *
2322 **************************/
2324 r22 = _mm256_mul_pd(rsq22,rinv22);
2325 r22 = _mm256_andnot_pd(dummy_mask,r22);
2327 /* Calculate table index by multiplying r with table scale and truncate to integer */
2328 rt = _mm256_mul_pd(r22,vftabscale);
2329 vfitab = _mm256_cvttpd_epi32(rt);
2330 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2331 vfitab = _mm_slli_epi32(vfitab,2);
2333 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2334 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2335 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2336 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2337 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2338 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2339 Heps = _mm256_mul_pd(vfeps,H);
2340 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2341 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2342 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2346 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2348 /* Calculate temporary vectorial force */
2349 tx = _mm256_mul_pd(fscal,dx22);
2350 ty = _mm256_mul_pd(fscal,dy22);
2351 tz = _mm256_mul_pd(fscal,dz22);
2353 /* Update vectorial force */
2354 fix2 = _mm256_add_pd(fix2,tx);
2355 fiy2 = _mm256_add_pd(fiy2,ty);
2356 fiz2 = _mm256_add_pd(fiz2,tz);
2358 fjx2 = _mm256_add_pd(fjx2,tx);
2359 fjy2 = _mm256_add_pd(fjy2,ty);
2360 fjz2 = _mm256_add_pd(fjz2,tz);
2362 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2363 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2364 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2365 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2367 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2368 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2370 /* Inner loop uses 368 flops */
2373 /* End of innermost loop */
2375 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2376 f+i_coord_offset,fshift+i_shift_offset);
2378 /* Increment number of inner iterations */
2379 inneriter += j_index_end - j_index_start;
2381 /* Outer loop uses 18 flops */
2384 /* Increment number of outer iterations */
2387 /* Update outer/inner flops */
2389 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);