2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
105 __m128i ifour = _mm_set1_epi32(4);
106 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
108 __m256d dummy_mask,cutoff_mask;
109 __m128 tmpmask0,tmpmask1;
110 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
111 __m256d one = _mm256_set1_pd(1.0);
112 __m256d two = _mm256_set1_pd(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm256_set1_pd(fr->epsfac);
125 charge = mdatoms->chargeA;
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 vftab = kernel_data->table_elec->data;
131 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
133 /* Setup water-specific parameters */
134 inr = nlist->iinr[0];
135 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
136 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
137 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
138 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
140 jq1 = _mm256_set1_pd(charge[inr+1]);
141 jq2 = _mm256_set1_pd(charge[inr+2]);
142 jq3 = _mm256_set1_pd(charge[inr+3]);
143 vdwjidx0A = 2*vdwtype[inr+0];
144 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
145 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
146 qq11 = _mm256_mul_pd(iq1,jq1);
147 qq12 = _mm256_mul_pd(iq1,jq2);
148 qq13 = _mm256_mul_pd(iq1,jq3);
149 qq21 = _mm256_mul_pd(iq2,jq1);
150 qq22 = _mm256_mul_pd(iq2,jq2);
151 qq23 = _mm256_mul_pd(iq2,jq3);
152 qq31 = _mm256_mul_pd(iq3,jq1);
153 qq32 = _mm256_mul_pd(iq3,jq2);
154 qq33 = _mm256_mul_pd(iq3,jq3);
156 /* Avoid stupid compiler warnings */
157 jnrA = jnrB = jnrC = jnrD = 0;
166 for(iidx=0;iidx<4*DIM;iidx++)
171 /* Start outer loop over neighborlists */
172 for(iidx=0; iidx<nri; iidx++)
174 /* Load shift vector for this list */
175 i_shift_offset = DIM*shiftidx[iidx];
177 /* Load limits for loop over neighbors */
178 j_index_start = jindex[iidx];
179 j_index_end = jindex[iidx+1];
181 /* Get outer coordinate index */
183 i_coord_offset = DIM*inr;
185 /* Load i particle coords and add shift vector */
186 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
187 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
189 fix0 = _mm256_setzero_pd();
190 fiy0 = _mm256_setzero_pd();
191 fiz0 = _mm256_setzero_pd();
192 fix1 = _mm256_setzero_pd();
193 fiy1 = _mm256_setzero_pd();
194 fiz1 = _mm256_setzero_pd();
195 fix2 = _mm256_setzero_pd();
196 fiy2 = _mm256_setzero_pd();
197 fiz2 = _mm256_setzero_pd();
198 fix3 = _mm256_setzero_pd();
199 fiy3 = _mm256_setzero_pd();
200 fiz3 = _mm256_setzero_pd();
202 /* Reset potential sums */
203 velecsum = _mm256_setzero_pd();
204 vvdwsum = _mm256_setzero_pd();
206 /* Start inner kernel loop */
207 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
210 /* Get j neighbor index, and coordinate index */
215 j_coord_offsetA = DIM*jnrA;
216 j_coord_offsetB = DIM*jnrB;
217 j_coord_offsetC = DIM*jnrC;
218 j_coord_offsetD = DIM*jnrD;
220 /* load j atom coordinates */
221 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222 x+j_coord_offsetC,x+j_coord_offsetD,
223 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
224 &jy2,&jz2,&jx3,&jy3,&jz3);
226 /* Calculate displacement vector */
227 dx00 = _mm256_sub_pd(ix0,jx0);
228 dy00 = _mm256_sub_pd(iy0,jy0);
229 dz00 = _mm256_sub_pd(iz0,jz0);
230 dx11 = _mm256_sub_pd(ix1,jx1);
231 dy11 = _mm256_sub_pd(iy1,jy1);
232 dz11 = _mm256_sub_pd(iz1,jz1);
233 dx12 = _mm256_sub_pd(ix1,jx2);
234 dy12 = _mm256_sub_pd(iy1,jy2);
235 dz12 = _mm256_sub_pd(iz1,jz2);
236 dx13 = _mm256_sub_pd(ix1,jx3);
237 dy13 = _mm256_sub_pd(iy1,jy3);
238 dz13 = _mm256_sub_pd(iz1,jz3);
239 dx21 = _mm256_sub_pd(ix2,jx1);
240 dy21 = _mm256_sub_pd(iy2,jy1);
241 dz21 = _mm256_sub_pd(iz2,jz1);
242 dx22 = _mm256_sub_pd(ix2,jx2);
243 dy22 = _mm256_sub_pd(iy2,jy2);
244 dz22 = _mm256_sub_pd(iz2,jz2);
245 dx23 = _mm256_sub_pd(ix2,jx3);
246 dy23 = _mm256_sub_pd(iy2,jy3);
247 dz23 = _mm256_sub_pd(iz2,jz3);
248 dx31 = _mm256_sub_pd(ix3,jx1);
249 dy31 = _mm256_sub_pd(iy3,jy1);
250 dz31 = _mm256_sub_pd(iz3,jz1);
251 dx32 = _mm256_sub_pd(ix3,jx2);
252 dy32 = _mm256_sub_pd(iy3,jy2);
253 dz32 = _mm256_sub_pd(iz3,jz2);
254 dx33 = _mm256_sub_pd(ix3,jx3);
255 dy33 = _mm256_sub_pd(iy3,jy3);
256 dz33 = _mm256_sub_pd(iz3,jz3);
258 /* Calculate squared distance and things based on it */
259 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
260 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
261 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
262 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
263 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
264 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
265 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
266 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
267 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
268 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
270 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
271 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
272 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
273 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
274 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
275 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
276 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
277 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
278 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
280 rinvsq00 = gmx_mm256_inv_pd(rsq00);
282 fjx0 = _mm256_setzero_pd();
283 fjy0 = _mm256_setzero_pd();
284 fjz0 = _mm256_setzero_pd();
285 fjx1 = _mm256_setzero_pd();
286 fjy1 = _mm256_setzero_pd();
287 fjz1 = _mm256_setzero_pd();
288 fjx2 = _mm256_setzero_pd();
289 fjy2 = _mm256_setzero_pd();
290 fjz2 = _mm256_setzero_pd();
291 fjx3 = _mm256_setzero_pd();
292 fjy3 = _mm256_setzero_pd();
293 fjz3 = _mm256_setzero_pd();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 /* LENNARD-JONES DISPERSION/REPULSION */
301 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
302 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
303 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
304 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
305 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
307 /* Update potential sum for this i atom from the interaction with this j atom. */
308 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
312 /* Calculate temporary vectorial force */
313 tx = _mm256_mul_pd(fscal,dx00);
314 ty = _mm256_mul_pd(fscal,dy00);
315 tz = _mm256_mul_pd(fscal,dz00);
317 /* Update vectorial force */
318 fix0 = _mm256_add_pd(fix0,tx);
319 fiy0 = _mm256_add_pd(fiy0,ty);
320 fiz0 = _mm256_add_pd(fiz0,tz);
322 fjx0 = _mm256_add_pd(fjx0,tx);
323 fjy0 = _mm256_add_pd(fjy0,ty);
324 fjz0 = _mm256_add_pd(fjz0,tz);
326 /**************************
327 * CALCULATE INTERACTIONS *
328 **************************/
330 r11 = _mm256_mul_pd(rsq11,rinv11);
332 /* Calculate table index by multiplying r with table scale and truncate to integer */
333 rt = _mm256_mul_pd(r11,vftabscale);
334 vfitab = _mm256_cvttpd_epi32(rt);
335 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
336 vfitab = _mm_slli_epi32(vfitab,2);
338 /* CUBIC SPLINE TABLE ELECTROSTATICS */
339 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
340 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
341 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
342 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
343 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
344 Heps = _mm256_mul_pd(vfeps,H);
345 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
346 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
347 velec = _mm256_mul_pd(qq11,VV);
348 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
349 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velecsum = _mm256_add_pd(velecsum,velec);
356 /* Calculate temporary vectorial force */
357 tx = _mm256_mul_pd(fscal,dx11);
358 ty = _mm256_mul_pd(fscal,dy11);
359 tz = _mm256_mul_pd(fscal,dz11);
361 /* Update vectorial force */
362 fix1 = _mm256_add_pd(fix1,tx);
363 fiy1 = _mm256_add_pd(fiy1,ty);
364 fiz1 = _mm256_add_pd(fiz1,tz);
366 fjx1 = _mm256_add_pd(fjx1,tx);
367 fjy1 = _mm256_add_pd(fjy1,ty);
368 fjz1 = _mm256_add_pd(fjz1,tz);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 r12 = _mm256_mul_pd(rsq12,rinv12);
376 /* Calculate table index by multiplying r with table scale and truncate to integer */
377 rt = _mm256_mul_pd(r12,vftabscale);
378 vfitab = _mm256_cvttpd_epi32(rt);
379 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
380 vfitab = _mm_slli_epi32(vfitab,2);
382 /* CUBIC SPLINE TABLE ELECTROSTATICS */
383 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
384 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
385 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
386 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
387 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
388 Heps = _mm256_mul_pd(vfeps,H);
389 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
390 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
391 velec = _mm256_mul_pd(qq12,VV);
392 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
393 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velecsum = _mm256_add_pd(velecsum,velec);
400 /* Calculate temporary vectorial force */
401 tx = _mm256_mul_pd(fscal,dx12);
402 ty = _mm256_mul_pd(fscal,dy12);
403 tz = _mm256_mul_pd(fscal,dz12);
405 /* Update vectorial force */
406 fix1 = _mm256_add_pd(fix1,tx);
407 fiy1 = _mm256_add_pd(fiy1,ty);
408 fiz1 = _mm256_add_pd(fiz1,tz);
410 fjx2 = _mm256_add_pd(fjx2,tx);
411 fjy2 = _mm256_add_pd(fjy2,ty);
412 fjz2 = _mm256_add_pd(fjz2,tz);
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 r13 = _mm256_mul_pd(rsq13,rinv13);
420 /* Calculate table index by multiplying r with table scale and truncate to integer */
421 rt = _mm256_mul_pd(r13,vftabscale);
422 vfitab = _mm256_cvttpd_epi32(rt);
423 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
424 vfitab = _mm_slli_epi32(vfitab,2);
426 /* CUBIC SPLINE TABLE ELECTROSTATICS */
427 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
428 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
429 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
430 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
431 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
432 Heps = _mm256_mul_pd(vfeps,H);
433 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
434 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
435 velec = _mm256_mul_pd(qq13,VV);
436 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
437 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
439 /* Update potential sum for this i atom from the interaction with this j atom. */
440 velecsum = _mm256_add_pd(velecsum,velec);
444 /* Calculate temporary vectorial force */
445 tx = _mm256_mul_pd(fscal,dx13);
446 ty = _mm256_mul_pd(fscal,dy13);
447 tz = _mm256_mul_pd(fscal,dz13);
449 /* Update vectorial force */
450 fix1 = _mm256_add_pd(fix1,tx);
451 fiy1 = _mm256_add_pd(fiy1,ty);
452 fiz1 = _mm256_add_pd(fiz1,tz);
454 fjx3 = _mm256_add_pd(fjx3,tx);
455 fjy3 = _mm256_add_pd(fjy3,ty);
456 fjz3 = _mm256_add_pd(fjz3,tz);
458 /**************************
459 * CALCULATE INTERACTIONS *
460 **************************/
462 r21 = _mm256_mul_pd(rsq21,rinv21);
464 /* Calculate table index by multiplying r with table scale and truncate to integer */
465 rt = _mm256_mul_pd(r21,vftabscale);
466 vfitab = _mm256_cvttpd_epi32(rt);
467 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
468 vfitab = _mm_slli_epi32(vfitab,2);
470 /* CUBIC SPLINE TABLE ELECTROSTATICS */
471 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
472 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
473 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
474 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
475 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
476 Heps = _mm256_mul_pd(vfeps,H);
477 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
478 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
479 velec = _mm256_mul_pd(qq21,VV);
480 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
481 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm256_add_pd(velecsum,velec);
488 /* Calculate temporary vectorial force */
489 tx = _mm256_mul_pd(fscal,dx21);
490 ty = _mm256_mul_pd(fscal,dy21);
491 tz = _mm256_mul_pd(fscal,dz21);
493 /* Update vectorial force */
494 fix2 = _mm256_add_pd(fix2,tx);
495 fiy2 = _mm256_add_pd(fiy2,ty);
496 fiz2 = _mm256_add_pd(fiz2,tz);
498 fjx1 = _mm256_add_pd(fjx1,tx);
499 fjy1 = _mm256_add_pd(fjy1,ty);
500 fjz1 = _mm256_add_pd(fjz1,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r22 = _mm256_mul_pd(rsq22,rinv22);
508 /* Calculate table index by multiplying r with table scale and truncate to integer */
509 rt = _mm256_mul_pd(r22,vftabscale);
510 vfitab = _mm256_cvttpd_epi32(rt);
511 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
512 vfitab = _mm_slli_epi32(vfitab,2);
514 /* CUBIC SPLINE TABLE ELECTROSTATICS */
515 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
516 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
517 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
518 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
519 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
520 Heps = _mm256_mul_pd(vfeps,H);
521 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
522 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
523 velec = _mm256_mul_pd(qq22,VV);
524 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
525 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velecsum = _mm256_add_pd(velecsum,velec);
532 /* Calculate temporary vectorial force */
533 tx = _mm256_mul_pd(fscal,dx22);
534 ty = _mm256_mul_pd(fscal,dy22);
535 tz = _mm256_mul_pd(fscal,dz22);
537 /* Update vectorial force */
538 fix2 = _mm256_add_pd(fix2,tx);
539 fiy2 = _mm256_add_pd(fiy2,ty);
540 fiz2 = _mm256_add_pd(fiz2,tz);
542 fjx2 = _mm256_add_pd(fjx2,tx);
543 fjy2 = _mm256_add_pd(fjy2,ty);
544 fjz2 = _mm256_add_pd(fjz2,tz);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 r23 = _mm256_mul_pd(rsq23,rinv23);
552 /* Calculate table index by multiplying r with table scale and truncate to integer */
553 rt = _mm256_mul_pd(r23,vftabscale);
554 vfitab = _mm256_cvttpd_epi32(rt);
555 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
556 vfitab = _mm_slli_epi32(vfitab,2);
558 /* CUBIC SPLINE TABLE ELECTROSTATICS */
559 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
560 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
561 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
562 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
563 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
564 Heps = _mm256_mul_pd(vfeps,H);
565 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
566 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
567 velec = _mm256_mul_pd(qq23,VV);
568 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
569 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
571 /* Update potential sum for this i atom from the interaction with this j atom. */
572 velecsum = _mm256_add_pd(velecsum,velec);
576 /* Calculate temporary vectorial force */
577 tx = _mm256_mul_pd(fscal,dx23);
578 ty = _mm256_mul_pd(fscal,dy23);
579 tz = _mm256_mul_pd(fscal,dz23);
581 /* Update vectorial force */
582 fix2 = _mm256_add_pd(fix2,tx);
583 fiy2 = _mm256_add_pd(fiy2,ty);
584 fiz2 = _mm256_add_pd(fiz2,tz);
586 fjx3 = _mm256_add_pd(fjx3,tx);
587 fjy3 = _mm256_add_pd(fjy3,ty);
588 fjz3 = _mm256_add_pd(fjz3,tz);
590 /**************************
591 * CALCULATE INTERACTIONS *
592 **************************/
594 r31 = _mm256_mul_pd(rsq31,rinv31);
596 /* Calculate table index by multiplying r with table scale and truncate to integer */
597 rt = _mm256_mul_pd(r31,vftabscale);
598 vfitab = _mm256_cvttpd_epi32(rt);
599 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
600 vfitab = _mm_slli_epi32(vfitab,2);
602 /* CUBIC SPLINE TABLE ELECTROSTATICS */
603 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
604 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
605 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
606 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
607 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
608 Heps = _mm256_mul_pd(vfeps,H);
609 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
610 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
611 velec = _mm256_mul_pd(qq31,VV);
612 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
613 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
615 /* Update potential sum for this i atom from the interaction with this j atom. */
616 velecsum = _mm256_add_pd(velecsum,velec);
620 /* Calculate temporary vectorial force */
621 tx = _mm256_mul_pd(fscal,dx31);
622 ty = _mm256_mul_pd(fscal,dy31);
623 tz = _mm256_mul_pd(fscal,dz31);
625 /* Update vectorial force */
626 fix3 = _mm256_add_pd(fix3,tx);
627 fiy3 = _mm256_add_pd(fiy3,ty);
628 fiz3 = _mm256_add_pd(fiz3,tz);
630 fjx1 = _mm256_add_pd(fjx1,tx);
631 fjy1 = _mm256_add_pd(fjy1,ty);
632 fjz1 = _mm256_add_pd(fjz1,tz);
634 /**************************
635 * CALCULATE INTERACTIONS *
636 **************************/
638 r32 = _mm256_mul_pd(rsq32,rinv32);
640 /* Calculate table index by multiplying r with table scale and truncate to integer */
641 rt = _mm256_mul_pd(r32,vftabscale);
642 vfitab = _mm256_cvttpd_epi32(rt);
643 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
644 vfitab = _mm_slli_epi32(vfitab,2);
646 /* CUBIC SPLINE TABLE ELECTROSTATICS */
647 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
648 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
649 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
650 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
651 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
652 Heps = _mm256_mul_pd(vfeps,H);
653 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
654 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
655 velec = _mm256_mul_pd(qq32,VV);
656 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
657 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
659 /* Update potential sum for this i atom from the interaction with this j atom. */
660 velecsum = _mm256_add_pd(velecsum,velec);
664 /* Calculate temporary vectorial force */
665 tx = _mm256_mul_pd(fscal,dx32);
666 ty = _mm256_mul_pd(fscal,dy32);
667 tz = _mm256_mul_pd(fscal,dz32);
669 /* Update vectorial force */
670 fix3 = _mm256_add_pd(fix3,tx);
671 fiy3 = _mm256_add_pd(fiy3,ty);
672 fiz3 = _mm256_add_pd(fiz3,tz);
674 fjx2 = _mm256_add_pd(fjx2,tx);
675 fjy2 = _mm256_add_pd(fjy2,ty);
676 fjz2 = _mm256_add_pd(fjz2,tz);
678 /**************************
679 * CALCULATE INTERACTIONS *
680 **************************/
682 r33 = _mm256_mul_pd(rsq33,rinv33);
684 /* Calculate table index by multiplying r with table scale and truncate to integer */
685 rt = _mm256_mul_pd(r33,vftabscale);
686 vfitab = _mm256_cvttpd_epi32(rt);
687 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
688 vfitab = _mm_slli_epi32(vfitab,2);
690 /* CUBIC SPLINE TABLE ELECTROSTATICS */
691 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
692 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
693 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
694 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
695 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
696 Heps = _mm256_mul_pd(vfeps,H);
697 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
698 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
699 velec = _mm256_mul_pd(qq33,VV);
700 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
701 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
703 /* Update potential sum for this i atom from the interaction with this j atom. */
704 velecsum = _mm256_add_pd(velecsum,velec);
708 /* Calculate temporary vectorial force */
709 tx = _mm256_mul_pd(fscal,dx33);
710 ty = _mm256_mul_pd(fscal,dy33);
711 tz = _mm256_mul_pd(fscal,dz33);
713 /* Update vectorial force */
714 fix3 = _mm256_add_pd(fix3,tx);
715 fiy3 = _mm256_add_pd(fiy3,ty);
716 fiz3 = _mm256_add_pd(fiz3,tz);
718 fjx3 = _mm256_add_pd(fjx3,tx);
719 fjy3 = _mm256_add_pd(fjy3,ty);
720 fjz3 = _mm256_add_pd(fjz3,tz);
722 fjptrA = f+j_coord_offsetA;
723 fjptrB = f+j_coord_offsetB;
724 fjptrC = f+j_coord_offsetC;
725 fjptrD = f+j_coord_offsetD;
727 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
728 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
729 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
731 /* Inner loop uses 422 flops */
737 /* Get j neighbor index, and coordinate index */
738 jnrlistA = jjnr[jidx];
739 jnrlistB = jjnr[jidx+1];
740 jnrlistC = jjnr[jidx+2];
741 jnrlistD = jjnr[jidx+3];
742 /* Sign of each element will be negative for non-real atoms.
743 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
744 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
746 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
748 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
749 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
750 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
752 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
753 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
754 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
755 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
756 j_coord_offsetA = DIM*jnrA;
757 j_coord_offsetB = DIM*jnrB;
758 j_coord_offsetC = DIM*jnrC;
759 j_coord_offsetD = DIM*jnrD;
761 /* load j atom coordinates */
762 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
763 x+j_coord_offsetC,x+j_coord_offsetD,
764 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
765 &jy2,&jz2,&jx3,&jy3,&jz3);
767 /* Calculate displacement vector */
768 dx00 = _mm256_sub_pd(ix0,jx0);
769 dy00 = _mm256_sub_pd(iy0,jy0);
770 dz00 = _mm256_sub_pd(iz0,jz0);
771 dx11 = _mm256_sub_pd(ix1,jx1);
772 dy11 = _mm256_sub_pd(iy1,jy1);
773 dz11 = _mm256_sub_pd(iz1,jz1);
774 dx12 = _mm256_sub_pd(ix1,jx2);
775 dy12 = _mm256_sub_pd(iy1,jy2);
776 dz12 = _mm256_sub_pd(iz1,jz2);
777 dx13 = _mm256_sub_pd(ix1,jx3);
778 dy13 = _mm256_sub_pd(iy1,jy3);
779 dz13 = _mm256_sub_pd(iz1,jz3);
780 dx21 = _mm256_sub_pd(ix2,jx1);
781 dy21 = _mm256_sub_pd(iy2,jy1);
782 dz21 = _mm256_sub_pd(iz2,jz1);
783 dx22 = _mm256_sub_pd(ix2,jx2);
784 dy22 = _mm256_sub_pd(iy2,jy2);
785 dz22 = _mm256_sub_pd(iz2,jz2);
786 dx23 = _mm256_sub_pd(ix2,jx3);
787 dy23 = _mm256_sub_pd(iy2,jy3);
788 dz23 = _mm256_sub_pd(iz2,jz3);
789 dx31 = _mm256_sub_pd(ix3,jx1);
790 dy31 = _mm256_sub_pd(iy3,jy1);
791 dz31 = _mm256_sub_pd(iz3,jz1);
792 dx32 = _mm256_sub_pd(ix3,jx2);
793 dy32 = _mm256_sub_pd(iy3,jy2);
794 dz32 = _mm256_sub_pd(iz3,jz2);
795 dx33 = _mm256_sub_pd(ix3,jx3);
796 dy33 = _mm256_sub_pd(iy3,jy3);
797 dz33 = _mm256_sub_pd(iz3,jz3);
799 /* Calculate squared distance and things based on it */
800 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
801 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
802 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
803 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
804 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
805 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
806 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
807 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
808 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
809 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
811 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
812 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
813 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
814 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
815 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
816 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
817 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
818 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
819 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
821 rinvsq00 = gmx_mm256_inv_pd(rsq00);
823 fjx0 = _mm256_setzero_pd();
824 fjy0 = _mm256_setzero_pd();
825 fjz0 = _mm256_setzero_pd();
826 fjx1 = _mm256_setzero_pd();
827 fjy1 = _mm256_setzero_pd();
828 fjz1 = _mm256_setzero_pd();
829 fjx2 = _mm256_setzero_pd();
830 fjy2 = _mm256_setzero_pd();
831 fjz2 = _mm256_setzero_pd();
832 fjx3 = _mm256_setzero_pd();
833 fjy3 = _mm256_setzero_pd();
834 fjz3 = _mm256_setzero_pd();
836 /**************************
837 * CALCULATE INTERACTIONS *
838 **************************/
840 /* LENNARD-JONES DISPERSION/REPULSION */
842 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
843 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
844 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
845 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
846 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
850 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
854 fscal = _mm256_andnot_pd(dummy_mask,fscal);
856 /* Calculate temporary vectorial force */
857 tx = _mm256_mul_pd(fscal,dx00);
858 ty = _mm256_mul_pd(fscal,dy00);
859 tz = _mm256_mul_pd(fscal,dz00);
861 /* Update vectorial force */
862 fix0 = _mm256_add_pd(fix0,tx);
863 fiy0 = _mm256_add_pd(fiy0,ty);
864 fiz0 = _mm256_add_pd(fiz0,tz);
866 fjx0 = _mm256_add_pd(fjx0,tx);
867 fjy0 = _mm256_add_pd(fjy0,ty);
868 fjz0 = _mm256_add_pd(fjz0,tz);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 r11 = _mm256_mul_pd(rsq11,rinv11);
875 r11 = _mm256_andnot_pd(dummy_mask,r11);
877 /* Calculate table index by multiplying r with table scale and truncate to integer */
878 rt = _mm256_mul_pd(r11,vftabscale);
879 vfitab = _mm256_cvttpd_epi32(rt);
880 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
881 vfitab = _mm_slli_epi32(vfitab,2);
883 /* CUBIC SPLINE TABLE ELECTROSTATICS */
884 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
885 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
886 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
887 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
888 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
889 Heps = _mm256_mul_pd(vfeps,H);
890 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
891 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
892 velec = _mm256_mul_pd(qq11,VV);
893 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
894 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec = _mm256_andnot_pd(dummy_mask,velec);
898 velecsum = _mm256_add_pd(velecsum,velec);
902 fscal = _mm256_andnot_pd(dummy_mask,fscal);
904 /* Calculate temporary vectorial force */
905 tx = _mm256_mul_pd(fscal,dx11);
906 ty = _mm256_mul_pd(fscal,dy11);
907 tz = _mm256_mul_pd(fscal,dz11);
909 /* Update vectorial force */
910 fix1 = _mm256_add_pd(fix1,tx);
911 fiy1 = _mm256_add_pd(fiy1,ty);
912 fiz1 = _mm256_add_pd(fiz1,tz);
914 fjx1 = _mm256_add_pd(fjx1,tx);
915 fjy1 = _mm256_add_pd(fjy1,ty);
916 fjz1 = _mm256_add_pd(fjz1,tz);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 r12 = _mm256_mul_pd(rsq12,rinv12);
923 r12 = _mm256_andnot_pd(dummy_mask,r12);
925 /* Calculate table index by multiplying r with table scale and truncate to integer */
926 rt = _mm256_mul_pd(r12,vftabscale);
927 vfitab = _mm256_cvttpd_epi32(rt);
928 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
929 vfitab = _mm_slli_epi32(vfitab,2);
931 /* CUBIC SPLINE TABLE ELECTROSTATICS */
932 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
933 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
934 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
935 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
936 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
937 Heps = _mm256_mul_pd(vfeps,H);
938 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
939 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
940 velec = _mm256_mul_pd(qq12,VV);
941 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
942 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
944 /* Update potential sum for this i atom from the interaction with this j atom. */
945 velec = _mm256_andnot_pd(dummy_mask,velec);
946 velecsum = _mm256_add_pd(velecsum,velec);
950 fscal = _mm256_andnot_pd(dummy_mask,fscal);
952 /* Calculate temporary vectorial force */
953 tx = _mm256_mul_pd(fscal,dx12);
954 ty = _mm256_mul_pd(fscal,dy12);
955 tz = _mm256_mul_pd(fscal,dz12);
957 /* Update vectorial force */
958 fix1 = _mm256_add_pd(fix1,tx);
959 fiy1 = _mm256_add_pd(fiy1,ty);
960 fiz1 = _mm256_add_pd(fiz1,tz);
962 fjx2 = _mm256_add_pd(fjx2,tx);
963 fjy2 = _mm256_add_pd(fjy2,ty);
964 fjz2 = _mm256_add_pd(fjz2,tz);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 r13 = _mm256_mul_pd(rsq13,rinv13);
971 r13 = _mm256_andnot_pd(dummy_mask,r13);
973 /* Calculate table index by multiplying r with table scale and truncate to integer */
974 rt = _mm256_mul_pd(r13,vftabscale);
975 vfitab = _mm256_cvttpd_epi32(rt);
976 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
977 vfitab = _mm_slli_epi32(vfitab,2);
979 /* CUBIC SPLINE TABLE ELECTROSTATICS */
980 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
981 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
982 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
983 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
984 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
985 Heps = _mm256_mul_pd(vfeps,H);
986 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
987 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
988 velec = _mm256_mul_pd(qq13,VV);
989 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
990 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _mm256_andnot_pd(dummy_mask,velec);
994 velecsum = _mm256_add_pd(velecsum,velec);
998 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1000 /* Calculate temporary vectorial force */
1001 tx = _mm256_mul_pd(fscal,dx13);
1002 ty = _mm256_mul_pd(fscal,dy13);
1003 tz = _mm256_mul_pd(fscal,dz13);
1005 /* Update vectorial force */
1006 fix1 = _mm256_add_pd(fix1,tx);
1007 fiy1 = _mm256_add_pd(fiy1,ty);
1008 fiz1 = _mm256_add_pd(fiz1,tz);
1010 fjx3 = _mm256_add_pd(fjx3,tx);
1011 fjy3 = _mm256_add_pd(fjy3,ty);
1012 fjz3 = _mm256_add_pd(fjz3,tz);
1014 /**************************
1015 * CALCULATE INTERACTIONS *
1016 **************************/
1018 r21 = _mm256_mul_pd(rsq21,rinv21);
1019 r21 = _mm256_andnot_pd(dummy_mask,r21);
1021 /* Calculate table index by multiplying r with table scale and truncate to integer */
1022 rt = _mm256_mul_pd(r21,vftabscale);
1023 vfitab = _mm256_cvttpd_epi32(rt);
1024 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1025 vfitab = _mm_slli_epi32(vfitab,2);
1027 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1028 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1029 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1030 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1031 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1032 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1033 Heps = _mm256_mul_pd(vfeps,H);
1034 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1035 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1036 velec = _mm256_mul_pd(qq21,VV);
1037 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1038 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec = _mm256_andnot_pd(dummy_mask,velec);
1042 velecsum = _mm256_add_pd(velecsum,velec);
1046 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1048 /* Calculate temporary vectorial force */
1049 tx = _mm256_mul_pd(fscal,dx21);
1050 ty = _mm256_mul_pd(fscal,dy21);
1051 tz = _mm256_mul_pd(fscal,dz21);
1053 /* Update vectorial force */
1054 fix2 = _mm256_add_pd(fix2,tx);
1055 fiy2 = _mm256_add_pd(fiy2,ty);
1056 fiz2 = _mm256_add_pd(fiz2,tz);
1058 fjx1 = _mm256_add_pd(fjx1,tx);
1059 fjy1 = _mm256_add_pd(fjy1,ty);
1060 fjz1 = _mm256_add_pd(fjz1,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r22 = _mm256_mul_pd(rsq22,rinv22);
1067 r22 = _mm256_andnot_pd(dummy_mask,r22);
1069 /* Calculate table index by multiplying r with table scale and truncate to integer */
1070 rt = _mm256_mul_pd(r22,vftabscale);
1071 vfitab = _mm256_cvttpd_epi32(rt);
1072 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1073 vfitab = _mm_slli_epi32(vfitab,2);
1075 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1076 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1077 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1078 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1079 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1080 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1081 Heps = _mm256_mul_pd(vfeps,H);
1082 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1083 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1084 velec = _mm256_mul_pd(qq22,VV);
1085 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1086 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1088 /* Update potential sum for this i atom from the interaction with this j atom. */
1089 velec = _mm256_andnot_pd(dummy_mask,velec);
1090 velecsum = _mm256_add_pd(velecsum,velec);
1094 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1096 /* Calculate temporary vectorial force */
1097 tx = _mm256_mul_pd(fscal,dx22);
1098 ty = _mm256_mul_pd(fscal,dy22);
1099 tz = _mm256_mul_pd(fscal,dz22);
1101 /* Update vectorial force */
1102 fix2 = _mm256_add_pd(fix2,tx);
1103 fiy2 = _mm256_add_pd(fiy2,ty);
1104 fiz2 = _mm256_add_pd(fiz2,tz);
1106 fjx2 = _mm256_add_pd(fjx2,tx);
1107 fjy2 = _mm256_add_pd(fjy2,ty);
1108 fjz2 = _mm256_add_pd(fjz2,tz);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r23 = _mm256_mul_pd(rsq23,rinv23);
1115 r23 = _mm256_andnot_pd(dummy_mask,r23);
1117 /* Calculate table index by multiplying r with table scale and truncate to integer */
1118 rt = _mm256_mul_pd(r23,vftabscale);
1119 vfitab = _mm256_cvttpd_epi32(rt);
1120 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1121 vfitab = _mm_slli_epi32(vfitab,2);
1123 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1124 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1125 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1126 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1127 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1128 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1129 Heps = _mm256_mul_pd(vfeps,H);
1130 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1131 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1132 velec = _mm256_mul_pd(qq23,VV);
1133 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1134 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec = _mm256_andnot_pd(dummy_mask,velec);
1138 velecsum = _mm256_add_pd(velecsum,velec);
1142 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1144 /* Calculate temporary vectorial force */
1145 tx = _mm256_mul_pd(fscal,dx23);
1146 ty = _mm256_mul_pd(fscal,dy23);
1147 tz = _mm256_mul_pd(fscal,dz23);
1149 /* Update vectorial force */
1150 fix2 = _mm256_add_pd(fix2,tx);
1151 fiy2 = _mm256_add_pd(fiy2,ty);
1152 fiz2 = _mm256_add_pd(fiz2,tz);
1154 fjx3 = _mm256_add_pd(fjx3,tx);
1155 fjy3 = _mm256_add_pd(fjy3,ty);
1156 fjz3 = _mm256_add_pd(fjz3,tz);
1158 /**************************
1159 * CALCULATE INTERACTIONS *
1160 **************************/
1162 r31 = _mm256_mul_pd(rsq31,rinv31);
1163 r31 = _mm256_andnot_pd(dummy_mask,r31);
1165 /* Calculate table index by multiplying r with table scale and truncate to integer */
1166 rt = _mm256_mul_pd(r31,vftabscale);
1167 vfitab = _mm256_cvttpd_epi32(rt);
1168 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1169 vfitab = _mm_slli_epi32(vfitab,2);
1171 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1172 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1173 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1174 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1175 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1176 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1177 Heps = _mm256_mul_pd(vfeps,H);
1178 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1179 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1180 velec = _mm256_mul_pd(qq31,VV);
1181 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1182 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1184 /* Update potential sum for this i atom from the interaction with this j atom. */
1185 velec = _mm256_andnot_pd(dummy_mask,velec);
1186 velecsum = _mm256_add_pd(velecsum,velec);
1190 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1192 /* Calculate temporary vectorial force */
1193 tx = _mm256_mul_pd(fscal,dx31);
1194 ty = _mm256_mul_pd(fscal,dy31);
1195 tz = _mm256_mul_pd(fscal,dz31);
1197 /* Update vectorial force */
1198 fix3 = _mm256_add_pd(fix3,tx);
1199 fiy3 = _mm256_add_pd(fiy3,ty);
1200 fiz3 = _mm256_add_pd(fiz3,tz);
1202 fjx1 = _mm256_add_pd(fjx1,tx);
1203 fjy1 = _mm256_add_pd(fjy1,ty);
1204 fjz1 = _mm256_add_pd(fjz1,tz);
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 r32 = _mm256_mul_pd(rsq32,rinv32);
1211 r32 = _mm256_andnot_pd(dummy_mask,r32);
1213 /* Calculate table index by multiplying r with table scale and truncate to integer */
1214 rt = _mm256_mul_pd(r32,vftabscale);
1215 vfitab = _mm256_cvttpd_epi32(rt);
1216 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1217 vfitab = _mm_slli_epi32(vfitab,2);
1219 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1220 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1221 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1222 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1223 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1224 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1225 Heps = _mm256_mul_pd(vfeps,H);
1226 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1227 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1228 velec = _mm256_mul_pd(qq32,VV);
1229 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1230 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
1232 /* Update potential sum for this i atom from the interaction with this j atom. */
1233 velec = _mm256_andnot_pd(dummy_mask,velec);
1234 velecsum = _mm256_add_pd(velecsum,velec);
1238 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1240 /* Calculate temporary vectorial force */
1241 tx = _mm256_mul_pd(fscal,dx32);
1242 ty = _mm256_mul_pd(fscal,dy32);
1243 tz = _mm256_mul_pd(fscal,dz32);
1245 /* Update vectorial force */
1246 fix3 = _mm256_add_pd(fix3,tx);
1247 fiy3 = _mm256_add_pd(fiy3,ty);
1248 fiz3 = _mm256_add_pd(fiz3,tz);
1250 fjx2 = _mm256_add_pd(fjx2,tx);
1251 fjy2 = _mm256_add_pd(fjy2,ty);
1252 fjz2 = _mm256_add_pd(fjz2,tz);
1254 /**************************
1255 * CALCULATE INTERACTIONS *
1256 **************************/
1258 r33 = _mm256_mul_pd(rsq33,rinv33);
1259 r33 = _mm256_andnot_pd(dummy_mask,r33);
1261 /* Calculate table index by multiplying r with table scale and truncate to integer */
1262 rt = _mm256_mul_pd(r33,vftabscale);
1263 vfitab = _mm256_cvttpd_epi32(rt);
1264 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1265 vfitab = _mm_slli_epi32(vfitab,2);
1267 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1268 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1269 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1270 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1271 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1272 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1273 Heps = _mm256_mul_pd(vfeps,H);
1274 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1275 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1276 velec = _mm256_mul_pd(qq33,VV);
1277 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1278 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
1280 /* Update potential sum for this i atom from the interaction with this j atom. */
1281 velec = _mm256_andnot_pd(dummy_mask,velec);
1282 velecsum = _mm256_add_pd(velecsum,velec);
1286 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1288 /* Calculate temporary vectorial force */
1289 tx = _mm256_mul_pd(fscal,dx33);
1290 ty = _mm256_mul_pd(fscal,dy33);
1291 tz = _mm256_mul_pd(fscal,dz33);
1293 /* Update vectorial force */
1294 fix3 = _mm256_add_pd(fix3,tx);
1295 fiy3 = _mm256_add_pd(fiy3,ty);
1296 fiz3 = _mm256_add_pd(fiz3,tz);
1298 fjx3 = _mm256_add_pd(fjx3,tx);
1299 fjy3 = _mm256_add_pd(fjy3,ty);
1300 fjz3 = _mm256_add_pd(fjz3,tz);
1302 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1303 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1304 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1305 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1307 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1308 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1309 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1311 /* Inner loop uses 431 flops */
1314 /* End of innermost loop */
1316 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1317 f+i_coord_offset,fshift+i_shift_offset);
1320 /* Update potential energies */
1321 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1322 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1324 /* Increment number of inner iterations */
1325 inneriter += j_index_end - j_index_start;
1327 /* Outer loop uses 26 flops */
1330 /* Increment number of outer iterations */
1333 /* Update outer/inner flops */
1335 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431);
1338 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_double
1339 * Electrostatics interaction: CubicSplineTable
1340 * VdW interaction: LennardJones
1341 * Geometry: Water4-Water4
1342 * Calculate force/pot: Force
1345 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_double
1346 (t_nblist * gmx_restrict nlist,
1347 rvec * gmx_restrict xx,
1348 rvec * gmx_restrict ff,
1349 t_forcerec * gmx_restrict fr,
1350 t_mdatoms * gmx_restrict mdatoms,
1351 nb_kernel_data_t * gmx_restrict kernel_data,
1352 t_nrnb * gmx_restrict nrnb)
1354 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1355 * just 0 for non-waters.
1356 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1357 * jnr indices corresponding to data put in the four positions in the SIMD register.
1359 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1360 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1361 int jnrA,jnrB,jnrC,jnrD;
1362 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1363 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1364 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1365 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1366 real rcutoff_scalar;
1367 real *shiftvec,*fshift,*x,*f;
1368 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1369 real scratch[4*DIM];
1370 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1371 real * vdwioffsetptr0;
1372 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1373 real * vdwioffsetptr1;
1374 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1375 real * vdwioffsetptr2;
1376 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1377 real * vdwioffsetptr3;
1378 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1379 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1380 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1381 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1382 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1383 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1384 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1385 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1386 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1387 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1388 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1389 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1390 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1391 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1392 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1393 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1394 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1395 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1396 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1397 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1400 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1403 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1404 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1406 __m128i ifour = _mm_set1_epi32(4);
1407 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1409 __m256d dummy_mask,cutoff_mask;
1410 __m128 tmpmask0,tmpmask1;
1411 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1412 __m256d one = _mm256_set1_pd(1.0);
1413 __m256d two = _mm256_set1_pd(2.0);
1419 jindex = nlist->jindex;
1421 shiftidx = nlist->shift;
1423 shiftvec = fr->shift_vec[0];
1424 fshift = fr->fshift[0];
1425 facel = _mm256_set1_pd(fr->epsfac);
1426 charge = mdatoms->chargeA;
1427 nvdwtype = fr->ntype;
1428 vdwparam = fr->nbfp;
1429 vdwtype = mdatoms->typeA;
1431 vftab = kernel_data->table_elec->data;
1432 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
1434 /* Setup water-specific parameters */
1435 inr = nlist->iinr[0];
1436 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1437 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1438 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1439 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1441 jq1 = _mm256_set1_pd(charge[inr+1]);
1442 jq2 = _mm256_set1_pd(charge[inr+2]);
1443 jq3 = _mm256_set1_pd(charge[inr+3]);
1444 vdwjidx0A = 2*vdwtype[inr+0];
1445 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1446 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1447 qq11 = _mm256_mul_pd(iq1,jq1);
1448 qq12 = _mm256_mul_pd(iq1,jq2);
1449 qq13 = _mm256_mul_pd(iq1,jq3);
1450 qq21 = _mm256_mul_pd(iq2,jq1);
1451 qq22 = _mm256_mul_pd(iq2,jq2);
1452 qq23 = _mm256_mul_pd(iq2,jq3);
1453 qq31 = _mm256_mul_pd(iq3,jq1);
1454 qq32 = _mm256_mul_pd(iq3,jq2);
1455 qq33 = _mm256_mul_pd(iq3,jq3);
1457 /* Avoid stupid compiler warnings */
1458 jnrA = jnrB = jnrC = jnrD = 0;
1459 j_coord_offsetA = 0;
1460 j_coord_offsetB = 0;
1461 j_coord_offsetC = 0;
1462 j_coord_offsetD = 0;
1467 for(iidx=0;iidx<4*DIM;iidx++)
1469 scratch[iidx] = 0.0;
1472 /* Start outer loop over neighborlists */
1473 for(iidx=0; iidx<nri; iidx++)
1475 /* Load shift vector for this list */
1476 i_shift_offset = DIM*shiftidx[iidx];
1478 /* Load limits for loop over neighbors */
1479 j_index_start = jindex[iidx];
1480 j_index_end = jindex[iidx+1];
1482 /* Get outer coordinate index */
1484 i_coord_offset = DIM*inr;
1486 /* Load i particle coords and add shift vector */
1487 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1488 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1490 fix0 = _mm256_setzero_pd();
1491 fiy0 = _mm256_setzero_pd();
1492 fiz0 = _mm256_setzero_pd();
1493 fix1 = _mm256_setzero_pd();
1494 fiy1 = _mm256_setzero_pd();
1495 fiz1 = _mm256_setzero_pd();
1496 fix2 = _mm256_setzero_pd();
1497 fiy2 = _mm256_setzero_pd();
1498 fiz2 = _mm256_setzero_pd();
1499 fix3 = _mm256_setzero_pd();
1500 fiy3 = _mm256_setzero_pd();
1501 fiz3 = _mm256_setzero_pd();
1503 /* Start inner kernel loop */
1504 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1507 /* Get j neighbor index, and coordinate index */
1509 jnrB = jjnr[jidx+1];
1510 jnrC = jjnr[jidx+2];
1511 jnrD = jjnr[jidx+3];
1512 j_coord_offsetA = DIM*jnrA;
1513 j_coord_offsetB = DIM*jnrB;
1514 j_coord_offsetC = DIM*jnrC;
1515 j_coord_offsetD = DIM*jnrD;
1517 /* load j atom coordinates */
1518 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1519 x+j_coord_offsetC,x+j_coord_offsetD,
1520 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1521 &jy2,&jz2,&jx3,&jy3,&jz3);
1523 /* Calculate displacement vector */
1524 dx00 = _mm256_sub_pd(ix0,jx0);
1525 dy00 = _mm256_sub_pd(iy0,jy0);
1526 dz00 = _mm256_sub_pd(iz0,jz0);
1527 dx11 = _mm256_sub_pd(ix1,jx1);
1528 dy11 = _mm256_sub_pd(iy1,jy1);
1529 dz11 = _mm256_sub_pd(iz1,jz1);
1530 dx12 = _mm256_sub_pd(ix1,jx2);
1531 dy12 = _mm256_sub_pd(iy1,jy2);
1532 dz12 = _mm256_sub_pd(iz1,jz2);
1533 dx13 = _mm256_sub_pd(ix1,jx3);
1534 dy13 = _mm256_sub_pd(iy1,jy3);
1535 dz13 = _mm256_sub_pd(iz1,jz3);
1536 dx21 = _mm256_sub_pd(ix2,jx1);
1537 dy21 = _mm256_sub_pd(iy2,jy1);
1538 dz21 = _mm256_sub_pd(iz2,jz1);
1539 dx22 = _mm256_sub_pd(ix2,jx2);
1540 dy22 = _mm256_sub_pd(iy2,jy2);
1541 dz22 = _mm256_sub_pd(iz2,jz2);
1542 dx23 = _mm256_sub_pd(ix2,jx3);
1543 dy23 = _mm256_sub_pd(iy2,jy3);
1544 dz23 = _mm256_sub_pd(iz2,jz3);
1545 dx31 = _mm256_sub_pd(ix3,jx1);
1546 dy31 = _mm256_sub_pd(iy3,jy1);
1547 dz31 = _mm256_sub_pd(iz3,jz1);
1548 dx32 = _mm256_sub_pd(ix3,jx2);
1549 dy32 = _mm256_sub_pd(iy3,jy2);
1550 dz32 = _mm256_sub_pd(iz3,jz2);
1551 dx33 = _mm256_sub_pd(ix3,jx3);
1552 dy33 = _mm256_sub_pd(iy3,jy3);
1553 dz33 = _mm256_sub_pd(iz3,jz3);
1555 /* Calculate squared distance and things based on it */
1556 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1557 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1558 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1559 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1560 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1561 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1562 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1563 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1564 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1565 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1567 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1568 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1569 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1570 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1571 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1572 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1573 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1574 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1575 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1577 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1579 fjx0 = _mm256_setzero_pd();
1580 fjy0 = _mm256_setzero_pd();
1581 fjz0 = _mm256_setzero_pd();
1582 fjx1 = _mm256_setzero_pd();
1583 fjy1 = _mm256_setzero_pd();
1584 fjz1 = _mm256_setzero_pd();
1585 fjx2 = _mm256_setzero_pd();
1586 fjy2 = _mm256_setzero_pd();
1587 fjz2 = _mm256_setzero_pd();
1588 fjx3 = _mm256_setzero_pd();
1589 fjy3 = _mm256_setzero_pd();
1590 fjz3 = _mm256_setzero_pd();
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 /* LENNARD-JONES DISPERSION/REPULSION */
1598 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1599 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1603 /* Calculate temporary vectorial force */
1604 tx = _mm256_mul_pd(fscal,dx00);
1605 ty = _mm256_mul_pd(fscal,dy00);
1606 tz = _mm256_mul_pd(fscal,dz00);
1608 /* Update vectorial force */
1609 fix0 = _mm256_add_pd(fix0,tx);
1610 fiy0 = _mm256_add_pd(fiy0,ty);
1611 fiz0 = _mm256_add_pd(fiz0,tz);
1613 fjx0 = _mm256_add_pd(fjx0,tx);
1614 fjy0 = _mm256_add_pd(fjy0,ty);
1615 fjz0 = _mm256_add_pd(fjz0,tz);
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 r11 = _mm256_mul_pd(rsq11,rinv11);
1623 /* Calculate table index by multiplying r with table scale and truncate to integer */
1624 rt = _mm256_mul_pd(r11,vftabscale);
1625 vfitab = _mm256_cvttpd_epi32(rt);
1626 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1627 vfitab = _mm_slli_epi32(vfitab,2);
1629 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1630 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1631 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1632 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1633 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1634 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1635 Heps = _mm256_mul_pd(vfeps,H);
1636 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1637 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1638 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1642 /* Calculate temporary vectorial force */
1643 tx = _mm256_mul_pd(fscal,dx11);
1644 ty = _mm256_mul_pd(fscal,dy11);
1645 tz = _mm256_mul_pd(fscal,dz11);
1647 /* Update vectorial force */
1648 fix1 = _mm256_add_pd(fix1,tx);
1649 fiy1 = _mm256_add_pd(fiy1,ty);
1650 fiz1 = _mm256_add_pd(fiz1,tz);
1652 fjx1 = _mm256_add_pd(fjx1,tx);
1653 fjy1 = _mm256_add_pd(fjy1,ty);
1654 fjz1 = _mm256_add_pd(fjz1,tz);
1656 /**************************
1657 * CALCULATE INTERACTIONS *
1658 **************************/
1660 r12 = _mm256_mul_pd(rsq12,rinv12);
1662 /* Calculate table index by multiplying r with table scale and truncate to integer */
1663 rt = _mm256_mul_pd(r12,vftabscale);
1664 vfitab = _mm256_cvttpd_epi32(rt);
1665 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1666 vfitab = _mm_slli_epi32(vfitab,2);
1668 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1669 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1670 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1671 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1672 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1673 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1674 Heps = _mm256_mul_pd(vfeps,H);
1675 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1676 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1677 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1681 /* Calculate temporary vectorial force */
1682 tx = _mm256_mul_pd(fscal,dx12);
1683 ty = _mm256_mul_pd(fscal,dy12);
1684 tz = _mm256_mul_pd(fscal,dz12);
1686 /* Update vectorial force */
1687 fix1 = _mm256_add_pd(fix1,tx);
1688 fiy1 = _mm256_add_pd(fiy1,ty);
1689 fiz1 = _mm256_add_pd(fiz1,tz);
1691 fjx2 = _mm256_add_pd(fjx2,tx);
1692 fjy2 = _mm256_add_pd(fjy2,ty);
1693 fjz2 = _mm256_add_pd(fjz2,tz);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 r13 = _mm256_mul_pd(rsq13,rinv13);
1701 /* Calculate table index by multiplying r with table scale and truncate to integer */
1702 rt = _mm256_mul_pd(r13,vftabscale);
1703 vfitab = _mm256_cvttpd_epi32(rt);
1704 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1705 vfitab = _mm_slli_epi32(vfitab,2);
1707 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1708 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1709 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1710 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1711 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1712 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1713 Heps = _mm256_mul_pd(vfeps,H);
1714 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1715 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1716 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1720 /* Calculate temporary vectorial force */
1721 tx = _mm256_mul_pd(fscal,dx13);
1722 ty = _mm256_mul_pd(fscal,dy13);
1723 tz = _mm256_mul_pd(fscal,dz13);
1725 /* Update vectorial force */
1726 fix1 = _mm256_add_pd(fix1,tx);
1727 fiy1 = _mm256_add_pd(fiy1,ty);
1728 fiz1 = _mm256_add_pd(fiz1,tz);
1730 fjx3 = _mm256_add_pd(fjx3,tx);
1731 fjy3 = _mm256_add_pd(fjy3,ty);
1732 fjz3 = _mm256_add_pd(fjz3,tz);
1734 /**************************
1735 * CALCULATE INTERACTIONS *
1736 **************************/
1738 r21 = _mm256_mul_pd(rsq21,rinv21);
1740 /* Calculate table index by multiplying r with table scale and truncate to integer */
1741 rt = _mm256_mul_pd(r21,vftabscale);
1742 vfitab = _mm256_cvttpd_epi32(rt);
1743 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1744 vfitab = _mm_slli_epi32(vfitab,2);
1746 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1747 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1748 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1749 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1750 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1751 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1752 Heps = _mm256_mul_pd(vfeps,H);
1753 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1754 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1755 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1759 /* Calculate temporary vectorial force */
1760 tx = _mm256_mul_pd(fscal,dx21);
1761 ty = _mm256_mul_pd(fscal,dy21);
1762 tz = _mm256_mul_pd(fscal,dz21);
1764 /* Update vectorial force */
1765 fix2 = _mm256_add_pd(fix2,tx);
1766 fiy2 = _mm256_add_pd(fiy2,ty);
1767 fiz2 = _mm256_add_pd(fiz2,tz);
1769 fjx1 = _mm256_add_pd(fjx1,tx);
1770 fjy1 = _mm256_add_pd(fjy1,ty);
1771 fjz1 = _mm256_add_pd(fjz1,tz);
1773 /**************************
1774 * CALCULATE INTERACTIONS *
1775 **************************/
1777 r22 = _mm256_mul_pd(rsq22,rinv22);
1779 /* Calculate table index by multiplying r with table scale and truncate to integer */
1780 rt = _mm256_mul_pd(r22,vftabscale);
1781 vfitab = _mm256_cvttpd_epi32(rt);
1782 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1783 vfitab = _mm_slli_epi32(vfitab,2);
1785 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1786 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1787 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1788 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1789 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1790 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1791 Heps = _mm256_mul_pd(vfeps,H);
1792 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1793 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1794 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1798 /* Calculate temporary vectorial force */
1799 tx = _mm256_mul_pd(fscal,dx22);
1800 ty = _mm256_mul_pd(fscal,dy22);
1801 tz = _mm256_mul_pd(fscal,dz22);
1803 /* Update vectorial force */
1804 fix2 = _mm256_add_pd(fix2,tx);
1805 fiy2 = _mm256_add_pd(fiy2,ty);
1806 fiz2 = _mm256_add_pd(fiz2,tz);
1808 fjx2 = _mm256_add_pd(fjx2,tx);
1809 fjy2 = _mm256_add_pd(fjy2,ty);
1810 fjz2 = _mm256_add_pd(fjz2,tz);
1812 /**************************
1813 * CALCULATE INTERACTIONS *
1814 **************************/
1816 r23 = _mm256_mul_pd(rsq23,rinv23);
1818 /* Calculate table index by multiplying r with table scale and truncate to integer */
1819 rt = _mm256_mul_pd(r23,vftabscale);
1820 vfitab = _mm256_cvttpd_epi32(rt);
1821 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1822 vfitab = _mm_slli_epi32(vfitab,2);
1824 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1825 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1826 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1827 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1828 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1829 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1830 Heps = _mm256_mul_pd(vfeps,H);
1831 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1832 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1833 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1837 /* Calculate temporary vectorial force */
1838 tx = _mm256_mul_pd(fscal,dx23);
1839 ty = _mm256_mul_pd(fscal,dy23);
1840 tz = _mm256_mul_pd(fscal,dz23);
1842 /* Update vectorial force */
1843 fix2 = _mm256_add_pd(fix2,tx);
1844 fiy2 = _mm256_add_pd(fiy2,ty);
1845 fiz2 = _mm256_add_pd(fiz2,tz);
1847 fjx3 = _mm256_add_pd(fjx3,tx);
1848 fjy3 = _mm256_add_pd(fjy3,ty);
1849 fjz3 = _mm256_add_pd(fjz3,tz);
1851 /**************************
1852 * CALCULATE INTERACTIONS *
1853 **************************/
1855 r31 = _mm256_mul_pd(rsq31,rinv31);
1857 /* Calculate table index by multiplying r with table scale and truncate to integer */
1858 rt = _mm256_mul_pd(r31,vftabscale);
1859 vfitab = _mm256_cvttpd_epi32(rt);
1860 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1861 vfitab = _mm_slli_epi32(vfitab,2);
1863 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1864 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1865 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1866 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1867 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1868 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1869 Heps = _mm256_mul_pd(vfeps,H);
1870 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1871 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1872 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1876 /* Calculate temporary vectorial force */
1877 tx = _mm256_mul_pd(fscal,dx31);
1878 ty = _mm256_mul_pd(fscal,dy31);
1879 tz = _mm256_mul_pd(fscal,dz31);
1881 /* Update vectorial force */
1882 fix3 = _mm256_add_pd(fix3,tx);
1883 fiy3 = _mm256_add_pd(fiy3,ty);
1884 fiz3 = _mm256_add_pd(fiz3,tz);
1886 fjx1 = _mm256_add_pd(fjx1,tx);
1887 fjy1 = _mm256_add_pd(fjy1,ty);
1888 fjz1 = _mm256_add_pd(fjz1,tz);
1890 /**************************
1891 * CALCULATE INTERACTIONS *
1892 **************************/
1894 r32 = _mm256_mul_pd(rsq32,rinv32);
1896 /* Calculate table index by multiplying r with table scale and truncate to integer */
1897 rt = _mm256_mul_pd(r32,vftabscale);
1898 vfitab = _mm256_cvttpd_epi32(rt);
1899 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1900 vfitab = _mm_slli_epi32(vfitab,2);
1902 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1903 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1904 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1905 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1906 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1907 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1908 Heps = _mm256_mul_pd(vfeps,H);
1909 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1910 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1911 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
1915 /* Calculate temporary vectorial force */
1916 tx = _mm256_mul_pd(fscal,dx32);
1917 ty = _mm256_mul_pd(fscal,dy32);
1918 tz = _mm256_mul_pd(fscal,dz32);
1920 /* Update vectorial force */
1921 fix3 = _mm256_add_pd(fix3,tx);
1922 fiy3 = _mm256_add_pd(fiy3,ty);
1923 fiz3 = _mm256_add_pd(fiz3,tz);
1925 fjx2 = _mm256_add_pd(fjx2,tx);
1926 fjy2 = _mm256_add_pd(fjy2,ty);
1927 fjz2 = _mm256_add_pd(fjz2,tz);
1929 /**************************
1930 * CALCULATE INTERACTIONS *
1931 **************************/
1933 r33 = _mm256_mul_pd(rsq33,rinv33);
1935 /* Calculate table index by multiplying r with table scale and truncate to integer */
1936 rt = _mm256_mul_pd(r33,vftabscale);
1937 vfitab = _mm256_cvttpd_epi32(rt);
1938 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1939 vfitab = _mm_slli_epi32(vfitab,2);
1941 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1942 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1943 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1944 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1945 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1946 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1947 Heps = _mm256_mul_pd(vfeps,H);
1948 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1949 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1950 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
1954 /* Calculate temporary vectorial force */
1955 tx = _mm256_mul_pd(fscal,dx33);
1956 ty = _mm256_mul_pd(fscal,dy33);
1957 tz = _mm256_mul_pd(fscal,dz33);
1959 /* Update vectorial force */
1960 fix3 = _mm256_add_pd(fix3,tx);
1961 fiy3 = _mm256_add_pd(fiy3,ty);
1962 fiz3 = _mm256_add_pd(fiz3,tz);
1964 fjx3 = _mm256_add_pd(fjx3,tx);
1965 fjy3 = _mm256_add_pd(fjy3,ty);
1966 fjz3 = _mm256_add_pd(fjz3,tz);
1968 fjptrA = f+j_coord_offsetA;
1969 fjptrB = f+j_coord_offsetB;
1970 fjptrC = f+j_coord_offsetC;
1971 fjptrD = f+j_coord_offsetD;
1973 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1974 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1975 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1977 /* Inner loop uses 381 flops */
1980 if(jidx<j_index_end)
1983 /* Get j neighbor index, and coordinate index */
1984 jnrlistA = jjnr[jidx];
1985 jnrlistB = jjnr[jidx+1];
1986 jnrlistC = jjnr[jidx+2];
1987 jnrlistD = jjnr[jidx+3];
1988 /* Sign of each element will be negative for non-real atoms.
1989 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1990 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1992 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1994 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1995 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1996 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1998 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1999 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2000 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2001 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2002 j_coord_offsetA = DIM*jnrA;
2003 j_coord_offsetB = DIM*jnrB;
2004 j_coord_offsetC = DIM*jnrC;
2005 j_coord_offsetD = DIM*jnrD;
2007 /* load j atom coordinates */
2008 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2009 x+j_coord_offsetC,x+j_coord_offsetD,
2010 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2011 &jy2,&jz2,&jx3,&jy3,&jz3);
2013 /* Calculate displacement vector */
2014 dx00 = _mm256_sub_pd(ix0,jx0);
2015 dy00 = _mm256_sub_pd(iy0,jy0);
2016 dz00 = _mm256_sub_pd(iz0,jz0);
2017 dx11 = _mm256_sub_pd(ix1,jx1);
2018 dy11 = _mm256_sub_pd(iy1,jy1);
2019 dz11 = _mm256_sub_pd(iz1,jz1);
2020 dx12 = _mm256_sub_pd(ix1,jx2);
2021 dy12 = _mm256_sub_pd(iy1,jy2);
2022 dz12 = _mm256_sub_pd(iz1,jz2);
2023 dx13 = _mm256_sub_pd(ix1,jx3);
2024 dy13 = _mm256_sub_pd(iy1,jy3);
2025 dz13 = _mm256_sub_pd(iz1,jz3);
2026 dx21 = _mm256_sub_pd(ix2,jx1);
2027 dy21 = _mm256_sub_pd(iy2,jy1);
2028 dz21 = _mm256_sub_pd(iz2,jz1);
2029 dx22 = _mm256_sub_pd(ix2,jx2);
2030 dy22 = _mm256_sub_pd(iy2,jy2);
2031 dz22 = _mm256_sub_pd(iz2,jz2);
2032 dx23 = _mm256_sub_pd(ix2,jx3);
2033 dy23 = _mm256_sub_pd(iy2,jy3);
2034 dz23 = _mm256_sub_pd(iz2,jz3);
2035 dx31 = _mm256_sub_pd(ix3,jx1);
2036 dy31 = _mm256_sub_pd(iy3,jy1);
2037 dz31 = _mm256_sub_pd(iz3,jz1);
2038 dx32 = _mm256_sub_pd(ix3,jx2);
2039 dy32 = _mm256_sub_pd(iy3,jy2);
2040 dz32 = _mm256_sub_pd(iz3,jz2);
2041 dx33 = _mm256_sub_pd(ix3,jx3);
2042 dy33 = _mm256_sub_pd(iy3,jy3);
2043 dz33 = _mm256_sub_pd(iz3,jz3);
2045 /* Calculate squared distance and things based on it */
2046 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2047 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2048 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2049 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2050 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2051 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2052 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2053 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2054 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2055 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2057 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2058 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2059 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2060 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2061 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2062 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2063 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2064 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2065 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2067 rinvsq00 = gmx_mm256_inv_pd(rsq00);
2069 fjx0 = _mm256_setzero_pd();
2070 fjy0 = _mm256_setzero_pd();
2071 fjz0 = _mm256_setzero_pd();
2072 fjx1 = _mm256_setzero_pd();
2073 fjy1 = _mm256_setzero_pd();
2074 fjz1 = _mm256_setzero_pd();
2075 fjx2 = _mm256_setzero_pd();
2076 fjy2 = _mm256_setzero_pd();
2077 fjz2 = _mm256_setzero_pd();
2078 fjx3 = _mm256_setzero_pd();
2079 fjy3 = _mm256_setzero_pd();
2080 fjz3 = _mm256_setzero_pd();
2082 /**************************
2083 * CALCULATE INTERACTIONS *
2084 **************************/
2086 /* LENNARD-JONES DISPERSION/REPULSION */
2088 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2089 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2093 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm256_mul_pd(fscal,dx00);
2097 ty = _mm256_mul_pd(fscal,dy00);
2098 tz = _mm256_mul_pd(fscal,dz00);
2100 /* Update vectorial force */
2101 fix0 = _mm256_add_pd(fix0,tx);
2102 fiy0 = _mm256_add_pd(fiy0,ty);
2103 fiz0 = _mm256_add_pd(fiz0,tz);
2105 fjx0 = _mm256_add_pd(fjx0,tx);
2106 fjy0 = _mm256_add_pd(fjy0,ty);
2107 fjz0 = _mm256_add_pd(fjz0,tz);
2109 /**************************
2110 * CALCULATE INTERACTIONS *
2111 **************************/
2113 r11 = _mm256_mul_pd(rsq11,rinv11);
2114 r11 = _mm256_andnot_pd(dummy_mask,r11);
2116 /* Calculate table index by multiplying r with table scale and truncate to integer */
2117 rt = _mm256_mul_pd(r11,vftabscale);
2118 vfitab = _mm256_cvttpd_epi32(rt);
2119 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2120 vfitab = _mm_slli_epi32(vfitab,2);
2122 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2123 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2124 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2125 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2126 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2127 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2128 Heps = _mm256_mul_pd(vfeps,H);
2129 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2130 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2131 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2135 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2137 /* Calculate temporary vectorial force */
2138 tx = _mm256_mul_pd(fscal,dx11);
2139 ty = _mm256_mul_pd(fscal,dy11);
2140 tz = _mm256_mul_pd(fscal,dz11);
2142 /* Update vectorial force */
2143 fix1 = _mm256_add_pd(fix1,tx);
2144 fiy1 = _mm256_add_pd(fiy1,ty);
2145 fiz1 = _mm256_add_pd(fiz1,tz);
2147 fjx1 = _mm256_add_pd(fjx1,tx);
2148 fjy1 = _mm256_add_pd(fjy1,ty);
2149 fjz1 = _mm256_add_pd(fjz1,tz);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 r12 = _mm256_mul_pd(rsq12,rinv12);
2156 r12 = _mm256_andnot_pd(dummy_mask,r12);
2158 /* Calculate table index by multiplying r with table scale and truncate to integer */
2159 rt = _mm256_mul_pd(r12,vftabscale);
2160 vfitab = _mm256_cvttpd_epi32(rt);
2161 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2162 vfitab = _mm_slli_epi32(vfitab,2);
2164 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2165 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2166 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2167 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2168 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2169 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2170 Heps = _mm256_mul_pd(vfeps,H);
2171 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2172 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2173 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2177 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2179 /* Calculate temporary vectorial force */
2180 tx = _mm256_mul_pd(fscal,dx12);
2181 ty = _mm256_mul_pd(fscal,dy12);
2182 tz = _mm256_mul_pd(fscal,dz12);
2184 /* Update vectorial force */
2185 fix1 = _mm256_add_pd(fix1,tx);
2186 fiy1 = _mm256_add_pd(fiy1,ty);
2187 fiz1 = _mm256_add_pd(fiz1,tz);
2189 fjx2 = _mm256_add_pd(fjx2,tx);
2190 fjy2 = _mm256_add_pd(fjy2,ty);
2191 fjz2 = _mm256_add_pd(fjz2,tz);
2193 /**************************
2194 * CALCULATE INTERACTIONS *
2195 **************************/
2197 r13 = _mm256_mul_pd(rsq13,rinv13);
2198 r13 = _mm256_andnot_pd(dummy_mask,r13);
2200 /* Calculate table index by multiplying r with table scale and truncate to integer */
2201 rt = _mm256_mul_pd(r13,vftabscale);
2202 vfitab = _mm256_cvttpd_epi32(rt);
2203 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2204 vfitab = _mm_slli_epi32(vfitab,2);
2206 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2207 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2208 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2209 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2210 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2211 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2212 Heps = _mm256_mul_pd(vfeps,H);
2213 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2214 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2215 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
2219 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2221 /* Calculate temporary vectorial force */
2222 tx = _mm256_mul_pd(fscal,dx13);
2223 ty = _mm256_mul_pd(fscal,dy13);
2224 tz = _mm256_mul_pd(fscal,dz13);
2226 /* Update vectorial force */
2227 fix1 = _mm256_add_pd(fix1,tx);
2228 fiy1 = _mm256_add_pd(fiy1,ty);
2229 fiz1 = _mm256_add_pd(fiz1,tz);
2231 fjx3 = _mm256_add_pd(fjx3,tx);
2232 fjy3 = _mm256_add_pd(fjy3,ty);
2233 fjz3 = _mm256_add_pd(fjz3,tz);
2235 /**************************
2236 * CALCULATE INTERACTIONS *
2237 **************************/
2239 r21 = _mm256_mul_pd(rsq21,rinv21);
2240 r21 = _mm256_andnot_pd(dummy_mask,r21);
2242 /* Calculate table index by multiplying r with table scale and truncate to integer */
2243 rt = _mm256_mul_pd(r21,vftabscale);
2244 vfitab = _mm256_cvttpd_epi32(rt);
2245 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2246 vfitab = _mm_slli_epi32(vfitab,2);
2248 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2249 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2250 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2251 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2252 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2253 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2254 Heps = _mm256_mul_pd(vfeps,H);
2255 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2256 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2257 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2261 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2263 /* Calculate temporary vectorial force */
2264 tx = _mm256_mul_pd(fscal,dx21);
2265 ty = _mm256_mul_pd(fscal,dy21);
2266 tz = _mm256_mul_pd(fscal,dz21);
2268 /* Update vectorial force */
2269 fix2 = _mm256_add_pd(fix2,tx);
2270 fiy2 = _mm256_add_pd(fiy2,ty);
2271 fiz2 = _mm256_add_pd(fiz2,tz);
2273 fjx1 = _mm256_add_pd(fjx1,tx);
2274 fjy1 = _mm256_add_pd(fjy1,ty);
2275 fjz1 = _mm256_add_pd(fjz1,tz);
2277 /**************************
2278 * CALCULATE INTERACTIONS *
2279 **************************/
2281 r22 = _mm256_mul_pd(rsq22,rinv22);
2282 r22 = _mm256_andnot_pd(dummy_mask,r22);
2284 /* Calculate table index by multiplying r with table scale and truncate to integer */
2285 rt = _mm256_mul_pd(r22,vftabscale);
2286 vfitab = _mm256_cvttpd_epi32(rt);
2287 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2288 vfitab = _mm_slli_epi32(vfitab,2);
2290 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2291 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2292 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2293 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2294 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2295 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2296 Heps = _mm256_mul_pd(vfeps,H);
2297 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2298 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2299 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2303 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2305 /* Calculate temporary vectorial force */
2306 tx = _mm256_mul_pd(fscal,dx22);
2307 ty = _mm256_mul_pd(fscal,dy22);
2308 tz = _mm256_mul_pd(fscal,dz22);
2310 /* Update vectorial force */
2311 fix2 = _mm256_add_pd(fix2,tx);
2312 fiy2 = _mm256_add_pd(fiy2,ty);
2313 fiz2 = _mm256_add_pd(fiz2,tz);
2315 fjx2 = _mm256_add_pd(fjx2,tx);
2316 fjy2 = _mm256_add_pd(fjy2,ty);
2317 fjz2 = _mm256_add_pd(fjz2,tz);
2319 /**************************
2320 * CALCULATE INTERACTIONS *
2321 **************************/
2323 r23 = _mm256_mul_pd(rsq23,rinv23);
2324 r23 = _mm256_andnot_pd(dummy_mask,r23);
2326 /* Calculate table index by multiplying r with table scale and truncate to integer */
2327 rt = _mm256_mul_pd(r23,vftabscale);
2328 vfitab = _mm256_cvttpd_epi32(rt);
2329 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2330 vfitab = _mm_slli_epi32(vfitab,2);
2332 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2333 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2334 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2335 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2336 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2337 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2338 Heps = _mm256_mul_pd(vfeps,H);
2339 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2340 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2341 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
2345 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2347 /* Calculate temporary vectorial force */
2348 tx = _mm256_mul_pd(fscal,dx23);
2349 ty = _mm256_mul_pd(fscal,dy23);
2350 tz = _mm256_mul_pd(fscal,dz23);
2352 /* Update vectorial force */
2353 fix2 = _mm256_add_pd(fix2,tx);
2354 fiy2 = _mm256_add_pd(fiy2,ty);
2355 fiz2 = _mm256_add_pd(fiz2,tz);
2357 fjx3 = _mm256_add_pd(fjx3,tx);
2358 fjy3 = _mm256_add_pd(fjy3,ty);
2359 fjz3 = _mm256_add_pd(fjz3,tz);
2361 /**************************
2362 * CALCULATE INTERACTIONS *
2363 **************************/
2365 r31 = _mm256_mul_pd(rsq31,rinv31);
2366 r31 = _mm256_andnot_pd(dummy_mask,r31);
2368 /* Calculate table index by multiplying r with table scale and truncate to integer */
2369 rt = _mm256_mul_pd(r31,vftabscale);
2370 vfitab = _mm256_cvttpd_epi32(rt);
2371 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2372 vfitab = _mm_slli_epi32(vfitab,2);
2374 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2375 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2376 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2377 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2378 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2379 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2380 Heps = _mm256_mul_pd(vfeps,H);
2381 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2382 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2383 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
2387 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2389 /* Calculate temporary vectorial force */
2390 tx = _mm256_mul_pd(fscal,dx31);
2391 ty = _mm256_mul_pd(fscal,dy31);
2392 tz = _mm256_mul_pd(fscal,dz31);
2394 /* Update vectorial force */
2395 fix3 = _mm256_add_pd(fix3,tx);
2396 fiy3 = _mm256_add_pd(fiy3,ty);
2397 fiz3 = _mm256_add_pd(fiz3,tz);
2399 fjx1 = _mm256_add_pd(fjx1,tx);
2400 fjy1 = _mm256_add_pd(fjy1,ty);
2401 fjz1 = _mm256_add_pd(fjz1,tz);
2403 /**************************
2404 * CALCULATE INTERACTIONS *
2405 **************************/
2407 r32 = _mm256_mul_pd(rsq32,rinv32);
2408 r32 = _mm256_andnot_pd(dummy_mask,r32);
2410 /* Calculate table index by multiplying r with table scale and truncate to integer */
2411 rt = _mm256_mul_pd(r32,vftabscale);
2412 vfitab = _mm256_cvttpd_epi32(rt);
2413 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2414 vfitab = _mm_slli_epi32(vfitab,2);
2416 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2417 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2418 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2419 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2420 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2421 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2422 Heps = _mm256_mul_pd(vfeps,H);
2423 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2424 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2425 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2429 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2431 /* Calculate temporary vectorial force */
2432 tx = _mm256_mul_pd(fscal,dx32);
2433 ty = _mm256_mul_pd(fscal,dy32);
2434 tz = _mm256_mul_pd(fscal,dz32);
2436 /* Update vectorial force */
2437 fix3 = _mm256_add_pd(fix3,tx);
2438 fiy3 = _mm256_add_pd(fiy3,ty);
2439 fiz3 = _mm256_add_pd(fiz3,tz);
2441 fjx2 = _mm256_add_pd(fjx2,tx);
2442 fjy2 = _mm256_add_pd(fjy2,ty);
2443 fjz2 = _mm256_add_pd(fjz2,tz);
2445 /**************************
2446 * CALCULATE INTERACTIONS *
2447 **************************/
2449 r33 = _mm256_mul_pd(rsq33,rinv33);
2450 r33 = _mm256_andnot_pd(dummy_mask,r33);
2452 /* Calculate table index by multiplying r with table scale and truncate to integer */
2453 rt = _mm256_mul_pd(r33,vftabscale);
2454 vfitab = _mm256_cvttpd_epi32(rt);
2455 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2456 vfitab = _mm_slli_epi32(vfitab,2);
2458 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2459 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2460 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2461 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2462 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2463 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2464 Heps = _mm256_mul_pd(vfeps,H);
2465 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2466 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2467 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2471 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2473 /* Calculate temporary vectorial force */
2474 tx = _mm256_mul_pd(fscal,dx33);
2475 ty = _mm256_mul_pd(fscal,dy33);
2476 tz = _mm256_mul_pd(fscal,dz33);
2478 /* Update vectorial force */
2479 fix3 = _mm256_add_pd(fix3,tx);
2480 fiy3 = _mm256_add_pd(fiy3,ty);
2481 fiz3 = _mm256_add_pd(fiz3,tz);
2483 fjx3 = _mm256_add_pd(fjx3,tx);
2484 fjy3 = _mm256_add_pd(fjy3,ty);
2485 fjz3 = _mm256_add_pd(fjz3,tz);
2487 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2488 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2489 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2490 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2492 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2493 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2494 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2496 /* Inner loop uses 390 flops */
2499 /* End of innermost loop */
2501 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2502 f+i_coord_offset,fshift+i_shift_offset);
2504 /* Increment number of inner iterations */
2505 inneriter += j_index_end - j_index_start;
2507 /* Outer loop uses 24 flops */
2510 /* Increment number of outer iterations */
2513 /* Update outer/inner flops */
2515 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390);