2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
100 __m128i ifour = _mm_set1_epi32(4);
101 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
103 __m256d dummy_mask,cutoff_mask;
104 __m128 tmpmask0,tmpmask1;
105 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
106 __m256d one = _mm256_set1_pd(1.0);
107 __m256d two = _mm256_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm256_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 vftab = kernel_data->table_elec->data;
126 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
131 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
132 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
133 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
135 jq0 = _mm256_set1_pd(charge[inr+0]);
136 jq1 = _mm256_set1_pd(charge[inr+1]);
137 jq2 = _mm256_set1_pd(charge[inr+2]);
138 vdwjidx0A = 2*vdwtype[inr+0];
139 qq00 = _mm256_mul_pd(iq0,jq0);
140 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
141 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
142 qq01 = _mm256_mul_pd(iq0,jq1);
143 qq02 = _mm256_mul_pd(iq0,jq2);
144 qq10 = _mm256_mul_pd(iq1,jq0);
145 qq11 = _mm256_mul_pd(iq1,jq1);
146 qq12 = _mm256_mul_pd(iq1,jq2);
147 qq20 = _mm256_mul_pd(iq2,jq0);
148 qq21 = _mm256_mul_pd(iq2,jq1);
149 qq22 = _mm256_mul_pd(iq2,jq2);
151 /* Avoid stupid compiler warnings */
152 jnrA = jnrB = jnrC = jnrD = 0;
161 for(iidx=0;iidx<4*DIM;iidx++)
166 /* Start outer loop over neighborlists */
167 for(iidx=0; iidx<nri; iidx++)
169 /* Load shift vector for this list */
170 i_shift_offset = DIM*shiftidx[iidx];
172 /* Load limits for loop over neighbors */
173 j_index_start = jindex[iidx];
174 j_index_end = jindex[iidx+1];
176 /* Get outer coordinate index */
178 i_coord_offset = DIM*inr;
180 /* Load i particle coords and add shift vector */
181 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
182 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184 fix0 = _mm256_setzero_pd();
185 fiy0 = _mm256_setzero_pd();
186 fiz0 = _mm256_setzero_pd();
187 fix1 = _mm256_setzero_pd();
188 fiy1 = _mm256_setzero_pd();
189 fiz1 = _mm256_setzero_pd();
190 fix2 = _mm256_setzero_pd();
191 fiy2 = _mm256_setzero_pd();
192 fiz2 = _mm256_setzero_pd();
194 /* Reset potential sums */
195 velecsum = _mm256_setzero_pd();
196 vvdwsum = _mm256_setzero_pd();
198 /* Start inner kernel loop */
199 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
202 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA = DIM*jnrA;
208 j_coord_offsetB = DIM*jnrB;
209 j_coord_offsetC = DIM*jnrC;
210 j_coord_offsetD = DIM*jnrD;
212 /* load j atom coordinates */
213 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214 x+j_coord_offsetC,x+j_coord_offsetD,
215 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
217 /* Calculate displacement vector */
218 dx00 = _mm256_sub_pd(ix0,jx0);
219 dy00 = _mm256_sub_pd(iy0,jy0);
220 dz00 = _mm256_sub_pd(iz0,jz0);
221 dx01 = _mm256_sub_pd(ix0,jx1);
222 dy01 = _mm256_sub_pd(iy0,jy1);
223 dz01 = _mm256_sub_pd(iz0,jz1);
224 dx02 = _mm256_sub_pd(ix0,jx2);
225 dy02 = _mm256_sub_pd(iy0,jy2);
226 dz02 = _mm256_sub_pd(iz0,jz2);
227 dx10 = _mm256_sub_pd(ix1,jx0);
228 dy10 = _mm256_sub_pd(iy1,jy0);
229 dz10 = _mm256_sub_pd(iz1,jz0);
230 dx11 = _mm256_sub_pd(ix1,jx1);
231 dy11 = _mm256_sub_pd(iy1,jy1);
232 dz11 = _mm256_sub_pd(iz1,jz1);
233 dx12 = _mm256_sub_pd(ix1,jx2);
234 dy12 = _mm256_sub_pd(iy1,jy2);
235 dz12 = _mm256_sub_pd(iz1,jz2);
236 dx20 = _mm256_sub_pd(ix2,jx0);
237 dy20 = _mm256_sub_pd(iy2,jy0);
238 dz20 = _mm256_sub_pd(iz2,jz0);
239 dx21 = _mm256_sub_pd(ix2,jx1);
240 dy21 = _mm256_sub_pd(iy2,jy1);
241 dz21 = _mm256_sub_pd(iz2,jz1);
242 dx22 = _mm256_sub_pd(ix2,jx2);
243 dy22 = _mm256_sub_pd(iy2,jy2);
244 dz22 = _mm256_sub_pd(iz2,jz2);
246 /* Calculate squared distance and things based on it */
247 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
248 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
249 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
250 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
251 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
252 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
253 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
254 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
255 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
257 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
258 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
259 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
260 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
261 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
262 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
263 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
264 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
265 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
267 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
269 fjx0 = _mm256_setzero_pd();
270 fjy0 = _mm256_setzero_pd();
271 fjz0 = _mm256_setzero_pd();
272 fjx1 = _mm256_setzero_pd();
273 fjy1 = _mm256_setzero_pd();
274 fjz1 = _mm256_setzero_pd();
275 fjx2 = _mm256_setzero_pd();
276 fjy2 = _mm256_setzero_pd();
277 fjz2 = _mm256_setzero_pd();
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 r00 = _mm256_mul_pd(rsq00,rinv00);
285 /* Calculate table index by multiplying r with table scale and truncate to integer */
286 rt = _mm256_mul_pd(r00,vftabscale);
287 vfitab = _mm256_cvttpd_epi32(rt);
288 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
289 vfitab = _mm_slli_epi32(vfitab,2);
291 /* CUBIC SPLINE TABLE ELECTROSTATICS */
292 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
295 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
296 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
297 Heps = _mm256_mul_pd(vfeps,H);
298 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
299 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
300 velec = _mm256_mul_pd(qq00,VV);
301 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
302 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
304 /* LENNARD-JONES DISPERSION/REPULSION */
306 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
307 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
308 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
309 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
310 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
312 /* Update potential sum for this i atom from the interaction with this j atom. */
313 velecsum = _mm256_add_pd(velecsum,velec);
314 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
316 fscal = _mm256_add_pd(felec,fvdw);
318 /* Calculate temporary vectorial force */
319 tx = _mm256_mul_pd(fscal,dx00);
320 ty = _mm256_mul_pd(fscal,dy00);
321 tz = _mm256_mul_pd(fscal,dz00);
323 /* Update vectorial force */
324 fix0 = _mm256_add_pd(fix0,tx);
325 fiy0 = _mm256_add_pd(fiy0,ty);
326 fiz0 = _mm256_add_pd(fiz0,tz);
328 fjx0 = _mm256_add_pd(fjx0,tx);
329 fjy0 = _mm256_add_pd(fjy0,ty);
330 fjz0 = _mm256_add_pd(fjz0,tz);
332 /**************************
333 * CALCULATE INTERACTIONS *
334 **************************/
336 r01 = _mm256_mul_pd(rsq01,rinv01);
338 /* Calculate table index by multiplying r with table scale and truncate to integer */
339 rt = _mm256_mul_pd(r01,vftabscale);
340 vfitab = _mm256_cvttpd_epi32(rt);
341 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
342 vfitab = _mm_slli_epi32(vfitab,2);
344 /* CUBIC SPLINE TABLE ELECTROSTATICS */
345 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
346 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
347 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
348 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
349 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
350 Heps = _mm256_mul_pd(vfeps,H);
351 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
352 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
353 velec = _mm256_mul_pd(qq01,VV);
354 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
355 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm256_add_pd(velecsum,velec);
362 /* Calculate temporary vectorial force */
363 tx = _mm256_mul_pd(fscal,dx01);
364 ty = _mm256_mul_pd(fscal,dy01);
365 tz = _mm256_mul_pd(fscal,dz01);
367 /* Update vectorial force */
368 fix0 = _mm256_add_pd(fix0,tx);
369 fiy0 = _mm256_add_pd(fiy0,ty);
370 fiz0 = _mm256_add_pd(fiz0,tz);
372 fjx1 = _mm256_add_pd(fjx1,tx);
373 fjy1 = _mm256_add_pd(fjy1,ty);
374 fjz1 = _mm256_add_pd(fjz1,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r02 = _mm256_mul_pd(rsq02,rinv02);
382 /* Calculate table index by multiplying r with table scale and truncate to integer */
383 rt = _mm256_mul_pd(r02,vftabscale);
384 vfitab = _mm256_cvttpd_epi32(rt);
385 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
386 vfitab = _mm_slli_epi32(vfitab,2);
388 /* CUBIC SPLINE TABLE ELECTROSTATICS */
389 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
390 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
391 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
392 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
393 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
394 Heps = _mm256_mul_pd(vfeps,H);
395 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
396 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
397 velec = _mm256_mul_pd(qq02,VV);
398 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
399 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velecsum = _mm256_add_pd(velecsum,velec);
406 /* Calculate temporary vectorial force */
407 tx = _mm256_mul_pd(fscal,dx02);
408 ty = _mm256_mul_pd(fscal,dy02);
409 tz = _mm256_mul_pd(fscal,dz02);
411 /* Update vectorial force */
412 fix0 = _mm256_add_pd(fix0,tx);
413 fiy0 = _mm256_add_pd(fiy0,ty);
414 fiz0 = _mm256_add_pd(fiz0,tz);
416 fjx2 = _mm256_add_pd(fjx2,tx);
417 fjy2 = _mm256_add_pd(fjy2,ty);
418 fjz2 = _mm256_add_pd(fjz2,tz);
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 r10 = _mm256_mul_pd(rsq10,rinv10);
426 /* Calculate table index by multiplying r with table scale and truncate to integer */
427 rt = _mm256_mul_pd(r10,vftabscale);
428 vfitab = _mm256_cvttpd_epi32(rt);
429 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
430 vfitab = _mm_slli_epi32(vfitab,2);
432 /* CUBIC SPLINE TABLE ELECTROSTATICS */
433 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
434 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
435 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
436 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
437 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
438 Heps = _mm256_mul_pd(vfeps,H);
439 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
440 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
441 velec = _mm256_mul_pd(qq10,VV);
442 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
443 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
445 /* Update potential sum for this i atom from the interaction with this j atom. */
446 velecsum = _mm256_add_pd(velecsum,velec);
450 /* Calculate temporary vectorial force */
451 tx = _mm256_mul_pd(fscal,dx10);
452 ty = _mm256_mul_pd(fscal,dy10);
453 tz = _mm256_mul_pd(fscal,dz10);
455 /* Update vectorial force */
456 fix1 = _mm256_add_pd(fix1,tx);
457 fiy1 = _mm256_add_pd(fiy1,ty);
458 fiz1 = _mm256_add_pd(fiz1,tz);
460 fjx0 = _mm256_add_pd(fjx0,tx);
461 fjy0 = _mm256_add_pd(fjy0,ty);
462 fjz0 = _mm256_add_pd(fjz0,tz);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 r11 = _mm256_mul_pd(rsq11,rinv11);
470 /* Calculate table index by multiplying r with table scale and truncate to integer */
471 rt = _mm256_mul_pd(r11,vftabscale);
472 vfitab = _mm256_cvttpd_epi32(rt);
473 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
474 vfitab = _mm_slli_epi32(vfitab,2);
476 /* CUBIC SPLINE TABLE ELECTROSTATICS */
477 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
478 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
479 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
480 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
481 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
482 Heps = _mm256_mul_pd(vfeps,H);
483 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
484 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
485 velec = _mm256_mul_pd(qq11,VV);
486 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
487 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
489 /* Update potential sum for this i atom from the interaction with this j atom. */
490 velecsum = _mm256_add_pd(velecsum,velec);
494 /* Calculate temporary vectorial force */
495 tx = _mm256_mul_pd(fscal,dx11);
496 ty = _mm256_mul_pd(fscal,dy11);
497 tz = _mm256_mul_pd(fscal,dz11);
499 /* Update vectorial force */
500 fix1 = _mm256_add_pd(fix1,tx);
501 fiy1 = _mm256_add_pd(fiy1,ty);
502 fiz1 = _mm256_add_pd(fiz1,tz);
504 fjx1 = _mm256_add_pd(fjx1,tx);
505 fjy1 = _mm256_add_pd(fjy1,ty);
506 fjz1 = _mm256_add_pd(fjz1,tz);
508 /**************************
509 * CALCULATE INTERACTIONS *
510 **************************/
512 r12 = _mm256_mul_pd(rsq12,rinv12);
514 /* Calculate table index by multiplying r with table scale and truncate to integer */
515 rt = _mm256_mul_pd(r12,vftabscale);
516 vfitab = _mm256_cvttpd_epi32(rt);
517 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
518 vfitab = _mm_slli_epi32(vfitab,2);
520 /* CUBIC SPLINE TABLE ELECTROSTATICS */
521 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
522 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
523 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
524 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
525 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
526 Heps = _mm256_mul_pd(vfeps,H);
527 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
528 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
529 velec = _mm256_mul_pd(qq12,VV);
530 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
531 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
533 /* Update potential sum for this i atom from the interaction with this j atom. */
534 velecsum = _mm256_add_pd(velecsum,velec);
538 /* Calculate temporary vectorial force */
539 tx = _mm256_mul_pd(fscal,dx12);
540 ty = _mm256_mul_pd(fscal,dy12);
541 tz = _mm256_mul_pd(fscal,dz12);
543 /* Update vectorial force */
544 fix1 = _mm256_add_pd(fix1,tx);
545 fiy1 = _mm256_add_pd(fiy1,ty);
546 fiz1 = _mm256_add_pd(fiz1,tz);
548 fjx2 = _mm256_add_pd(fjx2,tx);
549 fjy2 = _mm256_add_pd(fjy2,ty);
550 fjz2 = _mm256_add_pd(fjz2,tz);
552 /**************************
553 * CALCULATE INTERACTIONS *
554 **************************/
556 r20 = _mm256_mul_pd(rsq20,rinv20);
558 /* Calculate table index by multiplying r with table scale and truncate to integer */
559 rt = _mm256_mul_pd(r20,vftabscale);
560 vfitab = _mm256_cvttpd_epi32(rt);
561 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
562 vfitab = _mm_slli_epi32(vfitab,2);
564 /* CUBIC SPLINE TABLE ELECTROSTATICS */
565 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
566 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
567 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
568 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
569 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
570 Heps = _mm256_mul_pd(vfeps,H);
571 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
572 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
573 velec = _mm256_mul_pd(qq20,VV);
574 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
575 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
577 /* Update potential sum for this i atom from the interaction with this j atom. */
578 velecsum = _mm256_add_pd(velecsum,velec);
582 /* Calculate temporary vectorial force */
583 tx = _mm256_mul_pd(fscal,dx20);
584 ty = _mm256_mul_pd(fscal,dy20);
585 tz = _mm256_mul_pd(fscal,dz20);
587 /* Update vectorial force */
588 fix2 = _mm256_add_pd(fix2,tx);
589 fiy2 = _mm256_add_pd(fiy2,ty);
590 fiz2 = _mm256_add_pd(fiz2,tz);
592 fjx0 = _mm256_add_pd(fjx0,tx);
593 fjy0 = _mm256_add_pd(fjy0,ty);
594 fjz0 = _mm256_add_pd(fjz0,tz);
596 /**************************
597 * CALCULATE INTERACTIONS *
598 **************************/
600 r21 = _mm256_mul_pd(rsq21,rinv21);
602 /* Calculate table index by multiplying r with table scale and truncate to integer */
603 rt = _mm256_mul_pd(r21,vftabscale);
604 vfitab = _mm256_cvttpd_epi32(rt);
605 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
606 vfitab = _mm_slli_epi32(vfitab,2);
608 /* CUBIC SPLINE TABLE ELECTROSTATICS */
609 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
610 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
611 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
612 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
613 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
614 Heps = _mm256_mul_pd(vfeps,H);
615 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
616 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
617 velec = _mm256_mul_pd(qq21,VV);
618 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
619 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velecsum = _mm256_add_pd(velecsum,velec);
626 /* Calculate temporary vectorial force */
627 tx = _mm256_mul_pd(fscal,dx21);
628 ty = _mm256_mul_pd(fscal,dy21);
629 tz = _mm256_mul_pd(fscal,dz21);
631 /* Update vectorial force */
632 fix2 = _mm256_add_pd(fix2,tx);
633 fiy2 = _mm256_add_pd(fiy2,ty);
634 fiz2 = _mm256_add_pd(fiz2,tz);
636 fjx1 = _mm256_add_pd(fjx1,tx);
637 fjy1 = _mm256_add_pd(fjy1,ty);
638 fjz1 = _mm256_add_pd(fjz1,tz);
640 /**************************
641 * CALCULATE INTERACTIONS *
642 **************************/
644 r22 = _mm256_mul_pd(rsq22,rinv22);
646 /* Calculate table index by multiplying r with table scale and truncate to integer */
647 rt = _mm256_mul_pd(r22,vftabscale);
648 vfitab = _mm256_cvttpd_epi32(rt);
649 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
650 vfitab = _mm_slli_epi32(vfitab,2);
652 /* CUBIC SPLINE TABLE ELECTROSTATICS */
653 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
654 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
655 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
656 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
657 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
658 Heps = _mm256_mul_pd(vfeps,H);
659 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
660 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
661 velec = _mm256_mul_pd(qq22,VV);
662 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
663 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velecsum = _mm256_add_pd(velecsum,velec);
670 /* Calculate temporary vectorial force */
671 tx = _mm256_mul_pd(fscal,dx22);
672 ty = _mm256_mul_pd(fscal,dy22);
673 tz = _mm256_mul_pd(fscal,dz22);
675 /* Update vectorial force */
676 fix2 = _mm256_add_pd(fix2,tx);
677 fiy2 = _mm256_add_pd(fiy2,ty);
678 fiz2 = _mm256_add_pd(fiz2,tz);
680 fjx2 = _mm256_add_pd(fjx2,tx);
681 fjy2 = _mm256_add_pd(fjy2,ty);
682 fjz2 = _mm256_add_pd(fjz2,tz);
684 fjptrA = f+j_coord_offsetA;
685 fjptrB = f+j_coord_offsetB;
686 fjptrC = f+j_coord_offsetC;
687 fjptrD = f+j_coord_offsetD;
689 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
690 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
692 /* Inner loop uses 400 flops */
698 /* Get j neighbor index, and coordinate index */
699 jnrlistA = jjnr[jidx];
700 jnrlistB = jjnr[jidx+1];
701 jnrlistC = jjnr[jidx+2];
702 jnrlistD = jjnr[jidx+3];
703 /* Sign of each element will be negative for non-real atoms.
704 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
705 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
707 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
709 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
710 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
711 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
713 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
714 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
715 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
716 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
717 j_coord_offsetA = DIM*jnrA;
718 j_coord_offsetB = DIM*jnrB;
719 j_coord_offsetC = DIM*jnrC;
720 j_coord_offsetD = DIM*jnrD;
722 /* load j atom coordinates */
723 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
724 x+j_coord_offsetC,x+j_coord_offsetD,
725 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
727 /* Calculate displacement vector */
728 dx00 = _mm256_sub_pd(ix0,jx0);
729 dy00 = _mm256_sub_pd(iy0,jy0);
730 dz00 = _mm256_sub_pd(iz0,jz0);
731 dx01 = _mm256_sub_pd(ix0,jx1);
732 dy01 = _mm256_sub_pd(iy0,jy1);
733 dz01 = _mm256_sub_pd(iz0,jz1);
734 dx02 = _mm256_sub_pd(ix0,jx2);
735 dy02 = _mm256_sub_pd(iy0,jy2);
736 dz02 = _mm256_sub_pd(iz0,jz2);
737 dx10 = _mm256_sub_pd(ix1,jx0);
738 dy10 = _mm256_sub_pd(iy1,jy0);
739 dz10 = _mm256_sub_pd(iz1,jz0);
740 dx11 = _mm256_sub_pd(ix1,jx1);
741 dy11 = _mm256_sub_pd(iy1,jy1);
742 dz11 = _mm256_sub_pd(iz1,jz1);
743 dx12 = _mm256_sub_pd(ix1,jx2);
744 dy12 = _mm256_sub_pd(iy1,jy2);
745 dz12 = _mm256_sub_pd(iz1,jz2);
746 dx20 = _mm256_sub_pd(ix2,jx0);
747 dy20 = _mm256_sub_pd(iy2,jy0);
748 dz20 = _mm256_sub_pd(iz2,jz0);
749 dx21 = _mm256_sub_pd(ix2,jx1);
750 dy21 = _mm256_sub_pd(iy2,jy1);
751 dz21 = _mm256_sub_pd(iz2,jz1);
752 dx22 = _mm256_sub_pd(ix2,jx2);
753 dy22 = _mm256_sub_pd(iy2,jy2);
754 dz22 = _mm256_sub_pd(iz2,jz2);
756 /* Calculate squared distance and things based on it */
757 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
758 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
759 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
760 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
761 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
762 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
763 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
764 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
765 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
767 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
768 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
769 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
770 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
771 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
772 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
773 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
774 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
775 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
777 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
779 fjx0 = _mm256_setzero_pd();
780 fjy0 = _mm256_setzero_pd();
781 fjz0 = _mm256_setzero_pd();
782 fjx1 = _mm256_setzero_pd();
783 fjy1 = _mm256_setzero_pd();
784 fjz1 = _mm256_setzero_pd();
785 fjx2 = _mm256_setzero_pd();
786 fjy2 = _mm256_setzero_pd();
787 fjz2 = _mm256_setzero_pd();
789 /**************************
790 * CALCULATE INTERACTIONS *
791 **************************/
793 r00 = _mm256_mul_pd(rsq00,rinv00);
794 r00 = _mm256_andnot_pd(dummy_mask,r00);
796 /* Calculate table index by multiplying r with table scale and truncate to integer */
797 rt = _mm256_mul_pd(r00,vftabscale);
798 vfitab = _mm256_cvttpd_epi32(rt);
799 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
800 vfitab = _mm_slli_epi32(vfitab,2);
802 /* CUBIC SPLINE TABLE ELECTROSTATICS */
803 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
804 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
805 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
806 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
807 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
808 Heps = _mm256_mul_pd(vfeps,H);
809 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
810 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
811 velec = _mm256_mul_pd(qq00,VV);
812 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
813 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
815 /* LENNARD-JONES DISPERSION/REPULSION */
817 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
818 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
819 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
820 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
821 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
823 /* Update potential sum for this i atom from the interaction with this j atom. */
824 velec = _mm256_andnot_pd(dummy_mask,velec);
825 velecsum = _mm256_add_pd(velecsum,velec);
826 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
827 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
829 fscal = _mm256_add_pd(felec,fvdw);
831 fscal = _mm256_andnot_pd(dummy_mask,fscal);
833 /* Calculate temporary vectorial force */
834 tx = _mm256_mul_pd(fscal,dx00);
835 ty = _mm256_mul_pd(fscal,dy00);
836 tz = _mm256_mul_pd(fscal,dz00);
838 /* Update vectorial force */
839 fix0 = _mm256_add_pd(fix0,tx);
840 fiy0 = _mm256_add_pd(fiy0,ty);
841 fiz0 = _mm256_add_pd(fiz0,tz);
843 fjx0 = _mm256_add_pd(fjx0,tx);
844 fjy0 = _mm256_add_pd(fjy0,ty);
845 fjz0 = _mm256_add_pd(fjz0,tz);
847 /**************************
848 * CALCULATE INTERACTIONS *
849 **************************/
851 r01 = _mm256_mul_pd(rsq01,rinv01);
852 r01 = _mm256_andnot_pd(dummy_mask,r01);
854 /* Calculate table index by multiplying r with table scale and truncate to integer */
855 rt = _mm256_mul_pd(r01,vftabscale);
856 vfitab = _mm256_cvttpd_epi32(rt);
857 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
858 vfitab = _mm_slli_epi32(vfitab,2);
860 /* CUBIC SPLINE TABLE ELECTROSTATICS */
861 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
862 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
863 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
864 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
865 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
866 Heps = _mm256_mul_pd(vfeps,H);
867 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
868 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
869 velec = _mm256_mul_pd(qq01,VV);
870 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
871 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm256_andnot_pd(dummy_mask,velec);
875 velecsum = _mm256_add_pd(velecsum,velec);
879 fscal = _mm256_andnot_pd(dummy_mask,fscal);
881 /* Calculate temporary vectorial force */
882 tx = _mm256_mul_pd(fscal,dx01);
883 ty = _mm256_mul_pd(fscal,dy01);
884 tz = _mm256_mul_pd(fscal,dz01);
886 /* Update vectorial force */
887 fix0 = _mm256_add_pd(fix0,tx);
888 fiy0 = _mm256_add_pd(fiy0,ty);
889 fiz0 = _mm256_add_pd(fiz0,tz);
891 fjx1 = _mm256_add_pd(fjx1,tx);
892 fjy1 = _mm256_add_pd(fjy1,ty);
893 fjz1 = _mm256_add_pd(fjz1,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 r02 = _mm256_mul_pd(rsq02,rinv02);
900 r02 = _mm256_andnot_pd(dummy_mask,r02);
902 /* Calculate table index by multiplying r with table scale and truncate to integer */
903 rt = _mm256_mul_pd(r02,vftabscale);
904 vfitab = _mm256_cvttpd_epi32(rt);
905 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
906 vfitab = _mm_slli_epi32(vfitab,2);
908 /* CUBIC SPLINE TABLE ELECTROSTATICS */
909 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
910 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
911 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
912 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
913 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
914 Heps = _mm256_mul_pd(vfeps,H);
915 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
916 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
917 velec = _mm256_mul_pd(qq02,VV);
918 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
919 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec = _mm256_andnot_pd(dummy_mask,velec);
923 velecsum = _mm256_add_pd(velecsum,velec);
927 fscal = _mm256_andnot_pd(dummy_mask,fscal);
929 /* Calculate temporary vectorial force */
930 tx = _mm256_mul_pd(fscal,dx02);
931 ty = _mm256_mul_pd(fscal,dy02);
932 tz = _mm256_mul_pd(fscal,dz02);
934 /* Update vectorial force */
935 fix0 = _mm256_add_pd(fix0,tx);
936 fiy0 = _mm256_add_pd(fiy0,ty);
937 fiz0 = _mm256_add_pd(fiz0,tz);
939 fjx2 = _mm256_add_pd(fjx2,tx);
940 fjy2 = _mm256_add_pd(fjy2,ty);
941 fjz2 = _mm256_add_pd(fjz2,tz);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 r10 = _mm256_mul_pd(rsq10,rinv10);
948 r10 = _mm256_andnot_pd(dummy_mask,r10);
950 /* Calculate table index by multiplying r with table scale and truncate to integer */
951 rt = _mm256_mul_pd(r10,vftabscale);
952 vfitab = _mm256_cvttpd_epi32(rt);
953 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
954 vfitab = _mm_slli_epi32(vfitab,2);
956 /* CUBIC SPLINE TABLE ELECTROSTATICS */
957 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
958 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
959 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
960 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
961 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
962 Heps = _mm256_mul_pd(vfeps,H);
963 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
964 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
965 velec = _mm256_mul_pd(qq10,VV);
966 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
967 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm256_andnot_pd(dummy_mask,velec);
971 velecsum = _mm256_add_pd(velecsum,velec);
975 fscal = _mm256_andnot_pd(dummy_mask,fscal);
977 /* Calculate temporary vectorial force */
978 tx = _mm256_mul_pd(fscal,dx10);
979 ty = _mm256_mul_pd(fscal,dy10);
980 tz = _mm256_mul_pd(fscal,dz10);
982 /* Update vectorial force */
983 fix1 = _mm256_add_pd(fix1,tx);
984 fiy1 = _mm256_add_pd(fiy1,ty);
985 fiz1 = _mm256_add_pd(fiz1,tz);
987 fjx0 = _mm256_add_pd(fjx0,tx);
988 fjy0 = _mm256_add_pd(fjy0,ty);
989 fjz0 = _mm256_add_pd(fjz0,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 r11 = _mm256_mul_pd(rsq11,rinv11);
996 r11 = _mm256_andnot_pd(dummy_mask,r11);
998 /* Calculate table index by multiplying r with table scale and truncate to integer */
999 rt = _mm256_mul_pd(r11,vftabscale);
1000 vfitab = _mm256_cvttpd_epi32(rt);
1001 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1002 vfitab = _mm_slli_epi32(vfitab,2);
1004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1006 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1007 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1008 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1009 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1010 Heps = _mm256_mul_pd(vfeps,H);
1011 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1012 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1013 velec = _mm256_mul_pd(qq11,VV);
1014 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1015 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 velec = _mm256_andnot_pd(dummy_mask,velec);
1019 velecsum = _mm256_add_pd(velecsum,velec);
1023 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1025 /* Calculate temporary vectorial force */
1026 tx = _mm256_mul_pd(fscal,dx11);
1027 ty = _mm256_mul_pd(fscal,dy11);
1028 tz = _mm256_mul_pd(fscal,dz11);
1030 /* Update vectorial force */
1031 fix1 = _mm256_add_pd(fix1,tx);
1032 fiy1 = _mm256_add_pd(fiy1,ty);
1033 fiz1 = _mm256_add_pd(fiz1,tz);
1035 fjx1 = _mm256_add_pd(fjx1,tx);
1036 fjy1 = _mm256_add_pd(fjy1,ty);
1037 fjz1 = _mm256_add_pd(fjz1,tz);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r12 = _mm256_mul_pd(rsq12,rinv12);
1044 r12 = _mm256_andnot_pd(dummy_mask,r12);
1046 /* Calculate table index by multiplying r with table scale and truncate to integer */
1047 rt = _mm256_mul_pd(r12,vftabscale);
1048 vfitab = _mm256_cvttpd_epi32(rt);
1049 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1050 vfitab = _mm_slli_epi32(vfitab,2);
1052 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1053 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1054 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1055 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1056 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1057 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1058 Heps = _mm256_mul_pd(vfeps,H);
1059 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1060 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1061 velec = _mm256_mul_pd(qq12,VV);
1062 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1063 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec = _mm256_andnot_pd(dummy_mask,velec);
1067 velecsum = _mm256_add_pd(velecsum,velec);
1071 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm256_mul_pd(fscal,dx12);
1075 ty = _mm256_mul_pd(fscal,dy12);
1076 tz = _mm256_mul_pd(fscal,dz12);
1078 /* Update vectorial force */
1079 fix1 = _mm256_add_pd(fix1,tx);
1080 fiy1 = _mm256_add_pd(fiy1,ty);
1081 fiz1 = _mm256_add_pd(fiz1,tz);
1083 fjx2 = _mm256_add_pd(fjx2,tx);
1084 fjy2 = _mm256_add_pd(fjy2,ty);
1085 fjz2 = _mm256_add_pd(fjz2,tz);
1087 /**************************
1088 * CALCULATE INTERACTIONS *
1089 **************************/
1091 r20 = _mm256_mul_pd(rsq20,rinv20);
1092 r20 = _mm256_andnot_pd(dummy_mask,r20);
1094 /* Calculate table index by multiplying r with table scale and truncate to integer */
1095 rt = _mm256_mul_pd(r20,vftabscale);
1096 vfitab = _mm256_cvttpd_epi32(rt);
1097 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1098 vfitab = _mm_slli_epi32(vfitab,2);
1100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1102 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1103 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1104 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1105 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1106 Heps = _mm256_mul_pd(vfeps,H);
1107 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1108 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1109 velec = _mm256_mul_pd(qq20,VV);
1110 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1111 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec = _mm256_andnot_pd(dummy_mask,velec);
1115 velecsum = _mm256_add_pd(velecsum,velec);
1119 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1121 /* Calculate temporary vectorial force */
1122 tx = _mm256_mul_pd(fscal,dx20);
1123 ty = _mm256_mul_pd(fscal,dy20);
1124 tz = _mm256_mul_pd(fscal,dz20);
1126 /* Update vectorial force */
1127 fix2 = _mm256_add_pd(fix2,tx);
1128 fiy2 = _mm256_add_pd(fiy2,ty);
1129 fiz2 = _mm256_add_pd(fiz2,tz);
1131 fjx0 = _mm256_add_pd(fjx0,tx);
1132 fjy0 = _mm256_add_pd(fjy0,ty);
1133 fjz0 = _mm256_add_pd(fjz0,tz);
1135 /**************************
1136 * CALCULATE INTERACTIONS *
1137 **************************/
1139 r21 = _mm256_mul_pd(rsq21,rinv21);
1140 r21 = _mm256_andnot_pd(dummy_mask,r21);
1142 /* Calculate table index by multiplying r with table scale and truncate to integer */
1143 rt = _mm256_mul_pd(r21,vftabscale);
1144 vfitab = _mm256_cvttpd_epi32(rt);
1145 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1146 vfitab = _mm_slli_epi32(vfitab,2);
1148 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1149 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1150 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1151 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1152 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1153 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1154 Heps = _mm256_mul_pd(vfeps,H);
1155 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1156 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1157 velec = _mm256_mul_pd(qq21,VV);
1158 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1159 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1161 /* Update potential sum for this i atom from the interaction with this j atom. */
1162 velec = _mm256_andnot_pd(dummy_mask,velec);
1163 velecsum = _mm256_add_pd(velecsum,velec);
1167 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1169 /* Calculate temporary vectorial force */
1170 tx = _mm256_mul_pd(fscal,dx21);
1171 ty = _mm256_mul_pd(fscal,dy21);
1172 tz = _mm256_mul_pd(fscal,dz21);
1174 /* Update vectorial force */
1175 fix2 = _mm256_add_pd(fix2,tx);
1176 fiy2 = _mm256_add_pd(fiy2,ty);
1177 fiz2 = _mm256_add_pd(fiz2,tz);
1179 fjx1 = _mm256_add_pd(fjx1,tx);
1180 fjy1 = _mm256_add_pd(fjy1,ty);
1181 fjz1 = _mm256_add_pd(fjz1,tz);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 r22 = _mm256_mul_pd(rsq22,rinv22);
1188 r22 = _mm256_andnot_pd(dummy_mask,r22);
1190 /* Calculate table index by multiplying r with table scale and truncate to integer */
1191 rt = _mm256_mul_pd(r22,vftabscale);
1192 vfitab = _mm256_cvttpd_epi32(rt);
1193 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1194 vfitab = _mm_slli_epi32(vfitab,2);
1196 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1197 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1198 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1199 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1200 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1201 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1202 Heps = _mm256_mul_pd(vfeps,H);
1203 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1204 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1205 velec = _mm256_mul_pd(qq22,VV);
1206 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1207 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1209 /* Update potential sum for this i atom from the interaction with this j atom. */
1210 velec = _mm256_andnot_pd(dummy_mask,velec);
1211 velecsum = _mm256_add_pd(velecsum,velec);
1215 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1217 /* Calculate temporary vectorial force */
1218 tx = _mm256_mul_pd(fscal,dx22);
1219 ty = _mm256_mul_pd(fscal,dy22);
1220 tz = _mm256_mul_pd(fscal,dz22);
1222 /* Update vectorial force */
1223 fix2 = _mm256_add_pd(fix2,tx);
1224 fiy2 = _mm256_add_pd(fiy2,ty);
1225 fiz2 = _mm256_add_pd(fiz2,tz);
1227 fjx2 = _mm256_add_pd(fjx2,tx);
1228 fjy2 = _mm256_add_pd(fjy2,ty);
1229 fjz2 = _mm256_add_pd(fjz2,tz);
1231 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1232 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1233 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1234 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1236 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1237 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1239 /* Inner loop uses 409 flops */
1242 /* End of innermost loop */
1244 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1245 f+i_coord_offset,fshift+i_shift_offset);
1248 /* Update potential energies */
1249 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1250 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1252 /* Increment number of inner iterations */
1253 inneriter += j_index_end - j_index_start;
1255 /* Outer loop uses 20 flops */
1258 /* Increment number of outer iterations */
1261 /* Update outer/inner flops */
1263 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1266 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1267 * Electrostatics interaction: CubicSplineTable
1268 * VdW interaction: LennardJones
1269 * Geometry: Water3-Water3
1270 * Calculate force/pot: Force
1273 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1274 (t_nblist * gmx_restrict nlist,
1275 rvec * gmx_restrict xx,
1276 rvec * gmx_restrict ff,
1277 t_forcerec * gmx_restrict fr,
1278 t_mdatoms * gmx_restrict mdatoms,
1279 nb_kernel_data_t * gmx_restrict kernel_data,
1280 t_nrnb * gmx_restrict nrnb)
1282 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1283 * just 0 for non-waters.
1284 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1285 * jnr indices corresponding to data put in the four positions in the SIMD register.
1287 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1288 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1289 int jnrA,jnrB,jnrC,jnrD;
1290 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1291 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1292 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1293 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1294 real rcutoff_scalar;
1295 real *shiftvec,*fshift,*x,*f;
1296 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1297 real scratch[4*DIM];
1298 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1299 real * vdwioffsetptr0;
1300 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1301 real * vdwioffsetptr1;
1302 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1303 real * vdwioffsetptr2;
1304 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1305 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1306 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1307 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1308 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1309 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1310 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1311 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1312 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1313 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1314 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1315 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1316 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1317 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1318 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1319 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1320 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1323 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1326 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1327 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1329 __m128i ifour = _mm_set1_epi32(4);
1330 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1332 __m256d dummy_mask,cutoff_mask;
1333 __m128 tmpmask0,tmpmask1;
1334 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1335 __m256d one = _mm256_set1_pd(1.0);
1336 __m256d two = _mm256_set1_pd(2.0);
1342 jindex = nlist->jindex;
1344 shiftidx = nlist->shift;
1346 shiftvec = fr->shift_vec[0];
1347 fshift = fr->fshift[0];
1348 facel = _mm256_set1_pd(fr->epsfac);
1349 charge = mdatoms->chargeA;
1350 nvdwtype = fr->ntype;
1351 vdwparam = fr->nbfp;
1352 vdwtype = mdatoms->typeA;
1354 vftab = kernel_data->table_elec->data;
1355 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
1357 /* Setup water-specific parameters */
1358 inr = nlist->iinr[0];
1359 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1360 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1361 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1362 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1364 jq0 = _mm256_set1_pd(charge[inr+0]);
1365 jq1 = _mm256_set1_pd(charge[inr+1]);
1366 jq2 = _mm256_set1_pd(charge[inr+2]);
1367 vdwjidx0A = 2*vdwtype[inr+0];
1368 qq00 = _mm256_mul_pd(iq0,jq0);
1369 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1370 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1371 qq01 = _mm256_mul_pd(iq0,jq1);
1372 qq02 = _mm256_mul_pd(iq0,jq2);
1373 qq10 = _mm256_mul_pd(iq1,jq0);
1374 qq11 = _mm256_mul_pd(iq1,jq1);
1375 qq12 = _mm256_mul_pd(iq1,jq2);
1376 qq20 = _mm256_mul_pd(iq2,jq0);
1377 qq21 = _mm256_mul_pd(iq2,jq1);
1378 qq22 = _mm256_mul_pd(iq2,jq2);
1380 /* Avoid stupid compiler warnings */
1381 jnrA = jnrB = jnrC = jnrD = 0;
1382 j_coord_offsetA = 0;
1383 j_coord_offsetB = 0;
1384 j_coord_offsetC = 0;
1385 j_coord_offsetD = 0;
1390 for(iidx=0;iidx<4*DIM;iidx++)
1392 scratch[iidx] = 0.0;
1395 /* Start outer loop over neighborlists */
1396 for(iidx=0; iidx<nri; iidx++)
1398 /* Load shift vector for this list */
1399 i_shift_offset = DIM*shiftidx[iidx];
1401 /* Load limits for loop over neighbors */
1402 j_index_start = jindex[iidx];
1403 j_index_end = jindex[iidx+1];
1405 /* Get outer coordinate index */
1407 i_coord_offset = DIM*inr;
1409 /* Load i particle coords and add shift vector */
1410 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1411 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1413 fix0 = _mm256_setzero_pd();
1414 fiy0 = _mm256_setzero_pd();
1415 fiz0 = _mm256_setzero_pd();
1416 fix1 = _mm256_setzero_pd();
1417 fiy1 = _mm256_setzero_pd();
1418 fiz1 = _mm256_setzero_pd();
1419 fix2 = _mm256_setzero_pd();
1420 fiy2 = _mm256_setzero_pd();
1421 fiz2 = _mm256_setzero_pd();
1423 /* Start inner kernel loop */
1424 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1427 /* Get j neighbor index, and coordinate index */
1429 jnrB = jjnr[jidx+1];
1430 jnrC = jjnr[jidx+2];
1431 jnrD = jjnr[jidx+3];
1432 j_coord_offsetA = DIM*jnrA;
1433 j_coord_offsetB = DIM*jnrB;
1434 j_coord_offsetC = DIM*jnrC;
1435 j_coord_offsetD = DIM*jnrD;
1437 /* load j atom coordinates */
1438 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1439 x+j_coord_offsetC,x+j_coord_offsetD,
1440 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1442 /* Calculate displacement vector */
1443 dx00 = _mm256_sub_pd(ix0,jx0);
1444 dy00 = _mm256_sub_pd(iy0,jy0);
1445 dz00 = _mm256_sub_pd(iz0,jz0);
1446 dx01 = _mm256_sub_pd(ix0,jx1);
1447 dy01 = _mm256_sub_pd(iy0,jy1);
1448 dz01 = _mm256_sub_pd(iz0,jz1);
1449 dx02 = _mm256_sub_pd(ix0,jx2);
1450 dy02 = _mm256_sub_pd(iy0,jy2);
1451 dz02 = _mm256_sub_pd(iz0,jz2);
1452 dx10 = _mm256_sub_pd(ix1,jx0);
1453 dy10 = _mm256_sub_pd(iy1,jy0);
1454 dz10 = _mm256_sub_pd(iz1,jz0);
1455 dx11 = _mm256_sub_pd(ix1,jx1);
1456 dy11 = _mm256_sub_pd(iy1,jy1);
1457 dz11 = _mm256_sub_pd(iz1,jz1);
1458 dx12 = _mm256_sub_pd(ix1,jx2);
1459 dy12 = _mm256_sub_pd(iy1,jy2);
1460 dz12 = _mm256_sub_pd(iz1,jz2);
1461 dx20 = _mm256_sub_pd(ix2,jx0);
1462 dy20 = _mm256_sub_pd(iy2,jy0);
1463 dz20 = _mm256_sub_pd(iz2,jz0);
1464 dx21 = _mm256_sub_pd(ix2,jx1);
1465 dy21 = _mm256_sub_pd(iy2,jy1);
1466 dz21 = _mm256_sub_pd(iz2,jz1);
1467 dx22 = _mm256_sub_pd(ix2,jx2);
1468 dy22 = _mm256_sub_pd(iy2,jy2);
1469 dz22 = _mm256_sub_pd(iz2,jz2);
1471 /* Calculate squared distance and things based on it */
1472 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1473 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1474 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1475 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1476 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1477 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1478 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1479 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1480 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1482 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1483 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1484 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1485 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1486 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1487 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1488 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1489 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1490 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1492 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1494 fjx0 = _mm256_setzero_pd();
1495 fjy0 = _mm256_setzero_pd();
1496 fjz0 = _mm256_setzero_pd();
1497 fjx1 = _mm256_setzero_pd();
1498 fjy1 = _mm256_setzero_pd();
1499 fjz1 = _mm256_setzero_pd();
1500 fjx2 = _mm256_setzero_pd();
1501 fjy2 = _mm256_setzero_pd();
1502 fjz2 = _mm256_setzero_pd();
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 r00 = _mm256_mul_pd(rsq00,rinv00);
1510 /* Calculate table index by multiplying r with table scale and truncate to integer */
1511 rt = _mm256_mul_pd(r00,vftabscale);
1512 vfitab = _mm256_cvttpd_epi32(rt);
1513 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1514 vfitab = _mm_slli_epi32(vfitab,2);
1516 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1517 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1518 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1519 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1520 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1521 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1522 Heps = _mm256_mul_pd(vfeps,H);
1523 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1524 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1525 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1527 /* LENNARD-JONES DISPERSION/REPULSION */
1529 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1530 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1532 fscal = _mm256_add_pd(felec,fvdw);
1534 /* Calculate temporary vectorial force */
1535 tx = _mm256_mul_pd(fscal,dx00);
1536 ty = _mm256_mul_pd(fscal,dy00);
1537 tz = _mm256_mul_pd(fscal,dz00);
1539 /* Update vectorial force */
1540 fix0 = _mm256_add_pd(fix0,tx);
1541 fiy0 = _mm256_add_pd(fiy0,ty);
1542 fiz0 = _mm256_add_pd(fiz0,tz);
1544 fjx0 = _mm256_add_pd(fjx0,tx);
1545 fjy0 = _mm256_add_pd(fjy0,ty);
1546 fjz0 = _mm256_add_pd(fjz0,tz);
1548 /**************************
1549 * CALCULATE INTERACTIONS *
1550 **************************/
1552 r01 = _mm256_mul_pd(rsq01,rinv01);
1554 /* Calculate table index by multiplying r with table scale and truncate to integer */
1555 rt = _mm256_mul_pd(r01,vftabscale);
1556 vfitab = _mm256_cvttpd_epi32(rt);
1557 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1558 vfitab = _mm_slli_epi32(vfitab,2);
1560 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1562 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1563 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1564 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1565 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1566 Heps = _mm256_mul_pd(vfeps,H);
1567 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1568 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1569 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1573 /* Calculate temporary vectorial force */
1574 tx = _mm256_mul_pd(fscal,dx01);
1575 ty = _mm256_mul_pd(fscal,dy01);
1576 tz = _mm256_mul_pd(fscal,dz01);
1578 /* Update vectorial force */
1579 fix0 = _mm256_add_pd(fix0,tx);
1580 fiy0 = _mm256_add_pd(fiy0,ty);
1581 fiz0 = _mm256_add_pd(fiz0,tz);
1583 fjx1 = _mm256_add_pd(fjx1,tx);
1584 fjy1 = _mm256_add_pd(fjy1,ty);
1585 fjz1 = _mm256_add_pd(fjz1,tz);
1587 /**************************
1588 * CALCULATE INTERACTIONS *
1589 **************************/
1591 r02 = _mm256_mul_pd(rsq02,rinv02);
1593 /* Calculate table index by multiplying r with table scale and truncate to integer */
1594 rt = _mm256_mul_pd(r02,vftabscale);
1595 vfitab = _mm256_cvttpd_epi32(rt);
1596 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1597 vfitab = _mm_slli_epi32(vfitab,2);
1599 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1600 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1601 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1602 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1603 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1604 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1605 Heps = _mm256_mul_pd(vfeps,H);
1606 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1607 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1608 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1612 /* Calculate temporary vectorial force */
1613 tx = _mm256_mul_pd(fscal,dx02);
1614 ty = _mm256_mul_pd(fscal,dy02);
1615 tz = _mm256_mul_pd(fscal,dz02);
1617 /* Update vectorial force */
1618 fix0 = _mm256_add_pd(fix0,tx);
1619 fiy0 = _mm256_add_pd(fiy0,ty);
1620 fiz0 = _mm256_add_pd(fiz0,tz);
1622 fjx2 = _mm256_add_pd(fjx2,tx);
1623 fjy2 = _mm256_add_pd(fjy2,ty);
1624 fjz2 = _mm256_add_pd(fjz2,tz);
1626 /**************************
1627 * CALCULATE INTERACTIONS *
1628 **************************/
1630 r10 = _mm256_mul_pd(rsq10,rinv10);
1632 /* Calculate table index by multiplying r with table scale and truncate to integer */
1633 rt = _mm256_mul_pd(r10,vftabscale);
1634 vfitab = _mm256_cvttpd_epi32(rt);
1635 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1636 vfitab = _mm_slli_epi32(vfitab,2);
1638 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1640 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1641 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1642 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1643 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1644 Heps = _mm256_mul_pd(vfeps,H);
1645 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1646 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1647 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1651 /* Calculate temporary vectorial force */
1652 tx = _mm256_mul_pd(fscal,dx10);
1653 ty = _mm256_mul_pd(fscal,dy10);
1654 tz = _mm256_mul_pd(fscal,dz10);
1656 /* Update vectorial force */
1657 fix1 = _mm256_add_pd(fix1,tx);
1658 fiy1 = _mm256_add_pd(fiy1,ty);
1659 fiz1 = _mm256_add_pd(fiz1,tz);
1661 fjx0 = _mm256_add_pd(fjx0,tx);
1662 fjy0 = _mm256_add_pd(fjy0,ty);
1663 fjz0 = _mm256_add_pd(fjz0,tz);
1665 /**************************
1666 * CALCULATE INTERACTIONS *
1667 **************************/
1669 r11 = _mm256_mul_pd(rsq11,rinv11);
1671 /* Calculate table index by multiplying r with table scale and truncate to integer */
1672 rt = _mm256_mul_pd(r11,vftabscale);
1673 vfitab = _mm256_cvttpd_epi32(rt);
1674 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1675 vfitab = _mm_slli_epi32(vfitab,2);
1677 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1679 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1680 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1681 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1682 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1683 Heps = _mm256_mul_pd(vfeps,H);
1684 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1685 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1686 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1690 /* Calculate temporary vectorial force */
1691 tx = _mm256_mul_pd(fscal,dx11);
1692 ty = _mm256_mul_pd(fscal,dy11);
1693 tz = _mm256_mul_pd(fscal,dz11);
1695 /* Update vectorial force */
1696 fix1 = _mm256_add_pd(fix1,tx);
1697 fiy1 = _mm256_add_pd(fiy1,ty);
1698 fiz1 = _mm256_add_pd(fiz1,tz);
1700 fjx1 = _mm256_add_pd(fjx1,tx);
1701 fjy1 = _mm256_add_pd(fjy1,ty);
1702 fjz1 = _mm256_add_pd(fjz1,tz);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 r12 = _mm256_mul_pd(rsq12,rinv12);
1710 /* Calculate table index by multiplying r with table scale and truncate to integer */
1711 rt = _mm256_mul_pd(r12,vftabscale);
1712 vfitab = _mm256_cvttpd_epi32(rt);
1713 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1714 vfitab = _mm_slli_epi32(vfitab,2);
1716 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1717 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1718 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1719 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1720 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1721 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1722 Heps = _mm256_mul_pd(vfeps,H);
1723 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1724 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1725 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1729 /* Calculate temporary vectorial force */
1730 tx = _mm256_mul_pd(fscal,dx12);
1731 ty = _mm256_mul_pd(fscal,dy12);
1732 tz = _mm256_mul_pd(fscal,dz12);
1734 /* Update vectorial force */
1735 fix1 = _mm256_add_pd(fix1,tx);
1736 fiy1 = _mm256_add_pd(fiy1,ty);
1737 fiz1 = _mm256_add_pd(fiz1,tz);
1739 fjx2 = _mm256_add_pd(fjx2,tx);
1740 fjy2 = _mm256_add_pd(fjy2,ty);
1741 fjz2 = _mm256_add_pd(fjz2,tz);
1743 /**************************
1744 * CALCULATE INTERACTIONS *
1745 **************************/
1747 r20 = _mm256_mul_pd(rsq20,rinv20);
1749 /* Calculate table index by multiplying r with table scale and truncate to integer */
1750 rt = _mm256_mul_pd(r20,vftabscale);
1751 vfitab = _mm256_cvttpd_epi32(rt);
1752 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1753 vfitab = _mm_slli_epi32(vfitab,2);
1755 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1756 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1757 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1758 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1759 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1760 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1761 Heps = _mm256_mul_pd(vfeps,H);
1762 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1763 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1764 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1768 /* Calculate temporary vectorial force */
1769 tx = _mm256_mul_pd(fscal,dx20);
1770 ty = _mm256_mul_pd(fscal,dy20);
1771 tz = _mm256_mul_pd(fscal,dz20);
1773 /* Update vectorial force */
1774 fix2 = _mm256_add_pd(fix2,tx);
1775 fiy2 = _mm256_add_pd(fiy2,ty);
1776 fiz2 = _mm256_add_pd(fiz2,tz);
1778 fjx0 = _mm256_add_pd(fjx0,tx);
1779 fjy0 = _mm256_add_pd(fjy0,ty);
1780 fjz0 = _mm256_add_pd(fjz0,tz);
1782 /**************************
1783 * CALCULATE INTERACTIONS *
1784 **************************/
1786 r21 = _mm256_mul_pd(rsq21,rinv21);
1788 /* Calculate table index by multiplying r with table scale and truncate to integer */
1789 rt = _mm256_mul_pd(r21,vftabscale);
1790 vfitab = _mm256_cvttpd_epi32(rt);
1791 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1792 vfitab = _mm_slli_epi32(vfitab,2);
1794 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1795 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1796 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1797 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1798 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1799 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1800 Heps = _mm256_mul_pd(vfeps,H);
1801 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1802 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1803 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1807 /* Calculate temporary vectorial force */
1808 tx = _mm256_mul_pd(fscal,dx21);
1809 ty = _mm256_mul_pd(fscal,dy21);
1810 tz = _mm256_mul_pd(fscal,dz21);
1812 /* Update vectorial force */
1813 fix2 = _mm256_add_pd(fix2,tx);
1814 fiy2 = _mm256_add_pd(fiy2,ty);
1815 fiz2 = _mm256_add_pd(fiz2,tz);
1817 fjx1 = _mm256_add_pd(fjx1,tx);
1818 fjy1 = _mm256_add_pd(fjy1,ty);
1819 fjz1 = _mm256_add_pd(fjz1,tz);
1821 /**************************
1822 * CALCULATE INTERACTIONS *
1823 **************************/
1825 r22 = _mm256_mul_pd(rsq22,rinv22);
1827 /* Calculate table index by multiplying r with table scale and truncate to integer */
1828 rt = _mm256_mul_pd(r22,vftabscale);
1829 vfitab = _mm256_cvttpd_epi32(rt);
1830 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1831 vfitab = _mm_slli_epi32(vfitab,2);
1833 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1834 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1835 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1836 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1837 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1838 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1839 Heps = _mm256_mul_pd(vfeps,H);
1840 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1841 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1842 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1846 /* Calculate temporary vectorial force */
1847 tx = _mm256_mul_pd(fscal,dx22);
1848 ty = _mm256_mul_pd(fscal,dy22);
1849 tz = _mm256_mul_pd(fscal,dz22);
1851 /* Update vectorial force */
1852 fix2 = _mm256_add_pd(fix2,tx);
1853 fiy2 = _mm256_add_pd(fiy2,ty);
1854 fiz2 = _mm256_add_pd(fiz2,tz);
1856 fjx2 = _mm256_add_pd(fjx2,tx);
1857 fjy2 = _mm256_add_pd(fjy2,ty);
1858 fjz2 = _mm256_add_pd(fjz2,tz);
1860 fjptrA = f+j_coord_offsetA;
1861 fjptrB = f+j_coord_offsetB;
1862 fjptrC = f+j_coord_offsetC;
1863 fjptrD = f+j_coord_offsetD;
1865 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1866 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1868 /* Inner loop uses 359 flops */
1871 if(jidx<j_index_end)
1874 /* Get j neighbor index, and coordinate index */
1875 jnrlistA = jjnr[jidx];
1876 jnrlistB = jjnr[jidx+1];
1877 jnrlistC = jjnr[jidx+2];
1878 jnrlistD = jjnr[jidx+3];
1879 /* Sign of each element will be negative for non-real atoms.
1880 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1881 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1883 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1885 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1886 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1887 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1889 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1890 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1891 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1892 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1893 j_coord_offsetA = DIM*jnrA;
1894 j_coord_offsetB = DIM*jnrB;
1895 j_coord_offsetC = DIM*jnrC;
1896 j_coord_offsetD = DIM*jnrD;
1898 /* load j atom coordinates */
1899 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1900 x+j_coord_offsetC,x+j_coord_offsetD,
1901 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1903 /* Calculate displacement vector */
1904 dx00 = _mm256_sub_pd(ix0,jx0);
1905 dy00 = _mm256_sub_pd(iy0,jy0);
1906 dz00 = _mm256_sub_pd(iz0,jz0);
1907 dx01 = _mm256_sub_pd(ix0,jx1);
1908 dy01 = _mm256_sub_pd(iy0,jy1);
1909 dz01 = _mm256_sub_pd(iz0,jz1);
1910 dx02 = _mm256_sub_pd(ix0,jx2);
1911 dy02 = _mm256_sub_pd(iy0,jy2);
1912 dz02 = _mm256_sub_pd(iz0,jz2);
1913 dx10 = _mm256_sub_pd(ix1,jx0);
1914 dy10 = _mm256_sub_pd(iy1,jy0);
1915 dz10 = _mm256_sub_pd(iz1,jz0);
1916 dx11 = _mm256_sub_pd(ix1,jx1);
1917 dy11 = _mm256_sub_pd(iy1,jy1);
1918 dz11 = _mm256_sub_pd(iz1,jz1);
1919 dx12 = _mm256_sub_pd(ix1,jx2);
1920 dy12 = _mm256_sub_pd(iy1,jy2);
1921 dz12 = _mm256_sub_pd(iz1,jz2);
1922 dx20 = _mm256_sub_pd(ix2,jx0);
1923 dy20 = _mm256_sub_pd(iy2,jy0);
1924 dz20 = _mm256_sub_pd(iz2,jz0);
1925 dx21 = _mm256_sub_pd(ix2,jx1);
1926 dy21 = _mm256_sub_pd(iy2,jy1);
1927 dz21 = _mm256_sub_pd(iz2,jz1);
1928 dx22 = _mm256_sub_pd(ix2,jx2);
1929 dy22 = _mm256_sub_pd(iy2,jy2);
1930 dz22 = _mm256_sub_pd(iz2,jz2);
1932 /* Calculate squared distance and things based on it */
1933 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1934 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1935 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1936 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1937 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1938 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1939 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1940 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1941 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1943 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1944 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1945 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1946 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1947 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1948 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1949 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1950 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1951 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1953 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1955 fjx0 = _mm256_setzero_pd();
1956 fjy0 = _mm256_setzero_pd();
1957 fjz0 = _mm256_setzero_pd();
1958 fjx1 = _mm256_setzero_pd();
1959 fjy1 = _mm256_setzero_pd();
1960 fjz1 = _mm256_setzero_pd();
1961 fjx2 = _mm256_setzero_pd();
1962 fjy2 = _mm256_setzero_pd();
1963 fjz2 = _mm256_setzero_pd();
1965 /**************************
1966 * CALCULATE INTERACTIONS *
1967 **************************/
1969 r00 = _mm256_mul_pd(rsq00,rinv00);
1970 r00 = _mm256_andnot_pd(dummy_mask,r00);
1972 /* Calculate table index by multiplying r with table scale and truncate to integer */
1973 rt = _mm256_mul_pd(r00,vftabscale);
1974 vfitab = _mm256_cvttpd_epi32(rt);
1975 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1976 vfitab = _mm_slli_epi32(vfitab,2);
1978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1979 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1980 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1981 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1982 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1983 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1984 Heps = _mm256_mul_pd(vfeps,H);
1985 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1986 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1987 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1989 /* LENNARD-JONES DISPERSION/REPULSION */
1991 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1992 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1994 fscal = _mm256_add_pd(felec,fvdw);
1996 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1998 /* Calculate temporary vectorial force */
1999 tx = _mm256_mul_pd(fscal,dx00);
2000 ty = _mm256_mul_pd(fscal,dy00);
2001 tz = _mm256_mul_pd(fscal,dz00);
2003 /* Update vectorial force */
2004 fix0 = _mm256_add_pd(fix0,tx);
2005 fiy0 = _mm256_add_pd(fiy0,ty);
2006 fiz0 = _mm256_add_pd(fiz0,tz);
2008 fjx0 = _mm256_add_pd(fjx0,tx);
2009 fjy0 = _mm256_add_pd(fjy0,ty);
2010 fjz0 = _mm256_add_pd(fjz0,tz);
2012 /**************************
2013 * CALCULATE INTERACTIONS *
2014 **************************/
2016 r01 = _mm256_mul_pd(rsq01,rinv01);
2017 r01 = _mm256_andnot_pd(dummy_mask,r01);
2019 /* Calculate table index by multiplying r with table scale and truncate to integer */
2020 rt = _mm256_mul_pd(r01,vftabscale);
2021 vfitab = _mm256_cvttpd_epi32(rt);
2022 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2023 vfitab = _mm_slli_epi32(vfitab,2);
2025 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2026 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2027 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2028 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2029 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2030 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2031 Heps = _mm256_mul_pd(vfeps,H);
2032 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2033 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2034 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
2038 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2040 /* Calculate temporary vectorial force */
2041 tx = _mm256_mul_pd(fscal,dx01);
2042 ty = _mm256_mul_pd(fscal,dy01);
2043 tz = _mm256_mul_pd(fscal,dz01);
2045 /* Update vectorial force */
2046 fix0 = _mm256_add_pd(fix0,tx);
2047 fiy0 = _mm256_add_pd(fiy0,ty);
2048 fiz0 = _mm256_add_pd(fiz0,tz);
2050 fjx1 = _mm256_add_pd(fjx1,tx);
2051 fjy1 = _mm256_add_pd(fjy1,ty);
2052 fjz1 = _mm256_add_pd(fjz1,tz);
2054 /**************************
2055 * CALCULATE INTERACTIONS *
2056 **************************/
2058 r02 = _mm256_mul_pd(rsq02,rinv02);
2059 r02 = _mm256_andnot_pd(dummy_mask,r02);
2061 /* Calculate table index by multiplying r with table scale and truncate to integer */
2062 rt = _mm256_mul_pd(r02,vftabscale);
2063 vfitab = _mm256_cvttpd_epi32(rt);
2064 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2065 vfitab = _mm_slli_epi32(vfitab,2);
2067 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2068 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2069 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2070 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2071 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2072 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2073 Heps = _mm256_mul_pd(vfeps,H);
2074 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2075 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2076 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2080 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2082 /* Calculate temporary vectorial force */
2083 tx = _mm256_mul_pd(fscal,dx02);
2084 ty = _mm256_mul_pd(fscal,dy02);
2085 tz = _mm256_mul_pd(fscal,dz02);
2087 /* Update vectorial force */
2088 fix0 = _mm256_add_pd(fix0,tx);
2089 fiy0 = _mm256_add_pd(fiy0,ty);
2090 fiz0 = _mm256_add_pd(fiz0,tz);
2092 fjx2 = _mm256_add_pd(fjx2,tx);
2093 fjy2 = _mm256_add_pd(fjy2,ty);
2094 fjz2 = _mm256_add_pd(fjz2,tz);
2096 /**************************
2097 * CALCULATE INTERACTIONS *
2098 **************************/
2100 r10 = _mm256_mul_pd(rsq10,rinv10);
2101 r10 = _mm256_andnot_pd(dummy_mask,r10);
2103 /* Calculate table index by multiplying r with table scale and truncate to integer */
2104 rt = _mm256_mul_pd(r10,vftabscale);
2105 vfitab = _mm256_cvttpd_epi32(rt);
2106 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2107 vfitab = _mm_slli_epi32(vfitab,2);
2109 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2110 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2111 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2112 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2113 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2114 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2115 Heps = _mm256_mul_pd(vfeps,H);
2116 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2117 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2118 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2122 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2124 /* Calculate temporary vectorial force */
2125 tx = _mm256_mul_pd(fscal,dx10);
2126 ty = _mm256_mul_pd(fscal,dy10);
2127 tz = _mm256_mul_pd(fscal,dz10);
2129 /* Update vectorial force */
2130 fix1 = _mm256_add_pd(fix1,tx);
2131 fiy1 = _mm256_add_pd(fiy1,ty);
2132 fiz1 = _mm256_add_pd(fiz1,tz);
2134 fjx0 = _mm256_add_pd(fjx0,tx);
2135 fjy0 = _mm256_add_pd(fjy0,ty);
2136 fjz0 = _mm256_add_pd(fjz0,tz);
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 r11 = _mm256_mul_pd(rsq11,rinv11);
2143 r11 = _mm256_andnot_pd(dummy_mask,r11);
2145 /* Calculate table index by multiplying r with table scale and truncate to integer */
2146 rt = _mm256_mul_pd(r11,vftabscale);
2147 vfitab = _mm256_cvttpd_epi32(rt);
2148 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2149 vfitab = _mm_slli_epi32(vfitab,2);
2151 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2152 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2153 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2154 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2155 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2156 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2157 Heps = _mm256_mul_pd(vfeps,H);
2158 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2159 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2160 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2164 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2166 /* Calculate temporary vectorial force */
2167 tx = _mm256_mul_pd(fscal,dx11);
2168 ty = _mm256_mul_pd(fscal,dy11);
2169 tz = _mm256_mul_pd(fscal,dz11);
2171 /* Update vectorial force */
2172 fix1 = _mm256_add_pd(fix1,tx);
2173 fiy1 = _mm256_add_pd(fiy1,ty);
2174 fiz1 = _mm256_add_pd(fiz1,tz);
2176 fjx1 = _mm256_add_pd(fjx1,tx);
2177 fjy1 = _mm256_add_pd(fjy1,ty);
2178 fjz1 = _mm256_add_pd(fjz1,tz);
2180 /**************************
2181 * CALCULATE INTERACTIONS *
2182 **************************/
2184 r12 = _mm256_mul_pd(rsq12,rinv12);
2185 r12 = _mm256_andnot_pd(dummy_mask,r12);
2187 /* Calculate table index by multiplying r with table scale and truncate to integer */
2188 rt = _mm256_mul_pd(r12,vftabscale);
2189 vfitab = _mm256_cvttpd_epi32(rt);
2190 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2191 vfitab = _mm_slli_epi32(vfitab,2);
2193 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2194 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2195 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2196 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2197 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2198 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2199 Heps = _mm256_mul_pd(vfeps,H);
2200 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2201 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2202 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2206 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2208 /* Calculate temporary vectorial force */
2209 tx = _mm256_mul_pd(fscal,dx12);
2210 ty = _mm256_mul_pd(fscal,dy12);
2211 tz = _mm256_mul_pd(fscal,dz12);
2213 /* Update vectorial force */
2214 fix1 = _mm256_add_pd(fix1,tx);
2215 fiy1 = _mm256_add_pd(fiy1,ty);
2216 fiz1 = _mm256_add_pd(fiz1,tz);
2218 fjx2 = _mm256_add_pd(fjx2,tx);
2219 fjy2 = _mm256_add_pd(fjy2,ty);
2220 fjz2 = _mm256_add_pd(fjz2,tz);
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2226 r20 = _mm256_mul_pd(rsq20,rinv20);
2227 r20 = _mm256_andnot_pd(dummy_mask,r20);
2229 /* Calculate table index by multiplying r with table scale and truncate to integer */
2230 rt = _mm256_mul_pd(r20,vftabscale);
2231 vfitab = _mm256_cvttpd_epi32(rt);
2232 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2233 vfitab = _mm_slli_epi32(vfitab,2);
2235 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2236 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2237 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2238 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2239 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2240 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2241 Heps = _mm256_mul_pd(vfeps,H);
2242 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2243 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2244 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2248 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2250 /* Calculate temporary vectorial force */
2251 tx = _mm256_mul_pd(fscal,dx20);
2252 ty = _mm256_mul_pd(fscal,dy20);
2253 tz = _mm256_mul_pd(fscal,dz20);
2255 /* Update vectorial force */
2256 fix2 = _mm256_add_pd(fix2,tx);
2257 fiy2 = _mm256_add_pd(fiy2,ty);
2258 fiz2 = _mm256_add_pd(fiz2,tz);
2260 fjx0 = _mm256_add_pd(fjx0,tx);
2261 fjy0 = _mm256_add_pd(fjy0,ty);
2262 fjz0 = _mm256_add_pd(fjz0,tz);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 r21 = _mm256_mul_pd(rsq21,rinv21);
2269 r21 = _mm256_andnot_pd(dummy_mask,r21);
2271 /* Calculate table index by multiplying r with table scale and truncate to integer */
2272 rt = _mm256_mul_pd(r21,vftabscale);
2273 vfitab = _mm256_cvttpd_epi32(rt);
2274 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2275 vfitab = _mm_slli_epi32(vfitab,2);
2277 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2278 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2279 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2280 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2281 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2282 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2283 Heps = _mm256_mul_pd(vfeps,H);
2284 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2285 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2286 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2290 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2292 /* Calculate temporary vectorial force */
2293 tx = _mm256_mul_pd(fscal,dx21);
2294 ty = _mm256_mul_pd(fscal,dy21);
2295 tz = _mm256_mul_pd(fscal,dz21);
2297 /* Update vectorial force */
2298 fix2 = _mm256_add_pd(fix2,tx);
2299 fiy2 = _mm256_add_pd(fiy2,ty);
2300 fiz2 = _mm256_add_pd(fiz2,tz);
2302 fjx1 = _mm256_add_pd(fjx1,tx);
2303 fjy1 = _mm256_add_pd(fjy1,ty);
2304 fjz1 = _mm256_add_pd(fjz1,tz);
2306 /**************************
2307 * CALCULATE INTERACTIONS *
2308 **************************/
2310 r22 = _mm256_mul_pd(rsq22,rinv22);
2311 r22 = _mm256_andnot_pd(dummy_mask,r22);
2313 /* Calculate table index by multiplying r with table scale and truncate to integer */
2314 rt = _mm256_mul_pd(r22,vftabscale);
2315 vfitab = _mm256_cvttpd_epi32(rt);
2316 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2317 vfitab = _mm_slli_epi32(vfitab,2);
2319 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2320 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2321 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2322 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2323 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2324 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2325 Heps = _mm256_mul_pd(vfeps,H);
2326 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2327 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2328 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2332 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2334 /* Calculate temporary vectorial force */
2335 tx = _mm256_mul_pd(fscal,dx22);
2336 ty = _mm256_mul_pd(fscal,dy22);
2337 tz = _mm256_mul_pd(fscal,dz22);
2339 /* Update vectorial force */
2340 fix2 = _mm256_add_pd(fix2,tx);
2341 fiy2 = _mm256_add_pd(fiy2,ty);
2342 fiz2 = _mm256_add_pd(fiz2,tz);
2344 fjx2 = _mm256_add_pd(fjx2,tx);
2345 fjy2 = _mm256_add_pd(fjy2,ty);
2346 fjz2 = _mm256_add_pd(fjz2,tz);
2348 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2349 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2350 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2351 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2353 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2354 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2356 /* Inner loop uses 368 flops */
2359 /* End of innermost loop */
2361 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2362 f+i_coord_offset,fshift+i_shift_offset);
2364 /* Increment number of inner iterations */
2365 inneriter += j_index_end - j_index_start;
2367 /* Outer loop uses 18 flops */
2370 /* Increment number of outer iterations */
2373 /* Update outer/inner flops */
2375 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);