2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
114 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
116 __m128i ifour = _mm_set1_epi32(4);
117 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
119 __m256d dummy_mask,cutoff_mask;
120 __m128 tmpmask0,tmpmask1;
121 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
122 __m256d one = _mm256_set1_pd(1.0);
123 __m256d two = _mm256_set1_pd(2.0);
129 jindex = nlist->jindex;
131 shiftidx = nlist->shift;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm256_set1_pd(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 vftab = kernel_data->table_elec->data;
142 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
144 /* Setup water-specific parameters */
145 inr = nlist->iinr[0];
146 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
147 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
148 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
149 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
151 jq0 = _mm256_set1_pd(charge[inr+0]);
152 jq1 = _mm256_set1_pd(charge[inr+1]);
153 jq2 = _mm256_set1_pd(charge[inr+2]);
154 vdwjidx0A = 2*vdwtype[inr+0];
155 qq00 = _mm256_mul_pd(iq0,jq0);
156 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
157 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
158 qq01 = _mm256_mul_pd(iq0,jq1);
159 qq02 = _mm256_mul_pd(iq0,jq2);
160 qq10 = _mm256_mul_pd(iq1,jq0);
161 qq11 = _mm256_mul_pd(iq1,jq1);
162 qq12 = _mm256_mul_pd(iq1,jq2);
163 qq20 = _mm256_mul_pd(iq2,jq0);
164 qq21 = _mm256_mul_pd(iq2,jq1);
165 qq22 = _mm256_mul_pd(iq2,jq2);
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = 0;
177 for(iidx=0;iidx<4*DIM;iidx++)
182 /* Start outer loop over neighborlists */
183 for(iidx=0; iidx<nri; iidx++)
185 /* Load shift vector for this list */
186 i_shift_offset = DIM*shiftidx[iidx];
188 /* Load limits for loop over neighbors */
189 j_index_start = jindex[iidx];
190 j_index_end = jindex[iidx+1];
192 /* Get outer coordinate index */
194 i_coord_offset = DIM*inr;
196 /* Load i particle coords and add shift vector */
197 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
198 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
200 fix0 = _mm256_setzero_pd();
201 fiy0 = _mm256_setzero_pd();
202 fiz0 = _mm256_setzero_pd();
203 fix1 = _mm256_setzero_pd();
204 fiy1 = _mm256_setzero_pd();
205 fiz1 = _mm256_setzero_pd();
206 fix2 = _mm256_setzero_pd();
207 fiy2 = _mm256_setzero_pd();
208 fiz2 = _mm256_setzero_pd();
210 /* Reset potential sums */
211 velecsum = _mm256_setzero_pd();
212 vvdwsum = _mm256_setzero_pd();
214 /* Start inner kernel loop */
215 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
225 j_coord_offsetC = DIM*jnrC;
226 j_coord_offsetD = DIM*jnrD;
228 /* load j atom coordinates */
229 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
230 x+j_coord_offsetC,x+j_coord_offsetD,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm256_sub_pd(ix0,jx0);
235 dy00 = _mm256_sub_pd(iy0,jy0);
236 dz00 = _mm256_sub_pd(iz0,jz0);
237 dx01 = _mm256_sub_pd(ix0,jx1);
238 dy01 = _mm256_sub_pd(iy0,jy1);
239 dz01 = _mm256_sub_pd(iz0,jz1);
240 dx02 = _mm256_sub_pd(ix0,jx2);
241 dy02 = _mm256_sub_pd(iy0,jy2);
242 dz02 = _mm256_sub_pd(iz0,jz2);
243 dx10 = _mm256_sub_pd(ix1,jx0);
244 dy10 = _mm256_sub_pd(iy1,jy0);
245 dz10 = _mm256_sub_pd(iz1,jz0);
246 dx11 = _mm256_sub_pd(ix1,jx1);
247 dy11 = _mm256_sub_pd(iy1,jy1);
248 dz11 = _mm256_sub_pd(iz1,jz1);
249 dx12 = _mm256_sub_pd(ix1,jx2);
250 dy12 = _mm256_sub_pd(iy1,jy2);
251 dz12 = _mm256_sub_pd(iz1,jz2);
252 dx20 = _mm256_sub_pd(ix2,jx0);
253 dy20 = _mm256_sub_pd(iy2,jy0);
254 dz20 = _mm256_sub_pd(iz2,jz0);
255 dx21 = _mm256_sub_pd(ix2,jx1);
256 dy21 = _mm256_sub_pd(iy2,jy1);
257 dz21 = _mm256_sub_pd(iz2,jz1);
258 dx22 = _mm256_sub_pd(ix2,jx2);
259 dy22 = _mm256_sub_pd(iy2,jy2);
260 dz22 = _mm256_sub_pd(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
264 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
265 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
266 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
267 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
268 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
269 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
270 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
271 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
273 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
274 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
275 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
276 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
277 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
278 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
279 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
280 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
281 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
283 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
285 fjx0 = _mm256_setzero_pd();
286 fjy0 = _mm256_setzero_pd();
287 fjz0 = _mm256_setzero_pd();
288 fjx1 = _mm256_setzero_pd();
289 fjy1 = _mm256_setzero_pd();
290 fjz1 = _mm256_setzero_pd();
291 fjx2 = _mm256_setzero_pd();
292 fjy2 = _mm256_setzero_pd();
293 fjz2 = _mm256_setzero_pd();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r00 = _mm256_mul_pd(rsq00,rinv00);
301 /* Calculate table index by multiplying r with table scale and truncate to integer */
302 rt = _mm256_mul_pd(r00,vftabscale);
303 vfitab = _mm256_cvttpd_epi32(rt);
304 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
305 vfitab = _mm_slli_epi32(vfitab,2);
307 /* CUBIC SPLINE TABLE ELECTROSTATICS */
308 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
309 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
310 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
311 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
312 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
313 Heps = _mm256_mul_pd(vfeps,H);
314 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
315 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
316 velec = _mm256_mul_pd(qq00,VV);
317 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
318 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
320 /* LENNARD-JONES DISPERSION/REPULSION */
322 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
323 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
324 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
325 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
326 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
328 /* Update potential sum for this i atom from the interaction with this j atom. */
329 velecsum = _mm256_add_pd(velecsum,velec);
330 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
332 fscal = _mm256_add_pd(felec,fvdw);
334 /* Calculate temporary vectorial force */
335 tx = _mm256_mul_pd(fscal,dx00);
336 ty = _mm256_mul_pd(fscal,dy00);
337 tz = _mm256_mul_pd(fscal,dz00);
339 /* Update vectorial force */
340 fix0 = _mm256_add_pd(fix0,tx);
341 fiy0 = _mm256_add_pd(fiy0,ty);
342 fiz0 = _mm256_add_pd(fiz0,tz);
344 fjx0 = _mm256_add_pd(fjx0,tx);
345 fjy0 = _mm256_add_pd(fjy0,ty);
346 fjz0 = _mm256_add_pd(fjz0,tz);
348 /**************************
349 * CALCULATE INTERACTIONS *
350 **************************/
352 r01 = _mm256_mul_pd(rsq01,rinv01);
354 /* Calculate table index by multiplying r with table scale and truncate to integer */
355 rt = _mm256_mul_pd(r01,vftabscale);
356 vfitab = _mm256_cvttpd_epi32(rt);
357 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
358 vfitab = _mm_slli_epi32(vfitab,2);
360 /* CUBIC SPLINE TABLE ELECTROSTATICS */
361 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
362 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
363 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
364 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
365 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
366 Heps = _mm256_mul_pd(vfeps,H);
367 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
368 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
369 velec = _mm256_mul_pd(qq01,VV);
370 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
371 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velecsum = _mm256_add_pd(velecsum,velec);
378 /* Calculate temporary vectorial force */
379 tx = _mm256_mul_pd(fscal,dx01);
380 ty = _mm256_mul_pd(fscal,dy01);
381 tz = _mm256_mul_pd(fscal,dz01);
383 /* Update vectorial force */
384 fix0 = _mm256_add_pd(fix0,tx);
385 fiy0 = _mm256_add_pd(fiy0,ty);
386 fiz0 = _mm256_add_pd(fiz0,tz);
388 fjx1 = _mm256_add_pd(fjx1,tx);
389 fjy1 = _mm256_add_pd(fjy1,ty);
390 fjz1 = _mm256_add_pd(fjz1,tz);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 r02 = _mm256_mul_pd(rsq02,rinv02);
398 /* Calculate table index by multiplying r with table scale and truncate to integer */
399 rt = _mm256_mul_pd(r02,vftabscale);
400 vfitab = _mm256_cvttpd_epi32(rt);
401 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
402 vfitab = _mm_slli_epi32(vfitab,2);
404 /* CUBIC SPLINE TABLE ELECTROSTATICS */
405 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
406 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
407 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
408 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
409 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
410 Heps = _mm256_mul_pd(vfeps,H);
411 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
412 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
413 velec = _mm256_mul_pd(qq02,VV);
414 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
415 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velecsum = _mm256_add_pd(velecsum,velec);
422 /* Calculate temporary vectorial force */
423 tx = _mm256_mul_pd(fscal,dx02);
424 ty = _mm256_mul_pd(fscal,dy02);
425 tz = _mm256_mul_pd(fscal,dz02);
427 /* Update vectorial force */
428 fix0 = _mm256_add_pd(fix0,tx);
429 fiy0 = _mm256_add_pd(fiy0,ty);
430 fiz0 = _mm256_add_pd(fiz0,tz);
432 fjx2 = _mm256_add_pd(fjx2,tx);
433 fjy2 = _mm256_add_pd(fjy2,ty);
434 fjz2 = _mm256_add_pd(fjz2,tz);
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 r10 = _mm256_mul_pd(rsq10,rinv10);
442 /* Calculate table index by multiplying r with table scale and truncate to integer */
443 rt = _mm256_mul_pd(r10,vftabscale);
444 vfitab = _mm256_cvttpd_epi32(rt);
445 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
446 vfitab = _mm_slli_epi32(vfitab,2);
448 /* CUBIC SPLINE TABLE ELECTROSTATICS */
449 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
450 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
451 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
452 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
453 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
454 Heps = _mm256_mul_pd(vfeps,H);
455 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
456 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
457 velec = _mm256_mul_pd(qq10,VV);
458 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
459 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum = _mm256_add_pd(velecsum,velec);
466 /* Calculate temporary vectorial force */
467 tx = _mm256_mul_pd(fscal,dx10);
468 ty = _mm256_mul_pd(fscal,dy10);
469 tz = _mm256_mul_pd(fscal,dz10);
471 /* Update vectorial force */
472 fix1 = _mm256_add_pd(fix1,tx);
473 fiy1 = _mm256_add_pd(fiy1,ty);
474 fiz1 = _mm256_add_pd(fiz1,tz);
476 fjx0 = _mm256_add_pd(fjx0,tx);
477 fjy0 = _mm256_add_pd(fjy0,ty);
478 fjz0 = _mm256_add_pd(fjz0,tz);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 r11 = _mm256_mul_pd(rsq11,rinv11);
486 /* Calculate table index by multiplying r with table scale and truncate to integer */
487 rt = _mm256_mul_pd(r11,vftabscale);
488 vfitab = _mm256_cvttpd_epi32(rt);
489 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
490 vfitab = _mm_slli_epi32(vfitab,2);
492 /* CUBIC SPLINE TABLE ELECTROSTATICS */
493 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
494 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
495 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
496 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
497 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
498 Heps = _mm256_mul_pd(vfeps,H);
499 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
500 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
501 velec = _mm256_mul_pd(qq11,VV);
502 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
503 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velecsum = _mm256_add_pd(velecsum,velec);
510 /* Calculate temporary vectorial force */
511 tx = _mm256_mul_pd(fscal,dx11);
512 ty = _mm256_mul_pd(fscal,dy11);
513 tz = _mm256_mul_pd(fscal,dz11);
515 /* Update vectorial force */
516 fix1 = _mm256_add_pd(fix1,tx);
517 fiy1 = _mm256_add_pd(fiy1,ty);
518 fiz1 = _mm256_add_pd(fiz1,tz);
520 fjx1 = _mm256_add_pd(fjx1,tx);
521 fjy1 = _mm256_add_pd(fjy1,ty);
522 fjz1 = _mm256_add_pd(fjz1,tz);
524 /**************************
525 * CALCULATE INTERACTIONS *
526 **************************/
528 r12 = _mm256_mul_pd(rsq12,rinv12);
530 /* Calculate table index by multiplying r with table scale and truncate to integer */
531 rt = _mm256_mul_pd(r12,vftabscale);
532 vfitab = _mm256_cvttpd_epi32(rt);
533 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
534 vfitab = _mm_slli_epi32(vfitab,2);
536 /* CUBIC SPLINE TABLE ELECTROSTATICS */
537 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
538 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
539 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
540 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
541 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
542 Heps = _mm256_mul_pd(vfeps,H);
543 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
544 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
545 velec = _mm256_mul_pd(qq12,VV);
546 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
547 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
549 /* Update potential sum for this i atom from the interaction with this j atom. */
550 velecsum = _mm256_add_pd(velecsum,velec);
554 /* Calculate temporary vectorial force */
555 tx = _mm256_mul_pd(fscal,dx12);
556 ty = _mm256_mul_pd(fscal,dy12);
557 tz = _mm256_mul_pd(fscal,dz12);
559 /* Update vectorial force */
560 fix1 = _mm256_add_pd(fix1,tx);
561 fiy1 = _mm256_add_pd(fiy1,ty);
562 fiz1 = _mm256_add_pd(fiz1,tz);
564 fjx2 = _mm256_add_pd(fjx2,tx);
565 fjy2 = _mm256_add_pd(fjy2,ty);
566 fjz2 = _mm256_add_pd(fjz2,tz);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 r20 = _mm256_mul_pd(rsq20,rinv20);
574 /* Calculate table index by multiplying r with table scale and truncate to integer */
575 rt = _mm256_mul_pd(r20,vftabscale);
576 vfitab = _mm256_cvttpd_epi32(rt);
577 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
578 vfitab = _mm_slli_epi32(vfitab,2);
580 /* CUBIC SPLINE TABLE ELECTROSTATICS */
581 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
582 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
583 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
584 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
585 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
586 Heps = _mm256_mul_pd(vfeps,H);
587 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
588 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
589 velec = _mm256_mul_pd(qq20,VV);
590 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
591 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
593 /* Update potential sum for this i atom from the interaction with this j atom. */
594 velecsum = _mm256_add_pd(velecsum,velec);
598 /* Calculate temporary vectorial force */
599 tx = _mm256_mul_pd(fscal,dx20);
600 ty = _mm256_mul_pd(fscal,dy20);
601 tz = _mm256_mul_pd(fscal,dz20);
603 /* Update vectorial force */
604 fix2 = _mm256_add_pd(fix2,tx);
605 fiy2 = _mm256_add_pd(fiy2,ty);
606 fiz2 = _mm256_add_pd(fiz2,tz);
608 fjx0 = _mm256_add_pd(fjx0,tx);
609 fjy0 = _mm256_add_pd(fjy0,ty);
610 fjz0 = _mm256_add_pd(fjz0,tz);
612 /**************************
613 * CALCULATE INTERACTIONS *
614 **************************/
616 r21 = _mm256_mul_pd(rsq21,rinv21);
618 /* Calculate table index by multiplying r with table scale and truncate to integer */
619 rt = _mm256_mul_pd(r21,vftabscale);
620 vfitab = _mm256_cvttpd_epi32(rt);
621 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
622 vfitab = _mm_slli_epi32(vfitab,2);
624 /* CUBIC SPLINE TABLE ELECTROSTATICS */
625 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
626 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
627 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
628 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
629 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
630 Heps = _mm256_mul_pd(vfeps,H);
631 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
632 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
633 velec = _mm256_mul_pd(qq21,VV);
634 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
635 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
637 /* Update potential sum for this i atom from the interaction with this j atom. */
638 velecsum = _mm256_add_pd(velecsum,velec);
642 /* Calculate temporary vectorial force */
643 tx = _mm256_mul_pd(fscal,dx21);
644 ty = _mm256_mul_pd(fscal,dy21);
645 tz = _mm256_mul_pd(fscal,dz21);
647 /* Update vectorial force */
648 fix2 = _mm256_add_pd(fix2,tx);
649 fiy2 = _mm256_add_pd(fiy2,ty);
650 fiz2 = _mm256_add_pd(fiz2,tz);
652 fjx1 = _mm256_add_pd(fjx1,tx);
653 fjy1 = _mm256_add_pd(fjy1,ty);
654 fjz1 = _mm256_add_pd(fjz1,tz);
656 /**************************
657 * CALCULATE INTERACTIONS *
658 **************************/
660 r22 = _mm256_mul_pd(rsq22,rinv22);
662 /* Calculate table index by multiplying r with table scale and truncate to integer */
663 rt = _mm256_mul_pd(r22,vftabscale);
664 vfitab = _mm256_cvttpd_epi32(rt);
665 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
666 vfitab = _mm_slli_epi32(vfitab,2);
668 /* CUBIC SPLINE TABLE ELECTROSTATICS */
669 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
670 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
671 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
672 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
673 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
674 Heps = _mm256_mul_pd(vfeps,H);
675 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
676 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
677 velec = _mm256_mul_pd(qq22,VV);
678 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
679 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velecsum = _mm256_add_pd(velecsum,velec);
686 /* Calculate temporary vectorial force */
687 tx = _mm256_mul_pd(fscal,dx22);
688 ty = _mm256_mul_pd(fscal,dy22);
689 tz = _mm256_mul_pd(fscal,dz22);
691 /* Update vectorial force */
692 fix2 = _mm256_add_pd(fix2,tx);
693 fiy2 = _mm256_add_pd(fiy2,ty);
694 fiz2 = _mm256_add_pd(fiz2,tz);
696 fjx2 = _mm256_add_pd(fjx2,tx);
697 fjy2 = _mm256_add_pd(fjy2,ty);
698 fjz2 = _mm256_add_pd(fjz2,tz);
700 fjptrA = f+j_coord_offsetA;
701 fjptrB = f+j_coord_offsetB;
702 fjptrC = f+j_coord_offsetC;
703 fjptrD = f+j_coord_offsetD;
705 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
706 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
708 /* Inner loop uses 400 flops */
714 /* Get j neighbor index, and coordinate index */
715 jnrlistA = jjnr[jidx];
716 jnrlistB = jjnr[jidx+1];
717 jnrlistC = jjnr[jidx+2];
718 jnrlistD = jjnr[jidx+3];
719 /* Sign of each element will be negative for non-real atoms.
720 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
721 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
723 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
725 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
726 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
727 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
729 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
730 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
731 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
732 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
733 j_coord_offsetA = DIM*jnrA;
734 j_coord_offsetB = DIM*jnrB;
735 j_coord_offsetC = DIM*jnrC;
736 j_coord_offsetD = DIM*jnrD;
738 /* load j atom coordinates */
739 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
740 x+j_coord_offsetC,x+j_coord_offsetD,
741 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
743 /* Calculate displacement vector */
744 dx00 = _mm256_sub_pd(ix0,jx0);
745 dy00 = _mm256_sub_pd(iy0,jy0);
746 dz00 = _mm256_sub_pd(iz0,jz0);
747 dx01 = _mm256_sub_pd(ix0,jx1);
748 dy01 = _mm256_sub_pd(iy0,jy1);
749 dz01 = _mm256_sub_pd(iz0,jz1);
750 dx02 = _mm256_sub_pd(ix0,jx2);
751 dy02 = _mm256_sub_pd(iy0,jy2);
752 dz02 = _mm256_sub_pd(iz0,jz2);
753 dx10 = _mm256_sub_pd(ix1,jx0);
754 dy10 = _mm256_sub_pd(iy1,jy0);
755 dz10 = _mm256_sub_pd(iz1,jz0);
756 dx11 = _mm256_sub_pd(ix1,jx1);
757 dy11 = _mm256_sub_pd(iy1,jy1);
758 dz11 = _mm256_sub_pd(iz1,jz1);
759 dx12 = _mm256_sub_pd(ix1,jx2);
760 dy12 = _mm256_sub_pd(iy1,jy2);
761 dz12 = _mm256_sub_pd(iz1,jz2);
762 dx20 = _mm256_sub_pd(ix2,jx0);
763 dy20 = _mm256_sub_pd(iy2,jy0);
764 dz20 = _mm256_sub_pd(iz2,jz0);
765 dx21 = _mm256_sub_pd(ix2,jx1);
766 dy21 = _mm256_sub_pd(iy2,jy1);
767 dz21 = _mm256_sub_pd(iz2,jz1);
768 dx22 = _mm256_sub_pd(ix2,jx2);
769 dy22 = _mm256_sub_pd(iy2,jy2);
770 dz22 = _mm256_sub_pd(iz2,jz2);
772 /* Calculate squared distance and things based on it */
773 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
774 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
775 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
776 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
777 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
778 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
779 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
780 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
781 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
783 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
784 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
785 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
786 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
787 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
788 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
789 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
790 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
791 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
793 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
795 fjx0 = _mm256_setzero_pd();
796 fjy0 = _mm256_setzero_pd();
797 fjz0 = _mm256_setzero_pd();
798 fjx1 = _mm256_setzero_pd();
799 fjy1 = _mm256_setzero_pd();
800 fjz1 = _mm256_setzero_pd();
801 fjx2 = _mm256_setzero_pd();
802 fjy2 = _mm256_setzero_pd();
803 fjz2 = _mm256_setzero_pd();
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 r00 = _mm256_mul_pd(rsq00,rinv00);
810 r00 = _mm256_andnot_pd(dummy_mask,r00);
812 /* Calculate table index by multiplying r with table scale and truncate to integer */
813 rt = _mm256_mul_pd(r00,vftabscale);
814 vfitab = _mm256_cvttpd_epi32(rt);
815 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
816 vfitab = _mm_slli_epi32(vfitab,2);
818 /* CUBIC SPLINE TABLE ELECTROSTATICS */
819 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
820 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
821 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
822 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
823 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
824 Heps = _mm256_mul_pd(vfeps,H);
825 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
826 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
827 velec = _mm256_mul_pd(qq00,VV);
828 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
829 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
831 /* LENNARD-JONES DISPERSION/REPULSION */
833 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
834 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
835 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
836 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
837 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
839 /* Update potential sum for this i atom from the interaction with this j atom. */
840 velec = _mm256_andnot_pd(dummy_mask,velec);
841 velecsum = _mm256_add_pd(velecsum,velec);
842 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
843 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
845 fscal = _mm256_add_pd(felec,fvdw);
847 fscal = _mm256_andnot_pd(dummy_mask,fscal);
849 /* Calculate temporary vectorial force */
850 tx = _mm256_mul_pd(fscal,dx00);
851 ty = _mm256_mul_pd(fscal,dy00);
852 tz = _mm256_mul_pd(fscal,dz00);
854 /* Update vectorial force */
855 fix0 = _mm256_add_pd(fix0,tx);
856 fiy0 = _mm256_add_pd(fiy0,ty);
857 fiz0 = _mm256_add_pd(fiz0,tz);
859 fjx0 = _mm256_add_pd(fjx0,tx);
860 fjy0 = _mm256_add_pd(fjy0,ty);
861 fjz0 = _mm256_add_pd(fjz0,tz);
863 /**************************
864 * CALCULATE INTERACTIONS *
865 **************************/
867 r01 = _mm256_mul_pd(rsq01,rinv01);
868 r01 = _mm256_andnot_pd(dummy_mask,r01);
870 /* Calculate table index by multiplying r with table scale and truncate to integer */
871 rt = _mm256_mul_pd(r01,vftabscale);
872 vfitab = _mm256_cvttpd_epi32(rt);
873 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
874 vfitab = _mm_slli_epi32(vfitab,2);
876 /* CUBIC SPLINE TABLE ELECTROSTATICS */
877 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
878 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
879 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
880 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
881 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
882 Heps = _mm256_mul_pd(vfeps,H);
883 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
884 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
885 velec = _mm256_mul_pd(qq01,VV);
886 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
887 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
889 /* Update potential sum for this i atom from the interaction with this j atom. */
890 velec = _mm256_andnot_pd(dummy_mask,velec);
891 velecsum = _mm256_add_pd(velecsum,velec);
895 fscal = _mm256_andnot_pd(dummy_mask,fscal);
897 /* Calculate temporary vectorial force */
898 tx = _mm256_mul_pd(fscal,dx01);
899 ty = _mm256_mul_pd(fscal,dy01);
900 tz = _mm256_mul_pd(fscal,dz01);
902 /* Update vectorial force */
903 fix0 = _mm256_add_pd(fix0,tx);
904 fiy0 = _mm256_add_pd(fiy0,ty);
905 fiz0 = _mm256_add_pd(fiz0,tz);
907 fjx1 = _mm256_add_pd(fjx1,tx);
908 fjy1 = _mm256_add_pd(fjy1,ty);
909 fjz1 = _mm256_add_pd(fjz1,tz);
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 r02 = _mm256_mul_pd(rsq02,rinv02);
916 r02 = _mm256_andnot_pd(dummy_mask,r02);
918 /* Calculate table index by multiplying r with table scale and truncate to integer */
919 rt = _mm256_mul_pd(r02,vftabscale);
920 vfitab = _mm256_cvttpd_epi32(rt);
921 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
922 vfitab = _mm_slli_epi32(vfitab,2);
924 /* CUBIC SPLINE TABLE ELECTROSTATICS */
925 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
926 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
927 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
928 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
929 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
930 Heps = _mm256_mul_pd(vfeps,H);
931 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
932 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
933 velec = _mm256_mul_pd(qq02,VV);
934 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
935 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
937 /* Update potential sum for this i atom from the interaction with this j atom. */
938 velec = _mm256_andnot_pd(dummy_mask,velec);
939 velecsum = _mm256_add_pd(velecsum,velec);
943 fscal = _mm256_andnot_pd(dummy_mask,fscal);
945 /* Calculate temporary vectorial force */
946 tx = _mm256_mul_pd(fscal,dx02);
947 ty = _mm256_mul_pd(fscal,dy02);
948 tz = _mm256_mul_pd(fscal,dz02);
950 /* Update vectorial force */
951 fix0 = _mm256_add_pd(fix0,tx);
952 fiy0 = _mm256_add_pd(fiy0,ty);
953 fiz0 = _mm256_add_pd(fiz0,tz);
955 fjx2 = _mm256_add_pd(fjx2,tx);
956 fjy2 = _mm256_add_pd(fjy2,ty);
957 fjz2 = _mm256_add_pd(fjz2,tz);
959 /**************************
960 * CALCULATE INTERACTIONS *
961 **************************/
963 r10 = _mm256_mul_pd(rsq10,rinv10);
964 r10 = _mm256_andnot_pd(dummy_mask,r10);
966 /* Calculate table index by multiplying r with table scale and truncate to integer */
967 rt = _mm256_mul_pd(r10,vftabscale);
968 vfitab = _mm256_cvttpd_epi32(rt);
969 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
970 vfitab = _mm_slli_epi32(vfitab,2);
972 /* CUBIC SPLINE TABLE ELECTROSTATICS */
973 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
974 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
975 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
976 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
977 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
978 Heps = _mm256_mul_pd(vfeps,H);
979 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
980 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
981 velec = _mm256_mul_pd(qq10,VV);
982 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
983 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
985 /* Update potential sum for this i atom from the interaction with this j atom. */
986 velec = _mm256_andnot_pd(dummy_mask,velec);
987 velecsum = _mm256_add_pd(velecsum,velec);
991 fscal = _mm256_andnot_pd(dummy_mask,fscal);
993 /* Calculate temporary vectorial force */
994 tx = _mm256_mul_pd(fscal,dx10);
995 ty = _mm256_mul_pd(fscal,dy10);
996 tz = _mm256_mul_pd(fscal,dz10);
998 /* Update vectorial force */
999 fix1 = _mm256_add_pd(fix1,tx);
1000 fiy1 = _mm256_add_pd(fiy1,ty);
1001 fiz1 = _mm256_add_pd(fiz1,tz);
1003 fjx0 = _mm256_add_pd(fjx0,tx);
1004 fjy0 = _mm256_add_pd(fjy0,ty);
1005 fjz0 = _mm256_add_pd(fjz0,tz);
1007 /**************************
1008 * CALCULATE INTERACTIONS *
1009 **************************/
1011 r11 = _mm256_mul_pd(rsq11,rinv11);
1012 r11 = _mm256_andnot_pd(dummy_mask,r11);
1014 /* Calculate table index by multiplying r with table scale and truncate to integer */
1015 rt = _mm256_mul_pd(r11,vftabscale);
1016 vfitab = _mm256_cvttpd_epi32(rt);
1017 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1018 vfitab = _mm_slli_epi32(vfitab,2);
1020 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1021 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1022 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1023 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1024 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1025 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1026 Heps = _mm256_mul_pd(vfeps,H);
1027 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1028 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1029 velec = _mm256_mul_pd(qq11,VV);
1030 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1031 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1033 /* Update potential sum for this i atom from the interaction with this j atom. */
1034 velec = _mm256_andnot_pd(dummy_mask,velec);
1035 velecsum = _mm256_add_pd(velecsum,velec);
1039 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1041 /* Calculate temporary vectorial force */
1042 tx = _mm256_mul_pd(fscal,dx11);
1043 ty = _mm256_mul_pd(fscal,dy11);
1044 tz = _mm256_mul_pd(fscal,dz11);
1046 /* Update vectorial force */
1047 fix1 = _mm256_add_pd(fix1,tx);
1048 fiy1 = _mm256_add_pd(fiy1,ty);
1049 fiz1 = _mm256_add_pd(fiz1,tz);
1051 fjx1 = _mm256_add_pd(fjx1,tx);
1052 fjy1 = _mm256_add_pd(fjy1,ty);
1053 fjz1 = _mm256_add_pd(fjz1,tz);
1055 /**************************
1056 * CALCULATE INTERACTIONS *
1057 **************************/
1059 r12 = _mm256_mul_pd(rsq12,rinv12);
1060 r12 = _mm256_andnot_pd(dummy_mask,r12);
1062 /* Calculate table index by multiplying r with table scale and truncate to integer */
1063 rt = _mm256_mul_pd(r12,vftabscale);
1064 vfitab = _mm256_cvttpd_epi32(rt);
1065 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1066 vfitab = _mm_slli_epi32(vfitab,2);
1068 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1069 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1070 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1071 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1072 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1073 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1074 Heps = _mm256_mul_pd(vfeps,H);
1075 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1076 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1077 velec = _mm256_mul_pd(qq12,VV);
1078 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1079 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1081 /* Update potential sum for this i atom from the interaction with this j atom. */
1082 velec = _mm256_andnot_pd(dummy_mask,velec);
1083 velecsum = _mm256_add_pd(velecsum,velec);
1087 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1089 /* Calculate temporary vectorial force */
1090 tx = _mm256_mul_pd(fscal,dx12);
1091 ty = _mm256_mul_pd(fscal,dy12);
1092 tz = _mm256_mul_pd(fscal,dz12);
1094 /* Update vectorial force */
1095 fix1 = _mm256_add_pd(fix1,tx);
1096 fiy1 = _mm256_add_pd(fiy1,ty);
1097 fiz1 = _mm256_add_pd(fiz1,tz);
1099 fjx2 = _mm256_add_pd(fjx2,tx);
1100 fjy2 = _mm256_add_pd(fjy2,ty);
1101 fjz2 = _mm256_add_pd(fjz2,tz);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 r20 = _mm256_mul_pd(rsq20,rinv20);
1108 r20 = _mm256_andnot_pd(dummy_mask,r20);
1110 /* Calculate table index by multiplying r with table scale and truncate to integer */
1111 rt = _mm256_mul_pd(r20,vftabscale);
1112 vfitab = _mm256_cvttpd_epi32(rt);
1113 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1114 vfitab = _mm_slli_epi32(vfitab,2);
1116 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1117 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1118 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1119 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1120 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1121 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1122 Heps = _mm256_mul_pd(vfeps,H);
1123 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1124 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1125 velec = _mm256_mul_pd(qq20,VV);
1126 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1127 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1129 /* Update potential sum for this i atom from the interaction with this j atom. */
1130 velec = _mm256_andnot_pd(dummy_mask,velec);
1131 velecsum = _mm256_add_pd(velecsum,velec);
1135 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1137 /* Calculate temporary vectorial force */
1138 tx = _mm256_mul_pd(fscal,dx20);
1139 ty = _mm256_mul_pd(fscal,dy20);
1140 tz = _mm256_mul_pd(fscal,dz20);
1142 /* Update vectorial force */
1143 fix2 = _mm256_add_pd(fix2,tx);
1144 fiy2 = _mm256_add_pd(fiy2,ty);
1145 fiz2 = _mm256_add_pd(fiz2,tz);
1147 fjx0 = _mm256_add_pd(fjx0,tx);
1148 fjy0 = _mm256_add_pd(fjy0,ty);
1149 fjz0 = _mm256_add_pd(fjz0,tz);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 r21 = _mm256_mul_pd(rsq21,rinv21);
1156 r21 = _mm256_andnot_pd(dummy_mask,r21);
1158 /* Calculate table index by multiplying r with table scale and truncate to integer */
1159 rt = _mm256_mul_pd(r21,vftabscale);
1160 vfitab = _mm256_cvttpd_epi32(rt);
1161 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1162 vfitab = _mm_slli_epi32(vfitab,2);
1164 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1165 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1166 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1167 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1168 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1169 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1170 Heps = _mm256_mul_pd(vfeps,H);
1171 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1172 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1173 velec = _mm256_mul_pd(qq21,VV);
1174 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1175 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec = _mm256_andnot_pd(dummy_mask,velec);
1179 velecsum = _mm256_add_pd(velecsum,velec);
1183 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1185 /* Calculate temporary vectorial force */
1186 tx = _mm256_mul_pd(fscal,dx21);
1187 ty = _mm256_mul_pd(fscal,dy21);
1188 tz = _mm256_mul_pd(fscal,dz21);
1190 /* Update vectorial force */
1191 fix2 = _mm256_add_pd(fix2,tx);
1192 fiy2 = _mm256_add_pd(fiy2,ty);
1193 fiz2 = _mm256_add_pd(fiz2,tz);
1195 fjx1 = _mm256_add_pd(fjx1,tx);
1196 fjy1 = _mm256_add_pd(fjy1,ty);
1197 fjz1 = _mm256_add_pd(fjz1,tz);
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 r22 = _mm256_mul_pd(rsq22,rinv22);
1204 r22 = _mm256_andnot_pd(dummy_mask,r22);
1206 /* Calculate table index by multiplying r with table scale and truncate to integer */
1207 rt = _mm256_mul_pd(r22,vftabscale);
1208 vfitab = _mm256_cvttpd_epi32(rt);
1209 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1210 vfitab = _mm_slli_epi32(vfitab,2);
1212 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1213 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1214 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1215 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1216 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1217 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1218 Heps = _mm256_mul_pd(vfeps,H);
1219 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1220 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1221 velec = _mm256_mul_pd(qq22,VV);
1222 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1223 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1225 /* Update potential sum for this i atom from the interaction with this j atom. */
1226 velec = _mm256_andnot_pd(dummy_mask,velec);
1227 velecsum = _mm256_add_pd(velecsum,velec);
1231 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1233 /* Calculate temporary vectorial force */
1234 tx = _mm256_mul_pd(fscal,dx22);
1235 ty = _mm256_mul_pd(fscal,dy22);
1236 tz = _mm256_mul_pd(fscal,dz22);
1238 /* Update vectorial force */
1239 fix2 = _mm256_add_pd(fix2,tx);
1240 fiy2 = _mm256_add_pd(fiy2,ty);
1241 fiz2 = _mm256_add_pd(fiz2,tz);
1243 fjx2 = _mm256_add_pd(fjx2,tx);
1244 fjy2 = _mm256_add_pd(fjy2,ty);
1245 fjz2 = _mm256_add_pd(fjz2,tz);
1247 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1248 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1249 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1250 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1252 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1253 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1255 /* Inner loop uses 409 flops */
1258 /* End of innermost loop */
1260 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1261 f+i_coord_offset,fshift+i_shift_offset);
1264 /* Update potential energies */
1265 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1266 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1268 /* Increment number of inner iterations */
1269 inneriter += j_index_end - j_index_start;
1271 /* Outer loop uses 20 flops */
1274 /* Increment number of outer iterations */
1277 /* Update outer/inner flops */
1279 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1282 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1283 * Electrostatics interaction: CubicSplineTable
1284 * VdW interaction: LennardJones
1285 * Geometry: Water3-Water3
1286 * Calculate force/pot: Force
1289 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_double
1290 (t_nblist * gmx_restrict nlist,
1291 rvec * gmx_restrict xx,
1292 rvec * gmx_restrict ff,
1293 t_forcerec * gmx_restrict fr,
1294 t_mdatoms * gmx_restrict mdatoms,
1295 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1296 t_nrnb * gmx_restrict nrnb)
1298 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1299 * just 0 for non-waters.
1300 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1301 * jnr indices corresponding to data put in the four positions in the SIMD register.
1303 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1304 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1305 int jnrA,jnrB,jnrC,jnrD;
1306 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1307 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1308 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1309 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1310 real rcutoff_scalar;
1311 real *shiftvec,*fshift,*x,*f;
1312 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1313 real scratch[4*DIM];
1314 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1315 real * vdwioffsetptr0;
1316 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1317 real * vdwioffsetptr1;
1318 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1319 real * vdwioffsetptr2;
1320 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1321 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1322 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1323 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1324 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1325 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1326 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1327 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1328 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1329 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1330 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1331 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1332 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1333 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1334 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1335 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1336 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1339 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1342 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1343 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1345 __m128i ifour = _mm_set1_epi32(4);
1346 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1348 __m256d dummy_mask,cutoff_mask;
1349 __m128 tmpmask0,tmpmask1;
1350 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1351 __m256d one = _mm256_set1_pd(1.0);
1352 __m256d two = _mm256_set1_pd(2.0);
1358 jindex = nlist->jindex;
1360 shiftidx = nlist->shift;
1362 shiftvec = fr->shift_vec[0];
1363 fshift = fr->fshift[0];
1364 facel = _mm256_set1_pd(fr->epsfac);
1365 charge = mdatoms->chargeA;
1366 nvdwtype = fr->ntype;
1367 vdwparam = fr->nbfp;
1368 vdwtype = mdatoms->typeA;
1370 vftab = kernel_data->table_elec->data;
1371 vftabscale = _mm256_set1_pd(kernel_data->table_elec->scale);
1373 /* Setup water-specific parameters */
1374 inr = nlist->iinr[0];
1375 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1376 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1377 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1378 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1380 jq0 = _mm256_set1_pd(charge[inr+0]);
1381 jq1 = _mm256_set1_pd(charge[inr+1]);
1382 jq2 = _mm256_set1_pd(charge[inr+2]);
1383 vdwjidx0A = 2*vdwtype[inr+0];
1384 qq00 = _mm256_mul_pd(iq0,jq0);
1385 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1386 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1387 qq01 = _mm256_mul_pd(iq0,jq1);
1388 qq02 = _mm256_mul_pd(iq0,jq2);
1389 qq10 = _mm256_mul_pd(iq1,jq0);
1390 qq11 = _mm256_mul_pd(iq1,jq1);
1391 qq12 = _mm256_mul_pd(iq1,jq2);
1392 qq20 = _mm256_mul_pd(iq2,jq0);
1393 qq21 = _mm256_mul_pd(iq2,jq1);
1394 qq22 = _mm256_mul_pd(iq2,jq2);
1396 /* Avoid stupid compiler warnings */
1397 jnrA = jnrB = jnrC = jnrD = 0;
1398 j_coord_offsetA = 0;
1399 j_coord_offsetB = 0;
1400 j_coord_offsetC = 0;
1401 j_coord_offsetD = 0;
1406 for(iidx=0;iidx<4*DIM;iidx++)
1408 scratch[iidx] = 0.0;
1411 /* Start outer loop over neighborlists */
1412 for(iidx=0; iidx<nri; iidx++)
1414 /* Load shift vector for this list */
1415 i_shift_offset = DIM*shiftidx[iidx];
1417 /* Load limits for loop over neighbors */
1418 j_index_start = jindex[iidx];
1419 j_index_end = jindex[iidx+1];
1421 /* Get outer coordinate index */
1423 i_coord_offset = DIM*inr;
1425 /* Load i particle coords and add shift vector */
1426 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1427 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1429 fix0 = _mm256_setzero_pd();
1430 fiy0 = _mm256_setzero_pd();
1431 fiz0 = _mm256_setzero_pd();
1432 fix1 = _mm256_setzero_pd();
1433 fiy1 = _mm256_setzero_pd();
1434 fiz1 = _mm256_setzero_pd();
1435 fix2 = _mm256_setzero_pd();
1436 fiy2 = _mm256_setzero_pd();
1437 fiz2 = _mm256_setzero_pd();
1439 /* Start inner kernel loop */
1440 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1443 /* Get j neighbor index, and coordinate index */
1445 jnrB = jjnr[jidx+1];
1446 jnrC = jjnr[jidx+2];
1447 jnrD = jjnr[jidx+3];
1448 j_coord_offsetA = DIM*jnrA;
1449 j_coord_offsetB = DIM*jnrB;
1450 j_coord_offsetC = DIM*jnrC;
1451 j_coord_offsetD = DIM*jnrD;
1453 /* load j atom coordinates */
1454 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1455 x+j_coord_offsetC,x+j_coord_offsetD,
1456 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1458 /* Calculate displacement vector */
1459 dx00 = _mm256_sub_pd(ix0,jx0);
1460 dy00 = _mm256_sub_pd(iy0,jy0);
1461 dz00 = _mm256_sub_pd(iz0,jz0);
1462 dx01 = _mm256_sub_pd(ix0,jx1);
1463 dy01 = _mm256_sub_pd(iy0,jy1);
1464 dz01 = _mm256_sub_pd(iz0,jz1);
1465 dx02 = _mm256_sub_pd(ix0,jx2);
1466 dy02 = _mm256_sub_pd(iy0,jy2);
1467 dz02 = _mm256_sub_pd(iz0,jz2);
1468 dx10 = _mm256_sub_pd(ix1,jx0);
1469 dy10 = _mm256_sub_pd(iy1,jy0);
1470 dz10 = _mm256_sub_pd(iz1,jz0);
1471 dx11 = _mm256_sub_pd(ix1,jx1);
1472 dy11 = _mm256_sub_pd(iy1,jy1);
1473 dz11 = _mm256_sub_pd(iz1,jz1);
1474 dx12 = _mm256_sub_pd(ix1,jx2);
1475 dy12 = _mm256_sub_pd(iy1,jy2);
1476 dz12 = _mm256_sub_pd(iz1,jz2);
1477 dx20 = _mm256_sub_pd(ix2,jx0);
1478 dy20 = _mm256_sub_pd(iy2,jy0);
1479 dz20 = _mm256_sub_pd(iz2,jz0);
1480 dx21 = _mm256_sub_pd(ix2,jx1);
1481 dy21 = _mm256_sub_pd(iy2,jy1);
1482 dz21 = _mm256_sub_pd(iz2,jz1);
1483 dx22 = _mm256_sub_pd(ix2,jx2);
1484 dy22 = _mm256_sub_pd(iy2,jy2);
1485 dz22 = _mm256_sub_pd(iz2,jz2);
1487 /* Calculate squared distance and things based on it */
1488 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1489 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1490 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1491 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1492 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1493 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1494 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1495 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1496 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1498 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1499 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1500 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1501 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1502 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1503 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1504 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1505 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1506 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1508 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1510 fjx0 = _mm256_setzero_pd();
1511 fjy0 = _mm256_setzero_pd();
1512 fjz0 = _mm256_setzero_pd();
1513 fjx1 = _mm256_setzero_pd();
1514 fjy1 = _mm256_setzero_pd();
1515 fjz1 = _mm256_setzero_pd();
1516 fjx2 = _mm256_setzero_pd();
1517 fjy2 = _mm256_setzero_pd();
1518 fjz2 = _mm256_setzero_pd();
1520 /**************************
1521 * CALCULATE INTERACTIONS *
1522 **************************/
1524 r00 = _mm256_mul_pd(rsq00,rinv00);
1526 /* Calculate table index by multiplying r with table scale and truncate to integer */
1527 rt = _mm256_mul_pd(r00,vftabscale);
1528 vfitab = _mm256_cvttpd_epi32(rt);
1529 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1530 vfitab = _mm_slli_epi32(vfitab,2);
1532 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1533 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1534 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1535 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1536 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1537 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1538 Heps = _mm256_mul_pd(vfeps,H);
1539 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1540 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1541 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1543 /* LENNARD-JONES DISPERSION/REPULSION */
1545 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1546 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1548 fscal = _mm256_add_pd(felec,fvdw);
1550 /* Calculate temporary vectorial force */
1551 tx = _mm256_mul_pd(fscal,dx00);
1552 ty = _mm256_mul_pd(fscal,dy00);
1553 tz = _mm256_mul_pd(fscal,dz00);
1555 /* Update vectorial force */
1556 fix0 = _mm256_add_pd(fix0,tx);
1557 fiy0 = _mm256_add_pd(fiy0,ty);
1558 fiz0 = _mm256_add_pd(fiz0,tz);
1560 fjx0 = _mm256_add_pd(fjx0,tx);
1561 fjy0 = _mm256_add_pd(fjy0,ty);
1562 fjz0 = _mm256_add_pd(fjz0,tz);
1564 /**************************
1565 * CALCULATE INTERACTIONS *
1566 **************************/
1568 r01 = _mm256_mul_pd(rsq01,rinv01);
1570 /* Calculate table index by multiplying r with table scale and truncate to integer */
1571 rt = _mm256_mul_pd(r01,vftabscale);
1572 vfitab = _mm256_cvttpd_epi32(rt);
1573 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1574 vfitab = _mm_slli_epi32(vfitab,2);
1576 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1577 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1578 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1579 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1580 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1581 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1582 Heps = _mm256_mul_pd(vfeps,H);
1583 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1584 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1585 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1589 /* Calculate temporary vectorial force */
1590 tx = _mm256_mul_pd(fscal,dx01);
1591 ty = _mm256_mul_pd(fscal,dy01);
1592 tz = _mm256_mul_pd(fscal,dz01);
1594 /* Update vectorial force */
1595 fix0 = _mm256_add_pd(fix0,tx);
1596 fiy0 = _mm256_add_pd(fiy0,ty);
1597 fiz0 = _mm256_add_pd(fiz0,tz);
1599 fjx1 = _mm256_add_pd(fjx1,tx);
1600 fjy1 = _mm256_add_pd(fjy1,ty);
1601 fjz1 = _mm256_add_pd(fjz1,tz);
1603 /**************************
1604 * CALCULATE INTERACTIONS *
1605 **************************/
1607 r02 = _mm256_mul_pd(rsq02,rinv02);
1609 /* Calculate table index by multiplying r with table scale and truncate to integer */
1610 rt = _mm256_mul_pd(r02,vftabscale);
1611 vfitab = _mm256_cvttpd_epi32(rt);
1612 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1613 vfitab = _mm_slli_epi32(vfitab,2);
1615 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1616 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1617 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1618 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1619 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1620 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1621 Heps = _mm256_mul_pd(vfeps,H);
1622 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1623 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1624 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1628 /* Calculate temporary vectorial force */
1629 tx = _mm256_mul_pd(fscal,dx02);
1630 ty = _mm256_mul_pd(fscal,dy02);
1631 tz = _mm256_mul_pd(fscal,dz02);
1633 /* Update vectorial force */
1634 fix0 = _mm256_add_pd(fix0,tx);
1635 fiy0 = _mm256_add_pd(fiy0,ty);
1636 fiz0 = _mm256_add_pd(fiz0,tz);
1638 fjx2 = _mm256_add_pd(fjx2,tx);
1639 fjy2 = _mm256_add_pd(fjy2,ty);
1640 fjz2 = _mm256_add_pd(fjz2,tz);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 r10 = _mm256_mul_pd(rsq10,rinv10);
1648 /* Calculate table index by multiplying r with table scale and truncate to integer */
1649 rt = _mm256_mul_pd(r10,vftabscale);
1650 vfitab = _mm256_cvttpd_epi32(rt);
1651 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1652 vfitab = _mm_slli_epi32(vfitab,2);
1654 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1655 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1656 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1657 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1658 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1659 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1660 Heps = _mm256_mul_pd(vfeps,H);
1661 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1662 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1663 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1667 /* Calculate temporary vectorial force */
1668 tx = _mm256_mul_pd(fscal,dx10);
1669 ty = _mm256_mul_pd(fscal,dy10);
1670 tz = _mm256_mul_pd(fscal,dz10);
1672 /* Update vectorial force */
1673 fix1 = _mm256_add_pd(fix1,tx);
1674 fiy1 = _mm256_add_pd(fiy1,ty);
1675 fiz1 = _mm256_add_pd(fiz1,tz);
1677 fjx0 = _mm256_add_pd(fjx0,tx);
1678 fjy0 = _mm256_add_pd(fjy0,ty);
1679 fjz0 = _mm256_add_pd(fjz0,tz);
1681 /**************************
1682 * CALCULATE INTERACTIONS *
1683 **************************/
1685 r11 = _mm256_mul_pd(rsq11,rinv11);
1687 /* Calculate table index by multiplying r with table scale and truncate to integer */
1688 rt = _mm256_mul_pd(r11,vftabscale);
1689 vfitab = _mm256_cvttpd_epi32(rt);
1690 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1691 vfitab = _mm_slli_epi32(vfitab,2);
1693 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1694 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1695 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1696 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1697 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1698 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1699 Heps = _mm256_mul_pd(vfeps,H);
1700 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1701 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1702 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1706 /* Calculate temporary vectorial force */
1707 tx = _mm256_mul_pd(fscal,dx11);
1708 ty = _mm256_mul_pd(fscal,dy11);
1709 tz = _mm256_mul_pd(fscal,dz11);
1711 /* Update vectorial force */
1712 fix1 = _mm256_add_pd(fix1,tx);
1713 fiy1 = _mm256_add_pd(fiy1,ty);
1714 fiz1 = _mm256_add_pd(fiz1,tz);
1716 fjx1 = _mm256_add_pd(fjx1,tx);
1717 fjy1 = _mm256_add_pd(fjy1,ty);
1718 fjz1 = _mm256_add_pd(fjz1,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 r12 = _mm256_mul_pd(rsq12,rinv12);
1726 /* Calculate table index by multiplying r with table scale and truncate to integer */
1727 rt = _mm256_mul_pd(r12,vftabscale);
1728 vfitab = _mm256_cvttpd_epi32(rt);
1729 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1730 vfitab = _mm_slli_epi32(vfitab,2);
1732 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1733 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1734 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1735 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1736 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1737 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1738 Heps = _mm256_mul_pd(vfeps,H);
1739 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1740 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1741 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1745 /* Calculate temporary vectorial force */
1746 tx = _mm256_mul_pd(fscal,dx12);
1747 ty = _mm256_mul_pd(fscal,dy12);
1748 tz = _mm256_mul_pd(fscal,dz12);
1750 /* Update vectorial force */
1751 fix1 = _mm256_add_pd(fix1,tx);
1752 fiy1 = _mm256_add_pd(fiy1,ty);
1753 fiz1 = _mm256_add_pd(fiz1,tz);
1755 fjx2 = _mm256_add_pd(fjx2,tx);
1756 fjy2 = _mm256_add_pd(fjy2,ty);
1757 fjz2 = _mm256_add_pd(fjz2,tz);
1759 /**************************
1760 * CALCULATE INTERACTIONS *
1761 **************************/
1763 r20 = _mm256_mul_pd(rsq20,rinv20);
1765 /* Calculate table index by multiplying r with table scale and truncate to integer */
1766 rt = _mm256_mul_pd(r20,vftabscale);
1767 vfitab = _mm256_cvttpd_epi32(rt);
1768 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1769 vfitab = _mm_slli_epi32(vfitab,2);
1771 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1772 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1773 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1774 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1775 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1776 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1777 Heps = _mm256_mul_pd(vfeps,H);
1778 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1779 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1780 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1784 /* Calculate temporary vectorial force */
1785 tx = _mm256_mul_pd(fscal,dx20);
1786 ty = _mm256_mul_pd(fscal,dy20);
1787 tz = _mm256_mul_pd(fscal,dz20);
1789 /* Update vectorial force */
1790 fix2 = _mm256_add_pd(fix2,tx);
1791 fiy2 = _mm256_add_pd(fiy2,ty);
1792 fiz2 = _mm256_add_pd(fiz2,tz);
1794 fjx0 = _mm256_add_pd(fjx0,tx);
1795 fjy0 = _mm256_add_pd(fjy0,ty);
1796 fjz0 = _mm256_add_pd(fjz0,tz);
1798 /**************************
1799 * CALCULATE INTERACTIONS *
1800 **************************/
1802 r21 = _mm256_mul_pd(rsq21,rinv21);
1804 /* Calculate table index by multiplying r with table scale and truncate to integer */
1805 rt = _mm256_mul_pd(r21,vftabscale);
1806 vfitab = _mm256_cvttpd_epi32(rt);
1807 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1808 vfitab = _mm_slli_epi32(vfitab,2);
1810 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1811 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1812 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1813 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1814 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1815 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1816 Heps = _mm256_mul_pd(vfeps,H);
1817 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1818 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1819 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1823 /* Calculate temporary vectorial force */
1824 tx = _mm256_mul_pd(fscal,dx21);
1825 ty = _mm256_mul_pd(fscal,dy21);
1826 tz = _mm256_mul_pd(fscal,dz21);
1828 /* Update vectorial force */
1829 fix2 = _mm256_add_pd(fix2,tx);
1830 fiy2 = _mm256_add_pd(fiy2,ty);
1831 fiz2 = _mm256_add_pd(fiz2,tz);
1833 fjx1 = _mm256_add_pd(fjx1,tx);
1834 fjy1 = _mm256_add_pd(fjy1,ty);
1835 fjz1 = _mm256_add_pd(fjz1,tz);
1837 /**************************
1838 * CALCULATE INTERACTIONS *
1839 **************************/
1841 r22 = _mm256_mul_pd(rsq22,rinv22);
1843 /* Calculate table index by multiplying r with table scale and truncate to integer */
1844 rt = _mm256_mul_pd(r22,vftabscale);
1845 vfitab = _mm256_cvttpd_epi32(rt);
1846 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1847 vfitab = _mm_slli_epi32(vfitab,2);
1849 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1850 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1851 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1852 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1853 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1854 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1855 Heps = _mm256_mul_pd(vfeps,H);
1856 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1857 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1858 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1862 /* Calculate temporary vectorial force */
1863 tx = _mm256_mul_pd(fscal,dx22);
1864 ty = _mm256_mul_pd(fscal,dy22);
1865 tz = _mm256_mul_pd(fscal,dz22);
1867 /* Update vectorial force */
1868 fix2 = _mm256_add_pd(fix2,tx);
1869 fiy2 = _mm256_add_pd(fiy2,ty);
1870 fiz2 = _mm256_add_pd(fiz2,tz);
1872 fjx2 = _mm256_add_pd(fjx2,tx);
1873 fjy2 = _mm256_add_pd(fjy2,ty);
1874 fjz2 = _mm256_add_pd(fjz2,tz);
1876 fjptrA = f+j_coord_offsetA;
1877 fjptrB = f+j_coord_offsetB;
1878 fjptrC = f+j_coord_offsetC;
1879 fjptrD = f+j_coord_offsetD;
1881 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1882 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1884 /* Inner loop uses 359 flops */
1887 if(jidx<j_index_end)
1890 /* Get j neighbor index, and coordinate index */
1891 jnrlistA = jjnr[jidx];
1892 jnrlistB = jjnr[jidx+1];
1893 jnrlistC = jjnr[jidx+2];
1894 jnrlistD = jjnr[jidx+3];
1895 /* Sign of each element will be negative for non-real atoms.
1896 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1897 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1899 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1901 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1902 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1903 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1905 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1906 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1907 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1908 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1909 j_coord_offsetA = DIM*jnrA;
1910 j_coord_offsetB = DIM*jnrB;
1911 j_coord_offsetC = DIM*jnrC;
1912 j_coord_offsetD = DIM*jnrD;
1914 /* load j atom coordinates */
1915 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1916 x+j_coord_offsetC,x+j_coord_offsetD,
1917 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1919 /* Calculate displacement vector */
1920 dx00 = _mm256_sub_pd(ix0,jx0);
1921 dy00 = _mm256_sub_pd(iy0,jy0);
1922 dz00 = _mm256_sub_pd(iz0,jz0);
1923 dx01 = _mm256_sub_pd(ix0,jx1);
1924 dy01 = _mm256_sub_pd(iy0,jy1);
1925 dz01 = _mm256_sub_pd(iz0,jz1);
1926 dx02 = _mm256_sub_pd(ix0,jx2);
1927 dy02 = _mm256_sub_pd(iy0,jy2);
1928 dz02 = _mm256_sub_pd(iz0,jz2);
1929 dx10 = _mm256_sub_pd(ix1,jx0);
1930 dy10 = _mm256_sub_pd(iy1,jy0);
1931 dz10 = _mm256_sub_pd(iz1,jz0);
1932 dx11 = _mm256_sub_pd(ix1,jx1);
1933 dy11 = _mm256_sub_pd(iy1,jy1);
1934 dz11 = _mm256_sub_pd(iz1,jz1);
1935 dx12 = _mm256_sub_pd(ix1,jx2);
1936 dy12 = _mm256_sub_pd(iy1,jy2);
1937 dz12 = _mm256_sub_pd(iz1,jz2);
1938 dx20 = _mm256_sub_pd(ix2,jx0);
1939 dy20 = _mm256_sub_pd(iy2,jy0);
1940 dz20 = _mm256_sub_pd(iz2,jz0);
1941 dx21 = _mm256_sub_pd(ix2,jx1);
1942 dy21 = _mm256_sub_pd(iy2,jy1);
1943 dz21 = _mm256_sub_pd(iz2,jz1);
1944 dx22 = _mm256_sub_pd(ix2,jx2);
1945 dy22 = _mm256_sub_pd(iy2,jy2);
1946 dz22 = _mm256_sub_pd(iz2,jz2);
1948 /* Calculate squared distance and things based on it */
1949 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1950 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1951 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1952 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1953 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1954 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1955 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1956 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1957 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1959 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1960 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1961 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1962 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1963 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1964 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1965 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1966 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1967 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1969 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1971 fjx0 = _mm256_setzero_pd();
1972 fjy0 = _mm256_setzero_pd();
1973 fjz0 = _mm256_setzero_pd();
1974 fjx1 = _mm256_setzero_pd();
1975 fjy1 = _mm256_setzero_pd();
1976 fjz1 = _mm256_setzero_pd();
1977 fjx2 = _mm256_setzero_pd();
1978 fjy2 = _mm256_setzero_pd();
1979 fjz2 = _mm256_setzero_pd();
1981 /**************************
1982 * CALCULATE INTERACTIONS *
1983 **************************/
1985 r00 = _mm256_mul_pd(rsq00,rinv00);
1986 r00 = _mm256_andnot_pd(dummy_mask,r00);
1988 /* Calculate table index by multiplying r with table scale and truncate to integer */
1989 rt = _mm256_mul_pd(r00,vftabscale);
1990 vfitab = _mm256_cvttpd_epi32(rt);
1991 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1992 vfitab = _mm_slli_epi32(vfitab,2);
1994 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1995 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1996 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1997 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1998 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1999 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2000 Heps = _mm256_mul_pd(vfeps,H);
2001 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2002 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2003 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
2005 /* LENNARD-JONES DISPERSION/REPULSION */
2007 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2008 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2010 fscal = _mm256_add_pd(felec,fvdw);
2012 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2014 /* Calculate temporary vectorial force */
2015 tx = _mm256_mul_pd(fscal,dx00);
2016 ty = _mm256_mul_pd(fscal,dy00);
2017 tz = _mm256_mul_pd(fscal,dz00);
2019 /* Update vectorial force */
2020 fix0 = _mm256_add_pd(fix0,tx);
2021 fiy0 = _mm256_add_pd(fiy0,ty);
2022 fiz0 = _mm256_add_pd(fiz0,tz);
2024 fjx0 = _mm256_add_pd(fjx0,tx);
2025 fjy0 = _mm256_add_pd(fjy0,ty);
2026 fjz0 = _mm256_add_pd(fjz0,tz);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 r01 = _mm256_mul_pd(rsq01,rinv01);
2033 r01 = _mm256_andnot_pd(dummy_mask,r01);
2035 /* Calculate table index by multiplying r with table scale and truncate to integer */
2036 rt = _mm256_mul_pd(r01,vftabscale);
2037 vfitab = _mm256_cvttpd_epi32(rt);
2038 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2039 vfitab = _mm_slli_epi32(vfitab,2);
2041 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2042 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2043 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2044 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2045 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2046 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2047 Heps = _mm256_mul_pd(vfeps,H);
2048 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2049 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2050 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
2054 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2056 /* Calculate temporary vectorial force */
2057 tx = _mm256_mul_pd(fscal,dx01);
2058 ty = _mm256_mul_pd(fscal,dy01);
2059 tz = _mm256_mul_pd(fscal,dz01);
2061 /* Update vectorial force */
2062 fix0 = _mm256_add_pd(fix0,tx);
2063 fiy0 = _mm256_add_pd(fiy0,ty);
2064 fiz0 = _mm256_add_pd(fiz0,tz);
2066 fjx1 = _mm256_add_pd(fjx1,tx);
2067 fjy1 = _mm256_add_pd(fjy1,ty);
2068 fjz1 = _mm256_add_pd(fjz1,tz);
2070 /**************************
2071 * CALCULATE INTERACTIONS *
2072 **************************/
2074 r02 = _mm256_mul_pd(rsq02,rinv02);
2075 r02 = _mm256_andnot_pd(dummy_mask,r02);
2077 /* Calculate table index by multiplying r with table scale and truncate to integer */
2078 rt = _mm256_mul_pd(r02,vftabscale);
2079 vfitab = _mm256_cvttpd_epi32(rt);
2080 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2081 vfitab = _mm_slli_epi32(vfitab,2);
2083 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2084 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2085 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2086 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2087 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2088 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2089 Heps = _mm256_mul_pd(vfeps,H);
2090 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2091 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2092 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2096 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2098 /* Calculate temporary vectorial force */
2099 tx = _mm256_mul_pd(fscal,dx02);
2100 ty = _mm256_mul_pd(fscal,dy02);
2101 tz = _mm256_mul_pd(fscal,dz02);
2103 /* Update vectorial force */
2104 fix0 = _mm256_add_pd(fix0,tx);
2105 fiy0 = _mm256_add_pd(fiy0,ty);
2106 fiz0 = _mm256_add_pd(fiz0,tz);
2108 fjx2 = _mm256_add_pd(fjx2,tx);
2109 fjy2 = _mm256_add_pd(fjy2,ty);
2110 fjz2 = _mm256_add_pd(fjz2,tz);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 r10 = _mm256_mul_pd(rsq10,rinv10);
2117 r10 = _mm256_andnot_pd(dummy_mask,r10);
2119 /* Calculate table index by multiplying r with table scale and truncate to integer */
2120 rt = _mm256_mul_pd(r10,vftabscale);
2121 vfitab = _mm256_cvttpd_epi32(rt);
2122 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2123 vfitab = _mm_slli_epi32(vfitab,2);
2125 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2126 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2127 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2128 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2129 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2130 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2131 Heps = _mm256_mul_pd(vfeps,H);
2132 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2133 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2134 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2138 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2140 /* Calculate temporary vectorial force */
2141 tx = _mm256_mul_pd(fscal,dx10);
2142 ty = _mm256_mul_pd(fscal,dy10);
2143 tz = _mm256_mul_pd(fscal,dz10);
2145 /* Update vectorial force */
2146 fix1 = _mm256_add_pd(fix1,tx);
2147 fiy1 = _mm256_add_pd(fiy1,ty);
2148 fiz1 = _mm256_add_pd(fiz1,tz);
2150 fjx0 = _mm256_add_pd(fjx0,tx);
2151 fjy0 = _mm256_add_pd(fjy0,ty);
2152 fjz0 = _mm256_add_pd(fjz0,tz);
2154 /**************************
2155 * CALCULATE INTERACTIONS *
2156 **************************/
2158 r11 = _mm256_mul_pd(rsq11,rinv11);
2159 r11 = _mm256_andnot_pd(dummy_mask,r11);
2161 /* Calculate table index by multiplying r with table scale and truncate to integer */
2162 rt = _mm256_mul_pd(r11,vftabscale);
2163 vfitab = _mm256_cvttpd_epi32(rt);
2164 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2165 vfitab = _mm_slli_epi32(vfitab,2);
2167 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2168 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2169 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2170 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2171 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2172 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2173 Heps = _mm256_mul_pd(vfeps,H);
2174 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2175 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2176 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2180 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2182 /* Calculate temporary vectorial force */
2183 tx = _mm256_mul_pd(fscal,dx11);
2184 ty = _mm256_mul_pd(fscal,dy11);
2185 tz = _mm256_mul_pd(fscal,dz11);
2187 /* Update vectorial force */
2188 fix1 = _mm256_add_pd(fix1,tx);
2189 fiy1 = _mm256_add_pd(fiy1,ty);
2190 fiz1 = _mm256_add_pd(fiz1,tz);
2192 fjx1 = _mm256_add_pd(fjx1,tx);
2193 fjy1 = _mm256_add_pd(fjy1,ty);
2194 fjz1 = _mm256_add_pd(fjz1,tz);
2196 /**************************
2197 * CALCULATE INTERACTIONS *
2198 **************************/
2200 r12 = _mm256_mul_pd(rsq12,rinv12);
2201 r12 = _mm256_andnot_pd(dummy_mask,r12);
2203 /* Calculate table index by multiplying r with table scale and truncate to integer */
2204 rt = _mm256_mul_pd(r12,vftabscale);
2205 vfitab = _mm256_cvttpd_epi32(rt);
2206 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2207 vfitab = _mm_slli_epi32(vfitab,2);
2209 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2210 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2211 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2212 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2213 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2214 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2215 Heps = _mm256_mul_pd(vfeps,H);
2216 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2217 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2218 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2222 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2224 /* Calculate temporary vectorial force */
2225 tx = _mm256_mul_pd(fscal,dx12);
2226 ty = _mm256_mul_pd(fscal,dy12);
2227 tz = _mm256_mul_pd(fscal,dz12);
2229 /* Update vectorial force */
2230 fix1 = _mm256_add_pd(fix1,tx);
2231 fiy1 = _mm256_add_pd(fiy1,ty);
2232 fiz1 = _mm256_add_pd(fiz1,tz);
2234 fjx2 = _mm256_add_pd(fjx2,tx);
2235 fjy2 = _mm256_add_pd(fjy2,ty);
2236 fjz2 = _mm256_add_pd(fjz2,tz);
2238 /**************************
2239 * CALCULATE INTERACTIONS *
2240 **************************/
2242 r20 = _mm256_mul_pd(rsq20,rinv20);
2243 r20 = _mm256_andnot_pd(dummy_mask,r20);
2245 /* Calculate table index by multiplying r with table scale and truncate to integer */
2246 rt = _mm256_mul_pd(r20,vftabscale);
2247 vfitab = _mm256_cvttpd_epi32(rt);
2248 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2249 vfitab = _mm_slli_epi32(vfitab,2);
2251 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2252 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2253 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2254 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2255 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2256 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2257 Heps = _mm256_mul_pd(vfeps,H);
2258 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2259 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2260 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2264 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2266 /* Calculate temporary vectorial force */
2267 tx = _mm256_mul_pd(fscal,dx20);
2268 ty = _mm256_mul_pd(fscal,dy20);
2269 tz = _mm256_mul_pd(fscal,dz20);
2271 /* Update vectorial force */
2272 fix2 = _mm256_add_pd(fix2,tx);
2273 fiy2 = _mm256_add_pd(fiy2,ty);
2274 fiz2 = _mm256_add_pd(fiz2,tz);
2276 fjx0 = _mm256_add_pd(fjx0,tx);
2277 fjy0 = _mm256_add_pd(fjy0,ty);
2278 fjz0 = _mm256_add_pd(fjz0,tz);
2280 /**************************
2281 * CALCULATE INTERACTIONS *
2282 **************************/
2284 r21 = _mm256_mul_pd(rsq21,rinv21);
2285 r21 = _mm256_andnot_pd(dummy_mask,r21);
2287 /* Calculate table index by multiplying r with table scale and truncate to integer */
2288 rt = _mm256_mul_pd(r21,vftabscale);
2289 vfitab = _mm256_cvttpd_epi32(rt);
2290 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2291 vfitab = _mm_slli_epi32(vfitab,2);
2293 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2294 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2295 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2296 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2297 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2298 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2299 Heps = _mm256_mul_pd(vfeps,H);
2300 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2301 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2302 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2306 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2308 /* Calculate temporary vectorial force */
2309 tx = _mm256_mul_pd(fscal,dx21);
2310 ty = _mm256_mul_pd(fscal,dy21);
2311 tz = _mm256_mul_pd(fscal,dz21);
2313 /* Update vectorial force */
2314 fix2 = _mm256_add_pd(fix2,tx);
2315 fiy2 = _mm256_add_pd(fiy2,ty);
2316 fiz2 = _mm256_add_pd(fiz2,tz);
2318 fjx1 = _mm256_add_pd(fjx1,tx);
2319 fjy1 = _mm256_add_pd(fjy1,ty);
2320 fjz1 = _mm256_add_pd(fjz1,tz);
2322 /**************************
2323 * CALCULATE INTERACTIONS *
2324 **************************/
2326 r22 = _mm256_mul_pd(rsq22,rinv22);
2327 r22 = _mm256_andnot_pd(dummy_mask,r22);
2329 /* Calculate table index by multiplying r with table scale and truncate to integer */
2330 rt = _mm256_mul_pd(r22,vftabscale);
2331 vfitab = _mm256_cvttpd_epi32(rt);
2332 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2333 vfitab = _mm_slli_epi32(vfitab,2);
2335 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2336 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2337 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2338 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2339 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2340 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2341 Heps = _mm256_mul_pd(vfeps,H);
2342 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2343 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2344 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2348 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2350 /* Calculate temporary vectorial force */
2351 tx = _mm256_mul_pd(fscal,dx22);
2352 ty = _mm256_mul_pd(fscal,dy22);
2353 tz = _mm256_mul_pd(fscal,dz22);
2355 /* Update vectorial force */
2356 fix2 = _mm256_add_pd(fix2,tx);
2357 fiy2 = _mm256_add_pd(fiy2,ty);
2358 fiz2 = _mm256_add_pd(fiz2,tz);
2360 fjx2 = _mm256_add_pd(fjx2,tx);
2361 fjy2 = _mm256_add_pd(fjy2,ty);
2362 fjz2 = _mm256_add_pd(fjz2,tz);
2364 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2365 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2366 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2367 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2369 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2370 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2372 /* Inner loop uses 368 flops */
2375 /* End of innermost loop */
2377 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2378 f+i_coord_offset,fshift+i_shift_offset);
2380 /* Increment number of inner iterations */
2381 inneriter += j_index_end - j_index_start;
2383 /* Outer loop uses 18 flops */
2386 /* Increment number of outer iterations */
2389 /* Update outer/inner flops */
2391 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);