2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gmx_math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_double
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
114 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
116 __m128i ifour = _mm_set1_epi32(4);
117 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
119 __m256d dummy_mask,cutoff_mask;
120 __m128 tmpmask0,tmpmask1;
121 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
122 __m256d one = _mm256_set1_pd(1.0);
123 __m256d two = _mm256_set1_pd(2.0);
129 jindex = nlist->jindex;
131 shiftidx = nlist->shift;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm256_set1_pd(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 vftab = kernel_data->table_elec_vdw->data;
142 vftabscale = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
144 /* Setup water-specific parameters */
145 inr = nlist->iinr[0];
146 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
147 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
148 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
149 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
151 jq0 = _mm256_set1_pd(charge[inr+0]);
152 jq1 = _mm256_set1_pd(charge[inr+1]);
153 jq2 = _mm256_set1_pd(charge[inr+2]);
154 vdwjidx0A = 2*vdwtype[inr+0];
155 qq00 = _mm256_mul_pd(iq0,jq0);
156 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
157 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
158 qq01 = _mm256_mul_pd(iq0,jq1);
159 qq02 = _mm256_mul_pd(iq0,jq2);
160 qq10 = _mm256_mul_pd(iq1,jq0);
161 qq11 = _mm256_mul_pd(iq1,jq1);
162 qq12 = _mm256_mul_pd(iq1,jq2);
163 qq20 = _mm256_mul_pd(iq2,jq0);
164 qq21 = _mm256_mul_pd(iq2,jq1);
165 qq22 = _mm256_mul_pd(iq2,jq2);
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = 0;
177 for(iidx=0;iidx<4*DIM;iidx++)
182 /* Start outer loop over neighborlists */
183 for(iidx=0; iidx<nri; iidx++)
185 /* Load shift vector for this list */
186 i_shift_offset = DIM*shiftidx[iidx];
188 /* Load limits for loop over neighbors */
189 j_index_start = jindex[iidx];
190 j_index_end = jindex[iidx+1];
192 /* Get outer coordinate index */
194 i_coord_offset = DIM*inr;
196 /* Load i particle coords and add shift vector */
197 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
198 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
200 fix0 = _mm256_setzero_pd();
201 fiy0 = _mm256_setzero_pd();
202 fiz0 = _mm256_setzero_pd();
203 fix1 = _mm256_setzero_pd();
204 fiy1 = _mm256_setzero_pd();
205 fiz1 = _mm256_setzero_pd();
206 fix2 = _mm256_setzero_pd();
207 fiy2 = _mm256_setzero_pd();
208 fiz2 = _mm256_setzero_pd();
210 /* Reset potential sums */
211 velecsum = _mm256_setzero_pd();
212 vvdwsum = _mm256_setzero_pd();
214 /* Start inner kernel loop */
215 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
225 j_coord_offsetC = DIM*jnrC;
226 j_coord_offsetD = DIM*jnrD;
228 /* load j atom coordinates */
229 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
230 x+j_coord_offsetC,x+j_coord_offsetD,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm256_sub_pd(ix0,jx0);
235 dy00 = _mm256_sub_pd(iy0,jy0);
236 dz00 = _mm256_sub_pd(iz0,jz0);
237 dx01 = _mm256_sub_pd(ix0,jx1);
238 dy01 = _mm256_sub_pd(iy0,jy1);
239 dz01 = _mm256_sub_pd(iz0,jz1);
240 dx02 = _mm256_sub_pd(ix0,jx2);
241 dy02 = _mm256_sub_pd(iy0,jy2);
242 dz02 = _mm256_sub_pd(iz0,jz2);
243 dx10 = _mm256_sub_pd(ix1,jx0);
244 dy10 = _mm256_sub_pd(iy1,jy0);
245 dz10 = _mm256_sub_pd(iz1,jz0);
246 dx11 = _mm256_sub_pd(ix1,jx1);
247 dy11 = _mm256_sub_pd(iy1,jy1);
248 dz11 = _mm256_sub_pd(iz1,jz1);
249 dx12 = _mm256_sub_pd(ix1,jx2);
250 dy12 = _mm256_sub_pd(iy1,jy2);
251 dz12 = _mm256_sub_pd(iz1,jz2);
252 dx20 = _mm256_sub_pd(ix2,jx0);
253 dy20 = _mm256_sub_pd(iy2,jy0);
254 dz20 = _mm256_sub_pd(iz2,jz0);
255 dx21 = _mm256_sub_pd(ix2,jx1);
256 dy21 = _mm256_sub_pd(iy2,jy1);
257 dz21 = _mm256_sub_pd(iz2,jz1);
258 dx22 = _mm256_sub_pd(ix2,jx2);
259 dy22 = _mm256_sub_pd(iy2,jy2);
260 dz22 = _mm256_sub_pd(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
264 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
265 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
266 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
267 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
268 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
269 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
270 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
271 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
273 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
274 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
275 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
276 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
277 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
278 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
279 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
280 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
281 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
283 fjx0 = _mm256_setzero_pd();
284 fjy0 = _mm256_setzero_pd();
285 fjz0 = _mm256_setzero_pd();
286 fjx1 = _mm256_setzero_pd();
287 fjy1 = _mm256_setzero_pd();
288 fjz1 = _mm256_setzero_pd();
289 fjx2 = _mm256_setzero_pd();
290 fjy2 = _mm256_setzero_pd();
291 fjz2 = _mm256_setzero_pd();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 r00 = _mm256_mul_pd(rsq00,rinv00);
299 /* Calculate table index by multiplying r with table scale and truncate to integer */
300 rt = _mm256_mul_pd(r00,vftabscale);
301 vfitab = _mm256_cvttpd_epi32(rt);
302 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
303 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
305 /* CUBIC SPLINE TABLE ELECTROSTATICS */
306 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
309 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
310 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
311 Heps = _mm256_mul_pd(vfeps,H);
312 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
313 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
314 velec = _mm256_mul_pd(qq00,VV);
315 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
316 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
318 /* CUBIC SPLINE TABLE DISPERSION */
319 vfitab = _mm_add_epi32(vfitab,ifour);
320 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
321 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
322 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
323 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
324 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
325 Heps = _mm256_mul_pd(vfeps,H);
326 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
327 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
328 vvdw6 = _mm256_mul_pd(c6_00,VV);
329 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
330 fvdw6 = _mm256_mul_pd(c6_00,FF);
332 /* CUBIC SPLINE TABLE REPULSION */
333 vfitab = _mm_add_epi32(vfitab,ifour);
334 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
335 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
336 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
337 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
338 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
339 Heps = _mm256_mul_pd(vfeps,H);
340 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
341 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
342 vvdw12 = _mm256_mul_pd(c12_00,VV);
343 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
344 fvdw12 = _mm256_mul_pd(c12_00,FF);
345 vvdw = _mm256_add_pd(vvdw12,vvdw6);
346 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 velecsum = _mm256_add_pd(velecsum,velec);
350 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
352 fscal = _mm256_add_pd(felec,fvdw);
354 /* Calculate temporary vectorial force */
355 tx = _mm256_mul_pd(fscal,dx00);
356 ty = _mm256_mul_pd(fscal,dy00);
357 tz = _mm256_mul_pd(fscal,dz00);
359 /* Update vectorial force */
360 fix0 = _mm256_add_pd(fix0,tx);
361 fiy0 = _mm256_add_pd(fiy0,ty);
362 fiz0 = _mm256_add_pd(fiz0,tz);
364 fjx0 = _mm256_add_pd(fjx0,tx);
365 fjy0 = _mm256_add_pd(fjy0,ty);
366 fjz0 = _mm256_add_pd(fjz0,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 r01 = _mm256_mul_pd(rsq01,rinv01);
374 /* Calculate table index by multiplying r with table scale and truncate to integer */
375 rt = _mm256_mul_pd(r01,vftabscale);
376 vfitab = _mm256_cvttpd_epi32(rt);
377 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
378 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
380 /* CUBIC SPLINE TABLE ELECTROSTATICS */
381 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
382 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
383 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
384 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
385 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
386 Heps = _mm256_mul_pd(vfeps,H);
387 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
388 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
389 velec = _mm256_mul_pd(qq01,VV);
390 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
391 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum = _mm256_add_pd(velecsum,velec);
398 /* Calculate temporary vectorial force */
399 tx = _mm256_mul_pd(fscal,dx01);
400 ty = _mm256_mul_pd(fscal,dy01);
401 tz = _mm256_mul_pd(fscal,dz01);
403 /* Update vectorial force */
404 fix0 = _mm256_add_pd(fix0,tx);
405 fiy0 = _mm256_add_pd(fiy0,ty);
406 fiz0 = _mm256_add_pd(fiz0,tz);
408 fjx1 = _mm256_add_pd(fjx1,tx);
409 fjy1 = _mm256_add_pd(fjy1,ty);
410 fjz1 = _mm256_add_pd(fjz1,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 r02 = _mm256_mul_pd(rsq02,rinv02);
418 /* Calculate table index by multiplying r with table scale and truncate to integer */
419 rt = _mm256_mul_pd(r02,vftabscale);
420 vfitab = _mm256_cvttpd_epi32(rt);
421 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
422 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
424 /* CUBIC SPLINE TABLE ELECTROSTATICS */
425 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
426 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
427 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
428 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
429 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
430 Heps = _mm256_mul_pd(vfeps,H);
431 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
432 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
433 velec = _mm256_mul_pd(qq02,VV);
434 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
435 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
437 /* Update potential sum for this i atom from the interaction with this j atom. */
438 velecsum = _mm256_add_pd(velecsum,velec);
442 /* Calculate temporary vectorial force */
443 tx = _mm256_mul_pd(fscal,dx02);
444 ty = _mm256_mul_pd(fscal,dy02);
445 tz = _mm256_mul_pd(fscal,dz02);
447 /* Update vectorial force */
448 fix0 = _mm256_add_pd(fix0,tx);
449 fiy0 = _mm256_add_pd(fiy0,ty);
450 fiz0 = _mm256_add_pd(fiz0,tz);
452 fjx2 = _mm256_add_pd(fjx2,tx);
453 fjy2 = _mm256_add_pd(fjy2,ty);
454 fjz2 = _mm256_add_pd(fjz2,tz);
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
460 r10 = _mm256_mul_pd(rsq10,rinv10);
462 /* Calculate table index by multiplying r with table scale and truncate to integer */
463 rt = _mm256_mul_pd(r10,vftabscale);
464 vfitab = _mm256_cvttpd_epi32(rt);
465 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
466 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
468 /* CUBIC SPLINE TABLE ELECTROSTATICS */
469 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
470 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
471 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
472 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
473 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
474 Heps = _mm256_mul_pd(vfeps,H);
475 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
476 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
477 velec = _mm256_mul_pd(qq10,VV);
478 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
479 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velecsum = _mm256_add_pd(velecsum,velec);
486 /* Calculate temporary vectorial force */
487 tx = _mm256_mul_pd(fscal,dx10);
488 ty = _mm256_mul_pd(fscal,dy10);
489 tz = _mm256_mul_pd(fscal,dz10);
491 /* Update vectorial force */
492 fix1 = _mm256_add_pd(fix1,tx);
493 fiy1 = _mm256_add_pd(fiy1,ty);
494 fiz1 = _mm256_add_pd(fiz1,tz);
496 fjx0 = _mm256_add_pd(fjx0,tx);
497 fjy0 = _mm256_add_pd(fjy0,ty);
498 fjz0 = _mm256_add_pd(fjz0,tz);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 r11 = _mm256_mul_pd(rsq11,rinv11);
506 /* Calculate table index by multiplying r with table scale and truncate to integer */
507 rt = _mm256_mul_pd(r11,vftabscale);
508 vfitab = _mm256_cvttpd_epi32(rt);
509 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
510 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
512 /* CUBIC SPLINE TABLE ELECTROSTATICS */
513 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
514 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
515 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
516 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
517 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
518 Heps = _mm256_mul_pd(vfeps,H);
519 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
520 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
521 velec = _mm256_mul_pd(qq11,VV);
522 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
523 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm256_add_pd(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm256_mul_pd(fscal,dx11);
532 ty = _mm256_mul_pd(fscal,dy11);
533 tz = _mm256_mul_pd(fscal,dz11);
535 /* Update vectorial force */
536 fix1 = _mm256_add_pd(fix1,tx);
537 fiy1 = _mm256_add_pd(fiy1,ty);
538 fiz1 = _mm256_add_pd(fiz1,tz);
540 fjx1 = _mm256_add_pd(fjx1,tx);
541 fjy1 = _mm256_add_pd(fjy1,ty);
542 fjz1 = _mm256_add_pd(fjz1,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r12 = _mm256_mul_pd(rsq12,rinv12);
550 /* Calculate table index by multiplying r with table scale and truncate to integer */
551 rt = _mm256_mul_pd(r12,vftabscale);
552 vfitab = _mm256_cvttpd_epi32(rt);
553 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
554 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
556 /* CUBIC SPLINE TABLE ELECTROSTATICS */
557 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
558 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
559 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
560 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
561 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
562 Heps = _mm256_mul_pd(vfeps,H);
563 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
564 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
565 velec = _mm256_mul_pd(qq12,VV);
566 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
567 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velecsum = _mm256_add_pd(velecsum,velec);
574 /* Calculate temporary vectorial force */
575 tx = _mm256_mul_pd(fscal,dx12);
576 ty = _mm256_mul_pd(fscal,dy12);
577 tz = _mm256_mul_pd(fscal,dz12);
579 /* Update vectorial force */
580 fix1 = _mm256_add_pd(fix1,tx);
581 fiy1 = _mm256_add_pd(fiy1,ty);
582 fiz1 = _mm256_add_pd(fiz1,tz);
584 fjx2 = _mm256_add_pd(fjx2,tx);
585 fjy2 = _mm256_add_pd(fjy2,ty);
586 fjz2 = _mm256_add_pd(fjz2,tz);
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 r20 = _mm256_mul_pd(rsq20,rinv20);
594 /* Calculate table index by multiplying r with table scale and truncate to integer */
595 rt = _mm256_mul_pd(r20,vftabscale);
596 vfitab = _mm256_cvttpd_epi32(rt);
597 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
598 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
600 /* CUBIC SPLINE TABLE ELECTROSTATICS */
601 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
602 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
603 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
604 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
605 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
606 Heps = _mm256_mul_pd(vfeps,H);
607 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
608 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
609 velec = _mm256_mul_pd(qq20,VV);
610 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
611 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velecsum = _mm256_add_pd(velecsum,velec);
618 /* Calculate temporary vectorial force */
619 tx = _mm256_mul_pd(fscal,dx20);
620 ty = _mm256_mul_pd(fscal,dy20);
621 tz = _mm256_mul_pd(fscal,dz20);
623 /* Update vectorial force */
624 fix2 = _mm256_add_pd(fix2,tx);
625 fiy2 = _mm256_add_pd(fiy2,ty);
626 fiz2 = _mm256_add_pd(fiz2,tz);
628 fjx0 = _mm256_add_pd(fjx0,tx);
629 fjy0 = _mm256_add_pd(fjy0,ty);
630 fjz0 = _mm256_add_pd(fjz0,tz);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 r21 = _mm256_mul_pd(rsq21,rinv21);
638 /* Calculate table index by multiplying r with table scale and truncate to integer */
639 rt = _mm256_mul_pd(r21,vftabscale);
640 vfitab = _mm256_cvttpd_epi32(rt);
641 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
642 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
645 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
646 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
647 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
648 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
649 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
650 Heps = _mm256_mul_pd(vfeps,H);
651 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
652 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
653 velec = _mm256_mul_pd(qq21,VV);
654 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
655 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velecsum = _mm256_add_pd(velecsum,velec);
662 /* Calculate temporary vectorial force */
663 tx = _mm256_mul_pd(fscal,dx21);
664 ty = _mm256_mul_pd(fscal,dy21);
665 tz = _mm256_mul_pd(fscal,dz21);
667 /* Update vectorial force */
668 fix2 = _mm256_add_pd(fix2,tx);
669 fiy2 = _mm256_add_pd(fiy2,ty);
670 fiz2 = _mm256_add_pd(fiz2,tz);
672 fjx1 = _mm256_add_pd(fjx1,tx);
673 fjy1 = _mm256_add_pd(fjy1,ty);
674 fjz1 = _mm256_add_pd(fjz1,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 r22 = _mm256_mul_pd(rsq22,rinv22);
682 /* Calculate table index by multiplying r with table scale and truncate to integer */
683 rt = _mm256_mul_pd(r22,vftabscale);
684 vfitab = _mm256_cvttpd_epi32(rt);
685 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
686 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
688 /* CUBIC SPLINE TABLE ELECTROSTATICS */
689 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
690 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
691 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
692 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
693 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
694 Heps = _mm256_mul_pd(vfeps,H);
695 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
696 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
697 velec = _mm256_mul_pd(qq22,VV);
698 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
699 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
701 /* Update potential sum for this i atom from the interaction with this j atom. */
702 velecsum = _mm256_add_pd(velecsum,velec);
706 /* Calculate temporary vectorial force */
707 tx = _mm256_mul_pd(fscal,dx22);
708 ty = _mm256_mul_pd(fscal,dy22);
709 tz = _mm256_mul_pd(fscal,dz22);
711 /* Update vectorial force */
712 fix2 = _mm256_add_pd(fix2,tx);
713 fiy2 = _mm256_add_pd(fiy2,ty);
714 fiz2 = _mm256_add_pd(fiz2,tz);
716 fjx2 = _mm256_add_pd(fjx2,tx);
717 fjy2 = _mm256_add_pd(fjy2,ty);
718 fjz2 = _mm256_add_pd(fjz2,tz);
720 fjptrA = f+j_coord_offsetA;
721 fjptrB = f+j_coord_offsetB;
722 fjptrC = f+j_coord_offsetC;
723 fjptrD = f+j_coord_offsetD;
725 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
726 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
728 /* Inner loop uses 417 flops */
734 /* Get j neighbor index, and coordinate index */
735 jnrlistA = jjnr[jidx];
736 jnrlistB = jjnr[jidx+1];
737 jnrlistC = jjnr[jidx+2];
738 jnrlistD = jjnr[jidx+3];
739 /* Sign of each element will be negative for non-real atoms.
740 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
741 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
743 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
745 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
746 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
747 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
749 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
750 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
751 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
752 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
753 j_coord_offsetA = DIM*jnrA;
754 j_coord_offsetB = DIM*jnrB;
755 j_coord_offsetC = DIM*jnrC;
756 j_coord_offsetD = DIM*jnrD;
758 /* load j atom coordinates */
759 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
760 x+j_coord_offsetC,x+j_coord_offsetD,
761 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
763 /* Calculate displacement vector */
764 dx00 = _mm256_sub_pd(ix0,jx0);
765 dy00 = _mm256_sub_pd(iy0,jy0);
766 dz00 = _mm256_sub_pd(iz0,jz0);
767 dx01 = _mm256_sub_pd(ix0,jx1);
768 dy01 = _mm256_sub_pd(iy0,jy1);
769 dz01 = _mm256_sub_pd(iz0,jz1);
770 dx02 = _mm256_sub_pd(ix0,jx2);
771 dy02 = _mm256_sub_pd(iy0,jy2);
772 dz02 = _mm256_sub_pd(iz0,jz2);
773 dx10 = _mm256_sub_pd(ix1,jx0);
774 dy10 = _mm256_sub_pd(iy1,jy0);
775 dz10 = _mm256_sub_pd(iz1,jz0);
776 dx11 = _mm256_sub_pd(ix1,jx1);
777 dy11 = _mm256_sub_pd(iy1,jy1);
778 dz11 = _mm256_sub_pd(iz1,jz1);
779 dx12 = _mm256_sub_pd(ix1,jx2);
780 dy12 = _mm256_sub_pd(iy1,jy2);
781 dz12 = _mm256_sub_pd(iz1,jz2);
782 dx20 = _mm256_sub_pd(ix2,jx0);
783 dy20 = _mm256_sub_pd(iy2,jy0);
784 dz20 = _mm256_sub_pd(iz2,jz0);
785 dx21 = _mm256_sub_pd(ix2,jx1);
786 dy21 = _mm256_sub_pd(iy2,jy1);
787 dz21 = _mm256_sub_pd(iz2,jz1);
788 dx22 = _mm256_sub_pd(ix2,jx2);
789 dy22 = _mm256_sub_pd(iy2,jy2);
790 dz22 = _mm256_sub_pd(iz2,jz2);
792 /* Calculate squared distance and things based on it */
793 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
794 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
795 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
796 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
797 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
798 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
799 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
800 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
801 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
803 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
804 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
805 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
806 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
807 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
808 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
809 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
810 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
811 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
813 fjx0 = _mm256_setzero_pd();
814 fjy0 = _mm256_setzero_pd();
815 fjz0 = _mm256_setzero_pd();
816 fjx1 = _mm256_setzero_pd();
817 fjy1 = _mm256_setzero_pd();
818 fjz1 = _mm256_setzero_pd();
819 fjx2 = _mm256_setzero_pd();
820 fjy2 = _mm256_setzero_pd();
821 fjz2 = _mm256_setzero_pd();
823 /**************************
824 * CALCULATE INTERACTIONS *
825 **************************/
827 r00 = _mm256_mul_pd(rsq00,rinv00);
828 r00 = _mm256_andnot_pd(dummy_mask,r00);
830 /* Calculate table index by multiplying r with table scale and truncate to integer */
831 rt = _mm256_mul_pd(r00,vftabscale);
832 vfitab = _mm256_cvttpd_epi32(rt);
833 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
834 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
836 /* CUBIC SPLINE TABLE ELECTROSTATICS */
837 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
838 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
839 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
840 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
841 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
842 Heps = _mm256_mul_pd(vfeps,H);
843 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
844 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
845 velec = _mm256_mul_pd(qq00,VV);
846 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
847 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
849 /* CUBIC SPLINE TABLE DISPERSION */
850 vfitab = _mm_add_epi32(vfitab,ifour);
851 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
852 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
853 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
854 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
855 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
856 Heps = _mm256_mul_pd(vfeps,H);
857 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
858 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
859 vvdw6 = _mm256_mul_pd(c6_00,VV);
860 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
861 fvdw6 = _mm256_mul_pd(c6_00,FF);
863 /* CUBIC SPLINE TABLE REPULSION */
864 vfitab = _mm_add_epi32(vfitab,ifour);
865 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
866 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
867 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
868 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
869 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
870 Heps = _mm256_mul_pd(vfeps,H);
871 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
872 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
873 vvdw12 = _mm256_mul_pd(c12_00,VV);
874 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
875 fvdw12 = _mm256_mul_pd(c12_00,FF);
876 vvdw = _mm256_add_pd(vvdw12,vvdw6);
877 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
879 /* Update potential sum for this i atom from the interaction with this j atom. */
880 velec = _mm256_andnot_pd(dummy_mask,velec);
881 velecsum = _mm256_add_pd(velecsum,velec);
882 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
883 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
885 fscal = _mm256_add_pd(felec,fvdw);
887 fscal = _mm256_andnot_pd(dummy_mask,fscal);
889 /* Calculate temporary vectorial force */
890 tx = _mm256_mul_pd(fscal,dx00);
891 ty = _mm256_mul_pd(fscal,dy00);
892 tz = _mm256_mul_pd(fscal,dz00);
894 /* Update vectorial force */
895 fix0 = _mm256_add_pd(fix0,tx);
896 fiy0 = _mm256_add_pd(fiy0,ty);
897 fiz0 = _mm256_add_pd(fiz0,tz);
899 fjx0 = _mm256_add_pd(fjx0,tx);
900 fjy0 = _mm256_add_pd(fjy0,ty);
901 fjz0 = _mm256_add_pd(fjz0,tz);
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 r01 = _mm256_mul_pd(rsq01,rinv01);
908 r01 = _mm256_andnot_pd(dummy_mask,r01);
910 /* Calculate table index by multiplying r with table scale and truncate to integer */
911 rt = _mm256_mul_pd(r01,vftabscale);
912 vfitab = _mm256_cvttpd_epi32(rt);
913 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
914 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
916 /* CUBIC SPLINE TABLE ELECTROSTATICS */
917 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
918 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
919 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
920 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
921 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
922 Heps = _mm256_mul_pd(vfeps,H);
923 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
924 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
925 velec = _mm256_mul_pd(qq01,VV);
926 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
927 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec = _mm256_andnot_pd(dummy_mask,velec);
931 velecsum = _mm256_add_pd(velecsum,velec);
935 fscal = _mm256_andnot_pd(dummy_mask,fscal);
937 /* Calculate temporary vectorial force */
938 tx = _mm256_mul_pd(fscal,dx01);
939 ty = _mm256_mul_pd(fscal,dy01);
940 tz = _mm256_mul_pd(fscal,dz01);
942 /* Update vectorial force */
943 fix0 = _mm256_add_pd(fix0,tx);
944 fiy0 = _mm256_add_pd(fiy0,ty);
945 fiz0 = _mm256_add_pd(fiz0,tz);
947 fjx1 = _mm256_add_pd(fjx1,tx);
948 fjy1 = _mm256_add_pd(fjy1,ty);
949 fjz1 = _mm256_add_pd(fjz1,tz);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 r02 = _mm256_mul_pd(rsq02,rinv02);
956 r02 = _mm256_andnot_pd(dummy_mask,r02);
958 /* Calculate table index by multiplying r with table scale and truncate to integer */
959 rt = _mm256_mul_pd(r02,vftabscale);
960 vfitab = _mm256_cvttpd_epi32(rt);
961 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
962 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
964 /* CUBIC SPLINE TABLE ELECTROSTATICS */
965 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
966 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
967 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
968 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
969 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
970 Heps = _mm256_mul_pd(vfeps,H);
971 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
972 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
973 velec = _mm256_mul_pd(qq02,VV);
974 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
975 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
977 /* Update potential sum for this i atom from the interaction with this j atom. */
978 velec = _mm256_andnot_pd(dummy_mask,velec);
979 velecsum = _mm256_add_pd(velecsum,velec);
983 fscal = _mm256_andnot_pd(dummy_mask,fscal);
985 /* Calculate temporary vectorial force */
986 tx = _mm256_mul_pd(fscal,dx02);
987 ty = _mm256_mul_pd(fscal,dy02);
988 tz = _mm256_mul_pd(fscal,dz02);
990 /* Update vectorial force */
991 fix0 = _mm256_add_pd(fix0,tx);
992 fiy0 = _mm256_add_pd(fiy0,ty);
993 fiz0 = _mm256_add_pd(fiz0,tz);
995 fjx2 = _mm256_add_pd(fjx2,tx);
996 fjy2 = _mm256_add_pd(fjy2,ty);
997 fjz2 = _mm256_add_pd(fjz2,tz);
999 /**************************
1000 * CALCULATE INTERACTIONS *
1001 **************************/
1003 r10 = _mm256_mul_pd(rsq10,rinv10);
1004 r10 = _mm256_andnot_pd(dummy_mask,r10);
1006 /* Calculate table index by multiplying r with table scale and truncate to integer */
1007 rt = _mm256_mul_pd(r10,vftabscale);
1008 vfitab = _mm256_cvttpd_epi32(rt);
1009 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1010 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1012 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1013 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1014 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1015 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1016 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1017 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1018 Heps = _mm256_mul_pd(vfeps,H);
1019 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1020 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1021 velec = _mm256_mul_pd(qq10,VV);
1022 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1023 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm256_andnot_pd(dummy_mask,velec);
1027 velecsum = _mm256_add_pd(velecsum,velec);
1031 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm256_mul_pd(fscal,dx10);
1035 ty = _mm256_mul_pd(fscal,dy10);
1036 tz = _mm256_mul_pd(fscal,dz10);
1038 /* Update vectorial force */
1039 fix1 = _mm256_add_pd(fix1,tx);
1040 fiy1 = _mm256_add_pd(fiy1,ty);
1041 fiz1 = _mm256_add_pd(fiz1,tz);
1043 fjx0 = _mm256_add_pd(fjx0,tx);
1044 fjy0 = _mm256_add_pd(fjy0,ty);
1045 fjz0 = _mm256_add_pd(fjz0,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 r11 = _mm256_mul_pd(rsq11,rinv11);
1052 r11 = _mm256_andnot_pd(dummy_mask,r11);
1054 /* Calculate table index by multiplying r with table scale and truncate to integer */
1055 rt = _mm256_mul_pd(r11,vftabscale);
1056 vfitab = _mm256_cvttpd_epi32(rt);
1057 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1058 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1060 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1061 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1062 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1063 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1064 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1065 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1066 Heps = _mm256_mul_pd(vfeps,H);
1067 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1068 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1069 velec = _mm256_mul_pd(qq11,VV);
1070 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1071 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec = _mm256_andnot_pd(dummy_mask,velec);
1075 velecsum = _mm256_add_pd(velecsum,velec);
1079 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1081 /* Calculate temporary vectorial force */
1082 tx = _mm256_mul_pd(fscal,dx11);
1083 ty = _mm256_mul_pd(fscal,dy11);
1084 tz = _mm256_mul_pd(fscal,dz11);
1086 /* Update vectorial force */
1087 fix1 = _mm256_add_pd(fix1,tx);
1088 fiy1 = _mm256_add_pd(fiy1,ty);
1089 fiz1 = _mm256_add_pd(fiz1,tz);
1091 fjx1 = _mm256_add_pd(fjx1,tx);
1092 fjy1 = _mm256_add_pd(fjy1,ty);
1093 fjz1 = _mm256_add_pd(fjz1,tz);
1095 /**************************
1096 * CALCULATE INTERACTIONS *
1097 **************************/
1099 r12 = _mm256_mul_pd(rsq12,rinv12);
1100 r12 = _mm256_andnot_pd(dummy_mask,r12);
1102 /* Calculate table index by multiplying r with table scale and truncate to integer */
1103 rt = _mm256_mul_pd(r12,vftabscale);
1104 vfitab = _mm256_cvttpd_epi32(rt);
1105 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1106 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1108 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1109 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1110 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1111 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1112 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1113 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1114 Heps = _mm256_mul_pd(vfeps,H);
1115 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1116 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1117 velec = _mm256_mul_pd(qq12,VV);
1118 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1119 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec = _mm256_andnot_pd(dummy_mask,velec);
1123 velecsum = _mm256_add_pd(velecsum,velec);
1127 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1129 /* Calculate temporary vectorial force */
1130 tx = _mm256_mul_pd(fscal,dx12);
1131 ty = _mm256_mul_pd(fscal,dy12);
1132 tz = _mm256_mul_pd(fscal,dz12);
1134 /* Update vectorial force */
1135 fix1 = _mm256_add_pd(fix1,tx);
1136 fiy1 = _mm256_add_pd(fiy1,ty);
1137 fiz1 = _mm256_add_pd(fiz1,tz);
1139 fjx2 = _mm256_add_pd(fjx2,tx);
1140 fjy2 = _mm256_add_pd(fjy2,ty);
1141 fjz2 = _mm256_add_pd(fjz2,tz);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 r20 = _mm256_mul_pd(rsq20,rinv20);
1148 r20 = _mm256_andnot_pd(dummy_mask,r20);
1150 /* Calculate table index by multiplying r with table scale and truncate to integer */
1151 rt = _mm256_mul_pd(r20,vftabscale);
1152 vfitab = _mm256_cvttpd_epi32(rt);
1153 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1154 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1156 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1157 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1158 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1159 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1160 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1161 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1162 Heps = _mm256_mul_pd(vfeps,H);
1163 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1164 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1165 velec = _mm256_mul_pd(qq20,VV);
1166 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1167 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1169 /* Update potential sum for this i atom from the interaction with this j atom. */
1170 velec = _mm256_andnot_pd(dummy_mask,velec);
1171 velecsum = _mm256_add_pd(velecsum,velec);
1175 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1177 /* Calculate temporary vectorial force */
1178 tx = _mm256_mul_pd(fscal,dx20);
1179 ty = _mm256_mul_pd(fscal,dy20);
1180 tz = _mm256_mul_pd(fscal,dz20);
1182 /* Update vectorial force */
1183 fix2 = _mm256_add_pd(fix2,tx);
1184 fiy2 = _mm256_add_pd(fiy2,ty);
1185 fiz2 = _mm256_add_pd(fiz2,tz);
1187 fjx0 = _mm256_add_pd(fjx0,tx);
1188 fjy0 = _mm256_add_pd(fjy0,ty);
1189 fjz0 = _mm256_add_pd(fjz0,tz);
1191 /**************************
1192 * CALCULATE INTERACTIONS *
1193 **************************/
1195 r21 = _mm256_mul_pd(rsq21,rinv21);
1196 r21 = _mm256_andnot_pd(dummy_mask,r21);
1198 /* Calculate table index by multiplying r with table scale and truncate to integer */
1199 rt = _mm256_mul_pd(r21,vftabscale);
1200 vfitab = _mm256_cvttpd_epi32(rt);
1201 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1202 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1204 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1205 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1206 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1207 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1208 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1209 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1210 Heps = _mm256_mul_pd(vfeps,H);
1211 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1212 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1213 velec = _mm256_mul_pd(qq21,VV);
1214 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1215 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1217 /* Update potential sum for this i atom from the interaction with this j atom. */
1218 velec = _mm256_andnot_pd(dummy_mask,velec);
1219 velecsum = _mm256_add_pd(velecsum,velec);
1223 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1225 /* Calculate temporary vectorial force */
1226 tx = _mm256_mul_pd(fscal,dx21);
1227 ty = _mm256_mul_pd(fscal,dy21);
1228 tz = _mm256_mul_pd(fscal,dz21);
1230 /* Update vectorial force */
1231 fix2 = _mm256_add_pd(fix2,tx);
1232 fiy2 = _mm256_add_pd(fiy2,ty);
1233 fiz2 = _mm256_add_pd(fiz2,tz);
1235 fjx1 = _mm256_add_pd(fjx1,tx);
1236 fjy1 = _mm256_add_pd(fjy1,ty);
1237 fjz1 = _mm256_add_pd(fjz1,tz);
1239 /**************************
1240 * CALCULATE INTERACTIONS *
1241 **************************/
1243 r22 = _mm256_mul_pd(rsq22,rinv22);
1244 r22 = _mm256_andnot_pd(dummy_mask,r22);
1246 /* Calculate table index by multiplying r with table scale and truncate to integer */
1247 rt = _mm256_mul_pd(r22,vftabscale);
1248 vfitab = _mm256_cvttpd_epi32(rt);
1249 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1250 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1252 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1253 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1254 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1255 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1256 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1257 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1258 Heps = _mm256_mul_pd(vfeps,H);
1259 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1260 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1261 velec = _mm256_mul_pd(qq22,VV);
1262 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1263 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1265 /* Update potential sum for this i atom from the interaction with this j atom. */
1266 velec = _mm256_andnot_pd(dummy_mask,velec);
1267 velecsum = _mm256_add_pd(velecsum,velec);
1271 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1273 /* Calculate temporary vectorial force */
1274 tx = _mm256_mul_pd(fscal,dx22);
1275 ty = _mm256_mul_pd(fscal,dy22);
1276 tz = _mm256_mul_pd(fscal,dz22);
1278 /* Update vectorial force */
1279 fix2 = _mm256_add_pd(fix2,tx);
1280 fiy2 = _mm256_add_pd(fiy2,ty);
1281 fiz2 = _mm256_add_pd(fiz2,tz);
1283 fjx2 = _mm256_add_pd(fjx2,tx);
1284 fjy2 = _mm256_add_pd(fjy2,ty);
1285 fjz2 = _mm256_add_pd(fjz2,tz);
1287 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1288 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1289 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1290 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1292 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1293 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1295 /* Inner loop uses 426 flops */
1298 /* End of innermost loop */
1300 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1301 f+i_coord_offset,fshift+i_shift_offset);
1304 /* Update potential energies */
1305 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1306 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1308 /* Increment number of inner iterations */
1309 inneriter += j_index_end - j_index_start;
1311 /* Outer loop uses 20 flops */
1314 /* Increment number of outer iterations */
1317 /* Update outer/inner flops */
1319 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1322 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_double
1323 * Electrostatics interaction: CubicSplineTable
1324 * VdW interaction: CubicSplineTable
1325 * Geometry: Water3-Water3
1326 * Calculate force/pot: Force
1329 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_double
1330 (t_nblist * gmx_restrict nlist,
1331 rvec * gmx_restrict xx,
1332 rvec * gmx_restrict ff,
1333 t_forcerec * gmx_restrict fr,
1334 t_mdatoms * gmx_restrict mdatoms,
1335 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1336 t_nrnb * gmx_restrict nrnb)
1338 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1339 * just 0 for non-waters.
1340 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1341 * jnr indices corresponding to data put in the four positions in the SIMD register.
1343 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1344 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1345 int jnrA,jnrB,jnrC,jnrD;
1346 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1347 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1348 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1349 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1350 real rcutoff_scalar;
1351 real *shiftvec,*fshift,*x,*f;
1352 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1353 real scratch[4*DIM];
1354 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1355 real * vdwioffsetptr0;
1356 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1357 real * vdwioffsetptr1;
1358 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1359 real * vdwioffsetptr2;
1360 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1361 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1362 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1363 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1364 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1365 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1366 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1367 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1368 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1369 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1370 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1371 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1372 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1373 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1374 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1375 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1376 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1379 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1382 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1383 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1385 __m128i ifour = _mm_set1_epi32(4);
1386 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1388 __m256d dummy_mask,cutoff_mask;
1389 __m128 tmpmask0,tmpmask1;
1390 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1391 __m256d one = _mm256_set1_pd(1.0);
1392 __m256d two = _mm256_set1_pd(2.0);
1398 jindex = nlist->jindex;
1400 shiftidx = nlist->shift;
1402 shiftvec = fr->shift_vec[0];
1403 fshift = fr->fshift[0];
1404 facel = _mm256_set1_pd(fr->epsfac);
1405 charge = mdatoms->chargeA;
1406 nvdwtype = fr->ntype;
1407 vdwparam = fr->nbfp;
1408 vdwtype = mdatoms->typeA;
1410 vftab = kernel_data->table_elec_vdw->data;
1411 vftabscale = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
1413 /* Setup water-specific parameters */
1414 inr = nlist->iinr[0];
1415 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1416 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1417 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1418 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1420 jq0 = _mm256_set1_pd(charge[inr+0]);
1421 jq1 = _mm256_set1_pd(charge[inr+1]);
1422 jq2 = _mm256_set1_pd(charge[inr+2]);
1423 vdwjidx0A = 2*vdwtype[inr+0];
1424 qq00 = _mm256_mul_pd(iq0,jq0);
1425 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1426 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1427 qq01 = _mm256_mul_pd(iq0,jq1);
1428 qq02 = _mm256_mul_pd(iq0,jq2);
1429 qq10 = _mm256_mul_pd(iq1,jq0);
1430 qq11 = _mm256_mul_pd(iq1,jq1);
1431 qq12 = _mm256_mul_pd(iq1,jq2);
1432 qq20 = _mm256_mul_pd(iq2,jq0);
1433 qq21 = _mm256_mul_pd(iq2,jq1);
1434 qq22 = _mm256_mul_pd(iq2,jq2);
1436 /* Avoid stupid compiler warnings */
1437 jnrA = jnrB = jnrC = jnrD = 0;
1438 j_coord_offsetA = 0;
1439 j_coord_offsetB = 0;
1440 j_coord_offsetC = 0;
1441 j_coord_offsetD = 0;
1446 for(iidx=0;iidx<4*DIM;iidx++)
1448 scratch[iidx] = 0.0;
1451 /* Start outer loop over neighborlists */
1452 for(iidx=0; iidx<nri; iidx++)
1454 /* Load shift vector for this list */
1455 i_shift_offset = DIM*shiftidx[iidx];
1457 /* Load limits for loop over neighbors */
1458 j_index_start = jindex[iidx];
1459 j_index_end = jindex[iidx+1];
1461 /* Get outer coordinate index */
1463 i_coord_offset = DIM*inr;
1465 /* Load i particle coords and add shift vector */
1466 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1467 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1469 fix0 = _mm256_setzero_pd();
1470 fiy0 = _mm256_setzero_pd();
1471 fiz0 = _mm256_setzero_pd();
1472 fix1 = _mm256_setzero_pd();
1473 fiy1 = _mm256_setzero_pd();
1474 fiz1 = _mm256_setzero_pd();
1475 fix2 = _mm256_setzero_pd();
1476 fiy2 = _mm256_setzero_pd();
1477 fiz2 = _mm256_setzero_pd();
1479 /* Start inner kernel loop */
1480 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1483 /* Get j neighbor index, and coordinate index */
1485 jnrB = jjnr[jidx+1];
1486 jnrC = jjnr[jidx+2];
1487 jnrD = jjnr[jidx+3];
1488 j_coord_offsetA = DIM*jnrA;
1489 j_coord_offsetB = DIM*jnrB;
1490 j_coord_offsetC = DIM*jnrC;
1491 j_coord_offsetD = DIM*jnrD;
1493 /* load j atom coordinates */
1494 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1495 x+j_coord_offsetC,x+j_coord_offsetD,
1496 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1498 /* Calculate displacement vector */
1499 dx00 = _mm256_sub_pd(ix0,jx0);
1500 dy00 = _mm256_sub_pd(iy0,jy0);
1501 dz00 = _mm256_sub_pd(iz0,jz0);
1502 dx01 = _mm256_sub_pd(ix0,jx1);
1503 dy01 = _mm256_sub_pd(iy0,jy1);
1504 dz01 = _mm256_sub_pd(iz0,jz1);
1505 dx02 = _mm256_sub_pd(ix0,jx2);
1506 dy02 = _mm256_sub_pd(iy0,jy2);
1507 dz02 = _mm256_sub_pd(iz0,jz2);
1508 dx10 = _mm256_sub_pd(ix1,jx0);
1509 dy10 = _mm256_sub_pd(iy1,jy0);
1510 dz10 = _mm256_sub_pd(iz1,jz0);
1511 dx11 = _mm256_sub_pd(ix1,jx1);
1512 dy11 = _mm256_sub_pd(iy1,jy1);
1513 dz11 = _mm256_sub_pd(iz1,jz1);
1514 dx12 = _mm256_sub_pd(ix1,jx2);
1515 dy12 = _mm256_sub_pd(iy1,jy2);
1516 dz12 = _mm256_sub_pd(iz1,jz2);
1517 dx20 = _mm256_sub_pd(ix2,jx0);
1518 dy20 = _mm256_sub_pd(iy2,jy0);
1519 dz20 = _mm256_sub_pd(iz2,jz0);
1520 dx21 = _mm256_sub_pd(ix2,jx1);
1521 dy21 = _mm256_sub_pd(iy2,jy1);
1522 dz21 = _mm256_sub_pd(iz2,jz1);
1523 dx22 = _mm256_sub_pd(ix2,jx2);
1524 dy22 = _mm256_sub_pd(iy2,jy2);
1525 dz22 = _mm256_sub_pd(iz2,jz2);
1527 /* Calculate squared distance and things based on it */
1528 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1529 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1530 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1531 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1532 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1533 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1534 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1535 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1536 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1538 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1539 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1540 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1541 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1542 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1543 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1544 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1545 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1546 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1548 fjx0 = _mm256_setzero_pd();
1549 fjy0 = _mm256_setzero_pd();
1550 fjz0 = _mm256_setzero_pd();
1551 fjx1 = _mm256_setzero_pd();
1552 fjy1 = _mm256_setzero_pd();
1553 fjz1 = _mm256_setzero_pd();
1554 fjx2 = _mm256_setzero_pd();
1555 fjy2 = _mm256_setzero_pd();
1556 fjz2 = _mm256_setzero_pd();
1558 /**************************
1559 * CALCULATE INTERACTIONS *
1560 **************************/
1562 r00 = _mm256_mul_pd(rsq00,rinv00);
1564 /* Calculate table index by multiplying r with table scale and truncate to integer */
1565 rt = _mm256_mul_pd(r00,vftabscale);
1566 vfitab = _mm256_cvttpd_epi32(rt);
1567 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1568 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1570 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1571 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1572 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1573 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1574 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1575 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1576 Heps = _mm256_mul_pd(vfeps,H);
1577 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1578 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1579 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1581 /* CUBIC SPLINE TABLE DISPERSION */
1582 vfitab = _mm_add_epi32(vfitab,ifour);
1583 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1584 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1585 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1586 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1587 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1588 Heps = _mm256_mul_pd(vfeps,H);
1589 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1590 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1591 fvdw6 = _mm256_mul_pd(c6_00,FF);
1593 /* CUBIC SPLINE TABLE REPULSION */
1594 vfitab = _mm_add_epi32(vfitab,ifour);
1595 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1596 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1597 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1598 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1599 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1600 Heps = _mm256_mul_pd(vfeps,H);
1601 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1602 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1603 fvdw12 = _mm256_mul_pd(c12_00,FF);
1604 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1606 fscal = _mm256_add_pd(felec,fvdw);
1608 /* Calculate temporary vectorial force */
1609 tx = _mm256_mul_pd(fscal,dx00);
1610 ty = _mm256_mul_pd(fscal,dy00);
1611 tz = _mm256_mul_pd(fscal,dz00);
1613 /* Update vectorial force */
1614 fix0 = _mm256_add_pd(fix0,tx);
1615 fiy0 = _mm256_add_pd(fiy0,ty);
1616 fiz0 = _mm256_add_pd(fiz0,tz);
1618 fjx0 = _mm256_add_pd(fjx0,tx);
1619 fjy0 = _mm256_add_pd(fjy0,ty);
1620 fjz0 = _mm256_add_pd(fjz0,tz);
1622 /**************************
1623 * CALCULATE INTERACTIONS *
1624 **************************/
1626 r01 = _mm256_mul_pd(rsq01,rinv01);
1628 /* Calculate table index by multiplying r with table scale and truncate to integer */
1629 rt = _mm256_mul_pd(r01,vftabscale);
1630 vfitab = _mm256_cvttpd_epi32(rt);
1631 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1632 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1634 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1635 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1636 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1637 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1638 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1639 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1640 Heps = _mm256_mul_pd(vfeps,H);
1641 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1642 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1643 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1647 /* Calculate temporary vectorial force */
1648 tx = _mm256_mul_pd(fscal,dx01);
1649 ty = _mm256_mul_pd(fscal,dy01);
1650 tz = _mm256_mul_pd(fscal,dz01);
1652 /* Update vectorial force */
1653 fix0 = _mm256_add_pd(fix0,tx);
1654 fiy0 = _mm256_add_pd(fiy0,ty);
1655 fiz0 = _mm256_add_pd(fiz0,tz);
1657 fjx1 = _mm256_add_pd(fjx1,tx);
1658 fjy1 = _mm256_add_pd(fjy1,ty);
1659 fjz1 = _mm256_add_pd(fjz1,tz);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 r02 = _mm256_mul_pd(rsq02,rinv02);
1667 /* Calculate table index by multiplying r with table scale and truncate to integer */
1668 rt = _mm256_mul_pd(r02,vftabscale);
1669 vfitab = _mm256_cvttpd_epi32(rt);
1670 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1671 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1673 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1674 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1675 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1676 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1677 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1678 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1679 Heps = _mm256_mul_pd(vfeps,H);
1680 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1681 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1682 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1686 /* Calculate temporary vectorial force */
1687 tx = _mm256_mul_pd(fscal,dx02);
1688 ty = _mm256_mul_pd(fscal,dy02);
1689 tz = _mm256_mul_pd(fscal,dz02);
1691 /* Update vectorial force */
1692 fix0 = _mm256_add_pd(fix0,tx);
1693 fiy0 = _mm256_add_pd(fiy0,ty);
1694 fiz0 = _mm256_add_pd(fiz0,tz);
1696 fjx2 = _mm256_add_pd(fjx2,tx);
1697 fjy2 = _mm256_add_pd(fjy2,ty);
1698 fjz2 = _mm256_add_pd(fjz2,tz);
1700 /**************************
1701 * CALCULATE INTERACTIONS *
1702 **************************/
1704 r10 = _mm256_mul_pd(rsq10,rinv10);
1706 /* Calculate table index by multiplying r with table scale and truncate to integer */
1707 rt = _mm256_mul_pd(r10,vftabscale);
1708 vfitab = _mm256_cvttpd_epi32(rt);
1709 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1710 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1712 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1713 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1714 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1715 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1716 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1717 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1718 Heps = _mm256_mul_pd(vfeps,H);
1719 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1720 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1721 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1725 /* Calculate temporary vectorial force */
1726 tx = _mm256_mul_pd(fscal,dx10);
1727 ty = _mm256_mul_pd(fscal,dy10);
1728 tz = _mm256_mul_pd(fscal,dz10);
1730 /* Update vectorial force */
1731 fix1 = _mm256_add_pd(fix1,tx);
1732 fiy1 = _mm256_add_pd(fiy1,ty);
1733 fiz1 = _mm256_add_pd(fiz1,tz);
1735 fjx0 = _mm256_add_pd(fjx0,tx);
1736 fjy0 = _mm256_add_pd(fjy0,ty);
1737 fjz0 = _mm256_add_pd(fjz0,tz);
1739 /**************************
1740 * CALCULATE INTERACTIONS *
1741 **************************/
1743 r11 = _mm256_mul_pd(rsq11,rinv11);
1745 /* Calculate table index by multiplying r with table scale and truncate to integer */
1746 rt = _mm256_mul_pd(r11,vftabscale);
1747 vfitab = _mm256_cvttpd_epi32(rt);
1748 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1749 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1751 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1752 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1753 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1754 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1755 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1756 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1757 Heps = _mm256_mul_pd(vfeps,H);
1758 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1759 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1760 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1764 /* Calculate temporary vectorial force */
1765 tx = _mm256_mul_pd(fscal,dx11);
1766 ty = _mm256_mul_pd(fscal,dy11);
1767 tz = _mm256_mul_pd(fscal,dz11);
1769 /* Update vectorial force */
1770 fix1 = _mm256_add_pd(fix1,tx);
1771 fiy1 = _mm256_add_pd(fiy1,ty);
1772 fiz1 = _mm256_add_pd(fiz1,tz);
1774 fjx1 = _mm256_add_pd(fjx1,tx);
1775 fjy1 = _mm256_add_pd(fjy1,ty);
1776 fjz1 = _mm256_add_pd(fjz1,tz);
1778 /**************************
1779 * CALCULATE INTERACTIONS *
1780 **************************/
1782 r12 = _mm256_mul_pd(rsq12,rinv12);
1784 /* Calculate table index by multiplying r with table scale and truncate to integer */
1785 rt = _mm256_mul_pd(r12,vftabscale);
1786 vfitab = _mm256_cvttpd_epi32(rt);
1787 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1788 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1790 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1791 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1792 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1793 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1794 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1795 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1796 Heps = _mm256_mul_pd(vfeps,H);
1797 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1798 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1799 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1803 /* Calculate temporary vectorial force */
1804 tx = _mm256_mul_pd(fscal,dx12);
1805 ty = _mm256_mul_pd(fscal,dy12);
1806 tz = _mm256_mul_pd(fscal,dz12);
1808 /* Update vectorial force */
1809 fix1 = _mm256_add_pd(fix1,tx);
1810 fiy1 = _mm256_add_pd(fiy1,ty);
1811 fiz1 = _mm256_add_pd(fiz1,tz);
1813 fjx2 = _mm256_add_pd(fjx2,tx);
1814 fjy2 = _mm256_add_pd(fjy2,ty);
1815 fjz2 = _mm256_add_pd(fjz2,tz);
1817 /**************************
1818 * CALCULATE INTERACTIONS *
1819 **************************/
1821 r20 = _mm256_mul_pd(rsq20,rinv20);
1823 /* Calculate table index by multiplying r with table scale and truncate to integer */
1824 rt = _mm256_mul_pd(r20,vftabscale);
1825 vfitab = _mm256_cvttpd_epi32(rt);
1826 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1827 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1829 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1830 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1831 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1832 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1833 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1834 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1835 Heps = _mm256_mul_pd(vfeps,H);
1836 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1837 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1838 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1842 /* Calculate temporary vectorial force */
1843 tx = _mm256_mul_pd(fscal,dx20);
1844 ty = _mm256_mul_pd(fscal,dy20);
1845 tz = _mm256_mul_pd(fscal,dz20);
1847 /* Update vectorial force */
1848 fix2 = _mm256_add_pd(fix2,tx);
1849 fiy2 = _mm256_add_pd(fiy2,ty);
1850 fiz2 = _mm256_add_pd(fiz2,tz);
1852 fjx0 = _mm256_add_pd(fjx0,tx);
1853 fjy0 = _mm256_add_pd(fjy0,ty);
1854 fjz0 = _mm256_add_pd(fjz0,tz);
1856 /**************************
1857 * CALCULATE INTERACTIONS *
1858 **************************/
1860 r21 = _mm256_mul_pd(rsq21,rinv21);
1862 /* Calculate table index by multiplying r with table scale and truncate to integer */
1863 rt = _mm256_mul_pd(r21,vftabscale);
1864 vfitab = _mm256_cvttpd_epi32(rt);
1865 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1866 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1868 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1869 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1870 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1871 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1872 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1873 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1874 Heps = _mm256_mul_pd(vfeps,H);
1875 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1876 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1877 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1881 /* Calculate temporary vectorial force */
1882 tx = _mm256_mul_pd(fscal,dx21);
1883 ty = _mm256_mul_pd(fscal,dy21);
1884 tz = _mm256_mul_pd(fscal,dz21);
1886 /* Update vectorial force */
1887 fix2 = _mm256_add_pd(fix2,tx);
1888 fiy2 = _mm256_add_pd(fiy2,ty);
1889 fiz2 = _mm256_add_pd(fiz2,tz);
1891 fjx1 = _mm256_add_pd(fjx1,tx);
1892 fjy1 = _mm256_add_pd(fjy1,ty);
1893 fjz1 = _mm256_add_pd(fjz1,tz);
1895 /**************************
1896 * CALCULATE INTERACTIONS *
1897 **************************/
1899 r22 = _mm256_mul_pd(rsq22,rinv22);
1901 /* Calculate table index by multiplying r with table scale and truncate to integer */
1902 rt = _mm256_mul_pd(r22,vftabscale);
1903 vfitab = _mm256_cvttpd_epi32(rt);
1904 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1905 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1907 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1908 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1909 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1910 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1911 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1912 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1913 Heps = _mm256_mul_pd(vfeps,H);
1914 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1915 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1916 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1920 /* Calculate temporary vectorial force */
1921 tx = _mm256_mul_pd(fscal,dx22);
1922 ty = _mm256_mul_pd(fscal,dy22);
1923 tz = _mm256_mul_pd(fscal,dz22);
1925 /* Update vectorial force */
1926 fix2 = _mm256_add_pd(fix2,tx);
1927 fiy2 = _mm256_add_pd(fiy2,ty);
1928 fiz2 = _mm256_add_pd(fiz2,tz);
1930 fjx2 = _mm256_add_pd(fjx2,tx);
1931 fjy2 = _mm256_add_pd(fjy2,ty);
1932 fjz2 = _mm256_add_pd(fjz2,tz);
1934 fjptrA = f+j_coord_offsetA;
1935 fjptrB = f+j_coord_offsetB;
1936 fjptrC = f+j_coord_offsetC;
1937 fjptrD = f+j_coord_offsetD;
1939 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1940 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1942 /* Inner loop uses 373 flops */
1945 if(jidx<j_index_end)
1948 /* Get j neighbor index, and coordinate index */
1949 jnrlistA = jjnr[jidx];
1950 jnrlistB = jjnr[jidx+1];
1951 jnrlistC = jjnr[jidx+2];
1952 jnrlistD = jjnr[jidx+3];
1953 /* Sign of each element will be negative for non-real atoms.
1954 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1955 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1957 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1959 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1960 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1961 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1963 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1964 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1965 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1966 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1967 j_coord_offsetA = DIM*jnrA;
1968 j_coord_offsetB = DIM*jnrB;
1969 j_coord_offsetC = DIM*jnrC;
1970 j_coord_offsetD = DIM*jnrD;
1972 /* load j atom coordinates */
1973 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1974 x+j_coord_offsetC,x+j_coord_offsetD,
1975 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1977 /* Calculate displacement vector */
1978 dx00 = _mm256_sub_pd(ix0,jx0);
1979 dy00 = _mm256_sub_pd(iy0,jy0);
1980 dz00 = _mm256_sub_pd(iz0,jz0);
1981 dx01 = _mm256_sub_pd(ix0,jx1);
1982 dy01 = _mm256_sub_pd(iy0,jy1);
1983 dz01 = _mm256_sub_pd(iz0,jz1);
1984 dx02 = _mm256_sub_pd(ix0,jx2);
1985 dy02 = _mm256_sub_pd(iy0,jy2);
1986 dz02 = _mm256_sub_pd(iz0,jz2);
1987 dx10 = _mm256_sub_pd(ix1,jx0);
1988 dy10 = _mm256_sub_pd(iy1,jy0);
1989 dz10 = _mm256_sub_pd(iz1,jz0);
1990 dx11 = _mm256_sub_pd(ix1,jx1);
1991 dy11 = _mm256_sub_pd(iy1,jy1);
1992 dz11 = _mm256_sub_pd(iz1,jz1);
1993 dx12 = _mm256_sub_pd(ix1,jx2);
1994 dy12 = _mm256_sub_pd(iy1,jy2);
1995 dz12 = _mm256_sub_pd(iz1,jz2);
1996 dx20 = _mm256_sub_pd(ix2,jx0);
1997 dy20 = _mm256_sub_pd(iy2,jy0);
1998 dz20 = _mm256_sub_pd(iz2,jz0);
1999 dx21 = _mm256_sub_pd(ix2,jx1);
2000 dy21 = _mm256_sub_pd(iy2,jy1);
2001 dz21 = _mm256_sub_pd(iz2,jz1);
2002 dx22 = _mm256_sub_pd(ix2,jx2);
2003 dy22 = _mm256_sub_pd(iy2,jy2);
2004 dz22 = _mm256_sub_pd(iz2,jz2);
2006 /* Calculate squared distance and things based on it */
2007 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2008 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2009 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2010 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2011 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2012 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2013 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2014 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2015 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2017 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2018 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2019 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2020 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2021 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2022 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2023 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2024 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2025 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2027 fjx0 = _mm256_setzero_pd();
2028 fjy0 = _mm256_setzero_pd();
2029 fjz0 = _mm256_setzero_pd();
2030 fjx1 = _mm256_setzero_pd();
2031 fjy1 = _mm256_setzero_pd();
2032 fjz1 = _mm256_setzero_pd();
2033 fjx2 = _mm256_setzero_pd();
2034 fjy2 = _mm256_setzero_pd();
2035 fjz2 = _mm256_setzero_pd();
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 r00 = _mm256_mul_pd(rsq00,rinv00);
2042 r00 = _mm256_andnot_pd(dummy_mask,r00);
2044 /* Calculate table index by multiplying r with table scale and truncate to integer */
2045 rt = _mm256_mul_pd(r00,vftabscale);
2046 vfitab = _mm256_cvttpd_epi32(rt);
2047 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2048 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2050 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2051 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2052 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2053 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2054 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2055 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2056 Heps = _mm256_mul_pd(vfeps,H);
2057 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2058 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2059 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
2061 /* CUBIC SPLINE TABLE DISPERSION */
2062 vfitab = _mm_add_epi32(vfitab,ifour);
2063 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2064 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2065 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2066 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2067 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2068 Heps = _mm256_mul_pd(vfeps,H);
2069 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2070 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2071 fvdw6 = _mm256_mul_pd(c6_00,FF);
2073 /* CUBIC SPLINE TABLE REPULSION */
2074 vfitab = _mm_add_epi32(vfitab,ifour);
2075 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2076 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2077 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2078 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2079 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2080 Heps = _mm256_mul_pd(vfeps,H);
2081 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2082 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2083 fvdw12 = _mm256_mul_pd(c12_00,FF);
2084 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2086 fscal = _mm256_add_pd(felec,fvdw);
2088 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2090 /* Calculate temporary vectorial force */
2091 tx = _mm256_mul_pd(fscal,dx00);
2092 ty = _mm256_mul_pd(fscal,dy00);
2093 tz = _mm256_mul_pd(fscal,dz00);
2095 /* Update vectorial force */
2096 fix0 = _mm256_add_pd(fix0,tx);
2097 fiy0 = _mm256_add_pd(fiy0,ty);
2098 fiz0 = _mm256_add_pd(fiz0,tz);
2100 fjx0 = _mm256_add_pd(fjx0,tx);
2101 fjy0 = _mm256_add_pd(fjy0,ty);
2102 fjz0 = _mm256_add_pd(fjz0,tz);
2104 /**************************
2105 * CALCULATE INTERACTIONS *
2106 **************************/
2108 r01 = _mm256_mul_pd(rsq01,rinv01);
2109 r01 = _mm256_andnot_pd(dummy_mask,r01);
2111 /* Calculate table index by multiplying r with table scale and truncate to integer */
2112 rt = _mm256_mul_pd(r01,vftabscale);
2113 vfitab = _mm256_cvttpd_epi32(rt);
2114 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2115 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2117 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2118 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2119 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2120 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2121 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2122 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2123 Heps = _mm256_mul_pd(vfeps,H);
2124 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2125 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2126 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
2130 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2132 /* Calculate temporary vectorial force */
2133 tx = _mm256_mul_pd(fscal,dx01);
2134 ty = _mm256_mul_pd(fscal,dy01);
2135 tz = _mm256_mul_pd(fscal,dz01);
2137 /* Update vectorial force */
2138 fix0 = _mm256_add_pd(fix0,tx);
2139 fiy0 = _mm256_add_pd(fiy0,ty);
2140 fiz0 = _mm256_add_pd(fiz0,tz);
2142 fjx1 = _mm256_add_pd(fjx1,tx);
2143 fjy1 = _mm256_add_pd(fjy1,ty);
2144 fjz1 = _mm256_add_pd(fjz1,tz);
2146 /**************************
2147 * CALCULATE INTERACTIONS *
2148 **************************/
2150 r02 = _mm256_mul_pd(rsq02,rinv02);
2151 r02 = _mm256_andnot_pd(dummy_mask,r02);
2153 /* Calculate table index by multiplying r with table scale and truncate to integer */
2154 rt = _mm256_mul_pd(r02,vftabscale);
2155 vfitab = _mm256_cvttpd_epi32(rt);
2156 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2157 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2159 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2160 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2161 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2162 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2163 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2164 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2165 Heps = _mm256_mul_pd(vfeps,H);
2166 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2167 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2168 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2172 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2174 /* Calculate temporary vectorial force */
2175 tx = _mm256_mul_pd(fscal,dx02);
2176 ty = _mm256_mul_pd(fscal,dy02);
2177 tz = _mm256_mul_pd(fscal,dz02);
2179 /* Update vectorial force */
2180 fix0 = _mm256_add_pd(fix0,tx);
2181 fiy0 = _mm256_add_pd(fiy0,ty);
2182 fiz0 = _mm256_add_pd(fiz0,tz);
2184 fjx2 = _mm256_add_pd(fjx2,tx);
2185 fjy2 = _mm256_add_pd(fjy2,ty);
2186 fjz2 = _mm256_add_pd(fjz2,tz);
2188 /**************************
2189 * CALCULATE INTERACTIONS *
2190 **************************/
2192 r10 = _mm256_mul_pd(rsq10,rinv10);
2193 r10 = _mm256_andnot_pd(dummy_mask,r10);
2195 /* Calculate table index by multiplying r with table scale and truncate to integer */
2196 rt = _mm256_mul_pd(r10,vftabscale);
2197 vfitab = _mm256_cvttpd_epi32(rt);
2198 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2199 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2201 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2202 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2203 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2204 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2205 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2206 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2207 Heps = _mm256_mul_pd(vfeps,H);
2208 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2209 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2210 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2214 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2216 /* Calculate temporary vectorial force */
2217 tx = _mm256_mul_pd(fscal,dx10);
2218 ty = _mm256_mul_pd(fscal,dy10);
2219 tz = _mm256_mul_pd(fscal,dz10);
2221 /* Update vectorial force */
2222 fix1 = _mm256_add_pd(fix1,tx);
2223 fiy1 = _mm256_add_pd(fiy1,ty);
2224 fiz1 = _mm256_add_pd(fiz1,tz);
2226 fjx0 = _mm256_add_pd(fjx0,tx);
2227 fjy0 = _mm256_add_pd(fjy0,ty);
2228 fjz0 = _mm256_add_pd(fjz0,tz);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 r11 = _mm256_mul_pd(rsq11,rinv11);
2235 r11 = _mm256_andnot_pd(dummy_mask,r11);
2237 /* Calculate table index by multiplying r with table scale and truncate to integer */
2238 rt = _mm256_mul_pd(r11,vftabscale);
2239 vfitab = _mm256_cvttpd_epi32(rt);
2240 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2241 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2243 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2244 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2245 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2246 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2247 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2248 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2249 Heps = _mm256_mul_pd(vfeps,H);
2250 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2251 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2252 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2256 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2258 /* Calculate temporary vectorial force */
2259 tx = _mm256_mul_pd(fscal,dx11);
2260 ty = _mm256_mul_pd(fscal,dy11);
2261 tz = _mm256_mul_pd(fscal,dz11);
2263 /* Update vectorial force */
2264 fix1 = _mm256_add_pd(fix1,tx);
2265 fiy1 = _mm256_add_pd(fiy1,ty);
2266 fiz1 = _mm256_add_pd(fiz1,tz);
2268 fjx1 = _mm256_add_pd(fjx1,tx);
2269 fjy1 = _mm256_add_pd(fjy1,ty);
2270 fjz1 = _mm256_add_pd(fjz1,tz);
2272 /**************************
2273 * CALCULATE INTERACTIONS *
2274 **************************/
2276 r12 = _mm256_mul_pd(rsq12,rinv12);
2277 r12 = _mm256_andnot_pd(dummy_mask,r12);
2279 /* Calculate table index by multiplying r with table scale and truncate to integer */
2280 rt = _mm256_mul_pd(r12,vftabscale);
2281 vfitab = _mm256_cvttpd_epi32(rt);
2282 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2283 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2285 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2286 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2287 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2288 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2289 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2290 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2291 Heps = _mm256_mul_pd(vfeps,H);
2292 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2293 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2294 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2298 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2300 /* Calculate temporary vectorial force */
2301 tx = _mm256_mul_pd(fscal,dx12);
2302 ty = _mm256_mul_pd(fscal,dy12);
2303 tz = _mm256_mul_pd(fscal,dz12);
2305 /* Update vectorial force */
2306 fix1 = _mm256_add_pd(fix1,tx);
2307 fiy1 = _mm256_add_pd(fiy1,ty);
2308 fiz1 = _mm256_add_pd(fiz1,tz);
2310 fjx2 = _mm256_add_pd(fjx2,tx);
2311 fjy2 = _mm256_add_pd(fjy2,ty);
2312 fjz2 = _mm256_add_pd(fjz2,tz);
2314 /**************************
2315 * CALCULATE INTERACTIONS *
2316 **************************/
2318 r20 = _mm256_mul_pd(rsq20,rinv20);
2319 r20 = _mm256_andnot_pd(dummy_mask,r20);
2321 /* Calculate table index by multiplying r with table scale and truncate to integer */
2322 rt = _mm256_mul_pd(r20,vftabscale);
2323 vfitab = _mm256_cvttpd_epi32(rt);
2324 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2325 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2327 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2328 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2329 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2330 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2331 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2332 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2333 Heps = _mm256_mul_pd(vfeps,H);
2334 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2335 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2336 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2340 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2342 /* Calculate temporary vectorial force */
2343 tx = _mm256_mul_pd(fscal,dx20);
2344 ty = _mm256_mul_pd(fscal,dy20);
2345 tz = _mm256_mul_pd(fscal,dz20);
2347 /* Update vectorial force */
2348 fix2 = _mm256_add_pd(fix2,tx);
2349 fiy2 = _mm256_add_pd(fiy2,ty);
2350 fiz2 = _mm256_add_pd(fiz2,tz);
2352 fjx0 = _mm256_add_pd(fjx0,tx);
2353 fjy0 = _mm256_add_pd(fjy0,ty);
2354 fjz0 = _mm256_add_pd(fjz0,tz);
2356 /**************************
2357 * CALCULATE INTERACTIONS *
2358 **************************/
2360 r21 = _mm256_mul_pd(rsq21,rinv21);
2361 r21 = _mm256_andnot_pd(dummy_mask,r21);
2363 /* Calculate table index by multiplying r with table scale and truncate to integer */
2364 rt = _mm256_mul_pd(r21,vftabscale);
2365 vfitab = _mm256_cvttpd_epi32(rt);
2366 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2367 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2369 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2370 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2371 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2372 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2373 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2374 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2375 Heps = _mm256_mul_pd(vfeps,H);
2376 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2377 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2378 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2382 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2384 /* Calculate temporary vectorial force */
2385 tx = _mm256_mul_pd(fscal,dx21);
2386 ty = _mm256_mul_pd(fscal,dy21);
2387 tz = _mm256_mul_pd(fscal,dz21);
2389 /* Update vectorial force */
2390 fix2 = _mm256_add_pd(fix2,tx);
2391 fiy2 = _mm256_add_pd(fiy2,ty);
2392 fiz2 = _mm256_add_pd(fiz2,tz);
2394 fjx1 = _mm256_add_pd(fjx1,tx);
2395 fjy1 = _mm256_add_pd(fjy1,ty);
2396 fjz1 = _mm256_add_pd(fjz1,tz);
2398 /**************************
2399 * CALCULATE INTERACTIONS *
2400 **************************/
2402 r22 = _mm256_mul_pd(rsq22,rinv22);
2403 r22 = _mm256_andnot_pd(dummy_mask,r22);
2405 /* Calculate table index by multiplying r with table scale and truncate to integer */
2406 rt = _mm256_mul_pd(r22,vftabscale);
2407 vfitab = _mm256_cvttpd_epi32(rt);
2408 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2409 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2411 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2412 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2413 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2414 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2415 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2416 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2417 Heps = _mm256_mul_pd(vfeps,H);
2418 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2419 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2420 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2424 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2426 /* Calculate temporary vectorial force */
2427 tx = _mm256_mul_pd(fscal,dx22);
2428 ty = _mm256_mul_pd(fscal,dy22);
2429 tz = _mm256_mul_pd(fscal,dz22);
2431 /* Update vectorial force */
2432 fix2 = _mm256_add_pd(fix2,tx);
2433 fiy2 = _mm256_add_pd(fiy2,ty);
2434 fiz2 = _mm256_add_pd(fiz2,tz);
2436 fjx2 = _mm256_add_pd(fjx2,tx);
2437 fjy2 = _mm256_add_pd(fjy2,ty);
2438 fjz2 = _mm256_add_pd(fjz2,tz);
2440 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2441 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2442 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2443 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2445 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2446 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2448 /* Inner loop uses 382 flops */
2451 /* End of innermost loop */
2453 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2454 f+i_coord_offset,fshift+i_shift_offset);
2456 /* Increment number of inner iterations */
2457 inneriter += j_index_end - j_index_start;
2459 /* Outer loop uses 18 flops */
2462 /* Increment number of outer iterations */
2465 /* Update outer/inner flops */
2467 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);