2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 real * vdwioffsetptr3;
91 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
99 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
104 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
107 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
108 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
109 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
110 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
113 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
117 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
119 __m128i ifour = _mm_set1_epi32(4);
120 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
122 __m256d dummy_mask,cutoff_mask;
123 __m128 tmpmask0,tmpmask1;
124 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
125 __m256d one = _mm256_set1_pd(1.0);
126 __m256d two = _mm256_set1_pd(2.0);
132 jindex = nlist->jindex;
134 shiftidx = nlist->shift;
136 shiftvec = fr->shift_vec[0];
137 fshift = fr->fshift[0];
138 facel = _mm256_set1_pd(fr->epsfac);
139 charge = mdatoms->chargeA;
140 nvdwtype = fr->ntype;
142 vdwtype = mdatoms->typeA;
144 vftab = kernel_data->table_elec_vdw->data;
145 vftabscale = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
150 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
151 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
152 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
154 jq1 = _mm256_set1_pd(charge[inr+1]);
155 jq2 = _mm256_set1_pd(charge[inr+2]);
156 jq3 = _mm256_set1_pd(charge[inr+3]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
159 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
160 qq11 = _mm256_mul_pd(iq1,jq1);
161 qq12 = _mm256_mul_pd(iq1,jq2);
162 qq13 = _mm256_mul_pd(iq1,jq3);
163 qq21 = _mm256_mul_pd(iq2,jq1);
164 qq22 = _mm256_mul_pd(iq2,jq2);
165 qq23 = _mm256_mul_pd(iq2,jq3);
166 qq31 = _mm256_mul_pd(iq3,jq1);
167 qq32 = _mm256_mul_pd(iq3,jq2);
168 qq33 = _mm256_mul_pd(iq3,jq3);
170 /* Avoid stupid compiler warnings */
171 jnrA = jnrB = jnrC = jnrD = 0;
180 for(iidx=0;iidx<4*DIM;iidx++)
185 /* Start outer loop over neighborlists */
186 for(iidx=0; iidx<nri; iidx++)
188 /* Load shift vector for this list */
189 i_shift_offset = DIM*shiftidx[iidx];
191 /* Load limits for loop over neighbors */
192 j_index_start = jindex[iidx];
193 j_index_end = jindex[iidx+1];
195 /* Get outer coordinate index */
197 i_coord_offset = DIM*inr;
199 /* Load i particle coords and add shift vector */
200 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
201 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
203 fix0 = _mm256_setzero_pd();
204 fiy0 = _mm256_setzero_pd();
205 fiz0 = _mm256_setzero_pd();
206 fix1 = _mm256_setzero_pd();
207 fiy1 = _mm256_setzero_pd();
208 fiz1 = _mm256_setzero_pd();
209 fix2 = _mm256_setzero_pd();
210 fiy2 = _mm256_setzero_pd();
211 fiz2 = _mm256_setzero_pd();
212 fix3 = _mm256_setzero_pd();
213 fiy3 = _mm256_setzero_pd();
214 fiz3 = _mm256_setzero_pd();
216 /* Reset potential sums */
217 velecsum = _mm256_setzero_pd();
218 vvdwsum = _mm256_setzero_pd();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
224 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
231 j_coord_offsetC = DIM*jnrC;
232 j_coord_offsetD = DIM*jnrD;
234 /* load j atom coordinates */
235 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
238 &jy2,&jz2,&jx3,&jy3,&jz3);
240 /* Calculate displacement vector */
241 dx00 = _mm256_sub_pd(ix0,jx0);
242 dy00 = _mm256_sub_pd(iy0,jy0);
243 dz00 = _mm256_sub_pd(iz0,jz0);
244 dx11 = _mm256_sub_pd(ix1,jx1);
245 dy11 = _mm256_sub_pd(iy1,jy1);
246 dz11 = _mm256_sub_pd(iz1,jz1);
247 dx12 = _mm256_sub_pd(ix1,jx2);
248 dy12 = _mm256_sub_pd(iy1,jy2);
249 dz12 = _mm256_sub_pd(iz1,jz2);
250 dx13 = _mm256_sub_pd(ix1,jx3);
251 dy13 = _mm256_sub_pd(iy1,jy3);
252 dz13 = _mm256_sub_pd(iz1,jz3);
253 dx21 = _mm256_sub_pd(ix2,jx1);
254 dy21 = _mm256_sub_pd(iy2,jy1);
255 dz21 = _mm256_sub_pd(iz2,jz1);
256 dx22 = _mm256_sub_pd(ix2,jx2);
257 dy22 = _mm256_sub_pd(iy2,jy2);
258 dz22 = _mm256_sub_pd(iz2,jz2);
259 dx23 = _mm256_sub_pd(ix2,jx3);
260 dy23 = _mm256_sub_pd(iy2,jy3);
261 dz23 = _mm256_sub_pd(iz2,jz3);
262 dx31 = _mm256_sub_pd(ix3,jx1);
263 dy31 = _mm256_sub_pd(iy3,jy1);
264 dz31 = _mm256_sub_pd(iz3,jz1);
265 dx32 = _mm256_sub_pd(ix3,jx2);
266 dy32 = _mm256_sub_pd(iy3,jy2);
267 dz32 = _mm256_sub_pd(iz3,jz2);
268 dx33 = _mm256_sub_pd(ix3,jx3);
269 dy33 = _mm256_sub_pd(iy3,jy3);
270 dz33 = _mm256_sub_pd(iz3,jz3);
272 /* Calculate squared distance and things based on it */
273 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
274 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
275 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
276 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
277 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
278 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
279 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
280 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
281 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
282 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
284 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
285 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
286 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
287 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
288 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
289 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
290 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
291 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
292 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
293 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
295 fjx0 = _mm256_setzero_pd();
296 fjy0 = _mm256_setzero_pd();
297 fjz0 = _mm256_setzero_pd();
298 fjx1 = _mm256_setzero_pd();
299 fjy1 = _mm256_setzero_pd();
300 fjz1 = _mm256_setzero_pd();
301 fjx2 = _mm256_setzero_pd();
302 fjy2 = _mm256_setzero_pd();
303 fjz2 = _mm256_setzero_pd();
304 fjx3 = _mm256_setzero_pd();
305 fjy3 = _mm256_setzero_pd();
306 fjz3 = _mm256_setzero_pd();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 r00 = _mm256_mul_pd(rsq00,rinv00);
314 /* Calculate table index by multiplying r with table scale and truncate to integer */
315 rt = _mm256_mul_pd(r00,vftabscale);
316 vfitab = _mm256_cvttpd_epi32(rt);
317 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
318 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
320 /* CUBIC SPLINE TABLE DISPERSION */
321 vfitab = _mm_add_epi32(vfitab,ifour);
322 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
323 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
324 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
325 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
326 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
327 Heps = _mm256_mul_pd(vfeps,H);
328 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
329 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
330 vvdw6 = _mm256_mul_pd(c6_00,VV);
331 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
332 fvdw6 = _mm256_mul_pd(c6_00,FF);
334 /* CUBIC SPLINE TABLE REPULSION */
335 vfitab = _mm_add_epi32(vfitab,ifour);
336 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
337 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
338 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
339 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
340 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
341 Heps = _mm256_mul_pd(vfeps,H);
342 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
343 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
344 vvdw12 = _mm256_mul_pd(c12_00,VV);
345 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
346 fvdw12 = _mm256_mul_pd(c12_00,FF);
347 vvdw = _mm256_add_pd(vvdw12,vvdw6);
348 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
355 /* Calculate temporary vectorial force */
356 tx = _mm256_mul_pd(fscal,dx00);
357 ty = _mm256_mul_pd(fscal,dy00);
358 tz = _mm256_mul_pd(fscal,dz00);
360 /* Update vectorial force */
361 fix0 = _mm256_add_pd(fix0,tx);
362 fiy0 = _mm256_add_pd(fiy0,ty);
363 fiz0 = _mm256_add_pd(fiz0,tz);
365 fjx0 = _mm256_add_pd(fjx0,tx);
366 fjy0 = _mm256_add_pd(fjy0,ty);
367 fjz0 = _mm256_add_pd(fjz0,tz);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 r11 = _mm256_mul_pd(rsq11,rinv11);
375 /* Calculate table index by multiplying r with table scale and truncate to integer */
376 rt = _mm256_mul_pd(r11,vftabscale);
377 vfitab = _mm256_cvttpd_epi32(rt);
378 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
379 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
381 /* CUBIC SPLINE TABLE ELECTROSTATICS */
382 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
383 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
384 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
385 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
386 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
387 Heps = _mm256_mul_pd(vfeps,H);
388 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
389 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
390 velec = _mm256_mul_pd(qq11,VV);
391 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
392 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
394 /* Update potential sum for this i atom from the interaction with this j atom. */
395 velecsum = _mm256_add_pd(velecsum,velec);
399 /* Calculate temporary vectorial force */
400 tx = _mm256_mul_pd(fscal,dx11);
401 ty = _mm256_mul_pd(fscal,dy11);
402 tz = _mm256_mul_pd(fscal,dz11);
404 /* Update vectorial force */
405 fix1 = _mm256_add_pd(fix1,tx);
406 fiy1 = _mm256_add_pd(fiy1,ty);
407 fiz1 = _mm256_add_pd(fiz1,tz);
409 fjx1 = _mm256_add_pd(fjx1,tx);
410 fjy1 = _mm256_add_pd(fjy1,ty);
411 fjz1 = _mm256_add_pd(fjz1,tz);
413 /**************************
414 * CALCULATE INTERACTIONS *
415 **************************/
417 r12 = _mm256_mul_pd(rsq12,rinv12);
419 /* Calculate table index by multiplying r with table scale and truncate to integer */
420 rt = _mm256_mul_pd(r12,vftabscale);
421 vfitab = _mm256_cvttpd_epi32(rt);
422 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
423 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
425 /* CUBIC SPLINE TABLE ELECTROSTATICS */
426 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
427 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
428 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
429 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
430 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
431 Heps = _mm256_mul_pd(vfeps,H);
432 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
433 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
434 velec = _mm256_mul_pd(qq12,VV);
435 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
436 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
438 /* Update potential sum for this i atom from the interaction with this j atom. */
439 velecsum = _mm256_add_pd(velecsum,velec);
443 /* Calculate temporary vectorial force */
444 tx = _mm256_mul_pd(fscal,dx12);
445 ty = _mm256_mul_pd(fscal,dy12);
446 tz = _mm256_mul_pd(fscal,dz12);
448 /* Update vectorial force */
449 fix1 = _mm256_add_pd(fix1,tx);
450 fiy1 = _mm256_add_pd(fiy1,ty);
451 fiz1 = _mm256_add_pd(fiz1,tz);
453 fjx2 = _mm256_add_pd(fjx2,tx);
454 fjy2 = _mm256_add_pd(fjy2,ty);
455 fjz2 = _mm256_add_pd(fjz2,tz);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 r13 = _mm256_mul_pd(rsq13,rinv13);
463 /* Calculate table index by multiplying r with table scale and truncate to integer */
464 rt = _mm256_mul_pd(r13,vftabscale);
465 vfitab = _mm256_cvttpd_epi32(rt);
466 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
467 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
469 /* CUBIC SPLINE TABLE ELECTROSTATICS */
470 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
471 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
472 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
473 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
474 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
475 Heps = _mm256_mul_pd(vfeps,H);
476 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
477 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
478 velec = _mm256_mul_pd(qq13,VV);
479 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
480 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velecsum = _mm256_add_pd(velecsum,velec);
487 /* Calculate temporary vectorial force */
488 tx = _mm256_mul_pd(fscal,dx13);
489 ty = _mm256_mul_pd(fscal,dy13);
490 tz = _mm256_mul_pd(fscal,dz13);
492 /* Update vectorial force */
493 fix1 = _mm256_add_pd(fix1,tx);
494 fiy1 = _mm256_add_pd(fiy1,ty);
495 fiz1 = _mm256_add_pd(fiz1,tz);
497 fjx3 = _mm256_add_pd(fjx3,tx);
498 fjy3 = _mm256_add_pd(fjy3,ty);
499 fjz3 = _mm256_add_pd(fjz3,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 r21 = _mm256_mul_pd(rsq21,rinv21);
507 /* Calculate table index by multiplying r with table scale and truncate to integer */
508 rt = _mm256_mul_pd(r21,vftabscale);
509 vfitab = _mm256_cvttpd_epi32(rt);
510 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
511 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
513 /* CUBIC SPLINE TABLE ELECTROSTATICS */
514 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
515 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
516 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
517 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
518 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
519 Heps = _mm256_mul_pd(vfeps,H);
520 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
521 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
522 velec = _mm256_mul_pd(qq21,VV);
523 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
524 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velecsum = _mm256_add_pd(velecsum,velec);
531 /* Calculate temporary vectorial force */
532 tx = _mm256_mul_pd(fscal,dx21);
533 ty = _mm256_mul_pd(fscal,dy21);
534 tz = _mm256_mul_pd(fscal,dz21);
536 /* Update vectorial force */
537 fix2 = _mm256_add_pd(fix2,tx);
538 fiy2 = _mm256_add_pd(fiy2,ty);
539 fiz2 = _mm256_add_pd(fiz2,tz);
541 fjx1 = _mm256_add_pd(fjx1,tx);
542 fjy1 = _mm256_add_pd(fjy1,ty);
543 fjz1 = _mm256_add_pd(fjz1,tz);
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
549 r22 = _mm256_mul_pd(rsq22,rinv22);
551 /* Calculate table index by multiplying r with table scale and truncate to integer */
552 rt = _mm256_mul_pd(r22,vftabscale);
553 vfitab = _mm256_cvttpd_epi32(rt);
554 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
555 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
557 /* CUBIC SPLINE TABLE ELECTROSTATICS */
558 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
559 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
560 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
561 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
562 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
563 Heps = _mm256_mul_pd(vfeps,H);
564 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
565 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
566 velec = _mm256_mul_pd(qq22,VV);
567 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
568 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velecsum = _mm256_add_pd(velecsum,velec);
575 /* Calculate temporary vectorial force */
576 tx = _mm256_mul_pd(fscal,dx22);
577 ty = _mm256_mul_pd(fscal,dy22);
578 tz = _mm256_mul_pd(fscal,dz22);
580 /* Update vectorial force */
581 fix2 = _mm256_add_pd(fix2,tx);
582 fiy2 = _mm256_add_pd(fiy2,ty);
583 fiz2 = _mm256_add_pd(fiz2,tz);
585 fjx2 = _mm256_add_pd(fjx2,tx);
586 fjy2 = _mm256_add_pd(fjy2,ty);
587 fjz2 = _mm256_add_pd(fjz2,tz);
589 /**************************
590 * CALCULATE INTERACTIONS *
591 **************************/
593 r23 = _mm256_mul_pd(rsq23,rinv23);
595 /* Calculate table index by multiplying r with table scale and truncate to integer */
596 rt = _mm256_mul_pd(r23,vftabscale);
597 vfitab = _mm256_cvttpd_epi32(rt);
598 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
599 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
601 /* CUBIC SPLINE TABLE ELECTROSTATICS */
602 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
603 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
604 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
605 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
606 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
607 Heps = _mm256_mul_pd(vfeps,H);
608 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
609 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
610 velec = _mm256_mul_pd(qq23,VV);
611 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
612 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
614 /* Update potential sum for this i atom from the interaction with this j atom. */
615 velecsum = _mm256_add_pd(velecsum,velec);
619 /* Calculate temporary vectorial force */
620 tx = _mm256_mul_pd(fscal,dx23);
621 ty = _mm256_mul_pd(fscal,dy23);
622 tz = _mm256_mul_pd(fscal,dz23);
624 /* Update vectorial force */
625 fix2 = _mm256_add_pd(fix2,tx);
626 fiy2 = _mm256_add_pd(fiy2,ty);
627 fiz2 = _mm256_add_pd(fiz2,tz);
629 fjx3 = _mm256_add_pd(fjx3,tx);
630 fjy3 = _mm256_add_pd(fjy3,ty);
631 fjz3 = _mm256_add_pd(fjz3,tz);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 r31 = _mm256_mul_pd(rsq31,rinv31);
639 /* Calculate table index by multiplying r with table scale and truncate to integer */
640 rt = _mm256_mul_pd(r31,vftabscale);
641 vfitab = _mm256_cvttpd_epi32(rt);
642 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
643 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
645 /* CUBIC SPLINE TABLE ELECTROSTATICS */
646 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
647 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
648 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
649 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
650 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
651 Heps = _mm256_mul_pd(vfeps,H);
652 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
653 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
654 velec = _mm256_mul_pd(qq31,VV);
655 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
656 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velecsum = _mm256_add_pd(velecsum,velec);
663 /* Calculate temporary vectorial force */
664 tx = _mm256_mul_pd(fscal,dx31);
665 ty = _mm256_mul_pd(fscal,dy31);
666 tz = _mm256_mul_pd(fscal,dz31);
668 /* Update vectorial force */
669 fix3 = _mm256_add_pd(fix3,tx);
670 fiy3 = _mm256_add_pd(fiy3,ty);
671 fiz3 = _mm256_add_pd(fiz3,tz);
673 fjx1 = _mm256_add_pd(fjx1,tx);
674 fjy1 = _mm256_add_pd(fjy1,ty);
675 fjz1 = _mm256_add_pd(fjz1,tz);
677 /**************************
678 * CALCULATE INTERACTIONS *
679 **************************/
681 r32 = _mm256_mul_pd(rsq32,rinv32);
683 /* Calculate table index by multiplying r with table scale and truncate to integer */
684 rt = _mm256_mul_pd(r32,vftabscale);
685 vfitab = _mm256_cvttpd_epi32(rt);
686 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
687 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
689 /* CUBIC SPLINE TABLE ELECTROSTATICS */
690 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
691 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
692 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
693 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
694 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
695 Heps = _mm256_mul_pd(vfeps,H);
696 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
697 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
698 velec = _mm256_mul_pd(qq32,VV);
699 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
700 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
702 /* Update potential sum for this i atom from the interaction with this j atom. */
703 velecsum = _mm256_add_pd(velecsum,velec);
707 /* Calculate temporary vectorial force */
708 tx = _mm256_mul_pd(fscal,dx32);
709 ty = _mm256_mul_pd(fscal,dy32);
710 tz = _mm256_mul_pd(fscal,dz32);
712 /* Update vectorial force */
713 fix3 = _mm256_add_pd(fix3,tx);
714 fiy3 = _mm256_add_pd(fiy3,ty);
715 fiz3 = _mm256_add_pd(fiz3,tz);
717 fjx2 = _mm256_add_pd(fjx2,tx);
718 fjy2 = _mm256_add_pd(fjy2,ty);
719 fjz2 = _mm256_add_pd(fjz2,tz);
721 /**************************
722 * CALCULATE INTERACTIONS *
723 **************************/
725 r33 = _mm256_mul_pd(rsq33,rinv33);
727 /* Calculate table index by multiplying r with table scale and truncate to integer */
728 rt = _mm256_mul_pd(r33,vftabscale);
729 vfitab = _mm256_cvttpd_epi32(rt);
730 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
731 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
733 /* CUBIC SPLINE TABLE ELECTROSTATICS */
734 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
735 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
736 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
737 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
738 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
739 Heps = _mm256_mul_pd(vfeps,H);
740 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
741 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
742 velec = _mm256_mul_pd(qq33,VV);
743 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
744 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velecsum = _mm256_add_pd(velecsum,velec);
751 /* Calculate temporary vectorial force */
752 tx = _mm256_mul_pd(fscal,dx33);
753 ty = _mm256_mul_pd(fscal,dy33);
754 tz = _mm256_mul_pd(fscal,dz33);
756 /* Update vectorial force */
757 fix3 = _mm256_add_pd(fix3,tx);
758 fiy3 = _mm256_add_pd(fiy3,ty);
759 fiz3 = _mm256_add_pd(fiz3,tz);
761 fjx3 = _mm256_add_pd(fjx3,tx);
762 fjy3 = _mm256_add_pd(fjy3,ty);
763 fjz3 = _mm256_add_pd(fjz3,tz);
765 fjptrA = f+j_coord_offsetA;
766 fjptrB = f+j_coord_offsetB;
767 fjptrC = f+j_coord_offsetC;
768 fjptrD = f+j_coord_offsetD;
770 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
771 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
772 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
774 /* Inner loop uses 446 flops */
780 /* Get j neighbor index, and coordinate index */
781 jnrlistA = jjnr[jidx];
782 jnrlistB = jjnr[jidx+1];
783 jnrlistC = jjnr[jidx+2];
784 jnrlistD = jjnr[jidx+3];
785 /* Sign of each element will be negative for non-real atoms.
786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
787 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
789 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
791 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
792 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
793 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
795 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
796 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
797 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
798 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
799 j_coord_offsetA = DIM*jnrA;
800 j_coord_offsetB = DIM*jnrB;
801 j_coord_offsetC = DIM*jnrC;
802 j_coord_offsetD = DIM*jnrD;
804 /* load j atom coordinates */
805 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
806 x+j_coord_offsetC,x+j_coord_offsetD,
807 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
808 &jy2,&jz2,&jx3,&jy3,&jz3);
810 /* Calculate displacement vector */
811 dx00 = _mm256_sub_pd(ix0,jx0);
812 dy00 = _mm256_sub_pd(iy0,jy0);
813 dz00 = _mm256_sub_pd(iz0,jz0);
814 dx11 = _mm256_sub_pd(ix1,jx1);
815 dy11 = _mm256_sub_pd(iy1,jy1);
816 dz11 = _mm256_sub_pd(iz1,jz1);
817 dx12 = _mm256_sub_pd(ix1,jx2);
818 dy12 = _mm256_sub_pd(iy1,jy2);
819 dz12 = _mm256_sub_pd(iz1,jz2);
820 dx13 = _mm256_sub_pd(ix1,jx3);
821 dy13 = _mm256_sub_pd(iy1,jy3);
822 dz13 = _mm256_sub_pd(iz1,jz3);
823 dx21 = _mm256_sub_pd(ix2,jx1);
824 dy21 = _mm256_sub_pd(iy2,jy1);
825 dz21 = _mm256_sub_pd(iz2,jz1);
826 dx22 = _mm256_sub_pd(ix2,jx2);
827 dy22 = _mm256_sub_pd(iy2,jy2);
828 dz22 = _mm256_sub_pd(iz2,jz2);
829 dx23 = _mm256_sub_pd(ix2,jx3);
830 dy23 = _mm256_sub_pd(iy2,jy3);
831 dz23 = _mm256_sub_pd(iz2,jz3);
832 dx31 = _mm256_sub_pd(ix3,jx1);
833 dy31 = _mm256_sub_pd(iy3,jy1);
834 dz31 = _mm256_sub_pd(iz3,jz1);
835 dx32 = _mm256_sub_pd(ix3,jx2);
836 dy32 = _mm256_sub_pd(iy3,jy2);
837 dz32 = _mm256_sub_pd(iz3,jz2);
838 dx33 = _mm256_sub_pd(ix3,jx3);
839 dy33 = _mm256_sub_pd(iy3,jy3);
840 dz33 = _mm256_sub_pd(iz3,jz3);
842 /* Calculate squared distance and things based on it */
843 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
844 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
845 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
846 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
847 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
848 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
849 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
850 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
851 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
852 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
854 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
855 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
856 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
857 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
858 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
859 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
860 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
861 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
862 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
863 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
865 fjx0 = _mm256_setzero_pd();
866 fjy0 = _mm256_setzero_pd();
867 fjz0 = _mm256_setzero_pd();
868 fjx1 = _mm256_setzero_pd();
869 fjy1 = _mm256_setzero_pd();
870 fjz1 = _mm256_setzero_pd();
871 fjx2 = _mm256_setzero_pd();
872 fjy2 = _mm256_setzero_pd();
873 fjz2 = _mm256_setzero_pd();
874 fjx3 = _mm256_setzero_pd();
875 fjy3 = _mm256_setzero_pd();
876 fjz3 = _mm256_setzero_pd();
878 /**************************
879 * CALCULATE INTERACTIONS *
880 **************************/
882 r00 = _mm256_mul_pd(rsq00,rinv00);
883 r00 = _mm256_andnot_pd(dummy_mask,r00);
885 /* Calculate table index by multiplying r with table scale and truncate to integer */
886 rt = _mm256_mul_pd(r00,vftabscale);
887 vfitab = _mm256_cvttpd_epi32(rt);
888 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
889 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
891 /* CUBIC SPLINE TABLE DISPERSION */
892 vfitab = _mm_add_epi32(vfitab,ifour);
893 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
894 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
895 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
896 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
897 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
898 Heps = _mm256_mul_pd(vfeps,H);
899 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
900 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
901 vvdw6 = _mm256_mul_pd(c6_00,VV);
902 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
903 fvdw6 = _mm256_mul_pd(c6_00,FF);
905 /* CUBIC SPLINE TABLE REPULSION */
906 vfitab = _mm_add_epi32(vfitab,ifour);
907 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
908 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
909 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
910 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
911 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
912 Heps = _mm256_mul_pd(vfeps,H);
913 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
914 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
915 vvdw12 = _mm256_mul_pd(c12_00,VV);
916 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
917 fvdw12 = _mm256_mul_pd(c12_00,FF);
918 vvdw = _mm256_add_pd(vvdw12,vvdw6);
919 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
923 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
927 fscal = _mm256_andnot_pd(dummy_mask,fscal);
929 /* Calculate temporary vectorial force */
930 tx = _mm256_mul_pd(fscal,dx00);
931 ty = _mm256_mul_pd(fscal,dy00);
932 tz = _mm256_mul_pd(fscal,dz00);
934 /* Update vectorial force */
935 fix0 = _mm256_add_pd(fix0,tx);
936 fiy0 = _mm256_add_pd(fiy0,ty);
937 fiz0 = _mm256_add_pd(fiz0,tz);
939 fjx0 = _mm256_add_pd(fjx0,tx);
940 fjy0 = _mm256_add_pd(fjy0,ty);
941 fjz0 = _mm256_add_pd(fjz0,tz);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 r11 = _mm256_mul_pd(rsq11,rinv11);
948 r11 = _mm256_andnot_pd(dummy_mask,r11);
950 /* Calculate table index by multiplying r with table scale and truncate to integer */
951 rt = _mm256_mul_pd(r11,vftabscale);
952 vfitab = _mm256_cvttpd_epi32(rt);
953 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
954 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
956 /* CUBIC SPLINE TABLE ELECTROSTATICS */
957 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
958 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
959 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
960 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
961 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
962 Heps = _mm256_mul_pd(vfeps,H);
963 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
964 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
965 velec = _mm256_mul_pd(qq11,VV);
966 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
967 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm256_andnot_pd(dummy_mask,velec);
971 velecsum = _mm256_add_pd(velecsum,velec);
975 fscal = _mm256_andnot_pd(dummy_mask,fscal);
977 /* Calculate temporary vectorial force */
978 tx = _mm256_mul_pd(fscal,dx11);
979 ty = _mm256_mul_pd(fscal,dy11);
980 tz = _mm256_mul_pd(fscal,dz11);
982 /* Update vectorial force */
983 fix1 = _mm256_add_pd(fix1,tx);
984 fiy1 = _mm256_add_pd(fiy1,ty);
985 fiz1 = _mm256_add_pd(fiz1,tz);
987 fjx1 = _mm256_add_pd(fjx1,tx);
988 fjy1 = _mm256_add_pd(fjy1,ty);
989 fjz1 = _mm256_add_pd(fjz1,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 r12 = _mm256_mul_pd(rsq12,rinv12);
996 r12 = _mm256_andnot_pd(dummy_mask,r12);
998 /* Calculate table index by multiplying r with table scale and truncate to integer */
999 rt = _mm256_mul_pd(r12,vftabscale);
1000 vfitab = _mm256_cvttpd_epi32(rt);
1001 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1002 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1006 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1007 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1008 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1009 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1010 Heps = _mm256_mul_pd(vfeps,H);
1011 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1012 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1013 velec = _mm256_mul_pd(qq12,VV);
1014 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1015 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 velec = _mm256_andnot_pd(dummy_mask,velec);
1019 velecsum = _mm256_add_pd(velecsum,velec);
1023 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1025 /* Calculate temporary vectorial force */
1026 tx = _mm256_mul_pd(fscal,dx12);
1027 ty = _mm256_mul_pd(fscal,dy12);
1028 tz = _mm256_mul_pd(fscal,dz12);
1030 /* Update vectorial force */
1031 fix1 = _mm256_add_pd(fix1,tx);
1032 fiy1 = _mm256_add_pd(fiy1,ty);
1033 fiz1 = _mm256_add_pd(fiz1,tz);
1035 fjx2 = _mm256_add_pd(fjx2,tx);
1036 fjy2 = _mm256_add_pd(fjy2,ty);
1037 fjz2 = _mm256_add_pd(fjz2,tz);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r13 = _mm256_mul_pd(rsq13,rinv13);
1044 r13 = _mm256_andnot_pd(dummy_mask,r13);
1046 /* Calculate table index by multiplying r with table scale and truncate to integer */
1047 rt = _mm256_mul_pd(r13,vftabscale);
1048 vfitab = _mm256_cvttpd_epi32(rt);
1049 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1050 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1052 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1053 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1054 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1055 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1056 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1057 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1058 Heps = _mm256_mul_pd(vfeps,H);
1059 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1060 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1061 velec = _mm256_mul_pd(qq13,VV);
1062 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1063 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec = _mm256_andnot_pd(dummy_mask,velec);
1067 velecsum = _mm256_add_pd(velecsum,velec);
1071 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm256_mul_pd(fscal,dx13);
1075 ty = _mm256_mul_pd(fscal,dy13);
1076 tz = _mm256_mul_pd(fscal,dz13);
1078 /* Update vectorial force */
1079 fix1 = _mm256_add_pd(fix1,tx);
1080 fiy1 = _mm256_add_pd(fiy1,ty);
1081 fiz1 = _mm256_add_pd(fiz1,tz);
1083 fjx3 = _mm256_add_pd(fjx3,tx);
1084 fjy3 = _mm256_add_pd(fjy3,ty);
1085 fjz3 = _mm256_add_pd(fjz3,tz);
1087 /**************************
1088 * CALCULATE INTERACTIONS *
1089 **************************/
1091 r21 = _mm256_mul_pd(rsq21,rinv21);
1092 r21 = _mm256_andnot_pd(dummy_mask,r21);
1094 /* Calculate table index by multiplying r with table scale and truncate to integer */
1095 rt = _mm256_mul_pd(r21,vftabscale);
1096 vfitab = _mm256_cvttpd_epi32(rt);
1097 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1098 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1102 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1103 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1104 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1105 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1106 Heps = _mm256_mul_pd(vfeps,H);
1107 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1108 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1109 velec = _mm256_mul_pd(qq21,VV);
1110 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1111 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec = _mm256_andnot_pd(dummy_mask,velec);
1115 velecsum = _mm256_add_pd(velecsum,velec);
1119 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1121 /* Calculate temporary vectorial force */
1122 tx = _mm256_mul_pd(fscal,dx21);
1123 ty = _mm256_mul_pd(fscal,dy21);
1124 tz = _mm256_mul_pd(fscal,dz21);
1126 /* Update vectorial force */
1127 fix2 = _mm256_add_pd(fix2,tx);
1128 fiy2 = _mm256_add_pd(fiy2,ty);
1129 fiz2 = _mm256_add_pd(fiz2,tz);
1131 fjx1 = _mm256_add_pd(fjx1,tx);
1132 fjy1 = _mm256_add_pd(fjy1,ty);
1133 fjz1 = _mm256_add_pd(fjz1,tz);
1135 /**************************
1136 * CALCULATE INTERACTIONS *
1137 **************************/
1139 r22 = _mm256_mul_pd(rsq22,rinv22);
1140 r22 = _mm256_andnot_pd(dummy_mask,r22);
1142 /* Calculate table index by multiplying r with table scale and truncate to integer */
1143 rt = _mm256_mul_pd(r22,vftabscale);
1144 vfitab = _mm256_cvttpd_epi32(rt);
1145 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1146 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1148 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1149 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1150 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1151 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1152 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1153 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1154 Heps = _mm256_mul_pd(vfeps,H);
1155 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1156 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1157 velec = _mm256_mul_pd(qq22,VV);
1158 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1159 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1161 /* Update potential sum for this i atom from the interaction with this j atom. */
1162 velec = _mm256_andnot_pd(dummy_mask,velec);
1163 velecsum = _mm256_add_pd(velecsum,velec);
1167 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1169 /* Calculate temporary vectorial force */
1170 tx = _mm256_mul_pd(fscal,dx22);
1171 ty = _mm256_mul_pd(fscal,dy22);
1172 tz = _mm256_mul_pd(fscal,dz22);
1174 /* Update vectorial force */
1175 fix2 = _mm256_add_pd(fix2,tx);
1176 fiy2 = _mm256_add_pd(fiy2,ty);
1177 fiz2 = _mm256_add_pd(fiz2,tz);
1179 fjx2 = _mm256_add_pd(fjx2,tx);
1180 fjy2 = _mm256_add_pd(fjy2,ty);
1181 fjz2 = _mm256_add_pd(fjz2,tz);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 r23 = _mm256_mul_pd(rsq23,rinv23);
1188 r23 = _mm256_andnot_pd(dummy_mask,r23);
1190 /* Calculate table index by multiplying r with table scale and truncate to integer */
1191 rt = _mm256_mul_pd(r23,vftabscale);
1192 vfitab = _mm256_cvttpd_epi32(rt);
1193 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1194 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1196 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1197 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1198 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1199 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1200 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1201 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1202 Heps = _mm256_mul_pd(vfeps,H);
1203 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1204 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1205 velec = _mm256_mul_pd(qq23,VV);
1206 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1207 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1209 /* Update potential sum for this i atom from the interaction with this j atom. */
1210 velec = _mm256_andnot_pd(dummy_mask,velec);
1211 velecsum = _mm256_add_pd(velecsum,velec);
1215 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1217 /* Calculate temporary vectorial force */
1218 tx = _mm256_mul_pd(fscal,dx23);
1219 ty = _mm256_mul_pd(fscal,dy23);
1220 tz = _mm256_mul_pd(fscal,dz23);
1222 /* Update vectorial force */
1223 fix2 = _mm256_add_pd(fix2,tx);
1224 fiy2 = _mm256_add_pd(fiy2,ty);
1225 fiz2 = _mm256_add_pd(fiz2,tz);
1227 fjx3 = _mm256_add_pd(fjx3,tx);
1228 fjy3 = _mm256_add_pd(fjy3,ty);
1229 fjz3 = _mm256_add_pd(fjz3,tz);
1231 /**************************
1232 * CALCULATE INTERACTIONS *
1233 **************************/
1235 r31 = _mm256_mul_pd(rsq31,rinv31);
1236 r31 = _mm256_andnot_pd(dummy_mask,r31);
1238 /* Calculate table index by multiplying r with table scale and truncate to integer */
1239 rt = _mm256_mul_pd(r31,vftabscale);
1240 vfitab = _mm256_cvttpd_epi32(rt);
1241 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1242 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1244 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1245 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1246 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1247 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1248 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1249 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1250 Heps = _mm256_mul_pd(vfeps,H);
1251 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1252 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1253 velec = _mm256_mul_pd(qq31,VV);
1254 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1255 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1257 /* Update potential sum for this i atom from the interaction with this j atom. */
1258 velec = _mm256_andnot_pd(dummy_mask,velec);
1259 velecsum = _mm256_add_pd(velecsum,velec);
1263 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1265 /* Calculate temporary vectorial force */
1266 tx = _mm256_mul_pd(fscal,dx31);
1267 ty = _mm256_mul_pd(fscal,dy31);
1268 tz = _mm256_mul_pd(fscal,dz31);
1270 /* Update vectorial force */
1271 fix3 = _mm256_add_pd(fix3,tx);
1272 fiy3 = _mm256_add_pd(fiy3,ty);
1273 fiz3 = _mm256_add_pd(fiz3,tz);
1275 fjx1 = _mm256_add_pd(fjx1,tx);
1276 fjy1 = _mm256_add_pd(fjy1,ty);
1277 fjz1 = _mm256_add_pd(fjz1,tz);
1279 /**************************
1280 * CALCULATE INTERACTIONS *
1281 **************************/
1283 r32 = _mm256_mul_pd(rsq32,rinv32);
1284 r32 = _mm256_andnot_pd(dummy_mask,r32);
1286 /* Calculate table index by multiplying r with table scale and truncate to integer */
1287 rt = _mm256_mul_pd(r32,vftabscale);
1288 vfitab = _mm256_cvttpd_epi32(rt);
1289 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1290 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1292 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1293 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1294 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1295 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1296 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1297 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1298 Heps = _mm256_mul_pd(vfeps,H);
1299 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1300 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1301 velec = _mm256_mul_pd(qq32,VV);
1302 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1303 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
1305 /* Update potential sum for this i atom from the interaction with this j atom. */
1306 velec = _mm256_andnot_pd(dummy_mask,velec);
1307 velecsum = _mm256_add_pd(velecsum,velec);
1311 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1313 /* Calculate temporary vectorial force */
1314 tx = _mm256_mul_pd(fscal,dx32);
1315 ty = _mm256_mul_pd(fscal,dy32);
1316 tz = _mm256_mul_pd(fscal,dz32);
1318 /* Update vectorial force */
1319 fix3 = _mm256_add_pd(fix3,tx);
1320 fiy3 = _mm256_add_pd(fiy3,ty);
1321 fiz3 = _mm256_add_pd(fiz3,tz);
1323 fjx2 = _mm256_add_pd(fjx2,tx);
1324 fjy2 = _mm256_add_pd(fjy2,ty);
1325 fjz2 = _mm256_add_pd(fjz2,tz);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 r33 = _mm256_mul_pd(rsq33,rinv33);
1332 r33 = _mm256_andnot_pd(dummy_mask,r33);
1334 /* Calculate table index by multiplying r with table scale and truncate to integer */
1335 rt = _mm256_mul_pd(r33,vftabscale);
1336 vfitab = _mm256_cvttpd_epi32(rt);
1337 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1338 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1340 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1341 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1342 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1343 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1344 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1345 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1346 Heps = _mm256_mul_pd(vfeps,H);
1347 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1348 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1349 velec = _mm256_mul_pd(qq33,VV);
1350 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1351 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
1353 /* Update potential sum for this i atom from the interaction with this j atom. */
1354 velec = _mm256_andnot_pd(dummy_mask,velec);
1355 velecsum = _mm256_add_pd(velecsum,velec);
1359 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1361 /* Calculate temporary vectorial force */
1362 tx = _mm256_mul_pd(fscal,dx33);
1363 ty = _mm256_mul_pd(fscal,dy33);
1364 tz = _mm256_mul_pd(fscal,dz33);
1366 /* Update vectorial force */
1367 fix3 = _mm256_add_pd(fix3,tx);
1368 fiy3 = _mm256_add_pd(fiy3,ty);
1369 fiz3 = _mm256_add_pd(fiz3,tz);
1371 fjx3 = _mm256_add_pd(fjx3,tx);
1372 fjy3 = _mm256_add_pd(fjy3,ty);
1373 fjz3 = _mm256_add_pd(fjz3,tz);
1375 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1376 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1377 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1378 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1380 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1381 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1382 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1384 /* Inner loop uses 456 flops */
1387 /* End of innermost loop */
1389 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1390 f+i_coord_offset,fshift+i_shift_offset);
1393 /* Update potential energies */
1394 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1395 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1397 /* Increment number of inner iterations */
1398 inneriter += j_index_end - j_index_start;
1400 /* Outer loop uses 26 flops */
1403 /* Increment number of outer iterations */
1406 /* Update outer/inner flops */
1408 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1411 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1412 * Electrostatics interaction: CubicSplineTable
1413 * VdW interaction: CubicSplineTable
1414 * Geometry: Water4-Water4
1415 * Calculate force/pot: Force
1418 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_double
1419 (t_nblist * gmx_restrict nlist,
1420 rvec * gmx_restrict xx,
1421 rvec * gmx_restrict ff,
1422 t_forcerec * gmx_restrict fr,
1423 t_mdatoms * gmx_restrict mdatoms,
1424 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1425 t_nrnb * gmx_restrict nrnb)
1427 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1428 * just 0 for non-waters.
1429 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1430 * jnr indices corresponding to data put in the four positions in the SIMD register.
1432 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1433 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1434 int jnrA,jnrB,jnrC,jnrD;
1435 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1436 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1437 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1438 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1439 real rcutoff_scalar;
1440 real *shiftvec,*fshift,*x,*f;
1441 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1442 real scratch[4*DIM];
1443 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1444 real * vdwioffsetptr0;
1445 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1446 real * vdwioffsetptr1;
1447 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1448 real * vdwioffsetptr2;
1449 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1450 real * vdwioffsetptr3;
1451 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1452 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1453 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1454 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1455 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1456 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1457 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1458 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1459 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1460 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1461 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1462 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1463 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1464 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1465 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1466 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1467 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1468 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1469 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1470 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1473 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1476 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1477 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1479 __m128i ifour = _mm_set1_epi32(4);
1480 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1482 __m256d dummy_mask,cutoff_mask;
1483 __m128 tmpmask0,tmpmask1;
1484 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1485 __m256d one = _mm256_set1_pd(1.0);
1486 __m256d two = _mm256_set1_pd(2.0);
1492 jindex = nlist->jindex;
1494 shiftidx = nlist->shift;
1496 shiftvec = fr->shift_vec[0];
1497 fshift = fr->fshift[0];
1498 facel = _mm256_set1_pd(fr->epsfac);
1499 charge = mdatoms->chargeA;
1500 nvdwtype = fr->ntype;
1501 vdwparam = fr->nbfp;
1502 vdwtype = mdatoms->typeA;
1504 vftab = kernel_data->table_elec_vdw->data;
1505 vftabscale = _mm256_set1_pd(kernel_data->table_elec_vdw->scale);
1507 /* Setup water-specific parameters */
1508 inr = nlist->iinr[0];
1509 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1510 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1511 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1512 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1514 jq1 = _mm256_set1_pd(charge[inr+1]);
1515 jq2 = _mm256_set1_pd(charge[inr+2]);
1516 jq3 = _mm256_set1_pd(charge[inr+3]);
1517 vdwjidx0A = 2*vdwtype[inr+0];
1518 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1519 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1520 qq11 = _mm256_mul_pd(iq1,jq1);
1521 qq12 = _mm256_mul_pd(iq1,jq2);
1522 qq13 = _mm256_mul_pd(iq1,jq3);
1523 qq21 = _mm256_mul_pd(iq2,jq1);
1524 qq22 = _mm256_mul_pd(iq2,jq2);
1525 qq23 = _mm256_mul_pd(iq2,jq3);
1526 qq31 = _mm256_mul_pd(iq3,jq1);
1527 qq32 = _mm256_mul_pd(iq3,jq2);
1528 qq33 = _mm256_mul_pd(iq3,jq3);
1530 /* Avoid stupid compiler warnings */
1531 jnrA = jnrB = jnrC = jnrD = 0;
1532 j_coord_offsetA = 0;
1533 j_coord_offsetB = 0;
1534 j_coord_offsetC = 0;
1535 j_coord_offsetD = 0;
1540 for(iidx=0;iidx<4*DIM;iidx++)
1542 scratch[iidx] = 0.0;
1545 /* Start outer loop over neighborlists */
1546 for(iidx=0; iidx<nri; iidx++)
1548 /* Load shift vector for this list */
1549 i_shift_offset = DIM*shiftidx[iidx];
1551 /* Load limits for loop over neighbors */
1552 j_index_start = jindex[iidx];
1553 j_index_end = jindex[iidx+1];
1555 /* Get outer coordinate index */
1557 i_coord_offset = DIM*inr;
1559 /* Load i particle coords and add shift vector */
1560 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1561 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1563 fix0 = _mm256_setzero_pd();
1564 fiy0 = _mm256_setzero_pd();
1565 fiz0 = _mm256_setzero_pd();
1566 fix1 = _mm256_setzero_pd();
1567 fiy1 = _mm256_setzero_pd();
1568 fiz1 = _mm256_setzero_pd();
1569 fix2 = _mm256_setzero_pd();
1570 fiy2 = _mm256_setzero_pd();
1571 fiz2 = _mm256_setzero_pd();
1572 fix3 = _mm256_setzero_pd();
1573 fiy3 = _mm256_setzero_pd();
1574 fiz3 = _mm256_setzero_pd();
1576 /* Start inner kernel loop */
1577 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1580 /* Get j neighbor index, and coordinate index */
1582 jnrB = jjnr[jidx+1];
1583 jnrC = jjnr[jidx+2];
1584 jnrD = jjnr[jidx+3];
1585 j_coord_offsetA = DIM*jnrA;
1586 j_coord_offsetB = DIM*jnrB;
1587 j_coord_offsetC = DIM*jnrC;
1588 j_coord_offsetD = DIM*jnrD;
1590 /* load j atom coordinates */
1591 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1592 x+j_coord_offsetC,x+j_coord_offsetD,
1593 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1594 &jy2,&jz2,&jx3,&jy3,&jz3);
1596 /* Calculate displacement vector */
1597 dx00 = _mm256_sub_pd(ix0,jx0);
1598 dy00 = _mm256_sub_pd(iy0,jy0);
1599 dz00 = _mm256_sub_pd(iz0,jz0);
1600 dx11 = _mm256_sub_pd(ix1,jx1);
1601 dy11 = _mm256_sub_pd(iy1,jy1);
1602 dz11 = _mm256_sub_pd(iz1,jz1);
1603 dx12 = _mm256_sub_pd(ix1,jx2);
1604 dy12 = _mm256_sub_pd(iy1,jy2);
1605 dz12 = _mm256_sub_pd(iz1,jz2);
1606 dx13 = _mm256_sub_pd(ix1,jx3);
1607 dy13 = _mm256_sub_pd(iy1,jy3);
1608 dz13 = _mm256_sub_pd(iz1,jz3);
1609 dx21 = _mm256_sub_pd(ix2,jx1);
1610 dy21 = _mm256_sub_pd(iy2,jy1);
1611 dz21 = _mm256_sub_pd(iz2,jz1);
1612 dx22 = _mm256_sub_pd(ix2,jx2);
1613 dy22 = _mm256_sub_pd(iy2,jy2);
1614 dz22 = _mm256_sub_pd(iz2,jz2);
1615 dx23 = _mm256_sub_pd(ix2,jx3);
1616 dy23 = _mm256_sub_pd(iy2,jy3);
1617 dz23 = _mm256_sub_pd(iz2,jz3);
1618 dx31 = _mm256_sub_pd(ix3,jx1);
1619 dy31 = _mm256_sub_pd(iy3,jy1);
1620 dz31 = _mm256_sub_pd(iz3,jz1);
1621 dx32 = _mm256_sub_pd(ix3,jx2);
1622 dy32 = _mm256_sub_pd(iy3,jy2);
1623 dz32 = _mm256_sub_pd(iz3,jz2);
1624 dx33 = _mm256_sub_pd(ix3,jx3);
1625 dy33 = _mm256_sub_pd(iy3,jy3);
1626 dz33 = _mm256_sub_pd(iz3,jz3);
1628 /* Calculate squared distance and things based on it */
1629 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1630 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1631 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1632 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1633 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1634 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1635 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1636 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1637 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1638 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1640 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1641 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1642 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1643 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1644 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1645 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1646 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1647 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1648 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1649 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1651 fjx0 = _mm256_setzero_pd();
1652 fjy0 = _mm256_setzero_pd();
1653 fjz0 = _mm256_setzero_pd();
1654 fjx1 = _mm256_setzero_pd();
1655 fjy1 = _mm256_setzero_pd();
1656 fjz1 = _mm256_setzero_pd();
1657 fjx2 = _mm256_setzero_pd();
1658 fjy2 = _mm256_setzero_pd();
1659 fjz2 = _mm256_setzero_pd();
1660 fjx3 = _mm256_setzero_pd();
1661 fjy3 = _mm256_setzero_pd();
1662 fjz3 = _mm256_setzero_pd();
1664 /**************************
1665 * CALCULATE INTERACTIONS *
1666 **************************/
1668 r00 = _mm256_mul_pd(rsq00,rinv00);
1670 /* Calculate table index by multiplying r with table scale and truncate to integer */
1671 rt = _mm256_mul_pd(r00,vftabscale);
1672 vfitab = _mm256_cvttpd_epi32(rt);
1673 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1674 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1676 /* CUBIC SPLINE TABLE DISPERSION */
1677 vfitab = _mm_add_epi32(vfitab,ifour);
1678 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1679 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1680 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1681 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1682 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1683 Heps = _mm256_mul_pd(vfeps,H);
1684 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1685 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1686 fvdw6 = _mm256_mul_pd(c6_00,FF);
1688 /* CUBIC SPLINE TABLE REPULSION */
1689 vfitab = _mm_add_epi32(vfitab,ifour);
1690 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1691 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1692 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1693 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1694 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1695 Heps = _mm256_mul_pd(vfeps,H);
1696 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1697 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1698 fvdw12 = _mm256_mul_pd(c12_00,FF);
1699 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1703 /* Calculate temporary vectorial force */
1704 tx = _mm256_mul_pd(fscal,dx00);
1705 ty = _mm256_mul_pd(fscal,dy00);
1706 tz = _mm256_mul_pd(fscal,dz00);
1708 /* Update vectorial force */
1709 fix0 = _mm256_add_pd(fix0,tx);
1710 fiy0 = _mm256_add_pd(fiy0,ty);
1711 fiz0 = _mm256_add_pd(fiz0,tz);
1713 fjx0 = _mm256_add_pd(fjx0,tx);
1714 fjy0 = _mm256_add_pd(fjy0,ty);
1715 fjz0 = _mm256_add_pd(fjz0,tz);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 r11 = _mm256_mul_pd(rsq11,rinv11);
1723 /* Calculate table index by multiplying r with table scale and truncate to integer */
1724 rt = _mm256_mul_pd(r11,vftabscale);
1725 vfitab = _mm256_cvttpd_epi32(rt);
1726 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1727 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1729 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1730 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1731 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1732 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1733 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1734 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1735 Heps = _mm256_mul_pd(vfeps,H);
1736 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1737 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1738 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1742 /* Calculate temporary vectorial force */
1743 tx = _mm256_mul_pd(fscal,dx11);
1744 ty = _mm256_mul_pd(fscal,dy11);
1745 tz = _mm256_mul_pd(fscal,dz11);
1747 /* Update vectorial force */
1748 fix1 = _mm256_add_pd(fix1,tx);
1749 fiy1 = _mm256_add_pd(fiy1,ty);
1750 fiz1 = _mm256_add_pd(fiz1,tz);
1752 fjx1 = _mm256_add_pd(fjx1,tx);
1753 fjy1 = _mm256_add_pd(fjy1,ty);
1754 fjz1 = _mm256_add_pd(fjz1,tz);
1756 /**************************
1757 * CALCULATE INTERACTIONS *
1758 **************************/
1760 r12 = _mm256_mul_pd(rsq12,rinv12);
1762 /* Calculate table index by multiplying r with table scale and truncate to integer */
1763 rt = _mm256_mul_pd(r12,vftabscale);
1764 vfitab = _mm256_cvttpd_epi32(rt);
1765 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1766 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1768 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1769 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1770 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1771 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1772 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1773 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1774 Heps = _mm256_mul_pd(vfeps,H);
1775 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1776 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1777 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1781 /* Calculate temporary vectorial force */
1782 tx = _mm256_mul_pd(fscal,dx12);
1783 ty = _mm256_mul_pd(fscal,dy12);
1784 tz = _mm256_mul_pd(fscal,dz12);
1786 /* Update vectorial force */
1787 fix1 = _mm256_add_pd(fix1,tx);
1788 fiy1 = _mm256_add_pd(fiy1,ty);
1789 fiz1 = _mm256_add_pd(fiz1,tz);
1791 fjx2 = _mm256_add_pd(fjx2,tx);
1792 fjy2 = _mm256_add_pd(fjy2,ty);
1793 fjz2 = _mm256_add_pd(fjz2,tz);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 r13 = _mm256_mul_pd(rsq13,rinv13);
1801 /* Calculate table index by multiplying r with table scale and truncate to integer */
1802 rt = _mm256_mul_pd(r13,vftabscale);
1803 vfitab = _mm256_cvttpd_epi32(rt);
1804 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1805 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1807 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1808 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1809 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1810 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1811 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1812 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1813 Heps = _mm256_mul_pd(vfeps,H);
1814 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1815 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1816 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
1820 /* Calculate temporary vectorial force */
1821 tx = _mm256_mul_pd(fscal,dx13);
1822 ty = _mm256_mul_pd(fscal,dy13);
1823 tz = _mm256_mul_pd(fscal,dz13);
1825 /* Update vectorial force */
1826 fix1 = _mm256_add_pd(fix1,tx);
1827 fiy1 = _mm256_add_pd(fiy1,ty);
1828 fiz1 = _mm256_add_pd(fiz1,tz);
1830 fjx3 = _mm256_add_pd(fjx3,tx);
1831 fjy3 = _mm256_add_pd(fjy3,ty);
1832 fjz3 = _mm256_add_pd(fjz3,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r21 = _mm256_mul_pd(rsq21,rinv21);
1840 /* Calculate table index by multiplying r with table scale and truncate to integer */
1841 rt = _mm256_mul_pd(r21,vftabscale);
1842 vfitab = _mm256_cvttpd_epi32(rt);
1843 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1844 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1846 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1847 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1848 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1849 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1850 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1851 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1852 Heps = _mm256_mul_pd(vfeps,H);
1853 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1854 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1855 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1859 /* Calculate temporary vectorial force */
1860 tx = _mm256_mul_pd(fscal,dx21);
1861 ty = _mm256_mul_pd(fscal,dy21);
1862 tz = _mm256_mul_pd(fscal,dz21);
1864 /* Update vectorial force */
1865 fix2 = _mm256_add_pd(fix2,tx);
1866 fiy2 = _mm256_add_pd(fiy2,ty);
1867 fiz2 = _mm256_add_pd(fiz2,tz);
1869 fjx1 = _mm256_add_pd(fjx1,tx);
1870 fjy1 = _mm256_add_pd(fjy1,ty);
1871 fjz1 = _mm256_add_pd(fjz1,tz);
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 r22 = _mm256_mul_pd(rsq22,rinv22);
1879 /* Calculate table index by multiplying r with table scale and truncate to integer */
1880 rt = _mm256_mul_pd(r22,vftabscale);
1881 vfitab = _mm256_cvttpd_epi32(rt);
1882 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1883 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1885 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1886 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1887 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1888 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1889 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1890 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1891 Heps = _mm256_mul_pd(vfeps,H);
1892 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1893 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1894 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1898 /* Calculate temporary vectorial force */
1899 tx = _mm256_mul_pd(fscal,dx22);
1900 ty = _mm256_mul_pd(fscal,dy22);
1901 tz = _mm256_mul_pd(fscal,dz22);
1903 /* Update vectorial force */
1904 fix2 = _mm256_add_pd(fix2,tx);
1905 fiy2 = _mm256_add_pd(fiy2,ty);
1906 fiz2 = _mm256_add_pd(fiz2,tz);
1908 fjx2 = _mm256_add_pd(fjx2,tx);
1909 fjy2 = _mm256_add_pd(fjy2,ty);
1910 fjz2 = _mm256_add_pd(fjz2,tz);
1912 /**************************
1913 * CALCULATE INTERACTIONS *
1914 **************************/
1916 r23 = _mm256_mul_pd(rsq23,rinv23);
1918 /* Calculate table index by multiplying r with table scale and truncate to integer */
1919 rt = _mm256_mul_pd(r23,vftabscale);
1920 vfitab = _mm256_cvttpd_epi32(rt);
1921 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1922 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1924 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1925 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1926 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1927 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1928 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1929 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1930 Heps = _mm256_mul_pd(vfeps,H);
1931 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1932 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1933 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
1937 /* Calculate temporary vectorial force */
1938 tx = _mm256_mul_pd(fscal,dx23);
1939 ty = _mm256_mul_pd(fscal,dy23);
1940 tz = _mm256_mul_pd(fscal,dz23);
1942 /* Update vectorial force */
1943 fix2 = _mm256_add_pd(fix2,tx);
1944 fiy2 = _mm256_add_pd(fiy2,ty);
1945 fiz2 = _mm256_add_pd(fiz2,tz);
1947 fjx3 = _mm256_add_pd(fjx3,tx);
1948 fjy3 = _mm256_add_pd(fjy3,ty);
1949 fjz3 = _mm256_add_pd(fjz3,tz);
1951 /**************************
1952 * CALCULATE INTERACTIONS *
1953 **************************/
1955 r31 = _mm256_mul_pd(rsq31,rinv31);
1957 /* Calculate table index by multiplying r with table scale and truncate to integer */
1958 rt = _mm256_mul_pd(r31,vftabscale);
1959 vfitab = _mm256_cvttpd_epi32(rt);
1960 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1961 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1963 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1964 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1965 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1966 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1967 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1968 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1969 Heps = _mm256_mul_pd(vfeps,H);
1970 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1971 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1972 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
1976 /* Calculate temporary vectorial force */
1977 tx = _mm256_mul_pd(fscal,dx31);
1978 ty = _mm256_mul_pd(fscal,dy31);
1979 tz = _mm256_mul_pd(fscal,dz31);
1981 /* Update vectorial force */
1982 fix3 = _mm256_add_pd(fix3,tx);
1983 fiy3 = _mm256_add_pd(fiy3,ty);
1984 fiz3 = _mm256_add_pd(fiz3,tz);
1986 fjx1 = _mm256_add_pd(fjx1,tx);
1987 fjy1 = _mm256_add_pd(fjy1,ty);
1988 fjz1 = _mm256_add_pd(fjz1,tz);
1990 /**************************
1991 * CALCULATE INTERACTIONS *
1992 **************************/
1994 r32 = _mm256_mul_pd(rsq32,rinv32);
1996 /* Calculate table index by multiplying r with table scale and truncate to integer */
1997 rt = _mm256_mul_pd(r32,vftabscale);
1998 vfitab = _mm256_cvttpd_epi32(rt);
1999 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2000 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2002 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2003 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2004 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2005 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2006 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2007 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2008 Heps = _mm256_mul_pd(vfeps,H);
2009 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2010 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2011 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2015 /* Calculate temporary vectorial force */
2016 tx = _mm256_mul_pd(fscal,dx32);
2017 ty = _mm256_mul_pd(fscal,dy32);
2018 tz = _mm256_mul_pd(fscal,dz32);
2020 /* Update vectorial force */
2021 fix3 = _mm256_add_pd(fix3,tx);
2022 fiy3 = _mm256_add_pd(fiy3,ty);
2023 fiz3 = _mm256_add_pd(fiz3,tz);
2025 fjx2 = _mm256_add_pd(fjx2,tx);
2026 fjy2 = _mm256_add_pd(fjy2,ty);
2027 fjz2 = _mm256_add_pd(fjz2,tz);
2029 /**************************
2030 * CALCULATE INTERACTIONS *
2031 **************************/
2033 r33 = _mm256_mul_pd(rsq33,rinv33);
2035 /* Calculate table index by multiplying r with table scale and truncate to integer */
2036 rt = _mm256_mul_pd(r33,vftabscale);
2037 vfitab = _mm256_cvttpd_epi32(rt);
2038 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2039 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2041 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2042 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2043 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2044 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2045 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2046 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2047 Heps = _mm256_mul_pd(vfeps,H);
2048 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2049 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2050 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2054 /* Calculate temporary vectorial force */
2055 tx = _mm256_mul_pd(fscal,dx33);
2056 ty = _mm256_mul_pd(fscal,dy33);
2057 tz = _mm256_mul_pd(fscal,dz33);
2059 /* Update vectorial force */
2060 fix3 = _mm256_add_pd(fix3,tx);
2061 fiy3 = _mm256_add_pd(fiy3,ty);
2062 fiz3 = _mm256_add_pd(fiz3,tz);
2064 fjx3 = _mm256_add_pd(fjx3,tx);
2065 fjy3 = _mm256_add_pd(fjy3,ty);
2066 fjz3 = _mm256_add_pd(fjz3,tz);
2068 fjptrA = f+j_coord_offsetA;
2069 fjptrB = f+j_coord_offsetB;
2070 fjptrC = f+j_coord_offsetC;
2071 fjptrD = f+j_coord_offsetD;
2073 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2074 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2075 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2077 /* Inner loop uses 402 flops */
2080 if(jidx<j_index_end)
2083 /* Get j neighbor index, and coordinate index */
2084 jnrlistA = jjnr[jidx];
2085 jnrlistB = jjnr[jidx+1];
2086 jnrlistC = jjnr[jidx+2];
2087 jnrlistD = jjnr[jidx+3];
2088 /* Sign of each element will be negative for non-real atoms.
2089 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2090 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2092 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2094 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2095 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2096 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2098 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2099 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2100 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2101 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2102 j_coord_offsetA = DIM*jnrA;
2103 j_coord_offsetB = DIM*jnrB;
2104 j_coord_offsetC = DIM*jnrC;
2105 j_coord_offsetD = DIM*jnrD;
2107 /* load j atom coordinates */
2108 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2109 x+j_coord_offsetC,x+j_coord_offsetD,
2110 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2111 &jy2,&jz2,&jx3,&jy3,&jz3);
2113 /* Calculate displacement vector */
2114 dx00 = _mm256_sub_pd(ix0,jx0);
2115 dy00 = _mm256_sub_pd(iy0,jy0);
2116 dz00 = _mm256_sub_pd(iz0,jz0);
2117 dx11 = _mm256_sub_pd(ix1,jx1);
2118 dy11 = _mm256_sub_pd(iy1,jy1);
2119 dz11 = _mm256_sub_pd(iz1,jz1);
2120 dx12 = _mm256_sub_pd(ix1,jx2);
2121 dy12 = _mm256_sub_pd(iy1,jy2);
2122 dz12 = _mm256_sub_pd(iz1,jz2);
2123 dx13 = _mm256_sub_pd(ix1,jx3);
2124 dy13 = _mm256_sub_pd(iy1,jy3);
2125 dz13 = _mm256_sub_pd(iz1,jz3);
2126 dx21 = _mm256_sub_pd(ix2,jx1);
2127 dy21 = _mm256_sub_pd(iy2,jy1);
2128 dz21 = _mm256_sub_pd(iz2,jz1);
2129 dx22 = _mm256_sub_pd(ix2,jx2);
2130 dy22 = _mm256_sub_pd(iy2,jy2);
2131 dz22 = _mm256_sub_pd(iz2,jz2);
2132 dx23 = _mm256_sub_pd(ix2,jx3);
2133 dy23 = _mm256_sub_pd(iy2,jy3);
2134 dz23 = _mm256_sub_pd(iz2,jz3);
2135 dx31 = _mm256_sub_pd(ix3,jx1);
2136 dy31 = _mm256_sub_pd(iy3,jy1);
2137 dz31 = _mm256_sub_pd(iz3,jz1);
2138 dx32 = _mm256_sub_pd(ix3,jx2);
2139 dy32 = _mm256_sub_pd(iy3,jy2);
2140 dz32 = _mm256_sub_pd(iz3,jz2);
2141 dx33 = _mm256_sub_pd(ix3,jx3);
2142 dy33 = _mm256_sub_pd(iy3,jy3);
2143 dz33 = _mm256_sub_pd(iz3,jz3);
2145 /* Calculate squared distance and things based on it */
2146 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2147 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2148 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2149 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2150 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2151 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2152 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2153 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2154 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2155 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2157 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2158 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2159 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2160 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2161 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2162 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2163 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2164 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2165 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2166 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2168 fjx0 = _mm256_setzero_pd();
2169 fjy0 = _mm256_setzero_pd();
2170 fjz0 = _mm256_setzero_pd();
2171 fjx1 = _mm256_setzero_pd();
2172 fjy1 = _mm256_setzero_pd();
2173 fjz1 = _mm256_setzero_pd();
2174 fjx2 = _mm256_setzero_pd();
2175 fjy2 = _mm256_setzero_pd();
2176 fjz2 = _mm256_setzero_pd();
2177 fjx3 = _mm256_setzero_pd();
2178 fjy3 = _mm256_setzero_pd();
2179 fjz3 = _mm256_setzero_pd();
2181 /**************************
2182 * CALCULATE INTERACTIONS *
2183 **************************/
2185 r00 = _mm256_mul_pd(rsq00,rinv00);
2186 r00 = _mm256_andnot_pd(dummy_mask,r00);
2188 /* Calculate table index by multiplying r with table scale and truncate to integer */
2189 rt = _mm256_mul_pd(r00,vftabscale);
2190 vfitab = _mm256_cvttpd_epi32(rt);
2191 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2192 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2194 /* CUBIC SPLINE TABLE DISPERSION */
2195 vfitab = _mm_add_epi32(vfitab,ifour);
2196 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2197 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2198 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2199 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2200 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2201 Heps = _mm256_mul_pd(vfeps,H);
2202 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2203 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2204 fvdw6 = _mm256_mul_pd(c6_00,FF);
2206 /* CUBIC SPLINE TABLE REPULSION */
2207 vfitab = _mm_add_epi32(vfitab,ifour);
2208 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2209 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2210 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2211 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2212 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2213 Heps = _mm256_mul_pd(vfeps,H);
2214 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2215 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2216 fvdw12 = _mm256_mul_pd(c12_00,FF);
2217 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2221 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2223 /* Calculate temporary vectorial force */
2224 tx = _mm256_mul_pd(fscal,dx00);
2225 ty = _mm256_mul_pd(fscal,dy00);
2226 tz = _mm256_mul_pd(fscal,dz00);
2228 /* Update vectorial force */
2229 fix0 = _mm256_add_pd(fix0,tx);
2230 fiy0 = _mm256_add_pd(fiy0,ty);
2231 fiz0 = _mm256_add_pd(fiz0,tz);
2233 fjx0 = _mm256_add_pd(fjx0,tx);
2234 fjy0 = _mm256_add_pd(fjy0,ty);
2235 fjz0 = _mm256_add_pd(fjz0,tz);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 r11 = _mm256_mul_pd(rsq11,rinv11);
2242 r11 = _mm256_andnot_pd(dummy_mask,r11);
2244 /* Calculate table index by multiplying r with table scale and truncate to integer */
2245 rt = _mm256_mul_pd(r11,vftabscale);
2246 vfitab = _mm256_cvttpd_epi32(rt);
2247 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2248 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2250 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2251 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2252 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2253 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2254 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2255 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2256 Heps = _mm256_mul_pd(vfeps,H);
2257 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2258 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2259 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2263 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2265 /* Calculate temporary vectorial force */
2266 tx = _mm256_mul_pd(fscal,dx11);
2267 ty = _mm256_mul_pd(fscal,dy11);
2268 tz = _mm256_mul_pd(fscal,dz11);
2270 /* Update vectorial force */
2271 fix1 = _mm256_add_pd(fix1,tx);
2272 fiy1 = _mm256_add_pd(fiy1,ty);
2273 fiz1 = _mm256_add_pd(fiz1,tz);
2275 fjx1 = _mm256_add_pd(fjx1,tx);
2276 fjy1 = _mm256_add_pd(fjy1,ty);
2277 fjz1 = _mm256_add_pd(fjz1,tz);
2279 /**************************
2280 * CALCULATE INTERACTIONS *
2281 **************************/
2283 r12 = _mm256_mul_pd(rsq12,rinv12);
2284 r12 = _mm256_andnot_pd(dummy_mask,r12);
2286 /* Calculate table index by multiplying r with table scale and truncate to integer */
2287 rt = _mm256_mul_pd(r12,vftabscale);
2288 vfitab = _mm256_cvttpd_epi32(rt);
2289 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2290 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2292 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2293 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2294 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2295 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2296 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2297 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2298 Heps = _mm256_mul_pd(vfeps,H);
2299 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2300 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2301 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2305 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2307 /* Calculate temporary vectorial force */
2308 tx = _mm256_mul_pd(fscal,dx12);
2309 ty = _mm256_mul_pd(fscal,dy12);
2310 tz = _mm256_mul_pd(fscal,dz12);
2312 /* Update vectorial force */
2313 fix1 = _mm256_add_pd(fix1,tx);
2314 fiy1 = _mm256_add_pd(fiy1,ty);
2315 fiz1 = _mm256_add_pd(fiz1,tz);
2317 fjx2 = _mm256_add_pd(fjx2,tx);
2318 fjy2 = _mm256_add_pd(fjy2,ty);
2319 fjz2 = _mm256_add_pd(fjz2,tz);
2321 /**************************
2322 * CALCULATE INTERACTIONS *
2323 **************************/
2325 r13 = _mm256_mul_pd(rsq13,rinv13);
2326 r13 = _mm256_andnot_pd(dummy_mask,r13);
2328 /* Calculate table index by multiplying r with table scale and truncate to integer */
2329 rt = _mm256_mul_pd(r13,vftabscale);
2330 vfitab = _mm256_cvttpd_epi32(rt);
2331 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2332 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2334 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2335 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2336 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2337 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2338 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2339 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2340 Heps = _mm256_mul_pd(vfeps,H);
2341 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2342 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2343 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq13,FF),_mm256_mul_pd(vftabscale,rinv13)));
2347 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2349 /* Calculate temporary vectorial force */
2350 tx = _mm256_mul_pd(fscal,dx13);
2351 ty = _mm256_mul_pd(fscal,dy13);
2352 tz = _mm256_mul_pd(fscal,dz13);
2354 /* Update vectorial force */
2355 fix1 = _mm256_add_pd(fix1,tx);
2356 fiy1 = _mm256_add_pd(fiy1,ty);
2357 fiz1 = _mm256_add_pd(fiz1,tz);
2359 fjx3 = _mm256_add_pd(fjx3,tx);
2360 fjy3 = _mm256_add_pd(fjy3,ty);
2361 fjz3 = _mm256_add_pd(fjz3,tz);
2363 /**************************
2364 * CALCULATE INTERACTIONS *
2365 **************************/
2367 r21 = _mm256_mul_pd(rsq21,rinv21);
2368 r21 = _mm256_andnot_pd(dummy_mask,r21);
2370 /* Calculate table index by multiplying r with table scale and truncate to integer */
2371 rt = _mm256_mul_pd(r21,vftabscale);
2372 vfitab = _mm256_cvttpd_epi32(rt);
2373 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2374 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2376 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2377 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2378 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2379 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2380 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2381 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2382 Heps = _mm256_mul_pd(vfeps,H);
2383 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2384 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2385 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2389 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2391 /* Calculate temporary vectorial force */
2392 tx = _mm256_mul_pd(fscal,dx21);
2393 ty = _mm256_mul_pd(fscal,dy21);
2394 tz = _mm256_mul_pd(fscal,dz21);
2396 /* Update vectorial force */
2397 fix2 = _mm256_add_pd(fix2,tx);
2398 fiy2 = _mm256_add_pd(fiy2,ty);
2399 fiz2 = _mm256_add_pd(fiz2,tz);
2401 fjx1 = _mm256_add_pd(fjx1,tx);
2402 fjy1 = _mm256_add_pd(fjy1,ty);
2403 fjz1 = _mm256_add_pd(fjz1,tz);
2405 /**************************
2406 * CALCULATE INTERACTIONS *
2407 **************************/
2409 r22 = _mm256_mul_pd(rsq22,rinv22);
2410 r22 = _mm256_andnot_pd(dummy_mask,r22);
2412 /* Calculate table index by multiplying r with table scale and truncate to integer */
2413 rt = _mm256_mul_pd(r22,vftabscale);
2414 vfitab = _mm256_cvttpd_epi32(rt);
2415 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2416 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2418 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2419 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2420 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2421 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2422 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2423 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2424 Heps = _mm256_mul_pd(vfeps,H);
2425 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2426 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2427 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2431 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2433 /* Calculate temporary vectorial force */
2434 tx = _mm256_mul_pd(fscal,dx22);
2435 ty = _mm256_mul_pd(fscal,dy22);
2436 tz = _mm256_mul_pd(fscal,dz22);
2438 /* Update vectorial force */
2439 fix2 = _mm256_add_pd(fix2,tx);
2440 fiy2 = _mm256_add_pd(fiy2,ty);
2441 fiz2 = _mm256_add_pd(fiz2,tz);
2443 fjx2 = _mm256_add_pd(fjx2,tx);
2444 fjy2 = _mm256_add_pd(fjy2,ty);
2445 fjz2 = _mm256_add_pd(fjz2,tz);
2447 /**************************
2448 * CALCULATE INTERACTIONS *
2449 **************************/
2451 r23 = _mm256_mul_pd(rsq23,rinv23);
2452 r23 = _mm256_andnot_pd(dummy_mask,r23);
2454 /* Calculate table index by multiplying r with table scale and truncate to integer */
2455 rt = _mm256_mul_pd(r23,vftabscale);
2456 vfitab = _mm256_cvttpd_epi32(rt);
2457 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2458 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2460 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2461 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2462 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2463 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2464 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2465 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2466 Heps = _mm256_mul_pd(vfeps,H);
2467 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2468 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2469 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq23,FF),_mm256_mul_pd(vftabscale,rinv23)));
2473 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2475 /* Calculate temporary vectorial force */
2476 tx = _mm256_mul_pd(fscal,dx23);
2477 ty = _mm256_mul_pd(fscal,dy23);
2478 tz = _mm256_mul_pd(fscal,dz23);
2480 /* Update vectorial force */
2481 fix2 = _mm256_add_pd(fix2,tx);
2482 fiy2 = _mm256_add_pd(fiy2,ty);
2483 fiz2 = _mm256_add_pd(fiz2,tz);
2485 fjx3 = _mm256_add_pd(fjx3,tx);
2486 fjy3 = _mm256_add_pd(fjy3,ty);
2487 fjz3 = _mm256_add_pd(fjz3,tz);
2489 /**************************
2490 * CALCULATE INTERACTIONS *
2491 **************************/
2493 r31 = _mm256_mul_pd(rsq31,rinv31);
2494 r31 = _mm256_andnot_pd(dummy_mask,r31);
2496 /* Calculate table index by multiplying r with table scale and truncate to integer */
2497 rt = _mm256_mul_pd(r31,vftabscale);
2498 vfitab = _mm256_cvttpd_epi32(rt);
2499 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2500 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2502 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2503 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2504 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2505 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2506 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2507 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2508 Heps = _mm256_mul_pd(vfeps,H);
2509 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2510 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2511 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq31,FF),_mm256_mul_pd(vftabscale,rinv31)));
2515 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2517 /* Calculate temporary vectorial force */
2518 tx = _mm256_mul_pd(fscal,dx31);
2519 ty = _mm256_mul_pd(fscal,dy31);
2520 tz = _mm256_mul_pd(fscal,dz31);
2522 /* Update vectorial force */
2523 fix3 = _mm256_add_pd(fix3,tx);
2524 fiy3 = _mm256_add_pd(fiy3,ty);
2525 fiz3 = _mm256_add_pd(fiz3,tz);
2527 fjx1 = _mm256_add_pd(fjx1,tx);
2528 fjy1 = _mm256_add_pd(fjy1,ty);
2529 fjz1 = _mm256_add_pd(fjz1,tz);
2531 /**************************
2532 * CALCULATE INTERACTIONS *
2533 **************************/
2535 r32 = _mm256_mul_pd(rsq32,rinv32);
2536 r32 = _mm256_andnot_pd(dummy_mask,r32);
2538 /* Calculate table index by multiplying r with table scale and truncate to integer */
2539 rt = _mm256_mul_pd(r32,vftabscale);
2540 vfitab = _mm256_cvttpd_epi32(rt);
2541 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2542 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2544 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2545 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2546 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2547 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2548 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2549 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2550 Heps = _mm256_mul_pd(vfeps,H);
2551 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2552 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2553 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq32,FF),_mm256_mul_pd(vftabscale,rinv32)));
2557 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2559 /* Calculate temporary vectorial force */
2560 tx = _mm256_mul_pd(fscal,dx32);
2561 ty = _mm256_mul_pd(fscal,dy32);
2562 tz = _mm256_mul_pd(fscal,dz32);
2564 /* Update vectorial force */
2565 fix3 = _mm256_add_pd(fix3,tx);
2566 fiy3 = _mm256_add_pd(fiy3,ty);
2567 fiz3 = _mm256_add_pd(fiz3,tz);
2569 fjx2 = _mm256_add_pd(fjx2,tx);
2570 fjy2 = _mm256_add_pd(fjy2,ty);
2571 fjz2 = _mm256_add_pd(fjz2,tz);
2573 /**************************
2574 * CALCULATE INTERACTIONS *
2575 **************************/
2577 r33 = _mm256_mul_pd(rsq33,rinv33);
2578 r33 = _mm256_andnot_pd(dummy_mask,r33);
2580 /* Calculate table index by multiplying r with table scale and truncate to integer */
2581 rt = _mm256_mul_pd(r33,vftabscale);
2582 vfitab = _mm256_cvttpd_epi32(rt);
2583 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2584 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2586 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2587 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2588 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2589 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2590 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2591 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2592 Heps = _mm256_mul_pd(vfeps,H);
2593 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2594 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2595 felec = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq33,FF),_mm256_mul_pd(vftabscale,rinv33)));
2599 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2601 /* Calculate temporary vectorial force */
2602 tx = _mm256_mul_pd(fscal,dx33);
2603 ty = _mm256_mul_pd(fscal,dy33);
2604 tz = _mm256_mul_pd(fscal,dz33);
2606 /* Update vectorial force */
2607 fix3 = _mm256_add_pd(fix3,tx);
2608 fiy3 = _mm256_add_pd(fiy3,ty);
2609 fiz3 = _mm256_add_pd(fiz3,tz);
2611 fjx3 = _mm256_add_pd(fjx3,tx);
2612 fjy3 = _mm256_add_pd(fjy3,ty);
2613 fjz3 = _mm256_add_pd(fjz3,tz);
2615 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2616 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2617 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2618 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2620 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2621 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2622 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2624 /* Inner loop uses 412 flops */
2627 /* End of innermost loop */
2629 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2630 f+i_coord_offset,fshift+i_shift_offset);
2632 /* Increment number of inner iterations */
2633 inneriter += j_index_end - j_index_start;
2635 /* Outer loop uses 24 flops */
2638 /* Increment number of outer iterations */
2641 /* Update outer/inner flops */
2643 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);