2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
116 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
118 __m128i vfitab_lo,vfitab_hi;
119 __m128i ifour = _mm_set1_epi32(4);
120 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
122 __m256 dummy_mask,cutoff_mask;
123 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
124 __m256 one = _mm256_set1_ps(1.0);
125 __m256 two = _mm256_set1_ps(2.0);
131 jindex = nlist->jindex;
133 shiftidx = nlist->shift;
135 shiftvec = fr->shift_vec[0];
136 fshift = fr->fshift[0];
137 facel = _mm256_set1_ps(fr->epsfac);
138 charge = mdatoms->chargeA;
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 vftab = kernel_data->table_elec->data;
144 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
149 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
150 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
151 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
153 jq0 = _mm256_set1_ps(charge[inr+0]);
154 jq1 = _mm256_set1_ps(charge[inr+1]);
155 jq2 = _mm256_set1_ps(charge[inr+2]);
156 vdwjidx0A = 2*vdwtype[inr+0];
157 qq00 = _mm256_mul_ps(iq0,jq0);
158 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
159 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
160 qq01 = _mm256_mul_ps(iq0,jq1);
161 qq02 = _mm256_mul_ps(iq0,jq2);
162 qq10 = _mm256_mul_ps(iq1,jq0);
163 qq11 = _mm256_mul_ps(iq1,jq1);
164 qq12 = _mm256_mul_ps(iq1,jq2);
165 qq20 = _mm256_mul_ps(iq2,jq0);
166 qq21 = _mm256_mul_ps(iq2,jq1);
167 qq22 = _mm256_mul_ps(iq2,jq2);
169 /* Avoid stupid compiler warnings */
170 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
183 for(iidx=0;iidx<4*DIM;iidx++)
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206 fix0 = _mm256_setzero_ps();
207 fiy0 = _mm256_setzero_ps();
208 fiz0 = _mm256_setzero_ps();
209 fix1 = _mm256_setzero_ps();
210 fiy1 = _mm256_setzero_ps();
211 fiz1 = _mm256_setzero_ps();
212 fix2 = _mm256_setzero_ps();
213 fiy2 = _mm256_setzero_ps();
214 fiz2 = _mm256_setzero_ps();
216 /* Reset potential sums */
217 velecsum = _mm256_setzero_ps();
218 vvdwsum = _mm256_setzero_ps();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
224 /* Get j neighbor index, and coordinate index */
233 j_coord_offsetA = DIM*jnrA;
234 j_coord_offsetB = DIM*jnrB;
235 j_coord_offsetC = DIM*jnrC;
236 j_coord_offsetD = DIM*jnrD;
237 j_coord_offsetE = DIM*jnrE;
238 j_coord_offsetF = DIM*jnrF;
239 j_coord_offsetG = DIM*jnrG;
240 j_coord_offsetH = DIM*jnrH;
242 /* load j atom coordinates */
243 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
244 x+j_coord_offsetC,x+j_coord_offsetD,
245 x+j_coord_offsetE,x+j_coord_offsetF,
246 x+j_coord_offsetG,x+j_coord_offsetH,
247 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
249 /* Calculate displacement vector */
250 dx00 = _mm256_sub_ps(ix0,jx0);
251 dy00 = _mm256_sub_ps(iy0,jy0);
252 dz00 = _mm256_sub_ps(iz0,jz0);
253 dx01 = _mm256_sub_ps(ix0,jx1);
254 dy01 = _mm256_sub_ps(iy0,jy1);
255 dz01 = _mm256_sub_ps(iz0,jz1);
256 dx02 = _mm256_sub_ps(ix0,jx2);
257 dy02 = _mm256_sub_ps(iy0,jy2);
258 dz02 = _mm256_sub_ps(iz0,jz2);
259 dx10 = _mm256_sub_ps(ix1,jx0);
260 dy10 = _mm256_sub_ps(iy1,jy0);
261 dz10 = _mm256_sub_ps(iz1,jz0);
262 dx11 = _mm256_sub_ps(ix1,jx1);
263 dy11 = _mm256_sub_ps(iy1,jy1);
264 dz11 = _mm256_sub_ps(iz1,jz1);
265 dx12 = _mm256_sub_ps(ix1,jx2);
266 dy12 = _mm256_sub_ps(iy1,jy2);
267 dz12 = _mm256_sub_ps(iz1,jz2);
268 dx20 = _mm256_sub_ps(ix2,jx0);
269 dy20 = _mm256_sub_ps(iy2,jy0);
270 dz20 = _mm256_sub_ps(iz2,jz0);
271 dx21 = _mm256_sub_ps(ix2,jx1);
272 dy21 = _mm256_sub_ps(iy2,jy1);
273 dz21 = _mm256_sub_ps(iz2,jz1);
274 dx22 = _mm256_sub_ps(ix2,jx2);
275 dy22 = _mm256_sub_ps(iy2,jy2);
276 dz22 = _mm256_sub_ps(iz2,jz2);
278 /* Calculate squared distance and things based on it */
279 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
280 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
281 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
282 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
283 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
284 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
285 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
286 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
287 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
289 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
290 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
291 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
292 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
293 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
294 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
295 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
296 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
297 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
299 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
301 fjx0 = _mm256_setzero_ps();
302 fjy0 = _mm256_setzero_ps();
303 fjz0 = _mm256_setzero_ps();
304 fjx1 = _mm256_setzero_ps();
305 fjy1 = _mm256_setzero_ps();
306 fjz1 = _mm256_setzero_ps();
307 fjx2 = _mm256_setzero_ps();
308 fjy2 = _mm256_setzero_ps();
309 fjz2 = _mm256_setzero_ps();
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 r00 = _mm256_mul_ps(rsq00,rinv00);
317 /* Calculate table index by multiplying r with table scale and truncate to integer */
318 rt = _mm256_mul_ps(r00,vftabscale);
319 vfitab = _mm256_cvttps_epi32(rt);
320 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
321 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
322 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
323 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
324 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
325 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
327 /* CUBIC SPLINE TABLE ELECTROSTATICS */
328 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
330 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
332 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
334 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
335 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
336 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
337 Heps = _mm256_mul_ps(vfeps,H);
338 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
339 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
340 velec = _mm256_mul_ps(qq00,VV);
341 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
342 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
344 /* LENNARD-JONES DISPERSION/REPULSION */
346 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
347 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
348 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
349 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
350 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velecsum = _mm256_add_ps(velecsum,velec);
354 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
356 fscal = _mm256_add_ps(felec,fvdw);
358 /* Calculate temporary vectorial force */
359 tx = _mm256_mul_ps(fscal,dx00);
360 ty = _mm256_mul_ps(fscal,dy00);
361 tz = _mm256_mul_ps(fscal,dz00);
363 /* Update vectorial force */
364 fix0 = _mm256_add_ps(fix0,tx);
365 fiy0 = _mm256_add_ps(fiy0,ty);
366 fiz0 = _mm256_add_ps(fiz0,tz);
368 fjx0 = _mm256_add_ps(fjx0,tx);
369 fjy0 = _mm256_add_ps(fjy0,ty);
370 fjz0 = _mm256_add_ps(fjz0,tz);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 r01 = _mm256_mul_ps(rsq01,rinv01);
378 /* Calculate table index by multiplying r with table scale and truncate to integer */
379 rt = _mm256_mul_ps(r01,vftabscale);
380 vfitab = _mm256_cvttps_epi32(rt);
381 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
382 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
383 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
384 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
385 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
386 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
388 /* CUBIC SPLINE TABLE ELECTROSTATICS */
389 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
390 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
391 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
392 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
393 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
394 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
395 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
396 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
397 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
398 Heps = _mm256_mul_ps(vfeps,H);
399 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
400 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
401 velec = _mm256_mul_ps(qq01,VV);
402 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
403 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum = _mm256_add_ps(velecsum,velec);
410 /* Calculate temporary vectorial force */
411 tx = _mm256_mul_ps(fscal,dx01);
412 ty = _mm256_mul_ps(fscal,dy01);
413 tz = _mm256_mul_ps(fscal,dz01);
415 /* Update vectorial force */
416 fix0 = _mm256_add_ps(fix0,tx);
417 fiy0 = _mm256_add_ps(fiy0,ty);
418 fiz0 = _mm256_add_ps(fiz0,tz);
420 fjx1 = _mm256_add_ps(fjx1,tx);
421 fjy1 = _mm256_add_ps(fjy1,ty);
422 fjz1 = _mm256_add_ps(fjz1,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 r02 = _mm256_mul_ps(rsq02,rinv02);
430 /* Calculate table index by multiplying r with table scale and truncate to integer */
431 rt = _mm256_mul_ps(r02,vftabscale);
432 vfitab = _mm256_cvttps_epi32(rt);
433 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
434 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
435 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
436 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
437 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
438 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
440 /* CUBIC SPLINE TABLE ELECTROSTATICS */
441 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
442 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
443 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
444 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
445 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
446 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
447 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
448 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
449 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
450 Heps = _mm256_mul_ps(vfeps,H);
451 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
452 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
453 velec = _mm256_mul_ps(qq02,VV);
454 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
455 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velecsum = _mm256_add_ps(velecsum,velec);
462 /* Calculate temporary vectorial force */
463 tx = _mm256_mul_ps(fscal,dx02);
464 ty = _mm256_mul_ps(fscal,dy02);
465 tz = _mm256_mul_ps(fscal,dz02);
467 /* Update vectorial force */
468 fix0 = _mm256_add_ps(fix0,tx);
469 fiy0 = _mm256_add_ps(fiy0,ty);
470 fiz0 = _mm256_add_ps(fiz0,tz);
472 fjx2 = _mm256_add_ps(fjx2,tx);
473 fjy2 = _mm256_add_ps(fjy2,ty);
474 fjz2 = _mm256_add_ps(fjz2,tz);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 r10 = _mm256_mul_ps(rsq10,rinv10);
482 /* Calculate table index by multiplying r with table scale and truncate to integer */
483 rt = _mm256_mul_ps(r10,vftabscale);
484 vfitab = _mm256_cvttps_epi32(rt);
485 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
486 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
487 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
488 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
489 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
490 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
492 /* CUBIC SPLINE TABLE ELECTROSTATICS */
493 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
494 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
495 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
496 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
497 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
498 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
499 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
500 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
501 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
502 Heps = _mm256_mul_ps(vfeps,H);
503 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
504 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
505 velec = _mm256_mul_ps(qq10,VV);
506 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
507 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velecsum = _mm256_add_ps(velecsum,velec);
514 /* Calculate temporary vectorial force */
515 tx = _mm256_mul_ps(fscal,dx10);
516 ty = _mm256_mul_ps(fscal,dy10);
517 tz = _mm256_mul_ps(fscal,dz10);
519 /* Update vectorial force */
520 fix1 = _mm256_add_ps(fix1,tx);
521 fiy1 = _mm256_add_ps(fiy1,ty);
522 fiz1 = _mm256_add_ps(fiz1,tz);
524 fjx0 = _mm256_add_ps(fjx0,tx);
525 fjy0 = _mm256_add_ps(fjy0,ty);
526 fjz0 = _mm256_add_ps(fjz0,tz);
528 /**************************
529 * CALCULATE INTERACTIONS *
530 **************************/
532 r11 = _mm256_mul_ps(rsq11,rinv11);
534 /* Calculate table index by multiplying r with table scale and truncate to integer */
535 rt = _mm256_mul_ps(r11,vftabscale);
536 vfitab = _mm256_cvttps_epi32(rt);
537 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
538 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
539 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
540 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
541 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
542 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
544 /* CUBIC SPLINE TABLE ELECTROSTATICS */
545 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
546 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
547 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
548 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
549 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
550 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
551 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
552 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
553 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
554 Heps = _mm256_mul_ps(vfeps,H);
555 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
556 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
557 velec = _mm256_mul_ps(qq11,VV);
558 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
559 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velecsum = _mm256_add_ps(velecsum,velec);
566 /* Calculate temporary vectorial force */
567 tx = _mm256_mul_ps(fscal,dx11);
568 ty = _mm256_mul_ps(fscal,dy11);
569 tz = _mm256_mul_ps(fscal,dz11);
571 /* Update vectorial force */
572 fix1 = _mm256_add_ps(fix1,tx);
573 fiy1 = _mm256_add_ps(fiy1,ty);
574 fiz1 = _mm256_add_ps(fiz1,tz);
576 fjx1 = _mm256_add_ps(fjx1,tx);
577 fjy1 = _mm256_add_ps(fjy1,ty);
578 fjz1 = _mm256_add_ps(fjz1,tz);
580 /**************************
581 * CALCULATE INTERACTIONS *
582 **************************/
584 r12 = _mm256_mul_ps(rsq12,rinv12);
586 /* Calculate table index by multiplying r with table scale and truncate to integer */
587 rt = _mm256_mul_ps(r12,vftabscale);
588 vfitab = _mm256_cvttps_epi32(rt);
589 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
590 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
591 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
592 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
593 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
594 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
596 /* CUBIC SPLINE TABLE ELECTROSTATICS */
597 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
598 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
599 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
600 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
601 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
602 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
603 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
604 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
605 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
606 Heps = _mm256_mul_ps(vfeps,H);
607 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
608 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
609 velec = _mm256_mul_ps(qq12,VV);
610 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
611 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velecsum = _mm256_add_ps(velecsum,velec);
618 /* Calculate temporary vectorial force */
619 tx = _mm256_mul_ps(fscal,dx12);
620 ty = _mm256_mul_ps(fscal,dy12);
621 tz = _mm256_mul_ps(fscal,dz12);
623 /* Update vectorial force */
624 fix1 = _mm256_add_ps(fix1,tx);
625 fiy1 = _mm256_add_ps(fiy1,ty);
626 fiz1 = _mm256_add_ps(fiz1,tz);
628 fjx2 = _mm256_add_ps(fjx2,tx);
629 fjy2 = _mm256_add_ps(fjy2,ty);
630 fjz2 = _mm256_add_ps(fjz2,tz);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 r20 = _mm256_mul_ps(rsq20,rinv20);
638 /* Calculate table index by multiplying r with table scale and truncate to integer */
639 rt = _mm256_mul_ps(r20,vftabscale);
640 vfitab = _mm256_cvttps_epi32(rt);
641 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
642 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
643 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
644 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
645 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
646 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
648 /* CUBIC SPLINE TABLE ELECTROSTATICS */
649 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
650 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
651 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
652 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
653 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
654 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
655 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
656 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
657 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
658 Heps = _mm256_mul_ps(vfeps,H);
659 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
660 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
661 velec = _mm256_mul_ps(qq20,VV);
662 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
663 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velecsum = _mm256_add_ps(velecsum,velec);
670 /* Calculate temporary vectorial force */
671 tx = _mm256_mul_ps(fscal,dx20);
672 ty = _mm256_mul_ps(fscal,dy20);
673 tz = _mm256_mul_ps(fscal,dz20);
675 /* Update vectorial force */
676 fix2 = _mm256_add_ps(fix2,tx);
677 fiy2 = _mm256_add_ps(fiy2,ty);
678 fiz2 = _mm256_add_ps(fiz2,tz);
680 fjx0 = _mm256_add_ps(fjx0,tx);
681 fjy0 = _mm256_add_ps(fjy0,ty);
682 fjz0 = _mm256_add_ps(fjz0,tz);
684 /**************************
685 * CALCULATE INTERACTIONS *
686 **************************/
688 r21 = _mm256_mul_ps(rsq21,rinv21);
690 /* Calculate table index by multiplying r with table scale and truncate to integer */
691 rt = _mm256_mul_ps(r21,vftabscale);
692 vfitab = _mm256_cvttps_epi32(rt);
693 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
694 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
695 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
696 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
697 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
698 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
701 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
702 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
703 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
704 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
705 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
706 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
707 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
708 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
709 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
710 Heps = _mm256_mul_ps(vfeps,H);
711 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
712 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
713 velec = _mm256_mul_ps(qq21,VV);
714 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
715 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
717 /* Update potential sum for this i atom from the interaction with this j atom. */
718 velecsum = _mm256_add_ps(velecsum,velec);
722 /* Calculate temporary vectorial force */
723 tx = _mm256_mul_ps(fscal,dx21);
724 ty = _mm256_mul_ps(fscal,dy21);
725 tz = _mm256_mul_ps(fscal,dz21);
727 /* Update vectorial force */
728 fix2 = _mm256_add_ps(fix2,tx);
729 fiy2 = _mm256_add_ps(fiy2,ty);
730 fiz2 = _mm256_add_ps(fiz2,tz);
732 fjx1 = _mm256_add_ps(fjx1,tx);
733 fjy1 = _mm256_add_ps(fjy1,ty);
734 fjz1 = _mm256_add_ps(fjz1,tz);
736 /**************************
737 * CALCULATE INTERACTIONS *
738 **************************/
740 r22 = _mm256_mul_ps(rsq22,rinv22);
742 /* Calculate table index by multiplying r with table scale and truncate to integer */
743 rt = _mm256_mul_ps(r22,vftabscale);
744 vfitab = _mm256_cvttps_epi32(rt);
745 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
746 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
747 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
748 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
749 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
750 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
752 /* CUBIC SPLINE TABLE ELECTROSTATICS */
753 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
754 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
755 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
756 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
757 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
758 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
759 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
760 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
761 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
762 Heps = _mm256_mul_ps(vfeps,H);
763 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
764 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
765 velec = _mm256_mul_ps(qq22,VV);
766 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
767 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
769 /* Update potential sum for this i atom from the interaction with this j atom. */
770 velecsum = _mm256_add_ps(velecsum,velec);
774 /* Calculate temporary vectorial force */
775 tx = _mm256_mul_ps(fscal,dx22);
776 ty = _mm256_mul_ps(fscal,dy22);
777 tz = _mm256_mul_ps(fscal,dz22);
779 /* Update vectorial force */
780 fix2 = _mm256_add_ps(fix2,tx);
781 fiy2 = _mm256_add_ps(fiy2,ty);
782 fiz2 = _mm256_add_ps(fiz2,tz);
784 fjx2 = _mm256_add_ps(fjx2,tx);
785 fjy2 = _mm256_add_ps(fjy2,ty);
786 fjz2 = _mm256_add_ps(fjz2,tz);
788 fjptrA = f+j_coord_offsetA;
789 fjptrB = f+j_coord_offsetB;
790 fjptrC = f+j_coord_offsetC;
791 fjptrD = f+j_coord_offsetD;
792 fjptrE = f+j_coord_offsetE;
793 fjptrF = f+j_coord_offsetF;
794 fjptrG = f+j_coord_offsetG;
795 fjptrH = f+j_coord_offsetH;
797 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
798 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
800 /* Inner loop uses 400 flops */
806 /* Get j neighbor index, and coordinate index */
807 jnrlistA = jjnr[jidx];
808 jnrlistB = jjnr[jidx+1];
809 jnrlistC = jjnr[jidx+2];
810 jnrlistD = jjnr[jidx+3];
811 jnrlistE = jjnr[jidx+4];
812 jnrlistF = jjnr[jidx+5];
813 jnrlistG = jjnr[jidx+6];
814 jnrlistH = jjnr[jidx+7];
815 /* Sign of each element will be negative for non-real atoms.
816 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
817 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
819 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
820 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
822 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
823 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
824 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
825 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
826 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
827 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
828 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
829 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
830 j_coord_offsetA = DIM*jnrA;
831 j_coord_offsetB = DIM*jnrB;
832 j_coord_offsetC = DIM*jnrC;
833 j_coord_offsetD = DIM*jnrD;
834 j_coord_offsetE = DIM*jnrE;
835 j_coord_offsetF = DIM*jnrF;
836 j_coord_offsetG = DIM*jnrG;
837 j_coord_offsetH = DIM*jnrH;
839 /* load j atom coordinates */
840 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
841 x+j_coord_offsetC,x+j_coord_offsetD,
842 x+j_coord_offsetE,x+j_coord_offsetF,
843 x+j_coord_offsetG,x+j_coord_offsetH,
844 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
846 /* Calculate displacement vector */
847 dx00 = _mm256_sub_ps(ix0,jx0);
848 dy00 = _mm256_sub_ps(iy0,jy0);
849 dz00 = _mm256_sub_ps(iz0,jz0);
850 dx01 = _mm256_sub_ps(ix0,jx1);
851 dy01 = _mm256_sub_ps(iy0,jy1);
852 dz01 = _mm256_sub_ps(iz0,jz1);
853 dx02 = _mm256_sub_ps(ix0,jx2);
854 dy02 = _mm256_sub_ps(iy0,jy2);
855 dz02 = _mm256_sub_ps(iz0,jz2);
856 dx10 = _mm256_sub_ps(ix1,jx0);
857 dy10 = _mm256_sub_ps(iy1,jy0);
858 dz10 = _mm256_sub_ps(iz1,jz0);
859 dx11 = _mm256_sub_ps(ix1,jx1);
860 dy11 = _mm256_sub_ps(iy1,jy1);
861 dz11 = _mm256_sub_ps(iz1,jz1);
862 dx12 = _mm256_sub_ps(ix1,jx2);
863 dy12 = _mm256_sub_ps(iy1,jy2);
864 dz12 = _mm256_sub_ps(iz1,jz2);
865 dx20 = _mm256_sub_ps(ix2,jx0);
866 dy20 = _mm256_sub_ps(iy2,jy0);
867 dz20 = _mm256_sub_ps(iz2,jz0);
868 dx21 = _mm256_sub_ps(ix2,jx1);
869 dy21 = _mm256_sub_ps(iy2,jy1);
870 dz21 = _mm256_sub_ps(iz2,jz1);
871 dx22 = _mm256_sub_ps(ix2,jx2);
872 dy22 = _mm256_sub_ps(iy2,jy2);
873 dz22 = _mm256_sub_ps(iz2,jz2);
875 /* Calculate squared distance and things based on it */
876 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
877 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
878 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
879 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
880 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
881 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
882 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
883 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
884 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
886 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
887 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
888 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
889 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
890 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
891 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
892 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
893 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
894 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
896 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
898 fjx0 = _mm256_setzero_ps();
899 fjy0 = _mm256_setzero_ps();
900 fjz0 = _mm256_setzero_ps();
901 fjx1 = _mm256_setzero_ps();
902 fjy1 = _mm256_setzero_ps();
903 fjz1 = _mm256_setzero_ps();
904 fjx2 = _mm256_setzero_ps();
905 fjy2 = _mm256_setzero_ps();
906 fjz2 = _mm256_setzero_ps();
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 r00 = _mm256_mul_ps(rsq00,rinv00);
913 r00 = _mm256_andnot_ps(dummy_mask,r00);
915 /* Calculate table index by multiplying r with table scale and truncate to integer */
916 rt = _mm256_mul_ps(r00,vftabscale);
917 vfitab = _mm256_cvttps_epi32(rt);
918 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
919 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
920 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
921 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
922 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
923 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
925 /* CUBIC SPLINE TABLE ELECTROSTATICS */
926 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
927 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
928 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
929 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
930 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
931 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
932 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
933 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
934 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
935 Heps = _mm256_mul_ps(vfeps,H);
936 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
937 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
938 velec = _mm256_mul_ps(qq00,VV);
939 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
940 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
942 /* LENNARD-JONES DISPERSION/REPULSION */
944 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
945 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
946 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
947 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
948 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
950 /* Update potential sum for this i atom from the interaction with this j atom. */
951 velec = _mm256_andnot_ps(dummy_mask,velec);
952 velecsum = _mm256_add_ps(velecsum,velec);
953 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
954 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
956 fscal = _mm256_add_ps(felec,fvdw);
958 fscal = _mm256_andnot_ps(dummy_mask,fscal);
960 /* Calculate temporary vectorial force */
961 tx = _mm256_mul_ps(fscal,dx00);
962 ty = _mm256_mul_ps(fscal,dy00);
963 tz = _mm256_mul_ps(fscal,dz00);
965 /* Update vectorial force */
966 fix0 = _mm256_add_ps(fix0,tx);
967 fiy0 = _mm256_add_ps(fiy0,ty);
968 fiz0 = _mm256_add_ps(fiz0,tz);
970 fjx0 = _mm256_add_ps(fjx0,tx);
971 fjy0 = _mm256_add_ps(fjy0,ty);
972 fjz0 = _mm256_add_ps(fjz0,tz);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 r01 = _mm256_mul_ps(rsq01,rinv01);
979 r01 = _mm256_andnot_ps(dummy_mask,r01);
981 /* Calculate table index by multiplying r with table scale and truncate to integer */
982 rt = _mm256_mul_ps(r01,vftabscale);
983 vfitab = _mm256_cvttps_epi32(rt);
984 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
985 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
986 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
987 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
988 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
989 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
991 /* CUBIC SPLINE TABLE ELECTROSTATICS */
992 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
993 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
994 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
995 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
996 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
997 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
998 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
999 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1000 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1001 Heps = _mm256_mul_ps(vfeps,H);
1002 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1003 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1004 velec = _mm256_mul_ps(qq01,VV);
1005 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1006 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1008 /* Update potential sum for this i atom from the interaction with this j atom. */
1009 velec = _mm256_andnot_ps(dummy_mask,velec);
1010 velecsum = _mm256_add_ps(velecsum,velec);
1014 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1016 /* Calculate temporary vectorial force */
1017 tx = _mm256_mul_ps(fscal,dx01);
1018 ty = _mm256_mul_ps(fscal,dy01);
1019 tz = _mm256_mul_ps(fscal,dz01);
1021 /* Update vectorial force */
1022 fix0 = _mm256_add_ps(fix0,tx);
1023 fiy0 = _mm256_add_ps(fiy0,ty);
1024 fiz0 = _mm256_add_ps(fiz0,tz);
1026 fjx1 = _mm256_add_ps(fjx1,tx);
1027 fjy1 = _mm256_add_ps(fjy1,ty);
1028 fjz1 = _mm256_add_ps(fjz1,tz);
1030 /**************************
1031 * CALCULATE INTERACTIONS *
1032 **************************/
1034 r02 = _mm256_mul_ps(rsq02,rinv02);
1035 r02 = _mm256_andnot_ps(dummy_mask,r02);
1037 /* Calculate table index by multiplying r with table scale and truncate to integer */
1038 rt = _mm256_mul_ps(r02,vftabscale);
1039 vfitab = _mm256_cvttps_epi32(rt);
1040 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1041 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1042 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1043 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1044 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1045 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1047 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1048 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1049 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1050 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1051 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1052 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1053 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1054 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1055 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1056 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1057 Heps = _mm256_mul_ps(vfeps,H);
1058 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1059 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1060 velec = _mm256_mul_ps(qq02,VV);
1061 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1062 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1064 /* Update potential sum for this i atom from the interaction with this j atom. */
1065 velec = _mm256_andnot_ps(dummy_mask,velec);
1066 velecsum = _mm256_add_ps(velecsum,velec);
1070 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1072 /* Calculate temporary vectorial force */
1073 tx = _mm256_mul_ps(fscal,dx02);
1074 ty = _mm256_mul_ps(fscal,dy02);
1075 tz = _mm256_mul_ps(fscal,dz02);
1077 /* Update vectorial force */
1078 fix0 = _mm256_add_ps(fix0,tx);
1079 fiy0 = _mm256_add_ps(fiy0,ty);
1080 fiz0 = _mm256_add_ps(fiz0,tz);
1082 fjx2 = _mm256_add_ps(fjx2,tx);
1083 fjy2 = _mm256_add_ps(fjy2,ty);
1084 fjz2 = _mm256_add_ps(fjz2,tz);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 r10 = _mm256_mul_ps(rsq10,rinv10);
1091 r10 = _mm256_andnot_ps(dummy_mask,r10);
1093 /* Calculate table index by multiplying r with table scale and truncate to integer */
1094 rt = _mm256_mul_ps(r10,vftabscale);
1095 vfitab = _mm256_cvttps_epi32(rt);
1096 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1097 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1098 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1099 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1100 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1101 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1103 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1104 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1105 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1106 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1107 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1108 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1109 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1110 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1111 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1112 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1113 Heps = _mm256_mul_ps(vfeps,H);
1114 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1115 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1116 velec = _mm256_mul_ps(qq10,VV);
1117 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1118 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1120 /* Update potential sum for this i atom from the interaction with this j atom. */
1121 velec = _mm256_andnot_ps(dummy_mask,velec);
1122 velecsum = _mm256_add_ps(velecsum,velec);
1126 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1128 /* Calculate temporary vectorial force */
1129 tx = _mm256_mul_ps(fscal,dx10);
1130 ty = _mm256_mul_ps(fscal,dy10);
1131 tz = _mm256_mul_ps(fscal,dz10);
1133 /* Update vectorial force */
1134 fix1 = _mm256_add_ps(fix1,tx);
1135 fiy1 = _mm256_add_ps(fiy1,ty);
1136 fiz1 = _mm256_add_ps(fiz1,tz);
1138 fjx0 = _mm256_add_ps(fjx0,tx);
1139 fjy0 = _mm256_add_ps(fjy0,ty);
1140 fjz0 = _mm256_add_ps(fjz0,tz);
1142 /**************************
1143 * CALCULATE INTERACTIONS *
1144 **************************/
1146 r11 = _mm256_mul_ps(rsq11,rinv11);
1147 r11 = _mm256_andnot_ps(dummy_mask,r11);
1149 /* Calculate table index by multiplying r with table scale and truncate to integer */
1150 rt = _mm256_mul_ps(r11,vftabscale);
1151 vfitab = _mm256_cvttps_epi32(rt);
1152 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1153 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1154 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1155 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1156 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1157 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1159 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1160 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1161 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1162 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1163 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1164 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1165 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1166 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1167 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1168 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1169 Heps = _mm256_mul_ps(vfeps,H);
1170 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1171 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1172 velec = _mm256_mul_ps(qq11,VV);
1173 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1174 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1176 /* Update potential sum for this i atom from the interaction with this j atom. */
1177 velec = _mm256_andnot_ps(dummy_mask,velec);
1178 velecsum = _mm256_add_ps(velecsum,velec);
1182 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1184 /* Calculate temporary vectorial force */
1185 tx = _mm256_mul_ps(fscal,dx11);
1186 ty = _mm256_mul_ps(fscal,dy11);
1187 tz = _mm256_mul_ps(fscal,dz11);
1189 /* Update vectorial force */
1190 fix1 = _mm256_add_ps(fix1,tx);
1191 fiy1 = _mm256_add_ps(fiy1,ty);
1192 fiz1 = _mm256_add_ps(fiz1,tz);
1194 fjx1 = _mm256_add_ps(fjx1,tx);
1195 fjy1 = _mm256_add_ps(fjy1,ty);
1196 fjz1 = _mm256_add_ps(fjz1,tz);
1198 /**************************
1199 * CALCULATE INTERACTIONS *
1200 **************************/
1202 r12 = _mm256_mul_ps(rsq12,rinv12);
1203 r12 = _mm256_andnot_ps(dummy_mask,r12);
1205 /* Calculate table index by multiplying r with table scale and truncate to integer */
1206 rt = _mm256_mul_ps(r12,vftabscale);
1207 vfitab = _mm256_cvttps_epi32(rt);
1208 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1209 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1210 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1211 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1212 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1213 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1215 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1216 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1217 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1218 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1219 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1220 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1221 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1222 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1223 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1224 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1225 Heps = _mm256_mul_ps(vfeps,H);
1226 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1227 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1228 velec = _mm256_mul_ps(qq12,VV);
1229 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1230 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1232 /* Update potential sum for this i atom from the interaction with this j atom. */
1233 velec = _mm256_andnot_ps(dummy_mask,velec);
1234 velecsum = _mm256_add_ps(velecsum,velec);
1238 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1240 /* Calculate temporary vectorial force */
1241 tx = _mm256_mul_ps(fscal,dx12);
1242 ty = _mm256_mul_ps(fscal,dy12);
1243 tz = _mm256_mul_ps(fscal,dz12);
1245 /* Update vectorial force */
1246 fix1 = _mm256_add_ps(fix1,tx);
1247 fiy1 = _mm256_add_ps(fiy1,ty);
1248 fiz1 = _mm256_add_ps(fiz1,tz);
1250 fjx2 = _mm256_add_ps(fjx2,tx);
1251 fjy2 = _mm256_add_ps(fjy2,ty);
1252 fjz2 = _mm256_add_ps(fjz2,tz);
1254 /**************************
1255 * CALCULATE INTERACTIONS *
1256 **************************/
1258 r20 = _mm256_mul_ps(rsq20,rinv20);
1259 r20 = _mm256_andnot_ps(dummy_mask,r20);
1261 /* Calculate table index by multiplying r with table scale and truncate to integer */
1262 rt = _mm256_mul_ps(r20,vftabscale);
1263 vfitab = _mm256_cvttps_epi32(rt);
1264 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1265 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1266 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1267 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1268 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1269 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1271 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1272 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1273 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1274 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1275 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1276 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1277 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1278 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1279 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1280 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1281 Heps = _mm256_mul_ps(vfeps,H);
1282 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1283 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1284 velec = _mm256_mul_ps(qq20,VV);
1285 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1286 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1288 /* Update potential sum for this i atom from the interaction with this j atom. */
1289 velec = _mm256_andnot_ps(dummy_mask,velec);
1290 velecsum = _mm256_add_ps(velecsum,velec);
1294 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1296 /* Calculate temporary vectorial force */
1297 tx = _mm256_mul_ps(fscal,dx20);
1298 ty = _mm256_mul_ps(fscal,dy20);
1299 tz = _mm256_mul_ps(fscal,dz20);
1301 /* Update vectorial force */
1302 fix2 = _mm256_add_ps(fix2,tx);
1303 fiy2 = _mm256_add_ps(fiy2,ty);
1304 fiz2 = _mm256_add_ps(fiz2,tz);
1306 fjx0 = _mm256_add_ps(fjx0,tx);
1307 fjy0 = _mm256_add_ps(fjy0,ty);
1308 fjz0 = _mm256_add_ps(fjz0,tz);
1310 /**************************
1311 * CALCULATE INTERACTIONS *
1312 **************************/
1314 r21 = _mm256_mul_ps(rsq21,rinv21);
1315 r21 = _mm256_andnot_ps(dummy_mask,r21);
1317 /* Calculate table index by multiplying r with table scale and truncate to integer */
1318 rt = _mm256_mul_ps(r21,vftabscale);
1319 vfitab = _mm256_cvttps_epi32(rt);
1320 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1321 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1322 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1323 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1324 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1325 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1327 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1328 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1330 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1332 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1334 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1335 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1336 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1337 Heps = _mm256_mul_ps(vfeps,H);
1338 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1339 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1340 velec = _mm256_mul_ps(qq21,VV);
1341 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1342 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1344 /* Update potential sum for this i atom from the interaction with this j atom. */
1345 velec = _mm256_andnot_ps(dummy_mask,velec);
1346 velecsum = _mm256_add_ps(velecsum,velec);
1350 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1352 /* Calculate temporary vectorial force */
1353 tx = _mm256_mul_ps(fscal,dx21);
1354 ty = _mm256_mul_ps(fscal,dy21);
1355 tz = _mm256_mul_ps(fscal,dz21);
1357 /* Update vectorial force */
1358 fix2 = _mm256_add_ps(fix2,tx);
1359 fiy2 = _mm256_add_ps(fiy2,ty);
1360 fiz2 = _mm256_add_ps(fiz2,tz);
1362 fjx1 = _mm256_add_ps(fjx1,tx);
1363 fjy1 = _mm256_add_ps(fjy1,ty);
1364 fjz1 = _mm256_add_ps(fjz1,tz);
1366 /**************************
1367 * CALCULATE INTERACTIONS *
1368 **************************/
1370 r22 = _mm256_mul_ps(rsq22,rinv22);
1371 r22 = _mm256_andnot_ps(dummy_mask,r22);
1373 /* Calculate table index by multiplying r with table scale and truncate to integer */
1374 rt = _mm256_mul_ps(r22,vftabscale);
1375 vfitab = _mm256_cvttps_epi32(rt);
1376 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1377 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1378 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1379 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1380 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1381 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1383 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1384 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1385 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1386 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1388 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1390 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1391 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1392 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1393 Heps = _mm256_mul_ps(vfeps,H);
1394 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1395 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1396 velec = _mm256_mul_ps(qq22,VV);
1397 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1398 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1400 /* Update potential sum for this i atom from the interaction with this j atom. */
1401 velec = _mm256_andnot_ps(dummy_mask,velec);
1402 velecsum = _mm256_add_ps(velecsum,velec);
1406 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1408 /* Calculate temporary vectorial force */
1409 tx = _mm256_mul_ps(fscal,dx22);
1410 ty = _mm256_mul_ps(fscal,dy22);
1411 tz = _mm256_mul_ps(fscal,dz22);
1413 /* Update vectorial force */
1414 fix2 = _mm256_add_ps(fix2,tx);
1415 fiy2 = _mm256_add_ps(fiy2,ty);
1416 fiz2 = _mm256_add_ps(fiz2,tz);
1418 fjx2 = _mm256_add_ps(fjx2,tx);
1419 fjy2 = _mm256_add_ps(fjy2,ty);
1420 fjz2 = _mm256_add_ps(fjz2,tz);
1422 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1423 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1424 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1425 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1426 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1427 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1428 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1429 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1431 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1432 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1434 /* Inner loop uses 409 flops */
1437 /* End of innermost loop */
1439 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1440 f+i_coord_offset,fshift+i_shift_offset);
1443 /* Update potential energies */
1444 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1445 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1447 /* Increment number of inner iterations */
1448 inneriter += j_index_end - j_index_start;
1450 /* Outer loop uses 20 flops */
1453 /* Increment number of outer iterations */
1456 /* Update outer/inner flops */
1458 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1461 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1462 * Electrostatics interaction: CubicSplineTable
1463 * VdW interaction: LennardJones
1464 * Geometry: Water3-Water3
1465 * Calculate force/pot: Force
1468 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1469 (t_nblist * gmx_restrict nlist,
1470 rvec * gmx_restrict xx,
1471 rvec * gmx_restrict ff,
1472 t_forcerec * gmx_restrict fr,
1473 t_mdatoms * gmx_restrict mdatoms,
1474 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1475 t_nrnb * gmx_restrict nrnb)
1477 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1478 * just 0 for non-waters.
1479 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1480 * jnr indices corresponding to data put in the four positions in the SIMD register.
1482 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1483 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1484 int jnrA,jnrB,jnrC,jnrD;
1485 int jnrE,jnrF,jnrG,jnrH;
1486 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1487 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1488 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1489 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1490 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1491 real rcutoff_scalar;
1492 real *shiftvec,*fshift,*x,*f;
1493 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1494 real scratch[4*DIM];
1495 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1496 real * vdwioffsetptr0;
1497 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1498 real * vdwioffsetptr1;
1499 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1500 real * vdwioffsetptr2;
1501 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1502 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1503 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1504 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1505 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1506 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1507 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1508 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1509 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1510 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1511 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1512 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1513 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1514 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1515 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1516 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1517 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1520 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1523 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1524 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1526 __m128i vfitab_lo,vfitab_hi;
1527 __m128i ifour = _mm_set1_epi32(4);
1528 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1530 __m256 dummy_mask,cutoff_mask;
1531 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1532 __m256 one = _mm256_set1_ps(1.0);
1533 __m256 two = _mm256_set1_ps(2.0);
1539 jindex = nlist->jindex;
1541 shiftidx = nlist->shift;
1543 shiftvec = fr->shift_vec[0];
1544 fshift = fr->fshift[0];
1545 facel = _mm256_set1_ps(fr->epsfac);
1546 charge = mdatoms->chargeA;
1547 nvdwtype = fr->ntype;
1548 vdwparam = fr->nbfp;
1549 vdwtype = mdatoms->typeA;
1551 vftab = kernel_data->table_elec->data;
1552 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1554 /* Setup water-specific parameters */
1555 inr = nlist->iinr[0];
1556 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1557 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1558 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1559 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1561 jq0 = _mm256_set1_ps(charge[inr+0]);
1562 jq1 = _mm256_set1_ps(charge[inr+1]);
1563 jq2 = _mm256_set1_ps(charge[inr+2]);
1564 vdwjidx0A = 2*vdwtype[inr+0];
1565 qq00 = _mm256_mul_ps(iq0,jq0);
1566 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1567 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1568 qq01 = _mm256_mul_ps(iq0,jq1);
1569 qq02 = _mm256_mul_ps(iq0,jq2);
1570 qq10 = _mm256_mul_ps(iq1,jq0);
1571 qq11 = _mm256_mul_ps(iq1,jq1);
1572 qq12 = _mm256_mul_ps(iq1,jq2);
1573 qq20 = _mm256_mul_ps(iq2,jq0);
1574 qq21 = _mm256_mul_ps(iq2,jq1);
1575 qq22 = _mm256_mul_ps(iq2,jq2);
1577 /* Avoid stupid compiler warnings */
1578 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1579 j_coord_offsetA = 0;
1580 j_coord_offsetB = 0;
1581 j_coord_offsetC = 0;
1582 j_coord_offsetD = 0;
1583 j_coord_offsetE = 0;
1584 j_coord_offsetF = 0;
1585 j_coord_offsetG = 0;
1586 j_coord_offsetH = 0;
1591 for(iidx=0;iidx<4*DIM;iidx++)
1593 scratch[iidx] = 0.0;
1596 /* Start outer loop over neighborlists */
1597 for(iidx=0; iidx<nri; iidx++)
1599 /* Load shift vector for this list */
1600 i_shift_offset = DIM*shiftidx[iidx];
1602 /* Load limits for loop over neighbors */
1603 j_index_start = jindex[iidx];
1604 j_index_end = jindex[iidx+1];
1606 /* Get outer coordinate index */
1608 i_coord_offset = DIM*inr;
1610 /* Load i particle coords and add shift vector */
1611 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1612 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1614 fix0 = _mm256_setzero_ps();
1615 fiy0 = _mm256_setzero_ps();
1616 fiz0 = _mm256_setzero_ps();
1617 fix1 = _mm256_setzero_ps();
1618 fiy1 = _mm256_setzero_ps();
1619 fiz1 = _mm256_setzero_ps();
1620 fix2 = _mm256_setzero_ps();
1621 fiy2 = _mm256_setzero_ps();
1622 fiz2 = _mm256_setzero_ps();
1624 /* Start inner kernel loop */
1625 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1628 /* Get j neighbor index, and coordinate index */
1630 jnrB = jjnr[jidx+1];
1631 jnrC = jjnr[jidx+2];
1632 jnrD = jjnr[jidx+3];
1633 jnrE = jjnr[jidx+4];
1634 jnrF = jjnr[jidx+5];
1635 jnrG = jjnr[jidx+6];
1636 jnrH = jjnr[jidx+7];
1637 j_coord_offsetA = DIM*jnrA;
1638 j_coord_offsetB = DIM*jnrB;
1639 j_coord_offsetC = DIM*jnrC;
1640 j_coord_offsetD = DIM*jnrD;
1641 j_coord_offsetE = DIM*jnrE;
1642 j_coord_offsetF = DIM*jnrF;
1643 j_coord_offsetG = DIM*jnrG;
1644 j_coord_offsetH = DIM*jnrH;
1646 /* load j atom coordinates */
1647 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1648 x+j_coord_offsetC,x+j_coord_offsetD,
1649 x+j_coord_offsetE,x+j_coord_offsetF,
1650 x+j_coord_offsetG,x+j_coord_offsetH,
1651 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1653 /* Calculate displacement vector */
1654 dx00 = _mm256_sub_ps(ix0,jx0);
1655 dy00 = _mm256_sub_ps(iy0,jy0);
1656 dz00 = _mm256_sub_ps(iz0,jz0);
1657 dx01 = _mm256_sub_ps(ix0,jx1);
1658 dy01 = _mm256_sub_ps(iy0,jy1);
1659 dz01 = _mm256_sub_ps(iz0,jz1);
1660 dx02 = _mm256_sub_ps(ix0,jx2);
1661 dy02 = _mm256_sub_ps(iy0,jy2);
1662 dz02 = _mm256_sub_ps(iz0,jz2);
1663 dx10 = _mm256_sub_ps(ix1,jx0);
1664 dy10 = _mm256_sub_ps(iy1,jy0);
1665 dz10 = _mm256_sub_ps(iz1,jz0);
1666 dx11 = _mm256_sub_ps(ix1,jx1);
1667 dy11 = _mm256_sub_ps(iy1,jy1);
1668 dz11 = _mm256_sub_ps(iz1,jz1);
1669 dx12 = _mm256_sub_ps(ix1,jx2);
1670 dy12 = _mm256_sub_ps(iy1,jy2);
1671 dz12 = _mm256_sub_ps(iz1,jz2);
1672 dx20 = _mm256_sub_ps(ix2,jx0);
1673 dy20 = _mm256_sub_ps(iy2,jy0);
1674 dz20 = _mm256_sub_ps(iz2,jz0);
1675 dx21 = _mm256_sub_ps(ix2,jx1);
1676 dy21 = _mm256_sub_ps(iy2,jy1);
1677 dz21 = _mm256_sub_ps(iz2,jz1);
1678 dx22 = _mm256_sub_ps(ix2,jx2);
1679 dy22 = _mm256_sub_ps(iy2,jy2);
1680 dz22 = _mm256_sub_ps(iz2,jz2);
1682 /* Calculate squared distance and things based on it */
1683 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1684 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1685 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1686 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1687 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1688 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1689 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1690 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1691 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1693 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1694 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1695 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1696 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1697 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1698 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1699 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1700 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1701 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1703 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1705 fjx0 = _mm256_setzero_ps();
1706 fjy0 = _mm256_setzero_ps();
1707 fjz0 = _mm256_setzero_ps();
1708 fjx1 = _mm256_setzero_ps();
1709 fjy1 = _mm256_setzero_ps();
1710 fjz1 = _mm256_setzero_ps();
1711 fjx2 = _mm256_setzero_ps();
1712 fjy2 = _mm256_setzero_ps();
1713 fjz2 = _mm256_setzero_ps();
1715 /**************************
1716 * CALCULATE INTERACTIONS *
1717 **************************/
1719 r00 = _mm256_mul_ps(rsq00,rinv00);
1721 /* Calculate table index by multiplying r with table scale and truncate to integer */
1722 rt = _mm256_mul_ps(r00,vftabscale);
1723 vfitab = _mm256_cvttps_epi32(rt);
1724 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1725 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1726 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1727 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1728 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1729 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1731 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1732 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1734 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1735 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1736 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1737 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1738 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1739 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1740 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1741 Heps = _mm256_mul_ps(vfeps,H);
1742 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1743 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1744 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1746 /* LENNARD-JONES DISPERSION/REPULSION */
1748 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1749 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1751 fscal = _mm256_add_ps(felec,fvdw);
1753 /* Calculate temporary vectorial force */
1754 tx = _mm256_mul_ps(fscal,dx00);
1755 ty = _mm256_mul_ps(fscal,dy00);
1756 tz = _mm256_mul_ps(fscal,dz00);
1758 /* Update vectorial force */
1759 fix0 = _mm256_add_ps(fix0,tx);
1760 fiy0 = _mm256_add_ps(fiy0,ty);
1761 fiz0 = _mm256_add_ps(fiz0,tz);
1763 fjx0 = _mm256_add_ps(fjx0,tx);
1764 fjy0 = _mm256_add_ps(fjy0,ty);
1765 fjz0 = _mm256_add_ps(fjz0,tz);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 r01 = _mm256_mul_ps(rsq01,rinv01);
1773 /* Calculate table index by multiplying r with table scale and truncate to integer */
1774 rt = _mm256_mul_ps(r01,vftabscale);
1775 vfitab = _mm256_cvttps_epi32(rt);
1776 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1777 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1778 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1779 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1780 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1781 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1783 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1784 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1785 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1786 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1787 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1788 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1789 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1790 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1791 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1792 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1793 Heps = _mm256_mul_ps(vfeps,H);
1794 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1795 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1796 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1800 /* Calculate temporary vectorial force */
1801 tx = _mm256_mul_ps(fscal,dx01);
1802 ty = _mm256_mul_ps(fscal,dy01);
1803 tz = _mm256_mul_ps(fscal,dz01);
1805 /* Update vectorial force */
1806 fix0 = _mm256_add_ps(fix0,tx);
1807 fiy0 = _mm256_add_ps(fiy0,ty);
1808 fiz0 = _mm256_add_ps(fiz0,tz);
1810 fjx1 = _mm256_add_ps(fjx1,tx);
1811 fjy1 = _mm256_add_ps(fjy1,ty);
1812 fjz1 = _mm256_add_ps(fjz1,tz);
1814 /**************************
1815 * CALCULATE INTERACTIONS *
1816 **************************/
1818 r02 = _mm256_mul_ps(rsq02,rinv02);
1820 /* Calculate table index by multiplying r with table scale and truncate to integer */
1821 rt = _mm256_mul_ps(r02,vftabscale);
1822 vfitab = _mm256_cvttps_epi32(rt);
1823 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1824 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1825 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1826 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1827 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1828 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1830 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1831 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1832 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1833 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1834 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1835 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1836 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1837 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1838 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1839 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1840 Heps = _mm256_mul_ps(vfeps,H);
1841 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1842 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1843 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1847 /* Calculate temporary vectorial force */
1848 tx = _mm256_mul_ps(fscal,dx02);
1849 ty = _mm256_mul_ps(fscal,dy02);
1850 tz = _mm256_mul_ps(fscal,dz02);
1852 /* Update vectorial force */
1853 fix0 = _mm256_add_ps(fix0,tx);
1854 fiy0 = _mm256_add_ps(fiy0,ty);
1855 fiz0 = _mm256_add_ps(fiz0,tz);
1857 fjx2 = _mm256_add_ps(fjx2,tx);
1858 fjy2 = _mm256_add_ps(fjy2,ty);
1859 fjz2 = _mm256_add_ps(fjz2,tz);
1861 /**************************
1862 * CALCULATE INTERACTIONS *
1863 **************************/
1865 r10 = _mm256_mul_ps(rsq10,rinv10);
1867 /* Calculate table index by multiplying r with table scale and truncate to integer */
1868 rt = _mm256_mul_ps(r10,vftabscale);
1869 vfitab = _mm256_cvttps_epi32(rt);
1870 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1871 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1872 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1873 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1874 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1875 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1877 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1878 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1879 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1880 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1881 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1882 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1883 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1884 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1885 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1886 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1887 Heps = _mm256_mul_ps(vfeps,H);
1888 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1889 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1890 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1894 /* Calculate temporary vectorial force */
1895 tx = _mm256_mul_ps(fscal,dx10);
1896 ty = _mm256_mul_ps(fscal,dy10);
1897 tz = _mm256_mul_ps(fscal,dz10);
1899 /* Update vectorial force */
1900 fix1 = _mm256_add_ps(fix1,tx);
1901 fiy1 = _mm256_add_ps(fiy1,ty);
1902 fiz1 = _mm256_add_ps(fiz1,tz);
1904 fjx0 = _mm256_add_ps(fjx0,tx);
1905 fjy0 = _mm256_add_ps(fjy0,ty);
1906 fjz0 = _mm256_add_ps(fjz0,tz);
1908 /**************************
1909 * CALCULATE INTERACTIONS *
1910 **************************/
1912 r11 = _mm256_mul_ps(rsq11,rinv11);
1914 /* Calculate table index by multiplying r with table scale and truncate to integer */
1915 rt = _mm256_mul_ps(r11,vftabscale);
1916 vfitab = _mm256_cvttps_epi32(rt);
1917 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1918 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1919 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1920 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1921 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1922 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1924 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1925 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1926 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1927 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1928 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1929 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1930 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1931 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1932 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1933 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1934 Heps = _mm256_mul_ps(vfeps,H);
1935 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1936 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1937 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1941 /* Calculate temporary vectorial force */
1942 tx = _mm256_mul_ps(fscal,dx11);
1943 ty = _mm256_mul_ps(fscal,dy11);
1944 tz = _mm256_mul_ps(fscal,dz11);
1946 /* Update vectorial force */
1947 fix1 = _mm256_add_ps(fix1,tx);
1948 fiy1 = _mm256_add_ps(fiy1,ty);
1949 fiz1 = _mm256_add_ps(fiz1,tz);
1951 fjx1 = _mm256_add_ps(fjx1,tx);
1952 fjy1 = _mm256_add_ps(fjy1,ty);
1953 fjz1 = _mm256_add_ps(fjz1,tz);
1955 /**************************
1956 * CALCULATE INTERACTIONS *
1957 **************************/
1959 r12 = _mm256_mul_ps(rsq12,rinv12);
1961 /* Calculate table index by multiplying r with table scale and truncate to integer */
1962 rt = _mm256_mul_ps(r12,vftabscale);
1963 vfitab = _mm256_cvttps_epi32(rt);
1964 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1965 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1966 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1967 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1968 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1969 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1971 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1972 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1973 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1974 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1975 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1976 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1977 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1978 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1979 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1980 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1981 Heps = _mm256_mul_ps(vfeps,H);
1982 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1983 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1984 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1988 /* Calculate temporary vectorial force */
1989 tx = _mm256_mul_ps(fscal,dx12);
1990 ty = _mm256_mul_ps(fscal,dy12);
1991 tz = _mm256_mul_ps(fscal,dz12);
1993 /* Update vectorial force */
1994 fix1 = _mm256_add_ps(fix1,tx);
1995 fiy1 = _mm256_add_ps(fiy1,ty);
1996 fiz1 = _mm256_add_ps(fiz1,tz);
1998 fjx2 = _mm256_add_ps(fjx2,tx);
1999 fjy2 = _mm256_add_ps(fjy2,ty);
2000 fjz2 = _mm256_add_ps(fjz2,tz);
2002 /**************************
2003 * CALCULATE INTERACTIONS *
2004 **************************/
2006 r20 = _mm256_mul_ps(rsq20,rinv20);
2008 /* Calculate table index by multiplying r with table scale and truncate to integer */
2009 rt = _mm256_mul_ps(r20,vftabscale);
2010 vfitab = _mm256_cvttps_epi32(rt);
2011 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2012 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2013 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2014 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2015 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2016 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2018 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2019 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2020 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2021 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2022 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2023 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2024 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2025 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2026 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2027 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2028 Heps = _mm256_mul_ps(vfeps,H);
2029 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2030 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2031 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2035 /* Calculate temporary vectorial force */
2036 tx = _mm256_mul_ps(fscal,dx20);
2037 ty = _mm256_mul_ps(fscal,dy20);
2038 tz = _mm256_mul_ps(fscal,dz20);
2040 /* Update vectorial force */
2041 fix2 = _mm256_add_ps(fix2,tx);
2042 fiy2 = _mm256_add_ps(fiy2,ty);
2043 fiz2 = _mm256_add_ps(fiz2,tz);
2045 fjx0 = _mm256_add_ps(fjx0,tx);
2046 fjy0 = _mm256_add_ps(fjy0,ty);
2047 fjz0 = _mm256_add_ps(fjz0,tz);
2049 /**************************
2050 * CALCULATE INTERACTIONS *
2051 **************************/
2053 r21 = _mm256_mul_ps(rsq21,rinv21);
2055 /* Calculate table index by multiplying r with table scale and truncate to integer */
2056 rt = _mm256_mul_ps(r21,vftabscale);
2057 vfitab = _mm256_cvttps_epi32(rt);
2058 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2059 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2060 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2061 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2062 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2063 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2065 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2066 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2067 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2068 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2069 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2070 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2071 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2072 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2073 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2074 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2075 Heps = _mm256_mul_ps(vfeps,H);
2076 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2077 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2078 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2082 /* Calculate temporary vectorial force */
2083 tx = _mm256_mul_ps(fscal,dx21);
2084 ty = _mm256_mul_ps(fscal,dy21);
2085 tz = _mm256_mul_ps(fscal,dz21);
2087 /* Update vectorial force */
2088 fix2 = _mm256_add_ps(fix2,tx);
2089 fiy2 = _mm256_add_ps(fiy2,ty);
2090 fiz2 = _mm256_add_ps(fiz2,tz);
2092 fjx1 = _mm256_add_ps(fjx1,tx);
2093 fjy1 = _mm256_add_ps(fjy1,ty);
2094 fjz1 = _mm256_add_ps(fjz1,tz);
2096 /**************************
2097 * CALCULATE INTERACTIONS *
2098 **************************/
2100 r22 = _mm256_mul_ps(rsq22,rinv22);
2102 /* Calculate table index by multiplying r with table scale and truncate to integer */
2103 rt = _mm256_mul_ps(r22,vftabscale);
2104 vfitab = _mm256_cvttps_epi32(rt);
2105 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2106 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2107 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2108 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2109 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2110 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2112 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2113 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2114 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2115 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2116 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2117 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2118 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2119 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2120 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2121 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2122 Heps = _mm256_mul_ps(vfeps,H);
2123 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2124 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2125 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2129 /* Calculate temporary vectorial force */
2130 tx = _mm256_mul_ps(fscal,dx22);
2131 ty = _mm256_mul_ps(fscal,dy22);
2132 tz = _mm256_mul_ps(fscal,dz22);
2134 /* Update vectorial force */
2135 fix2 = _mm256_add_ps(fix2,tx);
2136 fiy2 = _mm256_add_ps(fiy2,ty);
2137 fiz2 = _mm256_add_ps(fiz2,tz);
2139 fjx2 = _mm256_add_ps(fjx2,tx);
2140 fjy2 = _mm256_add_ps(fjy2,ty);
2141 fjz2 = _mm256_add_ps(fjz2,tz);
2143 fjptrA = f+j_coord_offsetA;
2144 fjptrB = f+j_coord_offsetB;
2145 fjptrC = f+j_coord_offsetC;
2146 fjptrD = f+j_coord_offsetD;
2147 fjptrE = f+j_coord_offsetE;
2148 fjptrF = f+j_coord_offsetF;
2149 fjptrG = f+j_coord_offsetG;
2150 fjptrH = f+j_coord_offsetH;
2152 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2153 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2155 /* Inner loop uses 359 flops */
2158 if(jidx<j_index_end)
2161 /* Get j neighbor index, and coordinate index */
2162 jnrlistA = jjnr[jidx];
2163 jnrlistB = jjnr[jidx+1];
2164 jnrlistC = jjnr[jidx+2];
2165 jnrlistD = jjnr[jidx+3];
2166 jnrlistE = jjnr[jidx+4];
2167 jnrlistF = jjnr[jidx+5];
2168 jnrlistG = jjnr[jidx+6];
2169 jnrlistH = jjnr[jidx+7];
2170 /* Sign of each element will be negative for non-real atoms.
2171 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2172 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2174 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2175 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2177 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2178 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2179 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2180 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2181 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2182 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2183 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2184 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2185 j_coord_offsetA = DIM*jnrA;
2186 j_coord_offsetB = DIM*jnrB;
2187 j_coord_offsetC = DIM*jnrC;
2188 j_coord_offsetD = DIM*jnrD;
2189 j_coord_offsetE = DIM*jnrE;
2190 j_coord_offsetF = DIM*jnrF;
2191 j_coord_offsetG = DIM*jnrG;
2192 j_coord_offsetH = DIM*jnrH;
2194 /* load j atom coordinates */
2195 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2196 x+j_coord_offsetC,x+j_coord_offsetD,
2197 x+j_coord_offsetE,x+j_coord_offsetF,
2198 x+j_coord_offsetG,x+j_coord_offsetH,
2199 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2201 /* Calculate displacement vector */
2202 dx00 = _mm256_sub_ps(ix0,jx0);
2203 dy00 = _mm256_sub_ps(iy0,jy0);
2204 dz00 = _mm256_sub_ps(iz0,jz0);
2205 dx01 = _mm256_sub_ps(ix0,jx1);
2206 dy01 = _mm256_sub_ps(iy0,jy1);
2207 dz01 = _mm256_sub_ps(iz0,jz1);
2208 dx02 = _mm256_sub_ps(ix0,jx2);
2209 dy02 = _mm256_sub_ps(iy0,jy2);
2210 dz02 = _mm256_sub_ps(iz0,jz2);
2211 dx10 = _mm256_sub_ps(ix1,jx0);
2212 dy10 = _mm256_sub_ps(iy1,jy0);
2213 dz10 = _mm256_sub_ps(iz1,jz0);
2214 dx11 = _mm256_sub_ps(ix1,jx1);
2215 dy11 = _mm256_sub_ps(iy1,jy1);
2216 dz11 = _mm256_sub_ps(iz1,jz1);
2217 dx12 = _mm256_sub_ps(ix1,jx2);
2218 dy12 = _mm256_sub_ps(iy1,jy2);
2219 dz12 = _mm256_sub_ps(iz1,jz2);
2220 dx20 = _mm256_sub_ps(ix2,jx0);
2221 dy20 = _mm256_sub_ps(iy2,jy0);
2222 dz20 = _mm256_sub_ps(iz2,jz0);
2223 dx21 = _mm256_sub_ps(ix2,jx1);
2224 dy21 = _mm256_sub_ps(iy2,jy1);
2225 dz21 = _mm256_sub_ps(iz2,jz1);
2226 dx22 = _mm256_sub_ps(ix2,jx2);
2227 dy22 = _mm256_sub_ps(iy2,jy2);
2228 dz22 = _mm256_sub_ps(iz2,jz2);
2230 /* Calculate squared distance and things based on it */
2231 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2232 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2233 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2234 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2235 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2236 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2237 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2238 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2239 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2241 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2242 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2243 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2244 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2245 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2246 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2247 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2248 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2249 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2251 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2253 fjx0 = _mm256_setzero_ps();
2254 fjy0 = _mm256_setzero_ps();
2255 fjz0 = _mm256_setzero_ps();
2256 fjx1 = _mm256_setzero_ps();
2257 fjy1 = _mm256_setzero_ps();
2258 fjz1 = _mm256_setzero_ps();
2259 fjx2 = _mm256_setzero_ps();
2260 fjy2 = _mm256_setzero_ps();
2261 fjz2 = _mm256_setzero_ps();
2263 /**************************
2264 * CALCULATE INTERACTIONS *
2265 **************************/
2267 r00 = _mm256_mul_ps(rsq00,rinv00);
2268 r00 = _mm256_andnot_ps(dummy_mask,r00);
2270 /* Calculate table index by multiplying r with table scale and truncate to integer */
2271 rt = _mm256_mul_ps(r00,vftabscale);
2272 vfitab = _mm256_cvttps_epi32(rt);
2273 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2274 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2275 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2276 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2277 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2278 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2280 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2281 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2282 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2283 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2284 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2285 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2286 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2287 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2288 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2289 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2290 Heps = _mm256_mul_ps(vfeps,H);
2291 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2292 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2293 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2295 /* LENNARD-JONES DISPERSION/REPULSION */
2297 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2298 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2300 fscal = _mm256_add_ps(felec,fvdw);
2302 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2304 /* Calculate temporary vectorial force */
2305 tx = _mm256_mul_ps(fscal,dx00);
2306 ty = _mm256_mul_ps(fscal,dy00);
2307 tz = _mm256_mul_ps(fscal,dz00);
2309 /* Update vectorial force */
2310 fix0 = _mm256_add_ps(fix0,tx);
2311 fiy0 = _mm256_add_ps(fiy0,ty);
2312 fiz0 = _mm256_add_ps(fiz0,tz);
2314 fjx0 = _mm256_add_ps(fjx0,tx);
2315 fjy0 = _mm256_add_ps(fjy0,ty);
2316 fjz0 = _mm256_add_ps(fjz0,tz);
2318 /**************************
2319 * CALCULATE INTERACTIONS *
2320 **************************/
2322 r01 = _mm256_mul_ps(rsq01,rinv01);
2323 r01 = _mm256_andnot_ps(dummy_mask,r01);
2325 /* Calculate table index by multiplying r with table scale and truncate to integer */
2326 rt = _mm256_mul_ps(r01,vftabscale);
2327 vfitab = _mm256_cvttps_epi32(rt);
2328 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2329 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2330 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2331 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2332 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2333 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2335 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2336 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2337 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2338 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2339 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2340 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2341 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2342 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2343 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2344 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2345 Heps = _mm256_mul_ps(vfeps,H);
2346 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2347 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2348 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2352 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2354 /* Calculate temporary vectorial force */
2355 tx = _mm256_mul_ps(fscal,dx01);
2356 ty = _mm256_mul_ps(fscal,dy01);
2357 tz = _mm256_mul_ps(fscal,dz01);
2359 /* Update vectorial force */
2360 fix0 = _mm256_add_ps(fix0,tx);
2361 fiy0 = _mm256_add_ps(fiy0,ty);
2362 fiz0 = _mm256_add_ps(fiz0,tz);
2364 fjx1 = _mm256_add_ps(fjx1,tx);
2365 fjy1 = _mm256_add_ps(fjy1,ty);
2366 fjz1 = _mm256_add_ps(fjz1,tz);
2368 /**************************
2369 * CALCULATE INTERACTIONS *
2370 **************************/
2372 r02 = _mm256_mul_ps(rsq02,rinv02);
2373 r02 = _mm256_andnot_ps(dummy_mask,r02);
2375 /* Calculate table index by multiplying r with table scale and truncate to integer */
2376 rt = _mm256_mul_ps(r02,vftabscale);
2377 vfitab = _mm256_cvttps_epi32(rt);
2378 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2379 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2380 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2381 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2382 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2383 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2385 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2386 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2388 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2390 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2391 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2392 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2393 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2394 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2395 Heps = _mm256_mul_ps(vfeps,H);
2396 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2397 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2398 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2402 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2404 /* Calculate temporary vectorial force */
2405 tx = _mm256_mul_ps(fscal,dx02);
2406 ty = _mm256_mul_ps(fscal,dy02);
2407 tz = _mm256_mul_ps(fscal,dz02);
2409 /* Update vectorial force */
2410 fix0 = _mm256_add_ps(fix0,tx);
2411 fiy0 = _mm256_add_ps(fiy0,ty);
2412 fiz0 = _mm256_add_ps(fiz0,tz);
2414 fjx2 = _mm256_add_ps(fjx2,tx);
2415 fjy2 = _mm256_add_ps(fjy2,ty);
2416 fjz2 = _mm256_add_ps(fjz2,tz);
2418 /**************************
2419 * CALCULATE INTERACTIONS *
2420 **************************/
2422 r10 = _mm256_mul_ps(rsq10,rinv10);
2423 r10 = _mm256_andnot_ps(dummy_mask,r10);
2425 /* Calculate table index by multiplying r with table scale and truncate to integer */
2426 rt = _mm256_mul_ps(r10,vftabscale);
2427 vfitab = _mm256_cvttps_epi32(rt);
2428 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2429 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2430 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2431 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2432 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2433 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2435 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2436 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2437 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2438 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2439 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2440 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2441 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2442 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2443 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2444 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2445 Heps = _mm256_mul_ps(vfeps,H);
2446 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2447 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2448 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2452 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2454 /* Calculate temporary vectorial force */
2455 tx = _mm256_mul_ps(fscal,dx10);
2456 ty = _mm256_mul_ps(fscal,dy10);
2457 tz = _mm256_mul_ps(fscal,dz10);
2459 /* Update vectorial force */
2460 fix1 = _mm256_add_ps(fix1,tx);
2461 fiy1 = _mm256_add_ps(fiy1,ty);
2462 fiz1 = _mm256_add_ps(fiz1,tz);
2464 fjx0 = _mm256_add_ps(fjx0,tx);
2465 fjy0 = _mm256_add_ps(fjy0,ty);
2466 fjz0 = _mm256_add_ps(fjz0,tz);
2468 /**************************
2469 * CALCULATE INTERACTIONS *
2470 **************************/
2472 r11 = _mm256_mul_ps(rsq11,rinv11);
2473 r11 = _mm256_andnot_ps(dummy_mask,r11);
2475 /* Calculate table index by multiplying r with table scale and truncate to integer */
2476 rt = _mm256_mul_ps(r11,vftabscale);
2477 vfitab = _mm256_cvttps_epi32(rt);
2478 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2479 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2480 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2481 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2482 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2483 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2485 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2486 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2487 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2488 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2489 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2490 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2491 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2492 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2493 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2494 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2495 Heps = _mm256_mul_ps(vfeps,H);
2496 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2497 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2498 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2502 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2504 /* Calculate temporary vectorial force */
2505 tx = _mm256_mul_ps(fscal,dx11);
2506 ty = _mm256_mul_ps(fscal,dy11);
2507 tz = _mm256_mul_ps(fscal,dz11);
2509 /* Update vectorial force */
2510 fix1 = _mm256_add_ps(fix1,tx);
2511 fiy1 = _mm256_add_ps(fiy1,ty);
2512 fiz1 = _mm256_add_ps(fiz1,tz);
2514 fjx1 = _mm256_add_ps(fjx1,tx);
2515 fjy1 = _mm256_add_ps(fjy1,ty);
2516 fjz1 = _mm256_add_ps(fjz1,tz);
2518 /**************************
2519 * CALCULATE INTERACTIONS *
2520 **************************/
2522 r12 = _mm256_mul_ps(rsq12,rinv12);
2523 r12 = _mm256_andnot_ps(dummy_mask,r12);
2525 /* Calculate table index by multiplying r with table scale and truncate to integer */
2526 rt = _mm256_mul_ps(r12,vftabscale);
2527 vfitab = _mm256_cvttps_epi32(rt);
2528 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2529 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2530 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2531 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2532 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2533 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2535 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2536 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2537 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2538 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2539 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2540 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2541 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2542 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2543 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2544 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2545 Heps = _mm256_mul_ps(vfeps,H);
2546 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2547 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2548 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2552 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2554 /* Calculate temporary vectorial force */
2555 tx = _mm256_mul_ps(fscal,dx12);
2556 ty = _mm256_mul_ps(fscal,dy12);
2557 tz = _mm256_mul_ps(fscal,dz12);
2559 /* Update vectorial force */
2560 fix1 = _mm256_add_ps(fix1,tx);
2561 fiy1 = _mm256_add_ps(fiy1,ty);
2562 fiz1 = _mm256_add_ps(fiz1,tz);
2564 fjx2 = _mm256_add_ps(fjx2,tx);
2565 fjy2 = _mm256_add_ps(fjy2,ty);
2566 fjz2 = _mm256_add_ps(fjz2,tz);
2568 /**************************
2569 * CALCULATE INTERACTIONS *
2570 **************************/
2572 r20 = _mm256_mul_ps(rsq20,rinv20);
2573 r20 = _mm256_andnot_ps(dummy_mask,r20);
2575 /* Calculate table index by multiplying r with table scale and truncate to integer */
2576 rt = _mm256_mul_ps(r20,vftabscale);
2577 vfitab = _mm256_cvttps_epi32(rt);
2578 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2579 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2580 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2581 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2582 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2583 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2585 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2586 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2587 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2588 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2589 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2590 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2591 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2592 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2593 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2594 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2595 Heps = _mm256_mul_ps(vfeps,H);
2596 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2597 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2598 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2602 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2604 /* Calculate temporary vectorial force */
2605 tx = _mm256_mul_ps(fscal,dx20);
2606 ty = _mm256_mul_ps(fscal,dy20);
2607 tz = _mm256_mul_ps(fscal,dz20);
2609 /* Update vectorial force */
2610 fix2 = _mm256_add_ps(fix2,tx);
2611 fiy2 = _mm256_add_ps(fiy2,ty);
2612 fiz2 = _mm256_add_ps(fiz2,tz);
2614 fjx0 = _mm256_add_ps(fjx0,tx);
2615 fjy0 = _mm256_add_ps(fjy0,ty);
2616 fjz0 = _mm256_add_ps(fjz0,tz);
2618 /**************************
2619 * CALCULATE INTERACTIONS *
2620 **************************/
2622 r21 = _mm256_mul_ps(rsq21,rinv21);
2623 r21 = _mm256_andnot_ps(dummy_mask,r21);
2625 /* Calculate table index by multiplying r with table scale and truncate to integer */
2626 rt = _mm256_mul_ps(r21,vftabscale);
2627 vfitab = _mm256_cvttps_epi32(rt);
2628 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2629 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2630 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2631 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2632 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2633 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2635 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2636 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2637 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2638 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2639 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2640 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2641 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2642 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2643 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2644 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2645 Heps = _mm256_mul_ps(vfeps,H);
2646 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2647 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2648 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2652 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2654 /* Calculate temporary vectorial force */
2655 tx = _mm256_mul_ps(fscal,dx21);
2656 ty = _mm256_mul_ps(fscal,dy21);
2657 tz = _mm256_mul_ps(fscal,dz21);
2659 /* Update vectorial force */
2660 fix2 = _mm256_add_ps(fix2,tx);
2661 fiy2 = _mm256_add_ps(fiy2,ty);
2662 fiz2 = _mm256_add_ps(fiz2,tz);
2664 fjx1 = _mm256_add_ps(fjx1,tx);
2665 fjy1 = _mm256_add_ps(fjy1,ty);
2666 fjz1 = _mm256_add_ps(fjz1,tz);
2668 /**************************
2669 * CALCULATE INTERACTIONS *
2670 **************************/
2672 r22 = _mm256_mul_ps(rsq22,rinv22);
2673 r22 = _mm256_andnot_ps(dummy_mask,r22);
2675 /* Calculate table index by multiplying r with table scale and truncate to integer */
2676 rt = _mm256_mul_ps(r22,vftabscale);
2677 vfitab = _mm256_cvttps_epi32(rt);
2678 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2679 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2680 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2681 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2682 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2683 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2685 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2686 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2687 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2688 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2689 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2690 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2691 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2692 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2693 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2694 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2695 Heps = _mm256_mul_ps(vfeps,H);
2696 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2697 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2698 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2702 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2704 /* Calculate temporary vectorial force */
2705 tx = _mm256_mul_ps(fscal,dx22);
2706 ty = _mm256_mul_ps(fscal,dy22);
2707 tz = _mm256_mul_ps(fscal,dz22);
2709 /* Update vectorial force */
2710 fix2 = _mm256_add_ps(fix2,tx);
2711 fiy2 = _mm256_add_ps(fiy2,ty);
2712 fiz2 = _mm256_add_ps(fiz2,tz);
2714 fjx2 = _mm256_add_ps(fjx2,tx);
2715 fjy2 = _mm256_add_ps(fjy2,ty);
2716 fjz2 = _mm256_add_ps(fjz2,tz);
2718 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2719 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2720 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2721 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2722 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2723 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2724 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2725 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2727 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2728 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2730 /* Inner loop uses 368 flops */
2733 /* End of innermost loop */
2735 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2736 f+i_coord_offset,fshift+i_shift_offset);
2738 /* Increment number of inner iterations */
2739 inneriter += j_index_end - j_index_start;
2741 /* Outer loop uses 18 flops */
2744 /* Increment number of outer iterations */
2747 /* Update outer/inner flops */
2749 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);