2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrE,jnrF,jnrG,jnrH;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 real * vdwioffsetptr0;
86 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 real * vdwioffsetptr1;
88 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 real * vdwioffsetptr3;
92 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
94 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
96 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
98 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
100 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
114 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
117 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
118 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
120 __m128i vfitab_lo,vfitab_hi;
121 __m128i ifour = _mm_set1_epi32(4);
122 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
124 __m256 dummy_mask,cutoff_mask;
125 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
126 __m256 one = _mm256_set1_ps(1.0);
127 __m256 two = _mm256_set1_ps(2.0);
133 jindex = nlist->jindex;
135 shiftidx = nlist->shift;
137 shiftvec = fr->shift_vec[0];
138 fshift = fr->fshift[0];
139 facel = _mm256_set1_ps(fr->ic->epsfac);
140 charge = mdatoms->chargeA;
141 krf = _mm256_set1_ps(fr->ic->k_rf);
142 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
143 crf = _mm256_set1_ps(fr->ic->c_rf);
144 nvdwtype = fr->ntype;
146 vdwtype = mdatoms->typeA;
148 vftab = kernel_data->table_vdw->data;
149 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
151 /* Setup water-specific parameters */
152 inr = nlist->iinr[0];
153 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
154 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
155 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
156 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
158 jq1 = _mm256_set1_ps(charge[inr+1]);
159 jq2 = _mm256_set1_ps(charge[inr+2]);
160 jq3 = _mm256_set1_ps(charge[inr+3]);
161 vdwjidx0A = 2*vdwtype[inr+0];
162 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
163 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
164 qq11 = _mm256_mul_ps(iq1,jq1);
165 qq12 = _mm256_mul_ps(iq1,jq2);
166 qq13 = _mm256_mul_ps(iq1,jq3);
167 qq21 = _mm256_mul_ps(iq2,jq1);
168 qq22 = _mm256_mul_ps(iq2,jq2);
169 qq23 = _mm256_mul_ps(iq2,jq3);
170 qq31 = _mm256_mul_ps(iq3,jq1);
171 qq32 = _mm256_mul_ps(iq3,jq2);
172 qq33 = _mm256_mul_ps(iq3,jq3);
174 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
175 rcutoff_scalar = fr->ic->rcoulomb;
176 rcutoff = _mm256_set1_ps(rcutoff_scalar);
177 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
179 /* Avoid stupid compiler warnings */
180 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
193 for(iidx=0;iidx<4*DIM;iidx++)
198 /* Start outer loop over neighborlists */
199 for(iidx=0; iidx<nri; iidx++)
201 /* Load shift vector for this list */
202 i_shift_offset = DIM*shiftidx[iidx];
204 /* Load limits for loop over neighbors */
205 j_index_start = jindex[iidx];
206 j_index_end = jindex[iidx+1];
208 /* Get outer coordinate index */
210 i_coord_offset = DIM*inr;
212 /* Load i particle coords and add shift vector */
213 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
214 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
216 fix0 = _mm256_setzero_ps();
217 fiy0 = _mm256_setzero_ps();
218 fiz0 = _mm256_setzero_ps();
219 fix1 = _mm256_setzero_ps();
220 fiy1 = _mm256_setzero_ps();
221 fiz1 = _mm256_setzero_ps();
222 fix2 = _mm256_setzero_ps();
223 fiy2 = _mm256_setzero_ps();
224 fiz2 = _mm256_setzero_ps();
225 fix3 = _mm256_setzero_ps();
226 fiy3 = _mm256_setzero_ps();
227 fiz3 = _mm256_setzero_ps();
229 /* Reset potential sums */
230 velecsum = _mm256_setzero_ps();
231 vvdwsum = _mm256_setzero_ps();
233 /* Start inner kernel loop */
234 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
237 /* Get j neighbor index, and coordinate index */
246 j_coord_offsetA = DIM*jnrA;
247 j_coord_offsetB = DIM*jnrB;
248 j_coord_offsetC = DIM*jnrC;
249 j_coord_offsetD = DIM*jnrD;
250 j_coord_offsetE = DIM*jnrE;
251 j_coord_offsetF = DIM*jnrF;
252 j_coord_offsetG = DIM*jnrG;
253 j_coord_offsetH = DIM*jnrH;
255 /* load j atom coordinates */
256 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
257 x+j_coord_offsetC,x+j_coord_offsetD,
258 x+j_coord_offsetE,x+j_coord_offsetF,
259 x+j_coord_offsetG,x+j_coord_offsetH,
260 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
261 &jy2,&jz2,&jx3,&jy3,&jz3);
263 /* Calculate displacement vector */
264 dx00 = _mm256_sub_ps(ix0,jx0);
265 dy00 = _mm256_sub_ps(iy0,jy0);
266 dz00 = _mm256_sub_ps(iz0,jz0);
267 dx11 = _mm256_sub_ps(ix1,jx1);
268 dy11 = _mm256_sub_ps(iy1,jy1);
269 dz11 = _mm256_sub_ps(iz1,jz1);
270 dx12 = _mm256_sub_ps(ix1,jx2);
271 dy12 = _mm256_sub_ps(iy1,jy2);
272 dz12 = _mm256_sub_ps(iz1,jz2);
273 dx13 = _mm256_sub_ps(ix1,jx3);
274 dy13 = _mm256_sub_ps(iy1,jy3);
275 dz13 = _mm256_sub_ps(iz1,jz3);
276 dx21 = _mm256_sub_ps(ix2,jx1);
277 dy21 = _mm256_sub_ps(iy2,jy1);
278 dz21 = _mm256_sub_ps(iz2,jz1);
279 dx22 = _mm256_sub_ps(ix2,jx2);
280 dy22 = _mm256_sub_ps(iy2,jy2);
281 dz22 = _mm256_sub_ps(iz2,jz2);
282 dx23 = _mm256_sub_ps(ix2,jx3);
283 dy23 = _mm256_sub_ps(iy2,jy3);
284 dz23 = _mm256_sub_ps(iz2,jz3);
285 dx31 = _mm256_sub_ps(ix3,jx1);
286 dy31 = _mm256_sub_ps(iy3,jy1);
287 dz31 = _mm256_sub_ps(iz3,jz1);
288 dx32 = _mm256_sub_ps(ix3,jx2);
289 dy32 = _mm256_sub_ps(iy3,jy2);
290 dz32 = _mm256_sub_ps(iz3,jz2);
291 dx33 = _mm256_sub_ps(ix3,jx3);
292 dy33 = _mm256_sub_ps(iy3,jy3);
293 dz33 = _mm256_sub_ps(iz3,jz3);
295 /* Calculate squared distance and things based on it */
296 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
297 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
298 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
299 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
300 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
301 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
302 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
303 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
304 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
305 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
307 rinv00 = avx256_invsqrt_f(rsq00);
308 rinv11 = avx256_invsqrt_f(rsq11);
309 rinv12 = avx256_invsqrt_f(rsq12);
310 rinv13 = avx256_invsqrt_f(rsq13);
311 rinv21 = avx256_invsqrt_f(rsq21);
312 rinv22 = avx256_invsqrt_f(rsq22);
313 rinv23 = avx256_invsqrt_f(rsq23);
314 rinv31 = avx256_invsqrt_f(rsq31);
315 rinv32 = avx256_invsqrt_f(rsq32);
316 rinv33 = avx256_invsqrt_f(rsq33);
318 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
319 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
320 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
321 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
322 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
323 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
324 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
325 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
326 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
328 fjx0 = _mm256_setzero_ps();
329 fjy0 = _mm256_setzero_ps();
330 fjz0 = _mm256_setzero_ps();
331 fjx1 = _mm256_setzero_ps();
332 fjy1 = _mm256_setzero_ps();
333 fjz1 = _mm256_setzero_ps();
334 fjx2 = _mm256_setzero_ps();
335 fjy2 = _mm256_setzero_ps();
336 fjz2 = _mm256_setzero_ps();
337 fjx3 = _mm256_setzero_ps();
338 fjy3 = _mm256_setzero_ps();
339 fjz3 = _mm256_setzero_ps();
341 /**************************
342 * CALCULATE INTERACTIONS *
343 **************************/
345 if (gmx_mm256_any_lt(rsq00,rcutoff2))
348 r00 = _mm256_mul_ps(rsq00,rinv00);
350 /* Calculate table index by multiplying r with table scale and truncate to integer */
351 rt = _mm256_mul_ps(r00,vftabscale);
352 vfitab = _mm256_cvttps_epi32(rt);
353 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
354 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
355 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
356 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
357 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
358 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
360 /* CUBIC SPLINE TABLE DISPERSION */
361 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
362 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
363 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
364 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
365 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
366 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
367 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
368 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
369 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
370 Heps = _mm256_mul_ps(vfeps,H);
371 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
372 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
373 vvdw6 = _mm256_mul_ps(c6_00,VV);
374 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
375 fvdw6 = _mm256_mul_ps(c6_00,FF);
377 /* CUBIC SPLINE TABLE REPULSION */
378 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
379 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
380 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
381 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
382 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
383 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
384 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
385 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
386 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
388 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
389 Heps = _mm256_mul_ps(vfeps,H);
390 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
391 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
392 vvdw12 = _mm256_mul_ps(c12_00,VV);
393 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
394 fvdw12 = _mm256_mul_ps(c12_00,FF);
395 vvdw = _mm256_add_ps(vvdw12,vvdw6);
396 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
398 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
402 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
406 fscal = _mm256_and_ps(fscal,cutoff_mask);
408 /* Calculate temporary vectorial force */
409 tx = _mm256_mul_ps(fscal,dx00);
410 ty = _mm256_mul_ps(fscal,dy00);
411 tz = _mm256_mul_ps(fscal,dz00);
413 /* Update vectorial force */
414 fix0 = _mm256_add_ps(fix0,tx);
415 fiy0 = _mm256_add_ps(fiy0,ty);
416 fiz0 = _mm256_add_ps(fiz0,tz);
418 fjx0 = _mm256_add_ps(fjx0,tx);
419 fjy0 = _mm256_add_ps(fjy0,ty);
420 fjz0 = _mm256_add_ps(fjz0,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 if (gmx_mm256_any_lt(rsq11,rcutoff2))
431 /* REACTION-FIELD ELECTROSTATICS */
432 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
433 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
435 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
437 /* Update potential sum for this i atom from the interaction with this j atom. */
438 velec = _mm256_and_ps(velec,cutoff_mask);
439 velecsum = _mm256_add_ps(velecsum,velec);
443 fscal = _mm256_and_ps(fscal,cutoff_mask);
445 /* Calculate temporary vectorial force */
446 tx = _mm256_mul_ps(fscal,dx11);
447 ty = _mm256_mul_ps(fscal,dy11);
448 tz = _mm256_mul_ps(fscal,dz11);
450 /* Update vectorial force */
451 fix1 = _mm256_add_ps(fix1,tx);
452 fiy1 = _mm256_add_ps(fiy1,ty);
453 fiz1 = _mm256_add_ps(fiz1,tz);
455 fjx1 = _mm256_add_ps(fjx1,tx);
456 fjy1 = _mm256_add_ps(fjy1,ty);
457 fjz1 = _mm256_add_ps(fjz1,tz);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 if (gmx_mm256_any_lt(rsq12,rcutoff2))
468 /* REACTION-FIELD ELECTROSTATICS */
469 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
470 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
472 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velec = _mm256_and_ps(velec,cutoff_mask);
476 velecsum = _mm256_add_ps(velecsum,velec);
480 fscal = _mm256_and_ps(fscal,cutoff_mask);
482 /* Calculate temporary vectorial force */
483 tx = _mm256_mul_ps(fscal,dx12);
484 ty = _mm256_mul_ps(fscal,dy12);
485 tz = _mm256_mul_ps(fscal,dz12);
487 /* Update vectorial force */
488 fix1 = _mm256_add_ps(fix1,tx);
489 fiy1 = _mm256_add_ps(fiy1,ty);
490 fiz1 = _mm256_add_ps(fiz1,tz);
492 fjx2 = _mm256_add_ps(fjx2,tx);
493 fjy2 = _mm256_add_ps(fjy2,ty);
494 fjz2 = _mm256_add_ps(fjz2,tz);
498 /**************************
499 * CALCULATE INTERACTIONS *
500 **************************/
502 if (gmx_mm256_any_lt(rsq13,rcutoff2))
505 /* REACTION-FIELD ELECTROSTATICS */
506 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
507 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
509 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velec = _mm256_and_ps(velec,cutoff_mask);
513 velecsum = _mm256_add_ps(velecsum,velec);
517 fscal = _mm256_and_ps(fscal,cutoff_mask);
519 /* Calculate temporary vectorial force */
520 tx = _mm256_mul_ps(fscal,dx13);
521 ty = _mm256_mul_ps(fscal,dy13);
522 tz = _mm256_mul_ps(fscal,dz13);
524 /* Update vectorial force */
525 fix1 = _mm256_add_ps(fix1,tx);
526 fiy1 = _mm256_add_ps(fiy1,ty);
527 fiz1 = _mm256_add_ps(fiz1,tz);
529 fjx3 = _mm256_add_ps(fjx3,tx);
530 fjy3 = _mm256_add_ps(fjy3,ty);
531 fjz3 = _mm256_add_ps(fjz3,tz);
535 /**************************
536 * CALCULATE INTERACTIONS *
537 **************************/
539 if (gmx_mm256_any_lt(rsq21,rcutoff2))
542 /* REACTION-FIELD ELECTROSTATICS */
543 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
544 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
546 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velec = _mm256_and_ps(velec,cutoff_mask);
550 velecsum = _mm256_add_ps(velecsum,velec);
554 fscal = _mm256_and_ps(fscal,cutoff_mask);
556 /* Calculate temporary vectorial force */
557 tx = _mm256_mul_ps(fscal,dx21);
558 ty = _mm256_mul_ps(fscal,dy21);
559 tz = _mm256_mul_ps(fscal,dz21);
561 /* Update vectorial force */
562 fix2 = _mm256_add_ps(fix2,tx);
563 fiy2 = _mm256_add_ps(fiy2,ty);
564 fiz2 = _mm256_add_ps(fiz2,tz);
566 fjx1 = _mm256_add_ps(fjx1,tx);
567 fjy1 = _mm256_add_ps(fjy1,ty);
568 fjz1 = _mm256_add_ps(fjz1,tz);
572 /**************************
573 * CALCULATE INTERACTIONS *
574 **************************/
576 if (gmx_mm256_any_lt(rsq22,rcutoff2))
579 /* REACTION-FIELD ELECTROSTATICS */
580 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
581 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
583 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec = _mm256_and_ps(velec,cutoff_mask);
587 velecsum = _mm256_add_ps(velecsum,velec);
591 fscal = _mm256_and_ps(fscal,cutoff_mask);
593 /* Calculate temporary vectorial force */
594 tx = _mm256_mul_ps(fscal,dx22);
595 ty = _mm256_mul_ps(fscal,dy22);
596 tz = _mm256_mul_ps(fscal,dz22);
598 /* Update vectorial force */
599 fix2 = _mm256_add_ps(fix2,tx);
600 fiy2 = _mm256_add_ps(fiy2,ty);
601 fiz2 = _mm256_add_ps(fiz2,tz);
603 fjx2 = _mm256_add_ps(fjx2,tx);
604 fjy2 = _mm256_add_ps(fjy2,ty);
605 fjz2 = _mm256_add_ps(fjz2,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 if (gmx_mm256_any_lt(rsq23,rcutoff2))
616 /* REACTION-FIELD ELECTROSTATICS */
617 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
618 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
620 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec = _mm256_and_ps(velec,cutoff_mask);
624 velecsum = _mm256_add_ps(velecsum,velec);
628 fscal = _mm256_and_ps(fscal,cutoff_mask);
630 /* Calculate temporary vectorial force */
631 tx = _mm256_mul_ps(fscal,dx23);
632 ty = _mm256_mul_ps(fscal,dy23);
633 tz = _mm256_mul_ps(fscal,dz23);
635 /* Update vectorial force */
636 fix2 = _mm256_add_ps(fix2,tx);
637 fiy2 = _mm256_add_ps(fiy2,ty);
638 fiz2 = _mm256_add_ps(fiz2,tz);
640 fjx3 = _mm256_add_ps(fjx3,tx);
641 fjy3 = _mm256_add_ps(fjy3,ty);
642 fjz3 = _mm256_add_ps(fjz3,tz);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 if (gmx_mm256_any_lt(rsq31,rcutoff2))
653 /* REACTION-FIELD ELECTROSTATICS */
654 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
655 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
657 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
659 /* Update potential sum for this i atom from the interaction with this j atom. */
660 velec = _mm256_and_ps(velec,cutoff_mask);
661 velecsum = _mm256_add_ps(velecsum,velec);
665 fscal = _mm256_and_ps(fscal,cutoff_mask);
667 /* Calculate temporary vectorial force */
668 tx = _mm256_mul_ps(fscal,dx31);
669 ty = _mm256_mul_ps(fscal,dy31);
670 tz = _mm256_mul_ps(fscal,dz31);
672 /* Update vectorial force */
673 fix3 = _mm256_add_ps(fix3,tx);
674 fiy3 = _mm256_add_ps(fiy3,ty);
675 fiz3 = _mm256_add_ps(fiz3,tz);
677 fjx1 = _mm256_add_ps(fjx1,tx);
678 fjy1 = _mm256_add_ps(fjy1,ty);
679 fjz1 = _mm256_add_ps(fjz1,tz);
683 /**************************
684 * CALCULATE INTERACTIONS *
685 **************************/
687 if (gmx_mm256_any_lt(rsq32,rcutoff2))
690 /* REACTION-FIELD ELECTROSTATICS */
691 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
692 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
694 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
696 /* Update potential sum for this i atom from the interaction with this j atom. */
697 velec = _mm256_and_ps(velec,cutoff_mask);
698 velecsum = _mm256_add_ps(velecsum,velec);
702 fscal = _mm256_and_ps(fscal,cutoff_mask);
704 /* Calculate temporary vectorial force */
705 tx = _mm256_mul_ps(fscal,dx32);
706 ty = _mm256_mul_ps(fscal,dy32);
707 tz = _mm256_mul_ps(fscal,dz32);
709 /* Update vectorial force */
710 fix3 = _mm256_add_ps(fix3,tx);
711 fiy3 = _mm256_add_ps(fiy3,ty);
712 fiz3 = _mm256_add_ps(fiz3,tz);
714 fjx2 = _mm256_add_ps(fjx2,tx);
715 fjy2 = _mm256_add_ps(fjy2,ty);
716 fjz2 = _mm256_add_ps(fjz2,tz);
720 /**************************
721 * CALCULATE INTERACTIONS *
722 **************************/
724 if (gmx_mm256_any_lt(rsq33,rcutoff2))
727 /* REACTION-FIELD ELECTROSTATICS */
728 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
729 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
731 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
733 /* Update potential sum for this i atom from the interaction with this j atom. */
734 velec = _mm256_and_ps(velec,cutoff_mask);
735 velecsum = _mm256_add_ps(velecsum,velec);
739 fscal = _mm256_and_ps(fscal,cutoff_mask);
741 /* Calculate temporary vectorial force */
742 tx = _mm256_mul_ps(fscal,dx33);
743 ty = _mm256_mul_ps(fscal,dy33);
744 tz = _mm256_mul_ps(fscal,dz33);
746 /* Update vectorial force */
747 fix3 = _mm256_add_ps(fix3,tx);
748 fiy3 = _mm256_add_ps(fiy3,ty);
749 fiz3 = _mm256_add_ps(fiz3,tz);
751 fjx3 = _mm256_add_ps(fjx3,tx);
752 fjy3 = _mm256_add_ps(fjy3,ty);
753 fjz3 = _mm256_add_ps(fjz3,tz);
757 fjptrA = f+j_coord_offsetA;
758 fjptrB = f+j_coord_offsetB;
759 fjptrC = f+j_coord_offsetC;
760 fjptrD = f+j_coord_offsetD;
761 fjptrE = f+j_coord_offsetE;
762 fjptrF = f+j_coord_offsetF;
763 fjptrG = f+j_coord_offsetG;
764 fjptrH = f+j_coord_offsetH;
766 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
767 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
768 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
770 /* Inner loop uses 387 flops */
776 /* Get j neighbor index, and coordinate index */
777 jnrlistA = jjnr[jidx];
778 jnrlistB = jjnr[jidx+1];
779 jnrlistC = jjnr[jidx+2];
780 jnrlistD = jjnr[jidx+3];
781 jnrlistE = jjnr[jidx+4];
782 jnrlistF = jjnr[jidx+5];
783 jnrlistG = jjnr[jidx+6];
784 jnrlistH = jjnr[jidx+7];
785 /* Sign of each element will be negative for non-real atoms.
786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
787 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
789 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
790 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
792 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
793 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
794 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
795 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
796 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
797 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
798 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
799 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
800 j_coord_offsetA = DIM*jnrA;
801 j_coord_offsetB = DIM*jnrB;
802 j_coord_offsetC = DIM*jnrC;
803 j_coord_offsetD = DIM*jnrD;
804 j_coord_offsetE = DIM*jnrE;
805 j_coord_offsetF = DIM*jnrF;
806 j_coord_offsetG = DIM*jnrG;
807 j_coord_offsetH = DIM*jnrH;
809 /* load j atom coordinates */
810 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
811 x+j_coord_offsetC,x+j_coord_offsetD,
812 x+j_coord_offsetE,x+j_coord_offsetF,
813 x+j_coord_offsetG,x+j_coord_offsetH,
814 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
815 &jy2,&jz2,&jx3,&jy3,&jz3);
817 /* Calculate displacement vector */
818 dx00 = _mm256_sub_ps(ix0,jx0);
819 dy00 = _mm256_sub_ps(iy0,jy0);
820 dz00 = _mm256_sub_ps(iz0,jz0);
821 dx11 = _mm256_sub_ps(ix1,jx1);
822 dy11 = _mm256_sub_ps(iy1,jy1);
823 dz11 = _mm256_sub_ps(iz1,jz1);
824 dx12 = _mm256_sub_ps(ix1,jx2);
825 dy12 = _mm256_sub_ps(iy1,jy2);
826 dz12 = _mm256_sub_ps(iz1,jz2);
827 dx13 = _mm256_sub_ps(ix1,jx3);
828 dy13 = _mm256_sub_ps(iy1,jy3);
829 dz13 = _mm256_sub_ps(iz1,jz3);
830 dx21 = _mm256_sub_ps(ix2,jx1);
831 dy21 = _mm256_sub_ps(iy2,jy1);
832 dz21 = _mm256_sub_ps(iz2,jz1);
833 dx22 = _mm256_sub_ps(ix2,jx2);
834 dy22 = _mm256_sub_ps(iy2,jy2);
835 dz22 = _mm256_sub_ps(iz2,jz2);
836 dx23 = _mm256_sub_ps(ix2,jx3);
837 dy23 = _mm256_sub_ps(iy2,jy3);
838 dz23 = _mm256_sub_ps(iz2,jz3);
839 dx31 = _mm256_sub_ps(ix3,jx1);
840 dy31 = _mm256_sub_ps(iy3,jy1);
841 dz31 = _mm256_sub_ps(iz3,jz1);
842 dx32 = _mm256_sub_ps(ix3,jx2);
843 dy32 = _mm256_sub_ps(iy3,jy2);
844 dz32 = _mm256_sub_ps(iz3,jz2);
845 dx33 = _mm256_sub_ps(ix3,jx3);
846 dy33 = _mm256_sub_ps(iy3,jy3);
847 dz33 = _mm256_sub_ps(iz3,jz3);
849 /* Calculate squared distance and things based on it */
850 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
851 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
852 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
853 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
854 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
855 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
856 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
857 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
858 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
859 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
861 rinv00 = avx256_invsqrt_f(rsq00);
862 rinv11 = avx256_invsqrt_f(rsq11);
863 rinv12 = avx256_invsqrt_f(rsq12);
864 rinv13 = avx256_invsqrt_f(rsq13);
865 rinv21 = avx256_invsqrt_f(rsq21);
866 rinv22 = avx256_invsqrt_f(rsq22);
867 rinv23 = avx256_invsqrt_f(rsq23);
868 rinv31 = avx256_invsqrt_f(rsq31);
869 rinv32 = avx256_invsqrt_f(rsq32);
870 rinv33 = avx256_invsqrt_f(rsq33);
872 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
873 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
874 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
875 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
876 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
877 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
878 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
879 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
880 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
882 fjx0 = _mm256_setzero_ps();
883 fjy0 = _mm256_setzero_ps();
884 fjz0 = _mm256_setzero_ps();
885 fjx1 = _mm256_setzero_ps();
886 fjy1 = _mm256_setzero_ps();
887 fjz1 = _mm256_setzero_ps();
888 fjx2 = _mm256_setzero_ps();
889 fjy2 = _mm256_setzero_ps();
890 fjz2 = _mm256_setzero_ps();
891 fjx3 = _mm256_setzero_ps();
892 fjy3 = _mm256_setzero_ps();
893 fjz3 = _mm256_setzero_ps();
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 if (gmx_mm256_any_lt(rsq00,rcutoff2))
902 r00 = _mm256_mul_ps(rsq00,rinv00);
903 r00 = _mm256_andnot_ps(dummy_mask,r00);
905 /* Calculate table index by multiplying r with table scale and truncate to integer */
906 rt = _mm256_mul_ps(r00,vftabscale);
907 vfitab = _mm256_cvttps_epi32(rt);
908 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
909 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
910 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
911 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
912 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
913 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
915 /* CUBIC SPLINE TABLE DISPERSION */
916 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
917 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
918 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
919 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
920 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
921 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
922 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
923 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
924 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
925 Heps = _mm256_mul_ps(vfeps,H);
926 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
927 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
928 vvdw6 = _mm256_mul_ps(c6_00,VV);
929 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
930 fvdw6 = _mm256_mul_ps(c6_00,FF);
932 /* CUBIC SPLINE TABLE REPULSION */
933 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
934 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
935 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
936 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
937 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
938 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
939 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
940 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
941 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
942 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
943 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
944 Heps = _mm256_mul_ps(vfeps,H);
945 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
946 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
947 vvdw12 = _mm256_mul_ps(c12_00,VV);
948 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
949 fvdw12 = _mm256_mul_ps(c12_00,FF);
950 vvdw = _mm256_add_ps(vvdw12,vvdw6);
951 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
953 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
955 /* Update potential sum for this i atom from the interaction with this j atom. */
956 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
957 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
958 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
962 fscal = _mm256_and_ps(fscal,cutoff_mask);
964 fscal = _mm256_andnot_ps(dummy_mask,fscal);
966 /* Calculate temporary vectorial force */
967 tx = _mm256_mul_ps(fscal,dx00);
968 ty = _mm256_mul_ps(fscal,dy00);
969 tz = _mm256_mul_ps(fscal,dz00);
971 /* Update vectorial force */
972 fix0 = _mm256_add_ps(fix0,tx);
973 fiy0 = _mm256_add_ps(fiy0,ty);
974 fiz0 = _mm256_add_ps(fiz0,tz);
976 fjx0 = _mm256_add_ps(fjx0,tx);
977 fjy0 = _mm256_add_ps(fjy0,ty);
978 fjz0 = _mm256_add_ps(fjz0,tz);
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
986 if (gmx_mm256_any_lt(rsq11,rcutoff2))
989 /* REACTION-FIELD ELECTROSTATICS */
990 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
991 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
993 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
995 /* Update potential sum for this i atom from the interaction with this j atom. */
996 velec = _mm256_and_ps(velec,cutoff_mask);
997 velec = _mm256_andnot_ps(dummy_mask,velec);
998 velecsum = _mm256_add_ps(velecsum,velec);
1002 fscal = _mm256_and_ps(fscal,cutoff_mask);
1004 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1006 /* Calculate temporary vectorial force */
1007 tx = _mm256_mul_ps(fscal,dx11);
1008 ty = _mm256_mul_ps(fscal,dy11);
1009 tz = _mm256_mul_ps(fscal,dz11);
1011 /* Update vectorial force */
1012 fix1 = _mm256_add_ps(fix1,tx);
1013 fiy1 = _mm256_add_ps(fiy1,ty);
1014 fiz1 = _mm256_add_ps(fiz1,tz);
1016 fjx1 = _mm256_add_ps(fjx1,tx);
1017 fjy1 = _mm256_add_ps(fjy1,ty);
1018 fjz1 = _mm256_add_ps(fjz1,tz);
1022 /**************************
1023 * CALCULATE INTERACTIONS *
1024 **************************/
1026 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1029 /* REACTION-FIELD ELECTROSTATICS */
1030 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1031 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1033 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1035 /* Update potential sum for this i atom from the interaction with this j atom. */
1036 velec = _mm256_and_ps(velec,cutoff_mask);
1037 velec = _mm256_andnot_ps(dummy_mask,velec);
1038 velecsum = _mm256_add_ps(velecsum,velec);
1042 fscal = _mm256_and_ps(fscal,cutoff_mask);
1044 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1046 /* Calculate temporary vectorial force */
1047 tx = _mm256_mul_ps(fscal,dx12);
1048 ty = _mm256_mul_ps(fscal,dy12);
1049 tz = _mm256_mul_ps(fscal,dz12);
1051 /* Update vectorial force */
1052 fix1 = _mm256_add_ps(fix1,tx);
1053 fiy1 = _mm256_add_ps(fiy1,ty);
1054 fiz1 = _mm256_add_ps(fiz1,tz);
1056 fjx2 = _mm256_add_ps(fjx2,tx);
1057 fjy2 = _mm256_add_ps(fjy2,ty);
1058 fjz2 = _mm256_add_ps(fjz2,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1069 /* REACTION-FIELD ELECTROSTATICS */
1070 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
1071 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1073 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1075 /* Update potential sum for this i atom from the interaction with this j atom. */
1076 velec = _mm256_and_ps(velec,cutoff_mask);
1077 velec = _mm256_andnot_ps(dummy_mask,velec);
1078 velecsum = _mm256_add_ps(velecsum,velec);
1082 fscal = _mm256_and_ps(fscal,cutoff_mask);
1084 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1086 /* Calculate temporary vectorial force */
1087 tx = _mm256_mul_ps(fscal,dx13);
1088 ty = _mm256_mul_ps(fscal,dy13);
1089 tz = _mm256_mul_ps(fscal,dz13);
1091 /* Update vectorial force */
1092 fix1 = _mm256_add_ps(fix1,tx);
1093 fiy1 = _mm256_add_ps(fiy1,ty);
1094 fiz1 = _mm256_add_ps(fiz1,tz);
1096 fjx3 = _mm256_add_ps(fjx3,tx);
1097 fjy3 = _mm256_add_ps(fjy3,ty);
1098 fjz3 = _mm256_add_ps(fjz3,tz);
1102 /**************************
1103 * CALCULATE INTERACTIONS *
1104 **************************/
1106 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1109 /* REACTION-FIELD ELECTROSTATICS */
1110 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1111 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1113 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1115 /* Update potential sum for this i atom from the interaction with this j atom. */
1116 velec = _mm256_and_ps(velec,cutoff_mask);
1117 velec = _mm256_andnot_ps(dummy_mask,velec);
1118 velecsum = _mm256_add_ps(velecsum,velec);
1122 fscal = _mm256_and_ps(fscal,cutoff_mask);
1124 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1126 /* Calculate temporary vectorial force */
1127 tx = _mm256_mul_ps(fscal,dx21);
1128 ty = _mm256_mul_ps(fscal,dy21);
1129 tz = _mm256_mul_ps(fscal,dz21);
1131 /* Update vectorial force */
1132 fix2 = _mm256_add_ps(fix2,tx);
1133 fiy2 = _mm256_add_ps(fiy2,ty);
1134 fiz2 = _mm256_add_ps(fiz2,tz);
1136 fjx1 = _mm256_add_ps(fjx1,tx);
1137 fjy1 = _mm256_add_ps(fjy1,ty);
1138 fjz1 = _mm256_add_ps(fjz1,tz);
1142 /**************************
1143 * CALCULATE INTERACTIONS *
1144 **************************/
1146 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1149 /* REACTION-FIELD ELECTROSTATICS */
1150 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1151 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1153 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1155 /* Update potential sum for this i atom from the interaction with this j atom. */
1156 velec = _mm256_and_ps(velec,cutoff_mask);
1157 velec = _mm256_andnot_ps(dummy_mask,velec);
1158 velecsum = _mm256_add_ps(velecsum,velec);
1162 fscal = _mm256_and_ps(fscal,cutoff_mask);
1164 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1166 /* Calculate temporary vectorial force */
1167 tx = _mm256_mul_ps(fscal,dx22);
1168 ty = _mm256_mul_ps(fscal,dy22);
1169 tz = _mm256_mul_ps(fscal,dz22);
1171 /* Update vectorial force */
1172 fix2 = _mm256_add_ps(fix2,tx);
1173 fiy2 = _mm256_add_ps(fiy2,ty);
1174 fiz2 = _mm256_add_ps(fiz2,tz);
1176 fjx2 = _mm256_add_ps(fjx2,tx);
1177 fjy2 = _mm256_add_ps(fjy2,ty);
1178 fjz2 = _mm256_add_ps(fjz2,tz);
1182 /**************************
1183 * CALCULATE INTERACTIONS *
1184 **************************/
1186 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1189 /* REACTION-FIELD ELECTROSTATICS */
1190 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
1191 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1193 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec = _mm256_and_ps(velec,cutoff_mask);
1197 velec = _mm256_andnot_ps(dummy_mask,velec);
1198 velecsum = _mm256_add_ps(velecsum,velec);
1202 fscal = _mm256_and_ps(fscal,cutoff_mask);
1204 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1206 /* Calculate temporary vectorial force */
1207 tx = _mm256_mul_ps(fscal,dx23);
1208 ty = _mm256_mul_ps(fscal,dy23);
1209 tz = _mm256_mul_ps(fscal,dz23);
1211 /* Update vectorial force */
1212 fix2 = _mm256_add_ps(fix2,tx);
1213 fiy2 = _mm256_add_ps(fiy2,ty);
1214 fiz2 = _mm256_add_ps(fiz2,tz);
1216 fjx3 = _mm256_add_ps(fjx3,tx);
1217 fjy3 = _mm256_add_ps(fjy3,ty);
1218 fjz3 = _mm256_add_ps(fjz3,tz);
1222 /**************************
1223 * CALCULATE INTERACTIONS *
1224 **************************/
1226 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1229 /* REACTION-FIELD ELECTROSTATICS */
1230 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
1231 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1233 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1235 /* Update potential sum for this i atom from the interaction with this j atom. */
1236 velec = _mm256_and_ps(velec,cutoff_mask);
1237 velec = _mm256_andnot_ps(dummy_mask,velec);
1238 velecsum = _mm256_add_ps(velecsum,velec);
1242 fscal = _mm256_and_ps(fscal,cutoff_mask);
1244 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1246 /* Calculate temporary vectorial force */
1247 tx = _mm256_mul_ps(fscal,dx31);
1248 ty = _mm256_mul_ps(fscal,dy31);
1249 tz = _mm256_mul_ps(fscal,dz31);
1251 /* Update vectorial force */
1252 fix3 = _mm256_add_ps(fix3,tx);
1253 fiy3 = _mm256_add_ps(fiy3,ty);
1254 fiz3 = _mm256_add_ps(fiz3,tz);
1256 fjx1 = _mm256_add_ps(fjx1,tx);
1257 fjy1 = _mm256_add_ps(fjy1,ty);
1258 fjz1 = _mm256_add_ps(fjz1,tz);
1262 /**************************
1263 * CALCULATE INTERACTIONS *
1264 **************************/
1266 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1269 /* REACTION-FIELD ELECTROSTATICS */
1270 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
1271 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1273 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1275 /* Update potential sum for this i atom from the interaction with this j atom. */
1276 velec = _mm256_and_ps(velec,cutoff_mask);
1277 velec = _mm256_andnot_ps(dummy_mask,velec);
1278 velecsum = _mm256_add_ps(velecsum,velec);
1282 fscal = _mm256_and_ps(fscal,cutoff_mask);
1284 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1286 /* Calculate temporary vectorial force */
1287 tx = _mm256_mul_ps(fscal,dx32);
1288 ty = _mm256_mul_ps(fscal,dy32);
1289 tz = _mm256_mul_ps(fscal,dz32);
1291 /* Update vectorial force */
1292 fix3 = _mm256_add_ps(fix3,tx);
1293 fiy3 = _mm256_add_ps(fiy3,ty);
1294 fiz3 = _mm256_add_ps(fiz3,tz);
1296 fjx2 = _mm256_add_ps(fjx2,tx);
1297 fjy2 = _mm256_add_ps(fjy2,ty);
1298 fjz2 = _mm256_add_ps(fjz2,tz);
1302 /**************************
1303 * CALCULATE INTERACTIONS *
1304 **************************/
1306 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1309 /* REACTION-FIELD ELECTROSTATICS */
1310 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
1311 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1313 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1315 /* Update potential sum for this i atom from the interaction with this j atom. */
1316 velec = _mm256_and_ps(velec,cutoff_mask);
1317 velec = _mm256_andnot_ps(dummy_mask,velec);
1318 velecsum = _mm256_add_ps(velecsum,velec);
1322 fscal = _mm256_and_ps(fscal,cutoff_mask);
1324 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1326 /* Calculate temporary vectorial force */
1327 tx = _mm256_mul_ps(fscal,dx33);
1328 ty = _mm256_mul_ps(fscal,dy33);
1329 tz = _mm256_mul_ps(fscal,dz33);
1331 /* Update vectorial force */
1332 fix3 = _mm256_add_ps(fix3,tx);
1333 fiy3 = _mm256_add_ps(fiy3,ty);
1334 fiz3 = _mm256_add_ps(fiz3,tz);
1336 fjx3 = _mm256_add_ps(fjx3,tx);
1337 fjy3 = _mm256_add_ps(fjy3,ty);
1338 fjz3 = _mm256_add_ps(fjz3,tz);
1342 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1343 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1344 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1345 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1346 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1347 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1348 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1349 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1351 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1352 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1353 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1355 /* Inner loop uses 388 flops */
1358 /* End of innermost loop */
1360 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1361 f+i_coord_offset,fshift+i_shift_offset);
1364 /* Update potential energies */
1365 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1366 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1368 /* Increment number of inner iterations */
1369 inneriter += j_index_end - j_index_start;
1371 /* Outer loop uses 26 flops */
1374 /* Increment number of outer iterations */
1377 /* Update outer/inner flops */
1379 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*388);
1382 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_single
1383 * Electrostatics interaction: ReactionField
1384 * VdW interaction: CubicSplineTable
1385 * Geometry: Water4-Water4
1386 * Calculate force/pot: Force
1389 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_single
1390 (t_nblist * gmx_restrict nlist,
1391 rvec * gmx_restrict xx,
1392 rvec * gmx_restrict ff,
1393 struct t_forcerec * gmx_restrict fr,
1394 t_mdatoms * gmx_restrict mdatoms,
1395 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1396 t_nrnb * gmx_restrict nrnb)
1398 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1399 * just 0 for non-waters.
1400 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1401 * jnr indices corresponding to data put in the four positions in the SIMD register.
1403 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1404 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1405 int jnrA,jnrB,jnrC,jnrD;
1406 int jnrE,jnrF,jnrG,jnrH;
1407 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1408 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1409 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1410 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1411 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1412 real rcutoff_scalar;
1413 real *shiftvec,*fshift,*x,*f;
1414 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1415 real scratch[4*DIM];
1416 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1417 real * vdwioffsetptr0;
1418 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1419 real * vdwioffsetptr1;
1420 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1421 real * vdwioffsetptr2;
1422 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1423 real * vdwioffsetptr3;
1424 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1425 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1426 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1427 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1428 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1429 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1430 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1431 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1432 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1433 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1434 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1435 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1436 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1437 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1438 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1439 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1440 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1441 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1442 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1443 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1446 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1449 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1450 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1452 __m128i vfitab_lo,vfitab_hi;
1453 __m128i ifour = _mm_set1_epi32(4);
1454 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1456 __m256 dummy_mask,cutoff_mask;
1457 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1458 __m256 one = _mm256_set1_ps(1.0);
1459 __m256 two = _mm256_set1_ps(2.0);
1465 jindex = nlist->jindex;
1467 shiftidx = nlist->shift;
1469 shiftvec = fr->shift_vec[0];
1470 fshift = fr->fshift[0];
1471 facel = _mm256_set1_ps(fr->ic->epsfac);
1472 charge = mdatoms->chargeA;
1473 krf = _mm256_set1_ps(fr->ic->k_rf);
1474 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1475 crf = _mm256_set1_ps(fr->ic->c_rf);
1476 nvdwtype = fr->ntype;
1477 vdwparam = fr->nbfp;
1478 vdwtype = mdatoms->typeA;
1480 vftab = kernel_data->table_vdw->data;
1481 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
1483 /* Setup water-specific parameters */
1484 inr = nlist->iinr[0];
1485 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1486 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1487 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1488 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1490 jq1 = _mm256_set1_ps(charge[inr+1]);
1491 jq2 = _mm256_set1_ps(charge[inr+2]);
1492 jq3 = _mm256_set1_ps(charge[inr+3]);
1493 vdwjidx0A = 2*vdwtype[inr+0];
1494 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1495 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1496 qq11 = _mm256_mul_ps(iq1,jq1);
1497 qq12 = _mm256_mul_ps(iq1,jq2);
1498 qq13 = _mm256_mul_ps(iq1,jq3);
1499 qq21 = _mm256_mul_ps(iq2,jq1);
1500 qq22 = _mm256_mul_ps(iq2,jq2);
1501 qq23 = _mm256_mul_ps(iq2,jq3);
1502 qq31 = _mm256_mul_ps(iq3,jq1);
1503 qq32 = _mm256_mul_ps(iq3,jq2);
1504 qq33 = _mm256_mul_ps(iq3,jq3);
1506 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1507 rcutoff_scalar = fr->ic->rcoulomb;
1508 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1509 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1511 /* Avoid stupid compiler warnings */
1512 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1513 j_coord_offsetA = 0;
1514 j_coord_offsetB = 0;
1515 j_coord_offsetC = 0;
1516 j_coord_offsetD = 0;
1517 j_coord_offsetE = 0;
1518 j_coord_offsetF = 0;
1519 j_coord_offsetG = 0;
1520 j_coord_offsetH = 0;
1525 for(iidx=0;iidx<4*DIM;iidx++)
1527 scratch[iidx] = 0.0;
1530 /* Start outer loop over neighborlists */
1531 for(iidx=0; iidx<nri; iidx++)
1533 /* Load shift vector for this list */
1534 i_shift_offset = DIM*shiftidx[iidx];
1536 /* Load limits for loop over neighbors */
1537 j_index_start = jindex[iidx];
1538 j_index_end = jindex[iidx+1];
1540 /* Get outer coordinate index */
1542 i_coord_offset = DIM*inr;
1544 /* Load i particle coords and add shift vector */
1545 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1546 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1548 fix0 = _mm256_setzero_ps();
1549 fiy0 = _mm256_setzero_ps();
1550 fiz0 = _mm256_setzero_ps();
1551 fix1 = _mm256_setzero_ps();
1552 fiy1 = _mm256_setzero_ps();
1553 fiz1 = _mm256_setzero_ps();
1554 fix2 = _mm256_setzero_ps();
1555 fiy2 = _mm256_setzero_ps();
1556 fiz2 = _mm256_setzero_ps();
1557 fix3 = _mm256_setzero_ps();
1558 fiy3 = _mm256_setzero_ps();
1559 fiz3 = _mm256_setzero_ps();
1561 /* Start inner kernel loop */
1562 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1565 /* Get j neighbor index, and coordinate index */
1567 jnrB = jjnr[jidx+1];
1568 jnrC = jjnr[jidx+2];
1569 jnrD = jjnr[jidx+3];
1570 jnrE = jjnr[jidx+4];
1571 jnrF = jjnr[jidx+5];
1572 jnrG = jjnr[jidx+6];
1573 jnrH = jjnr[jidx+7];
1574 j_coord_offsetA = DIM*jnrA;
1575 j_coord_offsetB = DIM*jnrB;
1576 j_coord_offsetC = DIM*jnrC;
1577 j_coord_offsetD = DIM*jnrD;
1578 j_coord_offsetE = DIM*jnrE;
1579 j_coord_offsetF = DIM*jnrF;
1580 j_coord_offsetG = DIM*jnrG;
1581 j_coord_offsetH = DIM*jnrH;
1583 /* load j atom coordinates */
1584 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1585 x+j_coord_offsetC,x+j_coord_offsetD,
1586 x+j_coord_offsetE,x+j_coord_offsetF,
1587 x+j_coord_offsetG,x+j_coord_offsetH,
1588 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1589 &jy2,&jz2,&jx3,&jy3,&jz3);
1591 /* Calculate displacement vector */
1592 dx00 = _mm256_sub_ps(ix0,jx0);
1593 dy00 = _mm256_sub_ps(iy0,jy0);
1594 dz00 = _mm256_sub_ps(iz0,jz0);
1595 dx11 = _mm256_sub_ps(ix1,jx1);
1596 dy11 = _mm256_sub_ps(iy1,jy1);
1597 dz11 = _mm256_sub_ps(iz1,jz1);
1598 dx12 = _mm256_sub_ps(ix1,jx2);
1599 dy12 = _mm256_sub_ps(iy1,jy2);
1600 dz12 = _mm256_sub_ps(iz1,jz2);
1601 dx13 = _mm256_sub_ps(ix1,jx3);
1602 dy13 = _mm256_sub_ps(iy1,jy3);
1603 dz13 = _mm256_sub_ps(iz1,jz3);
1604 dx21 = _mm256_sub_ps(ix2,jx1);
1605 dy21 = _mm256_sub_ps(iy2,jy1);
1606 dz21 = _mm256_sub_ps(iz2,jz1);
1607 dx22 = _mm256_sub_ps(ix2,jx2);
1608 dy22 = _mm256_sub_ps(iy2,jy2);
1609 dz22 = _mm256_sub_ps(iz2,jz2);
1610 dx23 = _mm256_sub_ps(ix2,jx3);
1611 dy23 = _mm256_sub_ps(iy2,jy3);
1612 dz23 = _mm256_sub_ps(iz2,jz3);
1613 dx31 = _mm256_sub_ps(ix3,jx1);
1614 dy31 = _mm256_sub_ps(iy3,jy1);
1615 dz31 = _mm256_sub_ps(iz3,jz1);
1616 dx32 = _mm256_sub_ps(ix3,jx2);
1617 dy32 = _mm256_sub_ps(iy3,jy2);
1618 dz32 = _mm256_sub_ps(iz3,jz2);
1619 dx33 = _mm256_sub_ps(ix3,jx3);
1620 dy33 = _mm256_sub_ps(iy3,jy3);
1621 dz33 = _mm256_sub_ps(iz3,jz3);
1623 /* Calculate squared distance and things based on it */
1624 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1625 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1626 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1627 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1628 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1629 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1630 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1631 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1632 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1633 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1635 rinv00 = avx256_invsqrt_f(rsq00);
1636 rinv11 = avx256_invsqrt_f(rsq11);
1637 rinv12 = avx256_invsqrt_f(rsq12);
1638 rinv13 = avx256_invsqrt_f(rsq13);
1639 rinv21 = avx256_invsqrt_f(rsq21);
1640 rinv22 = avx256_invsqrt_f(rsq22);
1641 rinv23 = avx256_invsqrt_f(rsq23);
1642 rinv31 = avx256_invsqrt_f(rsq31);
1643 rinv32 = avx256_invsqrt_f(rsq32);
1644 rinv33 = avx256_invsqrt_f(rsq33);
1646 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1647 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1648 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1649 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1650 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1651 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1652 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1653 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1654 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1656 fjx0 = _mm256_setzero_ps();
1657 fjy0 = _mm256_setzero_ps();
1658 fjz0 = _mm256_setzero_ps();
1659 fjx1 = _mm256_setzero_ps();
1660 fjy1 = _mm256_setzero_ps();
1661 fjz1 = _mm256_setzero_ps();
1662 fjx2 = _mm256_setzero_ps();
1663 fjy2 = _mm256_setzero_ps();
1664 fjz2 = _mm256_setzero_ps();
1665 fjx3 = _mm256_setzero_ps();
1666 fjy3 = _mm256_setzero_ps();
1667 fjz3 = _mm256_setzero_ps();
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1676 r00 = _mm256_mul_ps(rsq00,rinv00);
1678 /* Calculate table index by multiplying r with table scale and truncate to integer */
1679 rt = _mm256_mul_ps(r00,vftabscale);
1680 vfitab = _mm256_cvttps_epi32(rt);
1681 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1682 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1683 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1684 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1685 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
1686 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
1688 /* CUBIC SPLINE TABLE DISPERSION */
1689 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1690 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1691 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1692 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1693 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1694 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1695 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1696 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1697 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1698 Heps = _mm256_mul_ps(vfeps,H);
1699 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1700 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1701 fvdw6 = _mm256_mul_ps(c6_00,FF);
1703 /* CUBIC SPLINE TABLE REPULSION */
1704 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1705 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1706 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1707 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1708 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1709 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1710 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1711 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1712 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1713 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1714 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1715 Heps = _mm256_mul_ps(vfeps,H);
1716 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1717 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1718 fvdw12 = _mm256_mul_ps(c12_00,FF);
1719 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1721 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1725 fscal = _mm256_and_ps(fscal,cutoff_mask);
1727 /* Calculate temporary vectorial force */
1728 tx = _mm256_mul_ps(fscal,dx00);
1729 ty = _mm256_mul_ps(fscal,dy00);
1730 tz = _mm256_mul_ps(fscal,dz00);
1732 /* Update vectorial force */
1733 fix0 = _mm256_add_ps(fix0,tx);
1734 fiy0 = _mm256_add_ps(fiy0,ty);
1735 fiz0 = _mm256_add_ps(fiz0,tz);
1737 fjx0 = _mm256_add_ps(fjx0,tx);
1738 fjy0 = _mm256_add_ps(fjy0,ty);
1739 fjz0 = _mm256_add_ps(fjz0,tz);
1743 /**************************
1744 * CALCULATE INTERACTIONS *
1745 **************************/
1747 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1750 /* REACTION-FIELD ELECTROSTATICS */
1751 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1753 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1757 fscal = _mm256_and_ps(fscal,cutoff_mask);
1759 /* Calculate temporary vectorial force */
1760 tx = _mm256_mul_ps(fscal,dx11);
1761 ty = _mm256_mul_ps(fscal,dy11);
1762 tz = _mm256_mul_ps(fscal,dz11);
1764 /* Update vectorial force */
1765 fix1 = _mm256_add_ps(fix1,tx);
1766 fiy1 = _mm256_add_ps(fiy1,ty);
1767 fiz1 = _mm256_add_ps(fiz1,tz);
1769 fjx1 = _mm256_add_ps(fjx1,tx);
1770 fjy1 = _mm256_add_ps(fjy1,ty);
1771 fjz1 = _mm256_add_ps(fjz1,tz);
1775 /**************************
1776 * CALCULATE INTERACTIONS *
1777 **************************/
1779 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1782 /* REACTION-FIELD ELECTROSTATICS */
1783 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1785 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1789 fscal = _mm256_and_ps(fscal,cutoff_mask);
1791 /* Calculate temporary vectorial force */
1792 tx = _mm256_mul_ps(fscal,dx12);
1793 ty = _mm256_mul_ps(fscal,dy12);
1794 tz = _mm256_mul_ps(fscal,dz12);
1796 /* Update vectorial force */
1797 fix1 = _mm256_add_ps(fix1,tx);
1798 fiy1 = _mm256_add_ps(fiy1,ty);
1799 fiz1 = _mm256_add_ps(fiz1,tz);
1801 fjx2 = _mm256_add_ps(fjx2,tx);
1802 fjy2 = _mm256_add_ps(fjy2,ty);
1803 fjz2 = _mm256_add_ps(fjz2,tz);
1807 /**************************
1808 * CALCULATE INTERACTIONS *
1809 **************************/
1811 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1814 /* REACTION-FIELD ELECTROSTATICS */
1815 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1817 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1821 fscal = _mm256_and_ps(fscal,cutoff_mask);
1823 /* Calculate temporary vectorial force */
1824 tx = _mm256_mul_ps(fscal,dx13);
1825 ty = _mm256_mul_ps(fscal,dy13);
1826 tz = _mm256_mul_ps(fscal,dz13);
1828 /* Update vectorial force */
1829 fix1 = _mm256_add_ps(fix1,tx);
1830 fiy1 = _mm256_add_ps(fiy1,ty);
1831 fiz1 = _mm256_add_ps(fiz1,tz);
1833 fjx3 = _mm256_add_ps(fjx3,tx);
1834 fjy3 = _mm256_add_ps(fjy3,ty);
1835 fjz3 = _mm256_add_ps(fjz3,tz);
1839 /**************************
1840 * CALCULATE INTERACTIONS *
1841 **************************/
1843 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1846 /* REACTION-FIELD ELECTROSTATICS */
1847 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1849 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1853 fscal = _mm256_and_ps(fscal,cutoff_mask);
1855 /* Calculate temporary vectorial force */
1856 tx = _mm256_mul_ps(fscal,dx21);
1857 ty = _mm256_mul_ps(fscal,dy21);
1858 tz = _mm256_mul_ps(fscal,dz21);
1860 /* Update vectorial force */
1861 fix2 = _mm256_add_ps(fix2,tx);
1862 fiy2 = _mm256_add_ps(fiy2,ty);
1863 fiz2 = _mm256_add_ps(fiz2,tz);
1865 fjx1 = _mm256_add_ps(fjx1,tx);
1866 fjy1 = _mm256_add_ps(fjy1,ty);
1867 fjz1 = _mm256_add_ps(fjz1,tz);
1871 /**************************
1872 * CALCULATE INTERACTIONS *
1873 **************************/
1875 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1878 /* REACTION-FIELD ELECTROSTATICS */
1879 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1881 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1885 fscal = _mm256_and_ps(fscal,cutoff_mask);
1887 /* Calculate temporary vectorial force */
1888 tx = _mm256_mul_ps(fscal,dx22);
1889 ty = _mm256_mul_ps(fscal,dy22);
1890 tz = _mm256_mul_ps(fscal,dz22);
1892 /* Update vectorial force */
1893 fix2 = _mm256_add_ps(fix2,tx);
1894 fiy2 = _mm256_add_ps(fiy2,ty);
1895 fiz2 = _mm256_add_ps(fiz2,tz);
1897 fjx2 = _mm256_add_ps(fjx2,tx);
1898 fjy2 = _mm256_add_ps(fjy2,ty);
1899 fjz2 = _mm256_add_ps(fjz2,tz);
1903 /**************************
1904 * CALCULATE INTERACTIONS *
1905 **************************/
1907 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1910 /* REACTION-FIELD ELECTROSTATICS */
1911 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1913 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1917 fscal = _mm256_and_ps(fscal,cutoff_mask);
1919 /* Calculate temporary vectorial force */
1920 tx = _mm256_mul_ps(fscal,dx23);
1921 ty = _mm256_mul_ps(fscal,dy23);
1922 tz = _mm256_mul_ps(fscal,dz23);
1924 /* Update vectorial force */
1925 fix2 = _mm256_add_ps(fix2,tx);
1926 fiy2 = _mm256_add_ps(fiy2,ty);
1927 fiz2 = _mm256_add_ps(fiz2,tz);
1929 fjx3 = _mm256_add_ps(fjx3,tx);
1930 fjy3 = _mm256_add_ps(fjy3,ty);
1931 fjz3 = _mm256_add_ps(fjz3,tz);
1935 /**************************
1936 * CALCULATE INTERACTIONS *
1937 **************************/
1939 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1942 /* REACTION-FIELD ELECTROSTATICS */
1943 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1945 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1949 fscal = _mm256_and_ps(fscal,cutoff_mask);
1951 /* Calculate temporary vectorial force */
1952 tx = _mm256_mul_ps(fscal,dx31);
1953 ty = _mm256_mul_ps(fscal,dy31);
1954 tz = _mm256_mul_ps(fscal,dz31);
1956 /* Update vectorial force */
1957 fix3 = _mm256_add_ps(fix3,tx);
1958 fiy3 = _mm256_add_ps(fiy3,ty);
1959 fiz3 = _mm256_add_ps(fiz3,tz);
1961 fjx1 = _mm256_add_ps(fjx1,tx);
1962 fjy1 = _mm256_add_ps(fjy1,ty);
1963 fjz1 = _mm256_add_ps(fjz1,tz);
1967 /**************************
1968 * CALCULATE INTERACTIONS *
1969 **************************/
1971 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1974 /* REACTION-FIELD ELECTROSTATICS */
1975 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1977 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1981 fscal = _mm256_and_ps(fscal,cutoff_mask);
1983 /* Calculate temporary vectorial force */
1984 tx = _mm256_mul_ps(fscal,dx32);
1985 ty = _mm256_mul_ps(fscal,dy32);
1986 tz = _mm256_mul_ps(fscal,dz32);
1988 /* Update vectorial force */
1989 fix3 = _mm256_add_ps(fix3,tx);
1990 fiy3 = _mm256_add_ps(fiy3,ty);
1991 fiz3 = _mm256_add_ps(fiz3,tz);
1993 fjx2 = _mm256_add_ps(fjx2,tx);
1994 fjy2 = _mm256_add_ps(fjy2,ty);
1995 fjz2 = _mm256_add_ps(fjz2,tz);
1999 /**************************
2000 * CALCULATE INTERACTIONS *
2001 **************************/
2003 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2006 /* REACTION-FIELD ELECTROSTATICS */
2007 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2009 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2013 fscal = _mm256_and_ps(fscal,cutoff_mask);
2015 /* Calculate temporary vectorial force */
2016 tx = _mm256_mul_ps(fscal,dx33);
2017 ty = _mm256_mul_ps(fscal,dy33);
2018 tz = _mm256_mul_ps(fscal,dz33);
2020 /* Update vectorial force */
2021 fix3 = _mm256_add_ps(fix3,tx);
2022 fiy3 = _mm256_add_ps(fiy3,ty);
2023 fiz3 = _mm256_add_ps(fiz3,tz);
2025 fjx3 = _mm256_add_ps(fjx3,tx);
2026 fjy3 = _mm256_add_ps(fjy3,ty);
2027 fjz3 = _mm256_add_ps(fjz3,tz);
2031 fjptrA = f+j_coord_offsetA;
2032 fjptrB = f+j_coord_offsetB;
2033 fjptrC = f+j_coord_offsetC;
2034 fjptrD = f+j_coord_offsetD;
2035 fjptrE = f+j_coord_offsetE;
2036 fjptrF = f+j_coord_offsetF;
2037 fjptrG = f+j_coord_offsetG;
2038 fjptrH = f+j_coord_offsetH;
2040 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2041 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2042 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2044 /* Inner loop uses 324 flops */
2047 if(jidx<j_index_end)
2050 /* Get j neighbor index, and coordinate index */
2051 jnrlistA = jjnr[jidx];
2052 jnrlistB = jjnr[jidx+1];
2053 jnrlistC = jjnr[jidx+2];
2054 jnrlistD = jjnr[jidx+3];
2055 jnrlistE = jjnr[jidx+4];
2056 jnrlistF = jjnr[jidx+5];
2057 jnrlistG = jjnr[jidx+6];
2058 jnrlistH = jjnr[jidx+7];
2059 /* Sign of each element will be negative for non-real atoms.
2060 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2061 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2063 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2064 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2066 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2067 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2068 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2069 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2070 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2071 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2072 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2073 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2074 j_coord_offsetA = DIM*jnrA;
2075 j_coord_offsetB = DIM*jnrB;
2076 j_coord_offsetC = DIM*jnrC;
2077 j_coord_offsetD = DIM*jnrD;
2078 j_coord_offsetE = DIM*jnrE;
2079 j_coord_offsetF = DIM*jnrF;
2080 j_coord_offsetG = DIM*jnrG;
2081 j_coord_offsetH = DIM*jnrH;
2083 /* load j atom coordinates */
2084 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2085 x+j_coord_offsetC,x+j_coord_offsetD,
2086 x+j_coord_offsetE,x+j_coord_offsetF,
2087 x+j_coord_offsetG,x+j_coord_offsetH,
2088 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2089 &jy2,&jz2,&jx3,&jy3,&jz3);
2091 /* Calculate displacement vector */
2092 dx00 = _mm256_sub_ps(ix0,jx0);
2093 dy00 = _mm256_sub_ps(iy0,jy0);
2094 dz00 = _mm256_sub_ps(iz0,jz0);
2095 dx11 = _mm256_sub_ps(ix1,jx1);
2096 dy11 = _mm256_sub_ps(iy1,jy1);
2097 dz11 = _mm256_sub_ps(iz1,jz1);
2098 dx12 = _mm256_sub_ps(ix1,jx2);
2099 dy12 = _mm256_sub_ps(iy1,jy2);
2100 dz12 = _mm256_sub_ps(iz1,jz2);
2101 dx13 = _mm256_sub_ps(ix1,jx3);
2102 dy13 = _mm256_sub_ps(iy1,jy3);
2103 dz13 = _mm256_sub_ps(iz1,jz3);
2104 dx21 = _mm256_sub_ps(ix2,jx1);
2105 dy21 = _mm256_sub_ps(iy2,jy1);
2106 dz21 = _mm256_sub_ps(iz2,jz1);
2107 dx22 = _mm256_sub_ps(ix2,jx2);
2108 dy22 = _mm256_sub_ps(iy2,jy2);
2109 dz22 = _mm256_sub_ps(iz2,jz2);
2110 dx23 = _mm256_sub_ps(ix2,jx3);
2111 dy23 = _mm256_sub_ps(iy2,jy3);
2112 dz23 = _mm256_sub_ps(iz2,jz3);
2113 dx31 = _mm256_sub_ps(ix3,jx1);
2114 dy31 = _mm256_sub_ps(iy3,jy1);
2115 dz31 = _mm256_sub_ps(iz3,jz1);
2116 dx32 = _mm256_sub_ps(ix3,jx2);
2117 dy32 = _mm256_sub_ps(iy3,jy2);
2118 dz32 = _mm256_sub_ps(iz3,jz2);
2119 dx33 = _mm256_sub_ps(ix3,jx3);
2120 dy33 = _mm256_sub_ps(iy3,jy3);
2121 dz33 = _mm256_sub_ps(iz3,jz3);
2123 /* Calculate squared distance and things based on it */
2124 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2125 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2126 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2127 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2128 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2129 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2130 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2131 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2132 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2133 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2135 rinv00 = avx256_invsqrt_f(rsq00);
2136 rinv11 = avx256_invsqrt_f(rsq11);
2137 rinv12 = avx256_invsqrt_f(rsq12);
2138 rinv13 = avx256_invsqrt_f(rsq13);
2139 rinv21 = avx256_invsqrt_f(rsq21);
2140 rinv22 = avx256_invsqrt_f(rsq22);
2141 rinv23 = avx256_invsqrt_f(rsq23);
2142 rinv31 = avx256_invsqrt_f(rsq31);
2143 rinv32 = avx256_invsqrt_f(rsq32);
2144 rinv33 = avx256_invsqrt_f(rsq33);
2146 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2147 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2148 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2149 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2150 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2151 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2152 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2153 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2154 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2156 fjx0 = _mm256_setzero_ps();
2157 fjy0 = _mm256_setzero_ps();
2158 fjz0 = _mm256_setzero_ps();
2159 fjx1 = _mm256_setzero_ps();
2160 fjy1 = _mm256_setzero_ps();
2161 fjz1 = _mm256_setzero_ps();
2162 fjx2 = _mm256_setzero_ps();
2163 fjy2 = _mm256_setzero_ps();
2164 fjz2 = _mm256_setzero_ps();
2165 fjx3 = _mm256_setzero_ps();
2166 fjy3 = _mm256_setzero_ps();
2167 fjz3 = _mm256_setzero_ps();
2169 /**************************
2170 * CALCULATE INTERACTIONS *
2171 **************************/
2173 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2176 r00 = _mm256_mul_ps(rsq00,rinv00);
2177 r00 = _mm256_andnot_ps(dummy_mask,r00);
2179 /* Calculate table index by multiplying r with table scale and truncate to integer */
2180 rt = _mm256_mul_ps(r00,vftabscale);
2181 vfitab = _mm256_cvttps_epi32(rt);
2182 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2183 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2184 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2185 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2186 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
2187 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
2189 /* CUBIC SPLINE TABLE DISPERSION */
2190 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2191 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2192 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2193 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2194 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2195 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2196 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2197 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2198 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2199 Heps = _mm256_mul_ps(vfeps,H);
2200 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2201 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2202 fvdw6 = _mm256_mul_ps(c6_00,FF);
2204 /* CUBIC SPLINE TABLE REPULSION */
2205 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2206 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2207 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2208 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2209 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2210 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2211 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2212 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2213 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2214 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2215 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2216 Heps = _mm256_mul_ps(vfeps,H);
2217 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2218 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2219 fvdw12 = _mm256_mul_ps(c12_00,FF);
2220 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2222 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2226 fscal = _mm256_and_ps(fscal,cutoff_mask);
2228 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2230 /* Calculate temporary vectorial force */
2231 tx = _mm256_mul_ps(fscal,dx00);
2232 ty = _mm256_mul_ps(fscal,dy00);
2233 tz = _mm256_mul_ps(fscal,dz00);
2235 /* Update vectorial force */
2236 fix0 = _mm256_add_ps(fix0,tx);
2237 fiy0 = _mm256_add_ps(fiy0,ty);
2238 fiz0 = _mm256_add_ps(fiz0,tz);
2240 fjx0 = _mm256_add_ps(fjx0,tx);
2241 fjy0 = _mm256_add_ps(fjy0,ty);
2242 fjz0 = _mm256_add_ps(fjz0,tz);
2246 /**************************
2247 * CALCULATE INTERACTIONS *
2248 **************************/
2250 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2253 /* REACTION-FIELD ELECTROSTATICS */
2254 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2256 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2260 fscal = _mm256_and_ps(fscal,cutoff_mask);
2262 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2264 /* Calculate temporary vectorial force */
2265 tx = _mm256_mul_ps(fscal,dx11);
2266 ty = _mm256_mul_ps(fscal,dy11);
2267 tz = _mm256_mul_ps(fscal,dz11);
2269 /* Update vectorial force */
2270 fix1 = _mm256_add_ps(fix1,tx);
2271 fiy1 = _mm256_add_ps(fiy1,ty);
2272 fiz1 = _mm256_add_ps(fiz1,tz);
2274 fjx1 = _mm256_add_ps(fjx1,tx);
2275 fjy1 = _mm256_add_ps(fjy1,ty);
2276 fjz1 = _mm256_add_ps(fjz1,tz);
2280 /**************************
2281 * CALCULATE INTERACTIONS *
2282 **************************/
2284 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2287 /* REACTION-FIELD ELECTROSTATICS */
2288 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2290 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2294 fscal = _mm256_and_ps(fscal,cutoff_mask);
2296 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2298 /* Calculate temporary vectorial force */
2299 tx = _mm256_mul_ps(fscal,dx12);
2300 ty = _mm256_mul_ps(fscal,dy12);
2301 tz = _mm256_mul_ps(fscal,dz12);
2303 /* Update vectorial force */
2304 fix1 = _mm256_add_ps(fix1,tx);
2305 fiy1 = _mm256_add_ps(fiy1,ty);
2306 fiz1 = _mm256_add_ps(fiz1,tz);
2308 fjx2 = _mm256_add_ps(fjx2,tx);
2309 fjy2 = _mm256_add_ps(fjy2,ty);
2310 fjz2 = _mm256_add_ps(fjz2,tz);
2314 /**************************
2315 * CALCULATE INTERACTIONS *
2316 **************************/
2318 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2321 /* REACTION-FIELD ELECTROSTATICS */
2322 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
2324 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2328 fscal = _mm256_and_ps(fscal,cutoff_mask);
2330 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2332 /* Calculate temporary vectorial force */
2333 tx = _mm256_mul_ps(fscal,dx13);
2334 ty = _mm256_mul_ps(fscal,dy13);
2335 tz = _mm256_mul_ps(fscal,dz13);
2337 /* Update vectorial force */
2338 fix1 = _mm256_add_ps(fix1,tx);
2339 fiy1 = _mm256_add_ps(fiy1,ty);
2340 fiz1 = _mm256_add_ps(fiz1,tz);
2342 fjx3 = _mm256_add_ps(fjx3,tx);
2343 fjy3 = _mm256_add_ps(fjy3,ty);
2344 fjz3 = _mm256_add_ps(fjz3,tz);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2355 /* REACTION-FIELD ELECTROSTATICS */
2356 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2358 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2362 fscal = _mm256_and_ps(fscal,cutoff_mask);
2364 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2366 /* Calculate temporary vectorial force */
2367 tx = _mm256_mul_ps(fscal,dx21);
2368 ty = _mm256_mul_ps(fscal,dy21);
2369 tz = _mm256_mul_ps(fscal,dz21);
2371 /* Update vectorial force */
2372 fix2 = _mm256_add_ps(fix2,tx);
2373 fiy2 = _mm256_add_ps(fiy2,ty);
2374 fiz2 = _mm256_add_ps(fiz2,tz);
2376 fjx1 = _mm256_add_ps(fjx1,tx);
2377 fjy1 = _mm256_add_ps(fjy1,ty);
2378 fjz1 = _mm256_add_ps(fjz1,tz);
2382 /**************************
2383 * CALCULATE INTERACTIONS *
2384 **************************/
2386 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2389 /* REACTION-FIELD ELECTROSTATICS */
2390 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2392 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2396 fscal = _mm256_and_ps(fscal,cutoff_mask);
2398 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2400 /* Calculate temporary vectorial force */
2401 tx = _mm256_mul_ps(fscal,dx22);
2402 ty = _mm256_mul_ps(fscal,dy22);
2403 tz = _mm256_mul_ps(fscal,dz22);
2405 /* Update vectorial force */
2406 fix2 = _mm256_add_ps(fix2,tx);
2407 fiy2 = _mm256_add_ps(fiy2,ty);
2408 fiz2 = _mm256_add_ps(fiz2,tz);
2410 fjx2 = _mm256_add_ps(fjx2,tx);
2411 fjy2 = _mm256_add_ps(fjy2,ty);
2412 fjz2 = _mm256_add_ps(fjz2,tz);
2416 /**************************
2417 * CALCULATE INTERACTIONS *
2418 **************************/
2420 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2423 /* REACTION-FIELD ELECTROSTATICS */
2424 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
2426 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2430 fscal = _mm256_and_ps(fscal,cutoff_mask);
2432 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2434 /* Calculate temporary vectorial force */
2435 tx = _mm256_mul_ps(fscal,dx23);
2436 ty = _mm256_mul_ps(fscal,dy23);
2437 tz = _mm256_mul_ps(fscal,dz23);
2439 /* Update vectorial force */
2440 fix2 = _mm256_add_ps(fix2,tx);
2441 fiy2 = _mm256_add_ps(fiy2,ty);
2442 fiz2 = _mm256_add_ps(fiz2,tz);
2444 fjx3 = _mm256_add_ps(fjx3,tx);
2445 fjy3 = _mm256_add_ps(fjy3,ty);
2446 fjz3 = _mm256_add_ps(fjz3,tz);
2450 /**************************
2451 * CALCULATE INTERACTIONS *
2452 **************************/
2454 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2457 /* REACTION-FIELD ELECTROSTATICS */
2458 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
2460 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2464 fscal = _mm256_and_ps(fscal,cutoff_mask);
2466 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2468 /* Calculate temporary vectorial force */
2469 tx = _mm256_mul_ps(fscal,dx31);
2470 ty = _mm256_mul_ps(fscal,dy31);
2471 tz = _mm256_mul_ps(fscal,dz31);
2473 /* Update vectorial force */
2474 fix3 = _mm256_add_ps(fix3,tx);
2475 fiy3 = _mm256_add_ps(fiy3,ty);
2476 fiz3 = _mm256_add_ps(fiz3,tz);
2478 fjx1 = _mm256_add_ps(fjx1,tx);
2479 fjy1 = _mm256_add_ps(fjy1,ty);
2480 fjz1 = _mm256_add_ps(fjz1,tz);
2484 /**************************
2485 * CALCULATE INTERACTIONS *
2486 **************************/
2488 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2491 /* REACTION-FIELD ELECTROSTATICS */
2492 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
2494 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2498 fscal = _mm256_and_ps(fscal,cutoff_mask);
2500 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2502 /* Calculate temporary vectorial force */
2503 tx = _mm256_mul_ps(fscal,dx32);
2504 ty = _mm256_mul_ps(fscal,dy32);
2505 tz = _mm256_mul_ps(fscal,dz32);
2507 /* Update vectorial force */
2508 fix3 = _mm256_add_ps(fix3,tx);
2509 fiy3 = _mm256_add_ps(fiy3,ty);
2510 fiz3 = _mm256_add_ps(fiz3,tz);
2512 fjx2 = _mm256_add_ps(fjx2,tx);
2513 fjy2 = _mm256_add_ps(fjy2,ty);
2514 fjz2 = _mm256_add_ps(fjz2,tz);
2518 /**************************
2519 * CALCULATE INTERACTIONS *
2520 **************************/
2522 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2525 /* REACTION-FIELD ELECTROSTATICS */
2526 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2528 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2532 fscal = _mm256_and_ps(fscal,cutoff_mask);
2534 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2536 /* Calculate temporary vectorial force */
2537 tx = _mm256_mul_ps(fscal,dx33);
2538 ty = _mm256_mul_ps(fscal,dy33);
2539 tz = _mm256_mul_ps(fscal,dz33);
2541 /* Update vectorial force */
2542 fix3 = _mm256_add_ps(fix3,tx);
2543 fiy3 = _mm256_add_ps(fiy3,ty);
2544 fiz3 = _mm256_add_ps(fiz3,tz);
2546 fjx3 = _mm256_add_ps(fjx3,tx);
2547 fjy3 = _mm256_add_ps(fjy3,ty);
2548 fjz3 = _mm256_add_ps(fjz3,tz);
2552 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2553 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2554 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2555 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2556 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2557 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2558 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2559 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2561 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2562 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2563 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2565 /* Inner loop uses 325 flops */
2568 /* End of innermost loop */
2570 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2571 f+i_coord_offset,fshift+i_shift_offset);
2573 /* Increment number of inner iterations */
2574 inneriter += j_index_end - j_index_start;
2576 /* Outer loop uses 24 flops */
2579 /* Increment number of outer iterations */
2582 /* Update outer/inner flops */
2584 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);