2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
101 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
115 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
118 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
119 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
121 __m128i ewitab_lo,ewitab_hi;
122 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
123 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
125 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
126 real rswitch_scalar,d_scalar;
127 __m256 dummy_mask,cutoff_mask;
128 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
129 __m256 one = _mm256_set1_ps(1.0);
130 __m256 two = _mm256_set1_ps(2.0);
136 jindex = nlist->jindex;
138 shiftidx = nlist->shift;
140 shiftvec = fr->shift_vec[0];
141 fshift = fr->fshift[0];
142 facel = _mm256_set1_ps(fr->epsfac);
143 charge = mdatoms->chargeA;
144 nvdwtype = fr->ntype;
146 vdwtype = mdatoms->typeA;
148 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
149 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
150 beta2 = _mm256_mul_ps(beta,beta);
151 beta3 = _mm256_mul_ps(beta,beta2);
153 ewtab = fr->ic->tabq_coul_FDV0;
154 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
155 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
157 /* Setup water-specific parameters */
158 inr = nlist->iinr[0];
159 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
160 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
161 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
162 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
164 jq1 = _mm256_set1_ps(charge[inr+1]);
165 jq2 = _mm256_set1_ps(charge[inr+2]);
166 jq3 = _mm256_set1_ps(charge[inr+3]);
167 vdwjidx0A = 2*vdwtype[inr+0];
168 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
169 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
170 qq11 = _mm256_mul_ps(iq1,jq1);
171 qq12 = _mm256_mul_ps(iq1,jq2);
172 qq13 = _mm256_mul_ps(iq1,jq3);
173 qq21 = _mm256_mul_ps(iq2,jq1);
174 qq22 = _mm256_mul_ps(iq2,jq2);
175 qq23 = _mm256_mul_ps(iq2,jq3);
176 qq31 = _mm256_mul_ps(iq3,jq1);
177 qq32 = _mm256_mul_ps(iq3,jq2);
178 qq33 = _mm256_mul_ps(iq3,jq3);
180 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
181 rcutoff_scalar = fr->rcoulomb;
182 rcutoff = _mm256_set1_ps(rcutoff_scalar);
183 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
185 rswitch_scalar = fr->rcoulomb_switch;
186 rswitch = _mm256_set1_ps(rswitch_scalar);
187 /* Setup switch parameters */
188 d_scalar = rcutoff_scalar-rswitch_scalar;
189 d = _mm256_set1_ps(d_scalar);
190 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
191 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
192 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
193 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
194 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
195 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
197 /* Avoid stupid compiler warnings */
198 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
211 for(iidx=0;iidx<4*DIM;iidx++)
216 /* Start outer loop over neighborlists */
217 for(iidx=0; iidx<nri; iidx++)
219 /* Load shift vector for this list */
220 i_shift_offset = DIM*shiftidx[iidx];
222 /* Load limits for loop over neighbors */
223 j_index_start = jindex[iidx];
224 j_index_end = jindex[iidx+1];
226 /* Get outer coordinate index */
228 i_coord_offset = DIM*inr;
230 /* Load i particle coords and add shift vector */
231 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
232 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
234 fix0 = _mm256_setzero_ps();
235 fiy0 = _mm256_setzero_ps();
236 fiz0 = _mm256_setzero_ps();
237 fix1 = _mm256_setzero_ps();
238 fiy1 = _mm256_setzero_ps();
239 fiz1 = _mm256_setzero_ps();
240 fix2 = _mm256_setzero_ps();
241 fiy2 = _mm256_setzero_ps();
242 fiz2 = _mm256_setzero_ps();
243 fix3 = _mm256_setzero_ps();
244 fiy3 = _mm256_setzero_ps();
245 fiz3 = _mm256_setzero_ps();
247 /* Reset potential sums */
248 velecsum = _mm256_setzero_ps();
249 vvdwsum = _mm256_setzero_ps();
251 /* Start inner kernel loop */
252 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
255 /* Get j neighbor index, and coordinate index */
264 j_coord_offsetA = DIM*jnrA;
265 j_coord_offsetB = DIM*jnrB;
266 j_coord_offsetC = DIM*jnrC;
267 j_coord_offsetD = DIM*jnrD;
268 j_coord_offsetE = DIM*jnrE;
269 j_coord_offsetF = DIM*jnrF;
270 j_coord_offsetG = DIM*jnrG;
271 j_coord_offsetH = DIM*jnrH;
273 /* load j atom coordinates */
274 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
275 x+j_coord_offsetC,x+j_coord_offsetD,
276 x+j_coord_offsetE,x+j_coord_offsetF,
277 x+j_coord_offsetG,x+j_coord_offsetH,
278 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
279 &jy2,&jz2,&jx3,&jy3,&jz3);
281 /* Calculate displacement vector */
282 dx00 = _mm256_sub_ps(ix0,jx0);
283 dy00 = _mm256_sub_ps(iy0,jy0);
284 dz00 = _mm256_sub_ps(iz0,jz0);
285 dx11 = _mm256_sub_ps(ix1,jx1);
286 dy11 = _mm256_sub_ps(iy1,jy1);
287 dz11 = _mm256_sub_ps(iz1,jz1);
288 dx12 = _mm256_sub_ps(ix1,jx2);
289 dy12 = _mm256_sub_ps(iy1,jy2);
290 dz12 = _mm256_sub_ps(iz1,jz2);
291 dx13 = _mm256_sub_ps(ix1,jx3);
292 dy13 = _mm256_sub_ps(iy1,jy3);
293 dz13 = _mm256_sub_ps(iz1,jz3);
294 dx21 = _mm256_sub_ps(ix2,jx1);
295 dy21 = _mm256_sub_ps(iy2,jy1);
296 dz21 = _mm256_sub_ps(iz2,jz1);
297 dx22 = _mm256_sub_ps(ix2,jx2);
298 dy22 = _mm256_sub_ps(iy2,jy2);
299 dz22 = _mm256_sub_ps(iz2,jz2);
300 dx23 = _mm256_sub_ps(ix2,jx3);
301 dy23 = _mm256_sub_ps(iy2,jy3);
302 dz23 = _mm256_sub_ps(iz2,jz3);
303 dx31 = _mm256_sub_ps(ix3,jx1);
304 dy31 = _mm256_sub_ps(iy3,jy1);
305 dz31 = _mm256_sub_ps(iz3,jz1);
306 dx32 = _mm256_sub_ps(ix3,jx2);
307 dy32 = _mm256_sub_ps(iy3,jy2);
308 dz32 = _mm256_sub_ps(iz3,jz2);
309 dx33 = _mm256_sub_ps(ix3,jx3);
310 dy33 = _mm256_sub_ps(iy3,jy3);
311 dz33 = _mm256_sub_ps(iz3,jz3);
313 /* Calculate squared distance and things based on it */
314 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
315 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
316 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
317 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
318 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
319 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
320 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
321 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
322 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
323 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
325 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
326 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
327 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
328 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
329 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
330 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
331 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
332 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
333 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
334 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
336 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
337 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
338 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
339 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
340 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
341 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
342 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
343 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
344 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
345 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
347 fjx0 = _mm256_setzero_ps();
348 fjy0 = _mm256_setzero_ps();
349 fjz0 = _mm256_setzero_ps();
350 fjx1 = _mm256_setzero_ps();
351 fjy1 = _mm256_setzero_ps();
352 fjz1 = _mm256_setzero_ps();
353 fjx2 = _mm256_setzero_ps();
354 fjy2 = _mm256_setzero_ps();
355 fjz2 = _mm256_setzero_ps();
356 fjx3 = _mm256_setzero_ps();
357 fjy3 = _mm256_setzero_ps();
358 fjz3 = _mm256_setzero_ps();
360 /**************************
361 * CALCULATE INTERACTIONS *
362 **************************/
364 if (gmx_mm256_any_lt(rsq00,rcutoff2))
367 r00 = _mm256_mul_ps(rsq00,rinv00);
369 /* LENNARD-JONES DISPERSION/REPULSION */
371 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
372 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
373 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
374 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
375 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
377 d = _mm256_sub_ps(r00,rswitch);
378 d = _mm256_max_ps(d,_mm256_setzero_ps());
379 d2 = _mm256_mul_ps(d,d);
380 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
382 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
384 /* Evaluate switch function */
385 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
386 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
387 vvdw = _mm256_mul_ps(vvdw,sw);
388 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
392 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
396 fscal = _mm256_and_ps(fscal,cutoff_mask);
398 /* Calculate temporary vectorial force */
399 tx = _mm256_mul_ps(fscal,dx00);
400 ty = _mm256_mul_ps(fscal,dy00);
401 tz = _mm256_mul_ps(fscal,dz00);
403 /* Update vectorial force */
404 fix0 = _mm256_add_ps(fix0,tx);
405 fiy0 = _mm256_add_ps(fiy0,ty);
406 fiz0 = _mm256_add_ps(fiz0,tz);
408 fjx0 = _mm256_add_ps(fjx0,tx);
409 fjy0 = _mm256_add_ps(fjy0,ty);
410 fjz0 = _mm256_add_ps(fjz0,tz);
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 if (gmx_mm256_any_lt(rsq11,rcutoff2))
421 r11 = _mm256_mul_ps(rsq11,rinv11);
423 /* EWALD ELECTROSTATICS */
425 /* Analytical PME correction */
426 zeta2 = _mm256_mul_ps(beta2,rsq11);
427 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
428 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
429 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
430 felec = _mm256_mul_ps(qq11,felec);
431 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
432 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
433 velec = _mm256_sub_ps(rinv11,pmecorrV);
434 velec = _mm256_mul_ps(qq11,velec);
436 d = _mm256_sub_ps(r11,rswitch);
437 d = _mm256_max_ps(d,_mm256_setzero_ps());
438 d2 = _mm256_mul_ps(d,d);
439 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
441 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
443 /* Evaluate switch function */
444 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
445 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
446 velec = _mm256_mul_ps(velec,sw);
447 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
449 /* Update potential sum for this i atom from the interaction with this j atom. */
450 velec = _mm256_and_ps(velec,cutoff_mask);
451 velecsum = _mm256_add_ps(velecsum,velec);
455 fscal = _mm256_and_ps(fscal,cutoff_mask);
457 /* Calculate temporary vectorial force */
458 tx = _mm256_mul_ps(fscal,dx11);
459 ty = _mm256_mul_ps(fscal,dy11);
460 tz = _mm256_mul_ps(fscal,dz11);
462 /* Update vectorial force */
463 fix1 = _mm256_add_ps(fix1,tx);
464 fiy1 = _mm256_add_ps(fiy1,ty);
465 fiz1 = _mm256_add_ps(fiz1,tz);
467 fjx1 = _mm256_add_ps(fjx1,tx);
468 fjy1 = _mm256_add_ps(fjy1,ty);
469 fjz1 = _mm256_add_ps(fjz1,tz);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 if (gmx_mm256_any_lt(rsq12,rcutoff2))
480 r12 = _mm256_mul_ps(rsq12,rinv12);
482 /* EWALD ELECTROSTATICS */
484 /* Analytical PME correction */
485 zeta2 = _mm256_mul_ps(beta2,rsq12);
486 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
487 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
488 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
489 felec = _mm256_mul_ps(qq12,felec);
490 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
491 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
492 velec = _mm256_sub_ps(rinv12,pmecorrV);
493 velec = _mm256_mul_ps(qq12,velec);
495 d = _mm256_sub_ps(r12,rswitch);
496 d = _mm256_max_ps(d,_mm256_setzero_ps());
497 d2 = _mm256_mul_ps(d,d);
498 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
500 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
502 /* Evaluate switch function */
503 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
504 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
505 velec = _mm256_mul_ps(velec,sw);
506 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
508 /* Update potential sum for this i atom from the interaction with this j atom. */
509 velec = _mm256_and_ps(velec,cutoff_mask);
510 velecsum = _mm256_add_ps(velecsum,velec);
514 fscal = _mm256_and_ps(fscal,cutoff_mask);
516 /* Calculate temporary vectorial force */
517 tx = _mm256_mul_ps(fscal,dx12);
518 ty = _mm256_mul_ps(fscal,dy12);
519 tz = _mm256_mul_ps(fscal,dz12);
521 /* Update vectorial force */
522 fix1 = _mm256_add_ps(fix1,tx);
523 fiy1 = _mm256_add_ps(fiy1,ty);
524 fiz1 = _mm256_add_ps(fiz1,tz);
526 fjx2 = _mm256_add_ps(fjx2,tx);
527 fjy2 = _mm256_add_ps(fjy2,ty);
528 fjz2 = _mm256_add_ps(fjz2,tz);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 if (gmx_mm256_any_lt(rsq13,rcutoff2))
539 r13 = _mm256_mul_ps(rsq13,rinv13);
541 /* EWALD ELECTROSTATICS */
543 /* Analytical PME correction */
544 zeta2 = _mm256_mul_ps(beta2,rsq13);
545 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
546 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
547 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
548 felec = _mm256_mul_ps(qq13,felec);
549 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
550 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
551 velec = _mm256_sub_ps(rinv13,pmecorrV);
552 velec = _mm256_mul_ps(qq13,velec);
554 d = _mm256_sub_ps(r13,rswitch);
555 d = _mm256_max_ps(d,_mm256_setzero_ps());
556 d2 = _mm256_mul_ps(d,d);
557 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
559 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
561 /* Evaluate switch function */
562 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
563 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
564 velec = _mm256_mul_ps(velec,sw);
565 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velec = _mm256_and_ps(velec,cutoff_mask);
569 velecsum = _mm256_add_ps(velecsum,velec);
573 fscal = _mm256_and_ps(fscal,cutoff_mask);
575 /* Calculate temporary vectorial force */
576 tx = _mm256_mul_ps(fscal,dx13);
577 ty = _mm256_mul_ps(fscal,dy13);
578 tz = _mm256_mul_ps(fscal,dz13);
580 /* Update vectorial force */
581 fix1 = _mm256_add_ps(fix1,tx);
582 fiy1 = _mm256_add_ps(fiy1,ty);
583 fiz1 = _mm256_add_ps(fiz1,tz);
585 fjx3 = _mm256_add_ps(fjx3,tx);
586 fjy3 = _mm256_add_ps(fjy3,ty);
587 fjz3 = _mm256_add_ps(fjz3,tz);
591 /**************************
592 * CALCULATE INTERACTIONS *
593 **************************/
595 if (gmx_mm256_any_lt(rsq21,rcutoff2))
598 r21 = _mm256_mul_ps(rsq21,rinv21);
600 /* EWALD ELECTROSTATICS */
602 /* Analytical PME correction */
603 zeta2 = _mm256_mul_ps(beta2,rsq21);
604 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
605 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
606 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
607 felec = _mm256_mul_ps(qq21,felec);
608 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
609 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
610 velec = _mm256_sub_ps(rinv21,pmecorrV);
611 velec = _mm256_mul_ps(qq21,velec);
613 d = _mm256_sub_ps(r21,rswitch);
614 d = _mm256_max_ps(d,_mm256_setzero_ps());
615 d2 = _mm256_mul_ps(d,d);
616 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
618 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
620 /* Evaluate switch function */
621 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
622 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
623 velec = _mm256_mul_ps(velec,sw);
624 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velec = _mm256_and_ps(velec,cutoff_mask);
628 velecsum = _mm256_add_ps(velecsum,velec);
632 fscal = _mm256_and_ps(fscal,cutoff_mask);
634 /* Calculate temporary vectorial force */
635 tx = _mm256_mul_ps(fscal,dx21);
636 ty = _mm256_mul_ps(fscal,dy21);
637 tz = _mm256_mul_ps(fscal,dz21);
639 /* Update vectorial force */
640 fix2 = _mm256_add_ps(fix2,tx);
641 fiy2 = _mm256_add_ps(fiy2,ty);
642 fiz2 = _mm256_add_ps(fiz2,tz);
644 fjx1 = _mm256_add_ps(fjx1,tx);
645 fjy1 = _mm256_add_ps(fjy1,ty);
646 fjz1 = _mm256_add_ps(fjz1,tz);
650 /**************************
651 * CALCULATE INTERACTIONS *
652 **************************/
654 if (gmx_mm256_any_lt(rsq22,rcutoff2))
657 r22 = _mm256_mul_ps(rsq22,rinv22);
659 /* EWALD ELECTROSTATICS */
661 /* Analytical PME correction */
662 zeta2 = _mm256_mul_ps(beta2,rsq22);
663 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
664 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
665 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
666 felec = _mm256_mul_ps(qq22,felec);
667 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
668 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
669 velec = _mm256_sub_ps(rinv22,pmecorrV);
670 velec = _mm256_mul_ps(qq22,velec);
672 d = _mm256_sub_ps(r22,rswitch);
673 d = _mm256_max_ps(d,_mm256_setzero_ps());
674 d2 = _mm256_mul_ps(d,d);
675 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
677 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
679 /* Evaluate switch function */
680 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
681 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
682 velec = _mm256_mul_ps(velec,sw);
683 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
685 /* Update potential sum for this i atom from the interaction with this j atom. */
686 velec = _mm256_and_ps(velec,cutoff_mask);
687 velecsum = _mm256_add_ps(velecsum,velec);
691 fscal = _mm256_and_ps(fscal,cutoff_mask);
693 /* Calculate temporary vectorial force */
694 tx = _mm256_mul_ps(fscal,dx22);
695 ty = _mm256_mul_ps(fscal,dy22);
696 tz = _mm256_mul_ps(fscal,dz22);
698 /* Update vectorial force */
699 fix2 = _mm256_add_ps(fix2,tx);
700 fiy2 = _mm256_add_ps(fiy2,ty);
701 fiz2 = _mm256_add_ps(fiz2,tz);
703 fjx2 = _mm256_add_ps(fjx2,tx);
704 fjy2 = _mm256_add_ps(fjy2,ty);
705 fjz2 = _mm256_add_ps(fjz2,tz);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 if (gmx_mm256_any_lt(rsq23,rcutoff2))
716 r23 = _mm256_mul_ps(rsq23,rinv23);
718 /* EWALD ELECTROSTATICS */
720 /* Analytical PME correction */
721 zeta2 = _mm256_mul_ps(beta2,rsq23);
722 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
723 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
724 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
725 felec = _mm256_mul_ps(qq23,felec);
726 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
727 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
728 velec = _mm256_sub_ps(rinv23,pmecorrV);
729 velec = _mm256_mul_ps(qq23,velec);
731 d = _mm256_sub_ps(r23,rswitch);
732 d = _mm256_max_ps(d,_mm256_setzero_ps());
733 d2 = _mm256_mul_ps(d,d);
734 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
736 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
738 /* Evaluate switch function */
739 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
740 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
741 velec = _mm256_mul_ps(velec,sw);
742 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velec = _mm256_and_ps(velec,cutoff_mask);
746 velecsum = _mm256_add_ps(velecsum,velec);
750 fscal = _mm256_and_ps(fscal,cutoff_mask);
752 /* Calculate temporary vectorial force */
753 tx = _mm256_mul_ps(fscal,dx23);
754 ty = _mm256_mul_ps(fscal,dy23);
755 tz = _mm256_mul_ps(fscal,dz23);
757 /* Update vectorial force */
758 fix2 = _mm256_add_ps(fix2,tx);
759 fiy2 = _mm256_add_ps(fiy2,ty);
760 fiz2 = _mm256_add_ps(fiz2,tz);
762 fjx3 = _mm256_add_ps(fjx3,tx);
763 fjy3 = _mm256_add_ps(fjy3,ty);
764 fjz3 = _mm256_add_ps(fjz3,tz);
768 /**************************
769 * CALCULATE INTERACTIONS *
770 **************************/
772 if (gmx_mm256_any_lt(rsq31,rcutoff2))
775 r31 = _mm256_mul_ps(rsq31,rinv31);
777 /* EWALD ELECTROSTATICS */
779 /* Analytical PME correction */
780 zeta2 = _mm256_mul_ps(beta2,rsq31);
781 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
782 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
783 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
784 felec = _mm256_mul_ps(qq31,felec);
785 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
786 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
787 velec = _mm256_sub_ps(rinv31,pmecorrV);
788 velec = _mm256_mul_ps(qq31,velec);
790 d = _mm256_sub_ps(r31,rswitch);
791 d = _mm256_max_ps(d,_mm256_setzero_ps());
792 d2 = _mm256_mul_ps(d,d);
793 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
795 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
797 /* Evaluate switch function */
798 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
799 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
800 velec = _mm256_mul_ps(velec,sw);
801 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
803 /* Update potential sum for this i atom from the interaction with this j atom. */
804 velec = _mm256_and_ps(velec,cutoff_mask);
805 velecsum = _mm256_add_ps(velecsum,velec);
809 fscal = _mm256_and_ps(fscal,cutoff_mask);
811 /* Calculate temporary vectorial force */
812 tx = _mm256_mul_ps(fscal,dx31);
813 ty = _mm256_mul_ps(fscal,dy31);
814 tz = _mm256_mul_ps(fscal,dz31);
816 /* Update vectorial force */
817 fix3 = _mm256_add_ps(fix3,tx);
818 fiy3 = _mm256_add_ps(fiy3,ty);
819 fiz3 = _mm256_add_ps(fiz3,tz);
821 fjx1 = _mm256_add_ps(fjx1,tx);
822 fjy1 = _mm256_add_ps(fjy1,ty);
823 fjz1 = _mm256_add_ps(fjz1,tz);
827 /**************************
828 * CALCULATE INTERACTIONS *
829 **************************/
831 if (gmx_mm256_any_lt(rsq32,rcutoff2))
834 r32 = _mm256_mul_ps(rsq32,rinv32);
836 /* EWALD ELECTROSTATICS */
838 /* Analytical PME correction */
839 zeta2 = _mm256_mul_ps(beta2,rsq32);
840 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
841 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
842 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
843 felec = _mm256_mul_ps(qq32,felec);
844 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
845 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
846 velec = _mm256_sub_ps(rinv32,pmecorrV);
847 velec = _mm256_mul_ps(qq32,velec);
849 d = _mm256_sub_ps(r32,rswitch);
850 d = _mm256_max_ps(d,_mm256_setzero_ps());
851 d2 = _mm256_mul_ps(d,d);
852 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
854 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
856 /* Evaluate switch function */
857 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
858 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
859 velec = _mm256_mul_ps(velec,sw);
860 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 velec = _mm256_and_ps(velec,cutoff_mask);
864 velecsum = _mm256_add_ps(velecsum,velec);
868 fscal = _mm256_and_ps(fscal,cutoff_mask);
870 /* Calculate temporary vectorial force */
871 tx = _mm256_mul_ps(fscal,dx32);
872 ty = _mm256_mul_ps(fscal,dy32);
873 tz = _mm256_mul_ps(fscal,dz32);
875 /* Update vectorial force */
876 fix3 = _mm256_add_ps(fix3,tx);
877 fiy3 = _mm256_add_ps(fiy3,ty);
878 fiz3 = _mm256_add_ps(fiz3,tz);
880 fjx2 = _mm256_add_ps(fjx2,tx);
881 fjy2 = _mm256_add_ps(fjy2,ty);
882 fjz2 = _mm256_add_ps(fjz2,tz);
886 /**************************
887 * CALCULATE INTERACTIONS *
888 **************************/
890 if (gmx_mm256_any_lt(rsq33,rcutoff2))
893 r33 = _mm256_mul_ps(rsq33,rinv33);
895 /* EWALD ELECTROSTATICS */
897 /* Analytical PME correction */
898 zeta2 = _mm256_mul_ps(beta2,rsq33);
899 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
900 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
901 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
902 felec = _mm256_mul_ps(qq33,felec);
903 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
904 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
905 velec = _mm256_sub_ps(rinv33,pmecorrV);
906 velec = _mm256_mul_ps(qq33,velec);
908 d = _mm256_sub_ps(r33,rswitch);
909 d = _mm256_max_ps(d,_mm256_setzero_ps());
910 d2 = _mm256_mul_ps(d,d);
911 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
913 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
915 /* Evaluate switch function */
916 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
917 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
918 velec = _mm256_mul_ps(velec,sw);
919 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec = _mm256_and_ps(velec,cutoff_mask);
923 velecsum = _mm256_add_ps(velecsum,velec);
927 fscal = _mm256_and_ps(fscal,cutoff_mask);
929 /* Calculate temporary vectorial force */
930 tx = _mm256_mul_ps(fscal,dx33);
931 ty = _mm256_mul_ps(fscal,dy33);
932 tz = _mm256_mul_ps(fscal,dz33);
934 /* Update vectorial force */
935 fix3 = _mm256_add_ps(fix3,tx);
936 fiy3 = _mm256_add_ps(fiy3,ty);
937 fiz3 = _mm256_add_ps(fiz3,tz);
939 fjx3 = _mm256_add_ps(fjx3,tx);
940 fjy3 = _mm256_add_ps(fjy3,ty);
941 fjz3 = _mm256_add_ps(fjz3,tz);
945 fjptrA = f+j_coord_offsetA;
946 fjptrB = f+j_coord_offsetB;
947 fjptrC = f+j_coord_offsetC;
948 fjptrD = f+j_coord_offsetD;
949 fjptrE = f+j_coord_offsetE;
950 fjptrF = f+j_coord_offsetF;
951 fjptrG = f+j_coord_offsetG;
952 fjptrH = f+j_coord_offsetH;
954 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
955 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
956 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
958 /* Inner loop uses 1034 flops */
964 /* Get j neighbor index, and coordinate index */
965 jnrlistA = jjnr[jidx];
966 jnrlistB = jjnr[jidx+1];
967 jnrlistC = jjnr[jidx+2];
968 jnrlistD = jjnr[jidx+3];
969 jnrlistE = jjnr[jidx+4];
970 jnrlistF = jjnr[jidx+5];
971 jnrlistG = jjnr[jidx+6];
972 jnrlistH = jjnr[jidx+7];
973 /* Sign of each element will be negative for non-real atoms.
974 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
975 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
977 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
978 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
980 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
981 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
982 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
983 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
984 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
985 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
986 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
987 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
988 j_coord_offsetA = DIM*jnrA;
989 j_coord_offsetB = DIM*jnrB;
990 j_coord_offsetC = DIM*jnrC;
991 j_coord_offsetD = DIM*jnrD;
992 j_coord_offsetE = DIM*jnrE;
993 j_coord_offsetF = DIM*jnrF;
994 j_coord_offsetG = DIM*jnrG;
995 j_coord_offsetH = DIM*jnrH;
997 /* load j atom coordinates */
998 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
999 x+j_coord_offsetC,x+j_coord_offsetD,
1000 x+j_coord_offsetE,x+j_coord_offsetF,
1001 x+j_coord_offsetG,x+j_coord_offsetH,
1002 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1003 &jy2,&jz2,&jx3,&jy3,&jz3);
1005 /* Calculate displacement vector */
1006 dx00 = _mm256_sub_ps(ix0,jx0);
1007 dy00 = _mm256_sub_ps(iy0,jy0);
1008 dz00 = _mm256_sub_ps(iz0,jz0);
1009 dx11 = _mm256_sub_ps(ix1,jx1);
1010 dy11 = _mm256_sub_ps(iy1,jy1);
1011 dz11 = _mm256_sub_ps(iz1,jz1);
1012 dx12 = _mm256_sub_ps(ix1,jx2);
1013 dy12 = _mm256_sub_ps(iy1,jy2);
1014 dz12 = _mm256_sub_ps(iz1,jz2);
1015 dx13 = _mm256_sub_ps(ix1,jx3);
1016 dy13 = _mm256_sub_ps(iy1,jy3);
1017 dz13 = _mm256_sub_ps(iz1,jz3);
1018 dx21 = _mm256_sub_ps(ix2,jx1);
1019 dy21 = _mm256_sub_ps(iy2,jy1);
1020 dz21 = _mm256_sub_ps(iz2,jz1);
1021 dx22 = _mm256_sub_ps(ix2,jx2);
1022 dy22 = _mm256_sub_ps(iy2,jy2);
1023 dz22 = _mm256_sub_ps(iz2,jz2);
1024 dx23 = _mm256_sub_ps(ix2,jx3);
1025 dy23 = _mm256_sub_ps(iy2,jy3);
1026 dz23 = _mm256_sub_ps(iz2,jz3);
1027 dx31 = _mm256_sub_ps(ix3,jx1);
1028 dy31 = _mm256_sub_ps(iy3,jy1);
1029 dz31 = _mm256_sub_ps(iz3,jz1);
1030 dx32 = _mm256_sub_ps(ix3,jx2);
1031 dy32 = _mm256_sub_ps(iy3,jy2);
1032 dz32 = _mm256_sub_ps(iz3,jz2);
1033 dx33 = _mm256_sub_ps(ix3,jx3);
1034 dy33 = _mm256_sub_ps(iy3,jy3);
1035 dz33 = _mm256_sub_ps(iz3,jz3);
1037 /* Calculate squared distance and things based on it */
1038 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1039 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1040 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1041 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1042 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1043 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1044 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1045 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1046 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1047 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1049 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1050 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1051 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1052 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1053 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1054 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1055 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1056 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1057 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1058 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1060 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1061 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1062 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1063 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1064 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1065 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1066 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1067 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1068 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1069 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1071 fjx0 = _mm256_setzero_ps();
1072 fjy0 = _mm256_setzero_ps();
1073 fjz0 = _mm256_setzero_ps();
1074 fjx1 = _mm256_setzero_ps();
1075 fjy1 = _mm256_setzero_ps();
1076 fjz1 = _mm256_setzero_ps();
1077 fjx2 = _mm256_setzero_ps();
1078 fjy2 = _mm256_setzero_ps();
1079 fjz2 = _mm256_setzero_ps();
1080 fjx3 = _mm256_setzero_ps();
1081 fjy3 = _mm256_setzero_ps();
1082 fjz3 = _mm256_setzero_ps();
1084 /**************************
1085 * CALCULATE INTERACTIONS *
1086 **************************/
1088 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1091 r00 = _mm256_mul_ps(rsq00,rinv00);
1092 r00 = _mm256_andnot_ps(dummy_mask,r00);
1094 /* LENNARD-JONES DISPERSION/REPULSION */
1096 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1097 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1098 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1099 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1100 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1102 d = _mm256_sub_ps(r00,rswitch);
1103 d = _mm256_max_ps(d,_mm256_setzero_ps());
1104 d2 = _mm256_mul_ps(d,d);
1105 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1107 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1109 /* Evaluate switch function */
1110 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1111 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1112 vvdw = _mm256_mul_ps(vvdw,sw);
1113 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1115 /* Update potential sum for this i atom from the interaction with this j atom. */
1116 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
1117 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1118 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1122 fscal = _mm256_and_ps(fscal,cutoff_mask);
1124 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1126 /* Calculate temporary vectorial force */
1127 tx = _mm256_mul_ps(fscal,dx00);
1128 ty = _mm256_mul_ps(fscal,dy00);
1129 tz = _mm256_mul_ps(fscal,dz00);
1131 /* Update vectorial force */
1132 fix0 = _mm256_add_ps(fix0,tx);
1133 fiy0 = _mm256_add_ps(fiy0,ty);
1134 fiz0 = _mm256_add_ps(fiz0,tz);
1136 fjx0 = _mm256_add_ps(fjx0,tx);
1137 fjy0 = _mm256_add_ps(fjy0,ty);
1138 fjz0 = _mm256_add_ps(fjz0,tz);
1142 /**************************
1143 * CALCULATE INTERACTIONS *
1144 **************************/
1146 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1149 r11 = _mm256_mul_ps(rsq11,rinv11);
1150 r11 = _mm256_andnot_ps(dummy_mask,r11);
1152 /* EWALD ELECTROSTATICS */
1154 /* Analytical PME correction */
1155 zeta2 = _mm256_mul_ps(beta2,rsq11);
1156 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1157 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1158 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1159 felec = _mm256_mul_ps(qq11,felec);
1160 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1161 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1162 velec = _mm256_sub_ps(rinv11,pmecorrV);
1163 velec = _mm256_mul_ps(qq11,velec);
1165 d = _mm256_sub_ps(r11,rswitch);
1166 d = _mm256_max_ps(d,_mm256_setzero_ps());
1167 d2 = _mm256_mul_ps(d,d);
1168 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1170 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1172 /* Evaluate switch function */
1173 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1174 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1175 velec = _mm256_mul_ps(velec,sw);
1176 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm256_and_ps(velec,cutoff_mask);
1180 velec = _mm256_andnot_ps(dummy_mask,velec);
1181 velecsum = _mm256_add_ps(velecsum,velec);
1185 fscal = _mm256_and_ps(fscal,cutoff_mask);
1187 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1189 /* Calculate temporary vectorial force */
1190 tx = _mm256_mul_ps(fscal,dx11);
1191 ty = _mm256_mul_ps(fscal,dy11);
1192 tz = _mm256_mul_ps(fscal,dz11);
1194 /* Update vectorial force */
1195 fix1 = _mm256_add_ps(fix1,tx);
1196 fiy1 = _mm256_add_ps(fiy1,ty);
1197 fiz1 = _mm256_add_ps(fiz1,tz);
1199 fjx1 = _mm256_add_ps(fjx1,tx);
1200 fjy1 = _mm256_add_ps(fjy1,ty);
1201 fjz1 = _mm256_add_ps(fjz1,tz);
1205 /**************************
1206 * CALCULATE INTERACTIONS *
1207 **************************/
1209 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1212 r12 = _mm256_mul_ps(rsq12,rinv12);
1213 r12 = _mm256_andnot_ps(dummy_mask,r12);
1215 /* EWALD ELECTROSTATICS */
1217 /* Analytical PME correction */
1218 zeta2 = _mm256_mul_ps(beta2,rsq12);
1219 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1220 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1221 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1222 felec = _mm256_mul_ps(qq12,felec);
1223 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1224 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1225 velec = _mm256_sub_ps(rinv12,pmecorrV);
1226 velec = _mm256_mul_ps(qq12,velec);
1228 d = _mm256_sub_ps(r12,rswitch);
1229 d = _mm256_max_ps(d,_mm256_setzero_ps());
1230 d2 = _mm256_mul_ps(d,d);
1231 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1233 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1235 /* Evaluate switch function */
1236 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1237 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1238 velec = _mm256_mul_ps(velec,sw);
1239 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1241 /* Update potential sum for this i atom from the interaction with this j atom. */
1242 velec = _mm256_and_ps(velec,cutoff_mask);
1243 velec = _mm256_andnot_ps(dummy_mask,velec);
1244 velecsum = _mm256_add_ps(velecsum,velec);
1248 fscal = _mm256_and_ps(fscal,cutoff_mask);
1250 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1252 /* Calculate temporary vectorial force */
1253 tx = _mm256_mul_ps(fscal,dx12);
1254 ty = _mm256_mul_ps(fscal,dy12);
1255 tz = _mm256_mul_ps(fscal,dz12);
1257 /* Update vectorial force */
1258 fix1 = _mm256_add_ps(fix1,tx);
1259 fiy1 = _mm256_add_ps(fiy1,ty);
1260 fiz1 = _mm256_add_ps(fiz1,tz);
1262 fjx2 = _mm256_add_ps(fjx2,tx);
1263 fjy2 = _mm256_add_ps(fjy2,ty);
1264 fjz2 = _mm256_add_ps(fjz2,tz);
1268 /**************************
1269 * CALCULATE INTERACTIONS *
1270 **************************/
1272 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1275 r13 = _mm256_mul_ps(rsq13,rinv13);
1276 r13 = _mm256_andnot_ps(dummy_mask,r13);
1278 /* EWALD ELECTROSTATICS */
1280 /* Analytical PME correction */
1281 zeta2 = _mm256_mul_ps(beta2,rsq13);
1282 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1283 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1284 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1285 felec = _mm256_mul_ps(qq13,felec);
1286 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1287 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1288 velec = _mm256_sub_ps(rinv13,pmecorrV);
1289 velec = _mm256_mul_ps(qq13,velec);
1291 d = _mm256_sub_ps(r13,rswitch);
1292 d = _mm256_max_ps(d,_mm256_setzero_ps());
1293 d2 = _mm256_mul_ps(d,d);
1294 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1296 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1298 /* Evaluate switch function */
1299 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1300 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1301 velec = _mm256_mul_ps(velec,sw);
1302 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1304 /* Update potential sum for this i atom from the interaction with this j atom. */
1305 velec = _mm256_and_ps(velec,cutoff_mask);
1306 velec = _mm256_andnot_ps(dummy_mask,velec);
1307 velecsum = _mm256_add_ps(velecsum,velec);
1311 fscal = _mm256_and_ps(fscal,cutoff_mask);
1313 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1315 /* Calculate temporary vectorial force */
1316 tx = _mm256_mul_ps(fscal,dx13);
1317 ty = _mm256_mul_ps(fscal,dy13);
1318 tz = _mm256_mul_ps(fscal,dz13);
1320 /* Update vectorial force */
1321 fix1 = _mm256_add_ps(fix1,tx);
1322 fiy1 = _mm256_add_ps(fiy1,ty);
1323 fiz1 = _mm256_add_ps(fiz1,tz);
1325 fjx3 = _mm256_add_ps(fjx3,tx);
1326 fjy3 = _mm256_add_ps(fjy3,ty);
1327 fjz3 = _mm256_add_ps(fjz3,tz);
1331 /**************************
1332 * CALCULATE INTERACTIONS *
1333 **************************/
1335 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1338 r21 = _mm256_mul_ps(rsq21,rinv21);
1339 r21 = _mm256_andnot_ps(dummy_mask,r21);
1341 /* EWALD ELECTROSTATICS */
1343 /* Analytical PME correction */
1344 zeta2 = _mm256_mul_ps(beta2,rsq21);
1345 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1346 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1347 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1348 felec = _mm256_mul_ps(qq21,felec);
1349 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1350 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1351 velec = _mm256_sub_ps(rinv21,pmecorrV);
1352 velec = _mm256_mul_ps(qq21,velec);
1354 d = _mm256_sub_ps(r21,rswitch);
1355 d = _mm256_max_ps(d,_mm256_setzero_ps());
1356 d2 = _mm256_mul_ps(d,d);
1357 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1359 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1361 /* Evaluate switch function */
1362 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1363 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1364 velec = _mm256_mul_ps(velec,sw);
1365 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1367 /* Update potential sum for this i atom from the interaction with this j atom. */
1368 velec = _mm256_and_ps(velec,cutoff_mask);
1369 velec = _mm256_andnot_ps(dummy_mask,velec);
1370 velecsum = _mm256_add_ps(velecsum,velec);
1374 fscal = _mm256_and_ps(fscal,cutoff_mask);
1376 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1378 /* Calculate temporary vectorial force */
1379 tx = _mm256_mul_ps(fscal,dx21);
1380 ty = _mm256_mul_ps(fscal,dy21);
1381 tz = _mm256_mul_ps(fscal,dz21);
1383 /* Update vectorial force */
1384 fix2 = _mm256_add_ps(fix2,tx);
1385 fiy2 = _mm256_add_ps(fiy2,ty);
1386 fiz2 = _mm256_add_ps(fiz2,tz);
1388 fjx1 = _mm256_add_ps(fjx1,tx);
1389 fjy1 = _mm256_add_ps(fjy1,ty);
1390 fjz1 = _mm256_add_ps(fjz1,tz);
1394 /**************************
1395 * CALCULATE INTERACTIONS *
1396 **************************/
1398 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1401 r22 = _mm256_mul_ps(rsq22,rinv22);
1402 r22 = _mm256_andnot_ps(dummy_mask,r22);
1404 /* EWALD ELECTROSTATICS */
1406 /* Analytical PME correction */
1407 zeta2 = _mm256_mul_ps(beta2,rsq22);
1408 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1409 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1410 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1411 felec = _mm256_mul_ps(qq22,felec);
1412 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1413 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1414 velec = _mm256_sub_ps(rinv22,pmecorrV);
1415 velec = _mm256_mul_ps(qq22,velec);
1417 d = _mm256_sub_ps(r22,rswitch);
1418 d = _mm256_max_ps(d,_mm256_setzero_ps());
1419 d2 = _mm256_mul_ps(d,d);
1420 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1422 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1424 /* Evaluate switch function */
1425 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1426 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1427 velec = _mm256_mul_ps(velec,sw);
1428 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1430 /* Update potential sum for this i atom from the interaction with this j atom. */
1431 velec = _mm256_and_ps(velec,cutoff_mask);
1432 velec = _mm256_andnot_ps(dummy_mask,velec);
1433 velecsum = _mm256_add_ps(velecsum,velec);
1437 fscal = _mm256_and_ps(fscal,cutoff_mask);
1439 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1441 /* Calculate temporary vectorial force */
1442 tx = _mm256_mul_ps(fscal,dx22);
1443 ty = _mm256_mul_ps(fscal,dy22);
1444 tz = _mm256_mul_ps(fscal,dz22);
1446 /* Update vectorial force */
1447 fix2 = _mm256_add_ps(fix2,tx);
1448 fiy2 = _mm256_add_ps(fiy2,ty);
1449 fiz2 = _mm256_add_ps(fiz2,tz);
1451 fjx2 = _mm256_add_ps(fjx2,tx);
1452 fjy2 = _mm256_add_ps(fjy2,ty);
1453 fjz2 = _mm256_add_ps(fjz2,tz);
1457 /**************************
1458 * CALCULATE INTERACTIONS *
1459 **************************/
1461 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1464 r23 = _mm256_mul_ps(rsq23,rinv23);
1465 r23 = _mm256_andnot_ps(dummy_mask,r23);
1467 /* EWALD ELECTROSTATICS */
1469 /* Analytical PME correction */
1470 zeta2 = _mm256_mul_ps(beta2,rsq23);
1471 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1472 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1473 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1474 felec = _mm256_mul_ps(qq23,felec);
1475 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1476 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1477 velec = _mm256_sub_ps(rinv23,pmecorrV);
1478 velec = _mm256_mul_ps(qq23,velec);
1480 d = _mm256_sub_ps(r23,rswitch);
1481 d = _mm256_max_ps(d,_mm256_setzero_ps());
1482 d2 = _mm256_mul_ps(d,d);
1483 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1485 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1487 /* Evaluate switch function */
1488 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1489 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1490 velec = _mm256_mul_ps(velec,sw);
1491 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1493 /* Update potential sum for this i atom from the interaction with this j atom. */
1494 velec = _mm256_and_ps(velec,cutoff_mask);
1495 velec = _mm256_andnot_ps(dummy_mask,velec);
1496 velecsum = _mm256_add_ps(velecsum,velec);
1500 fscal = _mm256_and_ps(fscal,cutoff_mask);
1502 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1504 /* Calculate temporary vectorial force */
1505 tx = _mm256_mul_ps(fscal,dx23);
1506 ty = _mm256_mul_ps(fscal,dy23);
1507 tz = _mm256_mul_ps(fscal,dz23);
1509 /* Update vectorial force */
1510 fix2 = _mm256_add_ps(fix2,tx);
1511 fiy2 = _mm256_add_ps(fiy2,ty);
1512 fiz2 = _mm256_add_ps(fiz2,tz);
1514 fjx3 = _mm256_add_ps(fjx3,tx);
1515 fjy3 = _mm256_add_ps(fjy3,ty);
1516 fjz3 = _mm256_add_ps(fjz3,tz);
1520 /**************************
1521 * CALCULATE INTERACTIONS *
1522 **************************/
1524 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1527 r31 = _mm256_mul_ps(rsq31,rinv31);
1528 r31 = _mm256_andnot_ps(dummy_mask,r31);
1530 /* EWALD ELECTROSTATICS */
1532 /* Analytical PME correction */
1533 zeta2 = _mm256_mul_ps(beta2,rsq31);
1534 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1535 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1536 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1537 felec = _mm256_mul_ps(qq31,felec);
1538 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1539 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1540 velec = _mm256_sub_ps(rinv31,pmecorrV);
1541 velec = _mm256_mul_ps(qq31,velec);
1543 d = _mm256_sub_ps(r31,rswitch);
1544 d = _mm256_max_ps(d,_mm256_setzero_ps());
1545 d2 = _mm256_mul_ps(d,d);
1546 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1548 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1550 /* Evaluate switch function */
1551 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1552 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1553 velec = _mm256_mul_ps(velec,sw);
1554 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1556 /* Update potential sum for this i atom from the interaction with this j atom. */
1557 velec = _mm256_and_ps(velec,cutoff_mask);
1558 velec = _mm256_andnot_ps(dummy_mask,velec);
1559 velecsum = _mm256_add_ps(velecsum,velec);
1563 fscal = _mm256_and_ps(fscal,cutoff_mask);
1565 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1567 /* Calculate temporary vectorial force */
1568 tx = _mm256_mul_ps(fscal,dx31);
1569 ty = _mm256_mul_ps(fscal,dy31);
1570 tz = _mm256_mul_ps(fscal,dz31);
1572 /* Update vectorial force */
1573 fix3 = _mm256_add_ps(fix3,tx);
1574 fiy3 = _mm256_add_ps(fiy3,ty);
1575 fiz3 = _mm256_add_ps(fiz3,tz);
1577 fjx1 = _mm256_add_ps(fjx1,tx);
1578 fjy1 = _mm256_add_ps(fjy1,ty);
1579 fjz1 = _mm256_add_ps(fjz1,tz);
1583 /**************************
1584 * CALCULATE INTERACTIONS *
1585 **************************/
1587 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1590 r32 = _mm256_mul_ps(rsq32,rinv32);
1591 r32 = _mm256_andnot_ps(dummy_mask,r32);
1593 /* EWALD ELECTROSTATICS */
1595 /* Analytical PME correction */
1596 zeta2 = _mm256_mul_ps(beta2,rsq32);
1597 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1598 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1599 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1600 felec = _mm256_mul_ps(qq32,felec);
1601 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1602 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1603 velec = _mm256_sub_ps(rinv32,pmecorrV);
1604 velec = _mm256_mul_ps(qq32,velec);
1606 d = _mm256_sub_ps(r32,rswitch);
1607 d = _mm256_max_ps(d,_mm256_setzero_ps());
1608 d2 = _mm256_mul_ps(d,d);
1609 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1611 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1613 /* Evaluate switch function */
1614 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1615 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1616 velec = _mm256_mul_ps(velec,sw);
1617 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1619 /* Update potential sum for this i atom from the interaction with this j atom. */
1620 velec = _mm256_and_ps(velec,cutoff_mask);
1621 velec = _mm256_andnot_ps(dummy_mask,velec);
1622 velecsum = _mm256_add_ps(velecsum,velec);
1626 fscal = _mm256_and_ps(fscal,cutoff_mask);
1628 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1630 /* Calculate temporary vectorial force */
1631 tx = _mm256_mul_ps(fscal,dx32);
1632 ty = _mm256_mul_ps(fscal,dy32);
1633 tz = _mm256_mul_ps(fscal,dz32);
1635 /* Update vectorial force */
1636 fix3 = _mm256_add_ps(fix3,tx);
1637 fiy3 = _mm256_add_ps(fiy3,ty);
1638 fiz3 = _mm256_add_ps(fiz3,tz);
1640 fjx2 = _mm256_add_ps(fjx2,tx);
1641 fjy2 = _mm256_add_ps(fjy2,ty);
1642 fjz2 = _mm256_add_ps(fjz2,tz);
1646 /**************************
1647 * CALCULATE INTERACTIONS *
1648 **************************/
1650 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1653 r33 = _mm256_mul_ps(rsq33,rinv33);
1654 r33 = _mm256_andnot_ps(dummy_mask,r33);
1656 /* EWALD ELECTROSTATICS */
1658 /* Analytical PME correction */
1659 zeta2 = _mm256_mul_ps(beta2,rsq33);
1660 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1661 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1662 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1663 felec = _mm256_mul_ps(qq33,felec);
1664 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1665 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1666 velec = _mm256_sub_ps(rinv33,pmecorrV);
1667 velec = _mm256_mul_ps(qq33,velec);
1669 d = _mm256_sub_ps(r33,rswitch);
1670 d = _mm256_max_ps(d,_mm256_setzero_ps());
1671 d2 = _mm256_mul_ps(d,d);
1672 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1674 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1676 /* Evaluate switch function */
1677 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1678 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1679 velec = _mm256_mul_ps(velec,sw);
1680 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1682 /* Update potential sum for this i atom from the interaction with this j atom. */
1683 velec = _mm256_and_ps(velec,cutoff_mask);
1684 velec = _mm256_andnot_ps(dummy_mask,velec);
1685 velecsum = _mm256_add_ps(velecsum,velec);
1689 fscal = _mm256_and_ps(fscal,cutoff_mask);
1691 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1693 /* Calculate temporary vectorial force */
1694 tx = _mm256_mul_ps(fscal,dx33);
1695 ty = _mm256_mul_ps(fscal,dy33);
1696 tz = _mm256_mul_ps(fscal,dz33);
1698 /* Update vectorial force */
1699 fix3 = _mm256_add_ps(fix3,tx);
1700 fiy3 = _mm256_add_ps(fiy3,ty);
1701 fiz3 = _mm256_add_ps(fiz3,tz);
1703 fjx3 = _mm256_add_ps(fjx3,tx);
1704 fjy3 = _mm256_add_ps(fjy3,ty);
1705 fjz3 = _mm256_add_ps(fjz3,tz);
1709 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1710 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1711 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1712 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1713 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1714 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1715 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1716 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1718 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1719 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1720 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1722 /* Inner loop uses 1044 flops */
1725 /* End of innermost loop */
1727 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1728 f+i_coord_offset,fshift+i_shift_offset);
1731 /* Update potential energies */
1732 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1733 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1735 /* Increment number of inner iterations */
1736 inneriter += j_index_end - j_index_start;
1738 /* Outer loop uses 26 flops */
1741 /* Increment number of outer iterations */
1744 /* Update outer/inner flops */
1746 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*1044);
1749 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single
1750 * Electrostatics interaction: Ewald
1751 * VdW interaction: LennardJones
1752 * Geometry: Water4-Water4
1753 * Calculate force/pot: Force
1756 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single
1757 (t_nblist * gmx_restrict nlist,
1758 rvec * gmx_restrict xx,
1759 rvec * gmx_restrict ff,
1760 t_forcerec * gmx_restrict fr,
1761 t_mdatoms * gmx_restrict mdatoms,
1762 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1763 t_nrnb * gmx_restrict nrnb)
1765 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1766 * just 0 for non-waters.
1767 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1768 * jnr indices corresponding to data put in the four positions in the SIMD register.
1770 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1771 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1772 int jnrA,jnrB,jnrC,jnrD;
1773 int jnrE,jnrF,jnrG,jnrH;
1774 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1775 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1776 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1777 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1778 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1779 real rcutoff_scalar;
1780 real *shiftvec,*fshift,*x,*f;
1781 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1782 real scratch[4*DIM];
1783 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1784 real * vdwioffsetptr0;
1785 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1786 real * vdwioffsetptr1;
1787 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1788 real * vdwioffsetptr2;
1789 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1790 real * vdwioffsetptr3;
1791 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1792 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1793 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1794 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1795 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1796 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1797 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1798 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1799 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1800 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1801 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1802 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1803 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1804 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1805 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1806 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1807 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1808 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1809 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1810 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1813 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1816 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1817 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1819 __m128i ewitab_lo,ewitab_hi;
1820 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1821 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1823 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1824 real rswitch_scalar,d_scalar;
1825 __m256 dummy_mask,cutoff_mask;
1826 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1827 __m256 one = _mm256_set1_ps(1.0);
1828 __m256 two = _mm256_set1_ps(2.0);
1834 jindex = nlist->jindex;
1836 shiftidx = nlist->shift;
1838 shiftvec = fr->shift_vec[0];
1839 fshift = fr->fshift[0];
1840 facel = _mm256_set1_ps(fr->epsfac);
1841 charge = mdatoms->chargeA;
1842 nvdwtype = fr->ntype;
1843 vdwparam = fr->nbfp;
1844 vdwtype = mdatoms->typeA;
1846 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1847 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1848 beta2 = _mm256_mul_ps(beta,beta);
1849 beta3 = _mm256_mul_ps(beta,beta2);
1851 ewtab = fr->ic->tabq_coul_FDV0;
1852 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1853 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1855 /* Setup water-specific parameters */
1856 inr = nlist->iinr[0];
1857 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1858 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1859 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1860 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1862 jq1 = _mm256_set1_ps(charge[inr+1]);
1863 jq2 = _mm256_set1_ps(charge[inr+2]);
1864 jq3 = _mm256_set1_ps(charge[inr+3]);
1865 vdwjidx0A = 2*vdwtype[inr+0];
1866 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1867 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1868 qq11 = _mm256_mul_ps(iq1,jq1);
1869 qq12 = _mm256_mul_ps(iq1,jq2);
1870 qq13 = _mm256_mul_ps(iq1,jq3);
1871 qq21 = _mm256_mul_ps(iq2,jq1);
1872 qq22 = _mm256_mul_ps(iq2,jq2);
1873 qq23 = _mm256_mul_ps(iq2,jq3);
1874 qq31 = _mm256_mul_ps(iq3,jq1);
1875 qq32 = _mm256_mul_ps(iq3,jq2);
1876 qq33 = _mm256_mul_ps(iq3,jq3);
1878 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1879 rcutoff_scalar = fr->rcoulomb;
1880 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1881 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1883 rswitch_scalar = fr->rcoulomb_switch;
1884 rswitch = _mm256_set1_ps(rswitch_scalar);
1885 /* Setup switch parameters */
1886 d_scalar = rcutoff_scalar-rswitch_scalar;
1887 d = _mm256_set1_ps(d_scalar);
1888 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1889 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1890 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1891 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1892 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1893 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1895 /* Avoid stupid compiler warnings */
1896 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1897 j_coord_offsetA = 0;
1898 j_coord_offsetB = 0;
1899 j_coord_offsetC = 0;
1900 j_coord_offsetD = 0;
1901 j_coord_offsetE = 0;
1902 j_coord_offsetF = 0;
1903 j_coord_offsetG = 0;
1904 j_coord_offsetH = 0;
1909 for(iidx=0;iidx<4*DIM;iidx++)
1911 scratch[iidx] = 0.0;
1914 /* Start outer loop over neighborlists */
1915 for(iidx=0; iidx<nri; iidx++)
1917 /* Load shift vector for this list */
1918 i_shift_offset = DIM*shiftidx[iidx];
1920 /* Load limits for loop over neighbors */
1921 j_index_start = jindex[iidx];
1922 j_index_end = jindex[iidx+1];
1924 /* Get outer coordinate index */
1926 i_coord_offset = DIM*inr;
1928 /* Load i particle coords and add shift vector */
1929 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1930 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1932 fix0 = _mm256_setzero_ps();
1933 fiy0 = _mm256_setzero_ps();
1934 fiz0 = _mm256_setzero_ps();
1935 fix1 = _mm256_setzero_ps();
1936 fiy1 = _mm256_setzero_ps();
1937 fiz1 = _mm256_setzero_ps();
1938 fix2 = _mm256_setzero_ps();
1939 fiy2 = _mm256_setzero_ps();
1940 fiz2 = _mm256_setzero_ps();
1941 fix3 = _mm256_setzero_ps();
1942 fiy3 = _mm256_setzero_ps();
1943 fiz3 = _mm256_setzero_ps();
1945 /* Start inner kernel loop */
1946 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1949 /* Get j neighbor index, and coordinate index */
1951 jnrB = jjnr[jidx+1];
1952 jnrC = jjnr[jidx+2];
1953 jnrD = jjnr[jidx+3];
1954 jnrE = jjnr[jidx+4];
1955 jnrF = jjnr[jidx+5];
1956 jnrG = jjnr[jidx+6];
1957 jnrH = jjnr[jidx+7];
1958 j_coord_offsetA = DIM*jnrA;
1959 j_coord_offsetB = DIM*jnrB;
1960 j_coord_offsetC = DIM*jnrC;
1961 j_coord_offsetD = DIM*jnrD;
1962 j_coord_offsetE = DIM*jnrE;
1963 j_coord_offsetF = DIM*jnrF;
1964 j_coord_offsetG = DIM*jnrG;
1965 j_coord_offsetH = DIM*jnrH;
1967 /* load j atom coordinates */
1968 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1969 x+j_coord_offsetC,x+j_coord_offsetD,
1970 x+j_coord_offsetE,x+j_coord_offsetF,
1971 x+j_coord_offsetG,x+j_coord_offsetH,
1972 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1973 &jy2,&jz2,&jx3,&jy3,&jz3);
1975 /* Calculate displacement vector */
1976 dx00 = _mm256_sub_ps(ix0,jx0);
1977 dy00 = _mm256_sub_ps(iy0,jy0);
1978 dz00 = _mm256_sub_ps(iz0,jz0);
1979 dx11 = _mm256_sub_ps(ix1,jx1);
1980 dy11 = _mm256_sub_ps(iy1,jy1);
1981 dz11 = _mm256_sub_ps(iz1,jz1);
1982 dx12 = _mm256_sub_ps(ix1,jx2);
1983 dy12 = _mm256_sub_ps(iy1,jy2);
1984 dz12 = _mm256_sub_ps(iz1,jz2);
1985 dx13 = _mm256_sub_ps(ix1,jx3);
1986 dy13 = _mm256_sub_ps(iy1,jy3);
1987 dz13 = _mm256_sub_ps(iz1,jz3);
1988 dx21 = _mm256_sub_ps(ix2,jx1);
1989 dy21 = _mm256_sub_ps(iy2,jy1);
1990 dz21 = _mm256_sub_ps(iz2,jz1);
1991 dx22 = _mm256_sub_ps(ix2,jx2);
1992 dy22 = _mm256_sub_ps(iy2,jy2);
1993 dz22 = _mm256_sub_ps(iz2,jz2);
1994 dx23 = _mm256_sub_ps(ix2,jx3);
1995 dy23 = _mm256_sub_ps(iy2,jy3);
1996 dz23 = _mm256_sub_ps(iz2,jz3);
1997 dx31 = _mm256_sub_ps(ix3,jx1);
1998 dy31 = _mm256_sub_ps(iy3,jy1);
1999 dz31 = _mm256_sub_ps(iz3,jz1);
2000 dx32 = _mm256_sub_ps(ix3,jx2);
2001 dy32 = _mm256_sub_ps(iy3,jy2);
2002 dz32 = _mm256_sub_ps(iz3,jz2);
2003 dx33 = _mm256_sub_ps(ix3,jx3);
2004 dy33 = _mm256_sub_ps(iy3,jy3);
2005 dz33 = _mm256_sub_ps(iz3,jz3);
2007 /* Calculate squared distance and things based on it */
2008 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2009 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2010 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2011 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2012 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2013 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2014 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2015 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2016 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2017 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2019 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2020 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2021 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2022 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2023 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2024 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2025 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2026 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2027 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2028 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2030 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2031 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2032 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2033 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2034 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2035 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2036 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2037 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2038 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2039 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2041 fjx0 = _mm256_setzero_ps();
2042 fjy0 = _mm256_setzero_ps();
2043 fjz0 = _mm256_setzero_ps();
2044 fjx1 = _mm256_setzero_ps();
2045 fjy1 = _mm256_setzero_ps();
2046 fjz1 = _mm256_setzero_ps();
2047 fjx2 = _mm256_setzero_ps();
2048 fjy2 = _mm256_setzero_ps();
2049 fjz2 = _mm256_setzero_ps();
2050 fjx3 = _mm256_setzero_ps();
2051 fjy3 = _mm256_setzero_ps();
2052 fjz3 = _mm256_setzero_ps();
2054 /**************************
2055 * CALCULATE INTERACTIONS *
2056 **************************/
2058 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2061 r00 = _mm256_mul_ps(rsq00,rinv00);
2063 /* LENNARD-JONES DISPERSION/REPULSION */
2065 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2066 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
2067 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2068 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2069 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2071 d = _mm256_sub_ps(r00,rswitch);
2072 d = _mm256_max_ps(d,_mm256_setzero_ps());
2073 d2 = _mm256_mul_ps(d,d);
2074 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2076 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2078 /* Evaluate switch function */
2079 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2080 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2081 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2085 fscal = _mm256_and_ps(fscal,cutoff_mask);
2087 /* Calculate temporary vectorial force */
2088 tx = _mm256_mul_ps(fscal,dx00);
2089 ty = _mm256_mul_ps(fscal,dy00);
2090 tz = _mm256_mul_ps(fscal,dz00);
2092 /* Update vectorial force */
2093 fix0 = _mm256_add_ps(fix0,tx);
2094 fiy0 = _mm256_add_ps(fiy0,ty);
2095 fiz0 = _mm256_add_ps(fiz0,tz);
2097 fjx0 = _mm256_add_ps(fjx0,tx);
2098 fjy0 = _mm256_add_ps(fjy0,ty);
2099 fjz0 = _mm256_add_ps(fjz0,tz);
2103 /**************************
2104 * CALCULATE INTERACTIONS *
2105 **************************/
2107 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2110 r11 = _mm256_mul_ps(rsq11,rinv11);
2112 /* EWALD ELECTROSTATICS */
2114 /* Analytical PME correction */
2115 zeta2 = _mm256_mul_ps(beta2,rsq11);
2116 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2117 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2118 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2119 felec = _mm256_mul_ps(qq11,felec);
2120 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2121 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2122 velec = _mm256_sub_ps(rinv11,pmecorrV);
2123 velec = _mm256_mul_ps(qq11,velec);
2125 d = _mm256_sub_ps(r11,rswitch);
2126 d = _mm256_max_ps(d,_mm256_setzero_ps());
2127 d2 = _mm256_mul_ps(d,d);
2128 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2130 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2132 /* Evaluate switch function */
2133 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2134 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2135 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2139 fscal = _mm256_and_ps(fscal,cutoff_mask);
2141 /* Calculate temporary vectorial force */
2142 tx = _mm256_mul_ps(fscal,dx11);
2143 ty = _mm256_mul_ps(fscal,dy11);
2144 tz = _mm256_mul_ps(fscal,dz11);
2146 /* Update vectorial force */
2147 fix1 = _mm256_add_ps(fix1,tx);
2148 fiy1 = _mm256_add_ps(fiy1,ty);
2149 fiz1 = _mm256_add_ps(fiz1,tz);
2151 fjx1 = _mm256_add_ps(fjx1,tx);
2152 fjy1 = _mm256_add_ps(fjy1,ty);
2153 fjz1 = _mm256_add_ps(fjz1,tz);
2157 /**************************
2158 * CALCULATE INTERACTIONS *
2159 **************************/
2161 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2164 r12 = _mm256_mul_ps(rsq12,rinv12);
2166 /* EWALD ELECTROSTATICS */
2168 /* Analytical PME correction */
2169 zeta2 = _mm256_mul_ps(beta2,rsq12);
2170 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2171 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2172 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2173 felec = _mm256_mul_ps(qq12,felec);
2174 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2175 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2176 velec = _mm256_sub_ps(rinv12,pmecorrV);
2177 velec = _mm256_mul_ps(qq12,velec);
2179 d = _mm256_sub_ps(r12,rswitch);
2180 d = _mm256_max_ps(d,_mm256_setzero_ps());
2181 d2 = _mm256_mul_ps(d,d);
2182 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2184 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2186 /* Evaluate switch function */
2187 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2188 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2189 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2193 fscal = _mm256_and_ps(fscal,cutoff_mask);
2195 /* Calculate temporary vectorial force */
2196 tx = _mm256_mul_ps(fscal,dx12);
2197 ty = _mm256_mul_ps(fscal,dy12);
2198 tz = _mm256_mul_ps(fscal,dz12);
2200 /* Update vectorial force */
2201 fix1 = _mm256_add_ps(fix1,tx);
2202 fiy1 = _mm256_add_ps(fiy1,ty);
2203 fiz1 = _mm256_add_ps(fiz1,tz);
2205 fjx2 = _mm256_add_ps(fjx2,tx);
2206 fjy2 = _mm256_add_ps(fjy2,ty);
2207 fjz2 = _mm256_add_ps(fjz2,tz);
2211 /**************************
2212 * CALCULATE INTERACTIONS *
2213 **************************/
2215 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2218 r13 = _mm256_mul_ps(rsq13,rinv13);
2220 /* EWALD ELECTROSTATICS */
2222 /* Analytical PME correction */
2223 zeta2 = _mm256_mul_ps(beta2,rsq13);
2224 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2225 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2226 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2227 felec = _mm256_mul_ps(qq13,felec);
2228 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2229 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2230 velec = _mm256_sub_ps(rinv13,pmecorrV);
2231 velec = _mm256_mul_ps(qq13,velec);
2233 d = _mm256_sub_ps(r13,rswitch);
2234 d = _mm256_max_ps(d,_mm256_setzero_ps());
2235 d2 = _mm256_mul_ps(d,d);
2236 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2238 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2240 /* Evaluate switch function */
2241 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2242 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2243 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2247 fscal = _mm256_and_ps(fscal,cutoff_mask);
2249 /* Calculate temporary vectorial force */
2250 tx = _mm256_mul_ps(fscal,dx13);
2251 ty = _mm256_mul_ps(fscal,dy13);
2252 tz = _mm256_mul_ps(fscal,dz13);
2254 /* Update vectorial force */
2255 fix1 = _mm256_add_ps(fix1,tx);
2256 fiy1 = _mm256_add_ps(fiy1,ty);
2257 fiz1 = _mm256_add_ps(fiz1,tz);
2259 fjx3 = _mm256_add_ps(fjx3,tx);
2260 fjy3 = _mm256_add_ps(fjy3,ty);
2261 fjz3 = _mm256_add_ps(fjz3,tz);
2265 /**************************
2266 * CALCULATE INTERACTIONS *
2267 **************************/
2269 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2272 r21 = _mm256_mul_ps(rsq21,rinv21);
2274 /* EWALD ELECTROSTATICS */
2276 /* Analytical PME correction */
2277 zeta2 = _mm256_mul_ps(beta2,rsq21);
2278 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2279 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2280 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2281 felec = _mm256_mul_ps(qq21,felec);
2282 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2283 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2284 velec = _mm256_sub_ps(rinv21,pmecorrV);
2285 velec = _mm256_mul_ps(qq21,velec);
2287 d = _mm256_sub_ps(r21,rswitch);
2288 d = _mm256_max_ps(d,_mm256_setzero_ps());
2289 d2 = _mm256_mul_ps(d,d);
2290 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2292 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2294 /* Evaluate switch function */
2295 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2296 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2297 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2301 fscal = _mm256_and_ps(fscal,cutoff_mask);
2303 /* Calculate temporary vectorial force */
2304 tx = _mm256_mul_ps(fscal,dx21);
2305 ty = _mm256_mul_ps(fscal,dy21);
2306 tz = _mm256_mul_ps(fscal,dz21);
2308 /* Update vectorial force */
2309 fix2 = _mm256_add_ps(fix2,tx);
2310 fiy2 = _mm256_add_ps(fiy2,ty);
2311 fiz2 = _mm256_add_ps(fiz2,tz);
2313 fjx1 = _mm256_add_ps(fjx1,tx);
2314 fjy1 = _mm256_add_ps(fjy1,ty);
2315 fjz1 = _mm256_add_ps(fjz1,tz);
2319 /**************************
2320 * CALCULATE INTERACTIONS *
2321 **************************/
2323 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2326 r22 = _mm256_mul_ps(rsq22,rinv22);
2328 /* EWALD ELECTROSTATICS */
2330 /* Analytical PME correction */
2331 zeta2 = _mm256_mul_ps(beta2,rsq22);
2332 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2333 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2334 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2335 felec = _mm256_mul_ps(qq22,felec);
2336 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2337 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2338 velec = _mm256_sub_ps(rinv22,pmecorrV);
2339 velec = _mm256_mul_ps(qq22,velec);
2341 d = _mm256_sub_ps(r22,rswitch);
2342 d = _mm256_max_ps(d,_mm256_setzero_ps());
2343 d2 = _mm256_mul_ps(d,d);
2344 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2346 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2348 /* Evaluate switch function */
2349 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2350 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2351 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2355 fscal = _mm256_and_ps(fscal,cutoff_mask);
2357 /* Calculate temporary vectorial force */
2358 tx = _mm256_mul_ps(fscal,dx22);
2359 ty = _mm256_mul_ps(fscal,dy22);
2360 tz = _mm256_mul_ps(fscal,dz22);
2362 /* Update vectorial force */
2363 fix2 = _mm256_add_ps(fix2,tx);
2364 fiy2 = _mm256_add_ps(fiy2,ty);
2365 fiz2 = _mm256_add_ps(fiz2,tz);
2367 fjx2 = _mm256_add_ps(fjx2,tx);
2368 fjy2 = _mm256_add_ps(fjy2,ty);
2369 fjz2 = _mm256_add_ps(fjz2,tz);
2373 /**************************
2374 * CALCULATE INTERACTIONS *
2375 **************************/
2377 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2380 r23 = _mm256_mul_ps(rsq23,rinv23);
2382 /* EWALD ELECTROSTATICS */
2384 /* Analytical PME correction */
2385 zeta2 = _mm256_mul_ps(beta2,rsq23);
2386 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2387 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2388 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2389 felec = _mm256_mul_ps(qq23,felec);
2390 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2391 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2392 velec = _mm256_sub_ps(rinv23,pmecorrV);
2393 velec = _mm256_mul_ps(qq23,velec);
2395 d = _mm256_sub_ps(r23,rswitch);
2396 d = _mm256_max_ps(d,_mm256_setzero_ps());
2397 d2 = _mm256_mul_ps(d,d);
2398 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2400 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2402 /* Evaluate switch function */
2403 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2404 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2405 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2409 fscal = _mm256_and_ps(fscal,cutoff_mask);
2411 /* Calculate temporary vectorial force */
2412 tx = _mm256_mul_ps(fscal,dx23);
2413 ty = _mm256_mul_ps(fscal,dy23);
2414 tz = _mm256_mul_ps(fscal,dz23);
2416 /* Update vectorial force */
2417 fix2 = _mm256_add_ps(fix2,tx);
2418 fiy2 = _mm256_add_ps(fiy2,ty);
2419 fiz2 = _mm256_add_ps(fiz2,tz);
2421 fjx3 = _mm256_add_ps(fjx3,tx);
2422 fjy3 = _mm256_add_ps(fjy3,ty);
2423 fjz3 = _mm256_add_ps(fjz3,tz);
2427 /**************************
2428 * CALCULATE INTERACTIONS *
2429 **************************/
2431 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2434 r31 = _mm256_mul_ps(rsq31,rinv31);
2436 /* EWALD ELECTROSTATICS */
2438 /* Analytical PME correction */
2439 zeta2 = _mm256_mul_ps(beta2,rsq31);
2440 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2441 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2442 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2443 felec = _mm256_mul_ps(qq31,felec);
2444 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2445 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2446 velec = _mm256_sub_ps(rinv31,pmecorrV);
2447 velec = _mm256_mul_ps(qq31,velec);
2449 d = _mm256_sub_ps(r31,rswitch);
2450 d = _mm256_max_ps(d,_mm256_setzero_ps());
2451 d2 = _mm256_mul_ps(d,d);
2452 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2454 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2456 /* Evaluate switch function */
2457 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2458 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2459 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2463 fscal = _mm256_and_ps(fscal,cutoff_mask);
2465 /* Calculate temporary vectorial force */
2466 tx = _mm256_mul_ps(fscal,dx31);
2467 ty = _mm256_mul_ps(fscal,dy31);
2468 tz = _mm256_mul_ps(fscal,dz31);
2470 /* Update vectorial force */
2471 fix3 = _mm256_add_ps(fix3,tx);
2472 fiy3 = _mm256_add_ps(fiy3,ty);
2473 fiz3 = _mm256_add_ps(fiz3,tz);
2475 fjx1 = _mm256_add_ps(fjx1,tx);
2476 fjy1 = _mm256_add_ps(fjy1,ty);
2477 fjz1 = _mm256_add_ps(fjz1,tz);
2481 /**************************
2482 * CALCULATE INTERACTIONS *
2483 **************************/
2485 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2488 r32 = _mm256_mul_ps(rsq32,rinv32);
2490 /* EWALD ELECTROSTATICS */
2492 /* Analytical PME correction */
2493 zeta2 = _mm256_mul_ps(beta2,rsq32);
2494 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2495 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2496 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2497 felec = _mm256_mul_ps(qq32,felec);
2498 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2499 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2500 velec = _mm256_sub_ps(rinv32,pmecorrV);
2501 velec = _mm256_mul_ps(qq32,velec);
2503 d = _mm256_sub_ps(r32,rswitch);
2504 d = _mm256_max_ps(d,_mm256_setzero_ps());
2505 d2 = _mm256_mul_ps(d,d);
2506 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2508 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2510 /* Evaluate switch function */
2511 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2512 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2513 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2517 fscal = _mm256_and_ps(fscal,cutoff_mask);
2519 /* Calculate temporary vectorial force */
2520 tx = _mm256_mul_ps(fscal,dx32);
2521 ty = _mm256_mul_ps(fscal,dy32);
2522 tz = _mm256_mul_ps(fscal,dz32);
2524 /* Update vectorial force */
2525 fix3 = _mm256_add_ps(fix3,tx);
2526 fiy3 = _mm256_add_ps(fiy3,ty);
2527 fiz3 = _mm256_add_ps(fiz3,tz);
2529 fjx2 = _mm256_add_ps(fjx2,tx);
2530 fjy2 = _mm256_add_ps(fjy2,ty);
2531 fjz2 = _mm256_add_ps(fjz2,tz);
2535 /**************************
2536 * CALCULATE INTERACTIONS *
2537 **************************/
2539 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2542 r33 = _mm256_mul_ps(rsq33,rinv33);
2544 /* EWALD ELECTROSTATICS */
2546 /* Analytical PME correction */
2547 zeta2 = _mm256_mul_ps(beta2,rsq33);
2548 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2549 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2550 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2551 felec = _mm256_mul_ps(qq33,felec);
2552 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2553 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2554 velec = _mm256_sub_ps(rinv33,pmecorrV);
2555 velec = _mm256_mul_ps(qq33,velec);
2557 d = _mm256_sub_ps(r33,rswitch);
2558 d = _mm256_max_ps(d,_mm256_setzero_ps());
2559 d2 = _mm256_mul_ps(d,d);
2560 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2562 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2564 /* Evaluate switch function */
2565 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2566 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2567 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2571 fscal = _mm256_and_ps(fscal,cutoff_mask);
2573 /* Calculate temporary vectorial force */
2574 tx = _mm256_mul_ps(fscal,dx33);
2575 ty = _mm256_mul_ps(fscal,dy33);
2576 tz = _mm256_mul_ps(fscal,dz33);
2578 /* Update vectorial force */
2579 fix3 = _mm256_add_ps(fix3,tx);
2580 fiy3 = _mm256_add_ps(fiy3,ty);
2581 fiz3 = _mm256_add_ps(fiz3,tz);
2583 fjx3 = _mm256_add_ps(fjx3,tx);
2584 fjy3 = _mm256_add_ps(fjy3,ty);
2585 fjz3 = _mm256_add_ps(fjz3,tz);
2589 fjptrA = f+j_coord_offsetA;
2590 fjptrB = f+j_coord_offsetB;
2591 fjptrC = f+j_coord_offsetC;
2592 fjptrD = f+j_coord_offsetD;
2593 fjptrE = f+j_coord_offsetE;
2594 fjptrF = f+j_coord_offsetF;
2595 fjptrG = f+j_coord_offsetG;
2596 fjptrH = f+j_coord_offsetH;
2598 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2599 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2600 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2602 /* Inner loop uses 1004 flops */
2605 if(jidx<j_index_end)
2608 /* Get j neighbor index, and coordinate index */
2609 jnrlistA = jjnr[jidx];
2610 jnrlistB = jjnr[jidx+1];
2611 jnrlistC = jjnr[jidx+2];
2612 jnrlistD = jjnr[jidx+3];
2613 jnrlistE = jjnr[jidx+4];
2614 jnrlistF = jjnr[jidx+5];
2615 jnrlistG = jjnr[jidx+6];
2616 jnrlistH = jjnr[jidx+7];
2617 /* Sign of each element will be negative for non-real atoms.
2618 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2619 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2621 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2622 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2624 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2625 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2626 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2627 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2628 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2629 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2630 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2631 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2632 j_coord_offsetA = DIM*jnrA;
2633 j_coord_offsetB = DIM*jnrB;
2634 j_coord_offsetC = DIM*jnrC;
2635 j_coord_offsetD = DIM*jnrD;
2636 j_coord_offsetE = DIM*jnrE;
2637 j_coord_offsetF = DIM*jnrF;
2638 j_coord_offsetG = DIM*jnrG;
2639 j_coord_offsetH = DIM*jnrH;
2641 /* load j atom coordinates */
2642 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2643 x+j_coord_offsetC,x+j_coord_offsetD,
2644 x+j_coord_offsetE,x+j_coord_offsetF,
2645 x+j_coord_offsetG,x+j_coord_offsetH,
2646 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2647 &jy2,&jz2,&jx3,&jy3,&jz3);
2649 /* Calculate displacement vector */
2650 dx00 = _mm256_sub_ps(ix0,jx0);
2651 dy00 = _mm256_sub_ps(iy0,jy0);
2652 dz00 = _mm256_sub_ps(iz0,jz0);
2653 dx11 = _mm256_sub_ps(ix1,jx1);
2654 dy11 = _mm256_sub_ps(iy1,jy1);
2655 dz11 = _mm256_sub_ps(iz1,jz1);
2656 dx12 = _mm256_sub_ps(ix1,jx2);
2657 dy12 = _mm256_sub_ps(iy1,jy2);
2658 dz12 = _mm256_sub_ps(iz1,jz2);
2659 dx13 = _mm256_sub_ps(ix1,jx3);
2660 dy13 = _mm256_sub_ps(iy1,jy3);
2661 dz13 = _mm256_sub_ps(iz1,jz3);
2662 dx21 = _mm256_sub_ps(ix2,jx1);
2663 dy21 = _mm256_sub_ps(iy2,jy1);
2664 dz21 = _mm256_sub_ps(iz2,jz1);
2665 dx22 = _mm256_sub_ps(ix2,jx2);
2666 dy22 = _mm256_sub_ps(iy2,jy2);
2667 dz22 = _mm256_sub_ps(iz2,jz2);
2668 dx23 = _mm256_sub_ps(ix2,jx3);
2669 dy23 = _mm256_sub_ps(iy2,jy3);
2670 dz23 = _mm256_sub_ps(iz2,jz3);
2671 dx31 = _mm256_sub_ps(ix3,jx1);
2672 dy31 = _mm256_sub_ps(iy3,jy1);
2673 dz31 = _mm256_sub_ps(iz3,jz1);
2674 dx32 = _mm256_sub_ps(ix3,jx2);
2675 dy32 = _mm256_sub_ps(iy3,jy2);
2676 dz32 = _mm256_sub_ps(iz3,jz2);
2677 dx33 = _mm256_sub_ps(ix3,jx3);
2678 dy33 = _mm256_sub_ps(iy3,jy3);
2679 dz33 = _mm256_sub_ps(iz3,jz3);
2681 /* Calculate squared distance and things based on it */
2682 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2683 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2684 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2685 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2686 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2687 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2688 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2689 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2690 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2691 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2693 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2694 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2695 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2696 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2697 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2698 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2699 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2700 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2701 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2702 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2704 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2705 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2706 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2707 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2708 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2709 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2710 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2711 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2712 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2713 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2715 fjx0 = _mm256_setzero_ps();
2716 fjy0 = _mm256_setzero_ps();
2717 fjz0 = _mm256_setzero_ps();
2718 fjx1 = _mm256_setzero_ps();
2719 fjy1 = _mm256_setzero_ps();
2720 fjz1 = _mm256_setzero_ps();
2721 fjx2 = _mm256_setzero_ps();
2722 fjy2 = _mm256_setzero_ps();
2723 fjz2 = _mm256_setzero_ps();
2724 fjx3 = _mm256_setzero_ps();
2725 fjy3 = _mm256_setzero_ps();
2726 fjz3 = _mm256_setzero_ps();
2728 /**************************
2729 * CALCULATE INTERACTIONS *
2730 **************************/
2732 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2735 r00 = _mm256_mul_ps(rsq00,rinv00);
2736 r00 = _mm256_andnot_ps(dummy_mask,r00);
2738 /* LENNARD-JONES DISPERSION/REPULSION */
2740 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2741 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
2742 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2743 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2744 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2746 d = _mm256_sub_ps(r00,rswitch);
2747 d = _mm256_max_ps(d,_mm256_setzero_ps());
2748 d2 = _mm256_mul_ps(d,d);
2749 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2751 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2753 /* Evaluate switch function */
2754 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2755 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2756 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2760 fscal = _mm256_and_ps(fscal,cutoff_mask);
2762 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2764 /* Calculate temporary vectorial force */
2765 tx = _mm256_mul_ps(fscal,dx00);
2766 ty = _mm256_mul_ps(fscal,dy00);
2767 tz = _mm256_mul_ps(fscal,dz00);
2769 /* Update vectorial force */
2770 fix0 = _mm256_add_ps(fix0,tx);
2771 fiy0 = _mm256_add_ps(fiy0,ty);
2772 fiz0 = _mm256_add_ps(fiz0,tz);
2774 fjx0 = _mm256_add_ps(fjx0,tx);
2775 fjy0 = _mm256_add_ps(fjy0,ty);
2776 fjz0 = _mm256_add_ps(fjz0,tz);
2780 /**************************
2781 * CALCULATE INTERACTIONS *
2782 **************************/
2784 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2787 r11 = _mm256_mul_ps(rsq11,rinv11);
2788 r11 = _mm256_andnot_ps(dummy_mask,r11);
2790 /* EWALD ELECTROSTATICS */
2792 /* Analytical PME correction */
2793 zeta2 = _mm256_mul_ps(beta2,rsq11);
2794 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2795 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2796 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2797 felec = _mm256_mul_ps(qq11,felec);
2798 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2799 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2800 velec = _mm256_sub_ps(rinv11,pmecorrV);
2801 velec = _mm256_mul_ps(qq11,velec);
2803 d = _mm256_sub_ps(r11,rswitch);
2804 d = _mm256_max_ps(d,_mm256_setzero_ps());
2805 d2 = _mm256_mul_ps(d,d);
2806 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2808 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2810 /* Evaluate switch function */
2811 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2812 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2813 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2817 fscal = _mm256_and_ps(fscal,cutoff_mask);
2819 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2821 /* Calculate temporary vectorial force */
2822 tx = _mm256_mul_ps(fscal,dx11);
2823 ty = _mm256_mul_ps(fscal,dy11);
2824 tz = _mm256_mul_ps(fscal,dz11);
2826 /* Update vectorial force */
2827 fix1 = _mm256_add_ps(fix1,tx);
2828 fiy1 = _mm256_add_ps(fiy1,ty);
2829 fiz1 = _mm256_add_ps(fiz1,tz);
2831 fjx1 = _mm256_add_ps(fjx1,tx);
2832 fjy1 = _mm256_add_ps(fjy1,ty);
2833 fjz1 = _mm256_add_ps(fjz1,tz);
2837 /**************************
2838 * CALCULATE INTERACTIONS *
2839 **************************/
2841 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2844 r12 = _mm256_mul_ps(rsq12,rinv12);
2845 r12 = _mm256_andnot_ps(dummy_mask,r12);
2847 /* EWALD ELECTROSTATICS */
2849 /* Analytical PME correction */
2850 zeta2 = _mm256_mul_ps(beta2,rsq12);
2851 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2852 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2853 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2854 felec = _mm256_mul_ps(qq12,felec);
2855 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2856 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2857 velec = _mm256_sub_ps(rinv12,pmecorrV);
2858 velec = _mm256_mul_ps(qq12,velec);
2860 d = _mm256_sub_ps(r12,rswitch);
2861 d = _mm256_max_ps(d,_mm256_setzero_ps());
2862 d2 = _mm256_mul_ps(d,d);
2863 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2865 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2867 /* Evaluate switch function */
2868 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2869 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2870 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2874 fscal = _mm256_and_ps(fscal,cutoff_mask);
2876 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2878 /* Calculate temporary vectorial force */
2879 tx = _mm256_mul_ps(fscal,dx12);
2880 ty = _mm256_mul_ps(fscal,dy12);
2881 tz = _mm256_mul_ps(fscal,dz12);
2883 /* Update vectorial force */
2884 fix1 = _mm256_add_ps(fix1,tx);
2885 fiy1 = _mm256_add_ps(fiy1,ty);
2886 fiz1 = _mm256_add_ps(fiz1,tz);
2888 fjx2 = _mm256_add_ps(fjx2,tx);
2889 fjy2 = _mm256_add_ps(fjy2,ty);
2890 fjz2 = _mm256_add_ps(fjz2,tz);
2894 /**************************
2895 * CALCULATE INTERACTIONS *
2896 **************************/
2898 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2901 r13 = _mm256_mul_ps(rsq13,rinv13);
2902 r13 = _mm256_andnot_ps(dummy_mask,r13);
2904 /* EWALD ELECTROSTATICS */
2906 /* Analytical PME correction */
2907 zeta2 = _mm256_mul_ps(beta2,rsq13);
2908 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2909 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2910 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2911 felec = _mm256_mul_ps(qq13,felec);
2912 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2913 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2914 velec = _mm256_sub_ps(rinv13,pmecorrV);
2915 velec = _mm256_mul_ps(qq13,velec);
2917 d = _mm256_sub_ps(r13,rswitch);
2918 d = _mm256_max_ps(d,_mm256_setzero_ps());
2919 d2 = _mm256_mul_ps(d,d);
2920 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2922 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2924 /* Evaluate switch function */
2925 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2926 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2927 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2931 fscal = _mm256_and_ps(fscal,cutoff_mask);
2933 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2935 /* Calculate temporary vectorial force */
2936 tx = _mm256_mul_ps(fscal,dx13);
2937 ty = _mm256_mul_ps(fscal,dy13);
2938 tz = _mm256_mul_ps(fscal,dz13);
2940 /* Update vectorial force */
2941 fix1 = _mm256_add_ps(fix1,tx);
2942 fiy1 = _mm256_add_ps(fiy1,ty);
2943 fiz1 = _mm256_add_ps(fiz1,tz);
2945 fjx3 = _mm256_add_ps(fjx3,tx);
2946 fjy3 = _mm256_add_ps(fjy3,ty);
2947 fjz3 = _mm256_add_ps(fjz3,tz);
2951 /**************************
2952 * CALCULATE INTERACTIONS *
2953 **************************/
2955 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2958 r21 = _mm256_mul_ps(rsq21,rinv21);
2959 r21 = _mm256_andnot_ps(dummy_mask,r21);
2961 /* EWALD ELECTROSTATICS */
2963 /* Analytical PME correction */
2964 zeta2 = _mm256_mul_ps(beta2,rsq21);
2965 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2966 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2967 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2968 felec = _mm256_mul_ps(qq21,felec);
2969 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2970 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2971 velec = _mm256_sub_ps(rinv21,pmecorrV);
2972 velec = _mm256_mul_ps(qq21,velec);
2974 d = _mm256_sub_ps(r21,rswitch);
2975 d = _mm256_max_ps(d,_mm256_setzero_ps());
2976 d2 = _mm256_mul_ps(d,d);
2977 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2979 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2981 /* Evaluate switch function */
2982 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2983 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2984 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2988 fscal = _mm256_and_ps(fscal,cutoff_mask);
2990 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2992 /* Calculate temporary vectorial force */
2993 tx = _mm256_mul_ps(fscal,dx21);
2994 ty = _mm256_mul_ps(fscal,dy21);
2995 tz = _mm256_mul_ps(fscal,dz21);
2997 /* Update vectorial force */
2998 fix2 = _mm256_add_ps(fix2,tx);
2999 fiy2 = _mm256_add_ps(fiy2,ty);
3000 fiz2 = _mm256_add_ps(fiz2,tz);
3002 fjx1 = _mm256_add_ps(fjx1,tx);
3003 fjy1 = _mm256_add_ps(fjy1,ty);
3004 fjz1 = _mm256_add_ps(fjz1,tz);
3008 /**************************
3009 * CALCULATE INTERACTIONS *
3010 **************************/
3012 if (gmx_mm256_any_lt(rsq22,rcutoff2))
3015 r22 = _mm256_mul_ps(rsq22,rinv22);
3016 r22 = _mm256_andnot_ps(dummy_mask,r22);
3018 /* EWALD ELECTROSTATICS */
3020 /* Analytical PME correction */
3021 zeta2 = _mm256_mul_ps(beta2,rsq22);
3022 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
3023 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3024 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3025 felec = _mm256_mul_ps(qq22,felec);
3026 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3027 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3028 velec = _mm256_sub_ps(rinv22,pmecorrV);
3029 velec = _mm256_mul_ps(qq22,velec);
3031 d = _mm256_sub_ps(r22,rswitch);
3032 d = _mm256_max_ps(d,_mm256_setzero_ps());
3033 d2 = _mm256_mul_ps(d,d);
3034 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3036 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3038 /* Evaluate switch function */
3039 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3040 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
3041 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
3045 fscal = _mm256_and_ps(fscal,cutoff_mask);
3047 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3049 /* Calculate temporary vectorial force */
3050 tx = _mm256_mul_ps(fscal,dx22);
3051 ty = _mm256_mul_ps(fscal,dy22);
3052 tz = _mm256_mul_ps(fscal,dz22);
3054 /* Update vectorial force */
3055 fix2 = _mm256_add_ps(fix2,tx);
3056 fiy2 = _mm256_add_ps(fiy2,ty);
3057 fiz2 = _mm256_add_ps(fiz2,tz);
3059 fjx2 = _mm256_add_ps(fjx2,tx);
3060 fjy2 = _mm256_add_ps(fjy2,ty);
3061 fjz2 = _mm256_add_ps(fjz2,tz);
3065 /**************************
3066 * CALCULATE INTERACTIONS *
3067 **************************/
3069 if (gmx_mm256_any_lt(rsq23,rcutoff2))
3072 r23 = _mm256_mul_ps(rsq23,rinv23);
3073 r23 = _mm256_andnot_ps(dummy_mask,r23);
3075 /* EWALD ELECTROSTATICS */
3077 /* Analytical PME correction */
3078 zeta2 = _mm256_mul_ps(beta2,rsq23);
3079 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
3080 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3081 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3082 felec = _mm256_mul_ps(qq23,felec);
3083 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3084 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3085 velec = _mm256_sub_ps(rinv23,pmecorrV);
3086 velec = _mm256_mul_ps(qq23,velec);
3088 d = _mm256_sub_ps(r23,rswitch);
3089 d = _mm256_max_ps(d,_mm256_setzero_ps());
3090 d2 = _mm256_mul_ps(d,d);
3091 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3093 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3095 /* Evaluate switch function */
3096 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3097 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
3098 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
3102 fscal = _mm256_and_ps(fscal,cutoff_mask);
3104 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3106 /* Calculate temporary vectorial force */
3107 tx = _mm256_mul_ps(fscal,dx23);
3108 ty = _mm256_mul_ps(fscal,dy23);
3109 tz = _mm256_mul_ps(fscal,dz23);
3111 /* Update vectorial force */
3112 fix2 = _mm256_add_ps(fix2,tx);
3113 fiy2 = _mm256_add_ps(fiy2,ty);
3114 fiz2 = _mm256_add_ps(fiz2,tz);
3116 fjx3 = _mm256_add_ps(fjx3,tx);
3117 fjy3 = _mm256_add_ps(fjy3,ty);
3118 fjz3 = _mm256_add_ps(fjz3,tz);
3122 /**************************
3123 * CALCULATE INTERACTIONS *
3124 **************************/
3126 if (gmx_mm256_any_lt(rsq31,rcutoff2))
3129 r31 = _mm256_mul_ps(rsq31,rinv31);
3130 r31 = _mm256_andnot_ps(dummy_mask,r31);
3132 /* EWALD ELECTROSTATICS */
3134 /* Analytical PME correction */
3135 zeta2 = _mm256_mul_ps(beta2,rsq31);
3136 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
3137 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3138 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3139 felec = _mm256_mul_ps(qq31,felec);
3140 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3141 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3142 velec = _mm256_sub_ps(rinv31,pmecorrV);
3143 velec = _mm256_mul_ps(qq31,velec);
3145 d = _mm256_sub_ps(r31,rswitch);
3146 d = _mm256_max_ps(d,_mm256_setzero_ps());
3147 d2 = _mm256_mul_ps(d,d);
3148 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3150 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3152 /* Evaluate switch function */
3153 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3154 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
3155 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
3159 fscal = _mm256_and_ps(fscal,cutoff_mask);
3161 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3163 /* Calculate temporary vectorial force */
3164 tx = _mm256_mul_ps(fscal,dx31);
3165 ty = _mm256_mul_ps(fscal,dy31);
3166 tz = _mm256_mul_ps(fscal,dz31);
3168 /* Update vectorial force */
3169 fix3 = _mm256_add_ps(fix3,tx);
3170 fiy3 = _mm256_add_ps(fiy3,ty);
3171 fiz3 = _mm256_add_ps(fiz3,tz);
3173 fjx1 = _mm256_add_ps(fjx1,tx);
3174 fjy1 = _mm256_add_ps(fjy1,ty);
3175 fjz1 = _mm256_add_ps(fjz1,tz);
3179 /**************************
3180 * CALCULATE INTERACTIONS *
3181 **************************/
3183 if (gmx_mm256_any_lt(rsq32,rcutoff2))
3186 r32 = _mm256_mul_ps(rsq32,rinv32);
3187 r32 = _mm256_andnot_ps(dummy_mask,r32);
3189 /* EWALD ELECTROSTATICS */
3191 /* Analytical PME correction */
3192 zeta2 = _mm256_mul_ps(beta2,rsq32);
3193 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
3194 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3195 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3196 felec = _mm256_mul_ps(qq32,felec);
3197 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3198 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3199 velec = _mm256_sub_ps(rinv32,pmecorrV);
3200 velec = _mm256_mul_ps(qq32,velec);
3202 d = _mm256_sub_ps(r32,rswitch);
3203 d = _mm256_max_ps(d,_mm256_setzero_ps());
3204 d2 = _mm256_mul_ps(d,d);
3205 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3207 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3209 /* Evaluate switch function */
3210 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3211 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
3212 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
3216 fscal = _mm256_and_ps(fscal,cutoff_mask);
3218 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3220 /* Calculate temporary vectorial force */
3221 tx = _mm256_mul_ps(fscal,dx32);
3222 ty = _mm256_mul_ps(fscal,dy32);
3223 tz = _mm256_mul_ps(fscal,dz32);
3225 /* Update vectorial force */
3226 fix3 = _mm256_add_ps(fix3,tx);
3227 fiy3 = _mm256_add_ps(fiy3,ty);
3228 fiz3 = _mm256_add_ps(fiz3,tz);
3230 fjx2 = _mm256_add_ps(fjx2,tx);
3231 fjy2 = _mm256_add_ps(fjy2,ty);
3232 fjz2 = _mm256_add_ps(fjz2,tz);
3236 /**************************
3237 * CALCULATE INTERACTIONS *
3238 **************************/
3240 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3243 r33 = _mm256_mul_ps(rsq33,rinv33);
3244 r33 = _mm256_andnot_ps(dummy_mask,r33);
3246 /* EWALD ELECTROSTATICS */
3248 /* Analytical PME correction */
3249 zeta2 = _mm256_mul_ps(beta2,rsq33);
3250 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
3251 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3252 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3253 felec = _mm256_mul_ps(qq33,felec);
3254 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3255 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3256 velec = _mm256_sub_ps(rinv33,pmecorrV);
3257 velec = _mm256_mul_ps(qq33,velec);
3259 d = _mm256_sub_ps(r33,rswitch);
3260 d = _mm256_max_ps(d,_mm256_setzero_ps());
3261 d2 = _mm256_mul_ps(d,d);
3262 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3264 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3266 /* Evaluate switch function */
3267 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3268 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
3269 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
3273 fscal = _mm256_and_ps(fscal,cutoff_mask);
3275 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3277 /* Calculate temporary vectorial force */
3278 tx = _mm256_mul_ps(fscal,dx33);
3279 ty = _mm256_mul_ps(fscal,dy33);
3280 tz = _mm256_mul_ps(fscal,dz33);
3282 /* Update vectorial force */
3283 fix3 = _mm256_add_ps(fix3,tx);
3284 fiy3 = _mm256_add_ps(fiy3,ty);
3285 fiz3 = _mm256_add_ps(fiz3,tz);
3287 fjx3 = _mm256_add_ps(fjx3,tx);
3288 fjy3 = _mm256_add_ps(fjy3,ty);
3289 fjz3 = _mm256_add_ps(fjz3,tz);
3293 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3294 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3295 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3296 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3297 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3298 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3299 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3300 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3302 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3303 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3304 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3306 /* Inner loop uses 1014 flops */
3309 /* End of innermost loop */
3311 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3312 f+i_coord_offset,fshift+i_shift_offset);
3314 /* Increment number of inner iterations */
3315 inneriter += j_index_end - j_index_start;
3317 /* Outer loop uses 24 flops */
3320 /* Increment number of outer iterations */
3323 /* Update outer/inner flops */
3325 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*1014);