2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 real * vdwioffsetptr3;
95 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
97 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
99 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
100 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
101 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
102 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
103 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
104 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
105 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
106 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
107 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
108 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
109 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
110 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
111 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
112 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
113 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
114 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
117 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
120 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
121 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
123 __m128i ewitab_lo,ewitab_hi;
124 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
125 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
127 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
128 real rswitch_scalar,d_scalar;
129 __m256 dummy_mask,cutoff_mask;
130 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
131 __m256 one = _mm256_set1_ps(1.0);
132 __m256 two = _mm256_set1_ps(2.0);
138 jindex = nlist->jindex;
140 shiftidx = nlist->shift;
142 shiftvec = fr->shift_vec[0];
143 fshift = fr->fshift[0];
144 facel = _mm256_set1_ps(fr->epsfac);
145 charge = mdatoms->chargeA;
146 nvdwtype = fr->ntype;
148 vdwtype = mdatoms->typeA;
150 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
151 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
152 beta2 = _mm256_mul_ps(beta,beta);
153 beta3 = _mm256_mul_ps(beta,beta2);
155 ewtab = fr->ic->tabq_coul_FDV0;
156 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
157 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
159 /* Setup water-specific parameters */
160 inr = nlist->iinr[0];
161 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
162 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
163 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
164 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
166 jq1 = _mm256_set1_ps(charge[inr+1]);
167 jq2 = _mm256_set1_ps(charge[inr+2]);
168 jq3 = _mm256_set1_ps(charge[inr+3]);
169 vdwjidx0A = 2*vdwtype[inr+0];
170 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
171 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
172 qq11 = _mm256_mul_ps(iq1,jq1);
173 qq12 = _mm256_mul_ps(iq1,jq2);
174 qq13 = _mm256_mul_ps(iq1,jq3);
175 qq21 = _mm256_mul_ps(iq2,jq1);
176 qq22 = _mm256_mul_ps(iq2,jq2);
177 qq23 = _mm256_mul_ps(iq2,jq3);
178 qq31 = _mm256_mul_ps(iq3,jq1);
179 qq32 = _mm256_mul_ps(iq3,jq2);
180 qq33 = _mm256_mul_ps(iq3,jq3);
182 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
183 rcutoff_scalar = fr->rcoulomb;
184 rcutoff = _mm256_set1_ps(rcutoff_scalar);
185 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
187 rswitch_scalar = fr->rcoulomb_switch;
188 rswitch = _mm256_set1_ps(rswitch_scalar);
189 /* Setup switch parameters */
190 d_scalar = rcutoff_scalar-rswitch_scalar;
191 d = _mm256_set1_ps(d_scalar);
192 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
193 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
194 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
195 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
196 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
197 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
199 /* Avoid stupid compiler warnings */
200 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
213 for(iidx=0;iidx<4*DIM;iidx++)
218 /* Start outer loop over neighborlists */
219 for(iidx=0; iidx<nri; iidx++)
221 /* Load shift vector for this list */
222 i_shift_offset = DIM*shiftidx[iidx];
224 /* Load limits for loop over neighbors */
225 j_index_start = jindex[iidx];
226 j_index_end = jindex[iidx+1];
228 /* Get outer coordinate index */
230 i_coord_offset = DIM*inr;
232 /* Load i particle coords and add shift vector */
233 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
234 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
236 fix0 = _mm256_setzero_ps();
237 fiy0 = _mm256_setzero_ps();
238 fiz0 = _mm256_setzero_ps();
239 fix1 = _mm256_setzero_ps();
240 fiy1 = _mm256_setzero_ps();
241 fiz1 = _mm256_setzero_ps();
242 fix2 = _mm256_setzero_ps();
243 fiy2 = _mm256_setzero_ps();
244 fiz2 = _mm256_setzero_ps();
245 fix3 = _mm256_setzero_ps();
246 fiy3 = _mm256_setzero_ps();
247 fiz3 = _mm256_setzero_ps();
249 /* Reset potential sums */
250 velecsum = _mm256_setzero_ps();
251 vvdwsum = _mm256_setzero_ps();
253 /* Start inner kernel loop */
254 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
257 /* Get j neighbor index, and coordinate index */
266 j_coord_offsetA = DIM*jnrA;
267 j_coord_offsetB = DIM*jnrB;
268 j_coord_offsetC = DIM*jnrC;
269 j_coord_offsetD = DIM*jnrD;
270 j_coord_offsetE = DIM*jnrE;
271 j_coord_offsetF = DIM*jnrF;
272 j_coord_offsetG = DIM*jnrG;
273 j_coord_offsetH = DIM*jnrH;
275 /* load j atom coordinates */
276 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
277 x+j_coord_offsetC,x+j_coord_offsetD,
278 x+j_coord_offsetE,x+j_coord_offsetF,
279 x+j_coord_offsetG,x+j_coord_offsetH,
280 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
281 &jy2,&jz2,&jx3,&jy3,&jz3);
283 /* Calculate displacement vector */
284 dx00 = _mm256_sub_ps(ix0,jx0);
285 dy00 = _mm256_sub_ps(iy0,jy0);
286 dz00 = _mm256_sub_ps(iz0,jz0);
287 dx11 = _mm256_sub_ps(ix1,jx1);
288 dy11 = _mm256_sub_ps(iy1,jy1);
289 dz11 = _mm256_sub_ps(iz1,jz1);
290 dx12 = _mm256_sub_ps(ix1,jx2);
291 dy12 = _mm256_sub_ps(iy1,jy2);
292 dz12 = _mm256_sub_ps(iz1,jz2);
293 dx13 = _mm256_sub_ps(ix1,jx3);
294 dy13 = _mm256_sub_ps(iy1,jy3);
295 dz13 = _mm256_sub_ps(iz1,jz3);
296 dx21 = _mm256_sub_ps(ix2,jx1);
297 dy21 = _mm256_sub_ps(iy2,jy1);
298 dz21 = _mm256_sub_ps(iz2,jz1);
299 dx22 = _mm256_sub_ps(ix2,jx2);
300 dy22 = _mm256_sub_ps(iy2,jy2);
301 dz22 = _mm256_sub_ps(iz2,jz2);
302 dx23 = _mm256_sub_ps(ix2,jx3);
303 dy23 = _mm256_sub_ps(iy2,jy3);
304 dz23 = _mm256_sub_ps(iz2,jz3);
305 dx31 = _mm256_sub_ps(ix3,jx1);
306 dy31 = _mm256_sub_ps(iy3,jy1);
307 dz31 = _mm256_sub_ps(iz3,jz1);
308 dx32 = _mm256_sub_ps(ix3,jx2);
309 dy32 = _mm256_sub_ps(iy3,jy2);
310 dz32 = _mm256_sub_ps(iz3,jz2);
311 dx33 = _mm256_sub_ps(ix3,jx3);
312 dy33 = _mm256_sub_ps(iy3,jy3);
313 dz33 = _mm256_sub_ps(iz3,jz3);
315 /* Calculate squared distance and things based on it */
316 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
317 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
318 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
319 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
320 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
321 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
322 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
323 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
324 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
325 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
327 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
328 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
329 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
330 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
331 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
332 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
333 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
334 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
335 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
336 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
338 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
339 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
340 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
341 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
342 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
343 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
344 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
345 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
346 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
347 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
349 fjx0 = _mm256_setzero_ps();
350 fjy0 = _mm256_setzero_ps();
351 fjz0 = _mm256_setzero_ps();
352 fjx1 = _mm256_setzero_ps();
353 fjy1 = _mm256_setzero_ps();
354 fjz1 = _mm256_setzero_ps();
355 fjx2 = _mm256_setzero_ps();
356 fjy2 = _mm256_setzero_ps();
357 fjz2 = _mm256_setzero_ps();
358 fjx3 = _mm256_setzero_ps();
359 fjy3 = _mm256_setzero_ps();
360 fjz3 = _mm256_setzero_ps();
362 /**************************
363 * CALCULATE INTERACTIONS *
364 **************************/
366 if (gmx_mm256_any_lt(rsq00,rcutoff2))
369 r00 = _mm256_mul_ps(rsq00,rinv00);
371 /* LENNARD-JONES DISPERSION/REPULSION */
373 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
374 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
375 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
376 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
377 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
379 d = _mm256_sub_ps(r00,rswitch);
380 d = _mm256_max_ps(d,_mm256_setzero_ps());
381 d2 = _mm256_mul_ps(d,d);
382 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
384 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
386 /* Evaluate switch function */
387 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
388 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
389 vvdw = _mm256_mul_ps(vvdw,sw);
390 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
392 /* Update potential sum for this i atom from the interaction with this j atom. */
393 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
394 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
398 fscal = _mm256_and_ps(fscal,cutoff_mask);
400 /* Calculate temporary vectorial force */
401 tx = _mm256_mul_ps(fscal,dx00);
402 ty = _mm256_mul_ps(fscal,dy00);
403 tz = _mm256_mul_ps(fscal,dz00);
405 /* Update vectorial force */
406 fix0 = _mm256_add_ps(fix0,tx);
407 fiy0 = _mm256_add_ps(fiy0,ty);
408 fiz0 = _mm256_add_ps(fiz0,tz);
410 fjx0 = _mm256_add_ps(fjx0,tx);
411 fjy0 = _mm256_add_ps(fjy0,ty);
412 fjz0 = _mm256_add_ps(fjz0,tz);
416 /**************************
417 * CALCULATE INTERACTIONS *
418 **************************/
420 if (gmx_mm256_any_lt(rsq11,rcutoff2))
423 r11 = _mm256_mul_ps(rsq11,rinv11);
425 /* EWALD ELECTROSTATICS */
427 /* Analytical PME correction */
428 zeta2 = _mm256_mul_ps(beta2,rsq11);
429 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
430 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
431 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
432 felec = _mm256_mul_ps(qq11,felec);
433 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
434 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
435 velec = _mm256_sub_ps(rinv11,pmecorrV);
436 velec = _mm256_mul_ps(qq11,velec);
438 d = _mm256_sub_ps(r11,rswitch);
439 d = _mm256_max_ps(d,_mm256_setzero_ps());
440 d2 = _mm256_mul_ps(d,d);
441 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
443 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
445 /* Evaluate switch function */
446 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
447 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
448 velec = _mm256_mul_ps(velec,sw);
449 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
451 /* Update potential sum for this i atom from the interaction with this j atom. */
452 velec = _mm256_and_ps(velec,cutoff_mask);
453 velecsum = _mm256_add_ps(velecsum,velec);
457 fscal = _mm256_and_ps(fscal,cutoff_mask);
459 /* Calculate temporary vectorial force */
460 tx = _mm256_mul_ps(fscal,dx11);
461 ty = _mm256_mul_ps(fscal,dy11);
462 tz = _mm256_mul_ps(fscal,dz11);
464 /* Update vectorial force */
465 fix1 = _mm256_add_ps(fix1,tx);
466 fiy1 = _mm256_add_ps(fiy1,ty);
467 fiz1 = _mm256_add_ps(fiz1,tz);
469 fjx1 = _mm256_add_ps(fjx1,tx);
470 fjy1 = _mm256_add_ps(fjy1,ty);
471 fjz1 = _mm256_add_ps(fjz1,tz);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 if (gmx_mm256_any_lt(rsq12,rcutoff2))
482 r12 = _mm256_mul_ps(rsq12,rinv12);
484 /* EWALD ELECTROSTATICS */
486 /* Analytical PME correction */
487 zeta2 = _mm256_mul_ps(beta2,rsq12);
488 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
489 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
490 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
491 felec = _mm256_mul_ps(qq12,felec);
492 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
493 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
494 velec = _mm256_sub_ps(rinv12,pmecorrV);
495 velec = _mm256_mul_ps(qq12,velec);
497 d = _mm256_sub_ps(r12,rswitch);
498 d = _mm256_max_ps(d,_mm256_setzero_ps());
499 d2 = _mm256_mul_ps(d,d);
500 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
502 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
504 /* Evaluate switch function */
505 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
506 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
507 velec = _mm256_mul_ps(velec,sw);
508 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velec = _mm256_and_ps(velec,cutoff_mask);
512 velecsum = _mm256_add_ps(velecsum,velec);
516 fscal = _mm256_and_ps(fscal,cutoff_mask);
518 /* Calculate temporary vectorial force */
519 tx = _mm256_mul_ps(fscal,dx12);
520 ty = _mm256_mul_ps(fscal,dy12);
521 tz = _mm256_mul_ps(fscal,dz12);
523 /* Update vectorial force */
524 fix1 = _mm256_add_ps(fix1,tx);
525 fiy1 = _mm256_add_ps(fiy1,ty);
526 fiz1 = _mm256_add_ps(fiz1,tz);
528 fjx2 = _mm256_add_ps(fjx2,tx);
529 fjy2 = _mm256_add_ps(fjy2,ty);
530 fjz2 = _mm256_add_ps(fjz2,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 if (gmx_mm256_any_lt(rsq13,rcutoff2))
541 r13 = _mm256_mul_ps(rsq13,rinv13);
543 /* EWALD ELECTROSTATICS */
545 /* Analytical PME correction */
546 zeta2 = _mm256_mul_ps(beta2,rsq13);
547 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
548 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
549 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
550 felec = _mm256_mul_ps(qq13,felec);
551 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
552 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
553 velec = _mm256_sub_ps(rinv13,pmecorrV);
554 velec = _mm256_mul_ps(qq13,velec);
556 d = _mm256_sub_ps(r13,rswitch);
557 d = _mm256_max_ps(d,_mm256_setzero_ps());
558 d2 = _mm256_mul_ps(d,d);
559 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
561 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
563 /* Evaluate switch function */
564 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
565 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
566 velec = _mm256_mul_ps(velec,sw);
567 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velec = _mm256_and_ps(velec,cutoff_mask);
571 velecsum = _mm256_add_ps(velecsum,velec);
575 fscal = _mm256_and_ps(fscal,cutoff_mask);
577 /* Calculate temporary vectorial force */
578 tx = _mm256_mul_ps(fscal,dx13);
579 ty = _mm256_mul_ps(fscal,dy13);
580 tz = _mm256_mul_ps(fscal,dz13);
582 /* Update vectorial force */
583 fix1 = _mm256_add_ps(fix1,tx);
584 fiy1 = _mm256_add_ps(fiy1,ty);
585 fiz1 = _mm256_add_ps(fiz1,tz);
587 fjx3 = _mm256_add_ps(fjx3,tx);
588 fjy3 = _mm256_add_ps(fjy3,ty);
589 fjz3 = _mm256_add_ps(fjz3,tz);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 if (gmx_mm256_any_lt(rsq21,rcutoff2))
600 r21 = _mm256_mul_ps(rsq21,rinv21);
602 /* EWALD ELECTROSTATICS */
604 /* Analytical PME correction */
605 zeta2 = _mm256_mul_ps(beta2,rsq21);
606 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
607 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
608 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
609 felec = _mm256_mul_ps(qq21,felec);
610 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
611 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
612 velec = _mm256_sub_ps(rinv21,pmecorrV);
613 velec = _mm256_mul_ps(qq21,velec);
615 d = _mm256_sub_ps(r21,rswitch);
616 d = _mm256_max_ps(d,_mm256_setzero_ps());
617 d2 = _mm256_mul_ps(d,d);
618 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
620 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
622 /* Evaluate switch function */
623 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
624 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
625 velec = _mm256_mul_ps(velec,sw);
626 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
628 /* Update potential sum for this i atom from the interaction with this j atom. */
629 velec = _mm256_and_ps(velec,cutoff_mask);
630 velecsum = _mm256_add_ps(velecsum,velec);
634 fscal = _mm256_and_ps(fscal,cutoff_mask);
636 /* Calculate temporary vectorial force */
637 tx = _mm256_mul_ps(fscal,dx21);
638 ty = _mm256_mul_ps(fscal,dy21);
639 tz = _mm256_mul_ps(fscal,dz21);
641 /* Update vectorial force */
642 fix2 = _mm256_add_ps(fix2,tx);
643 fiy2 = _mm256_add_ps(fiy2,ty);
644 fiz2 = _mm256_add_ps(fiz2,tz);
646 fjx1 = _mm256_add_ps(fjx1,tx);
647 fjy1 = _mm256_add_ps(fjy1,ty);
648 fjz1 = _mm256_add_ps(fjz1,tz);
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 if (gmx_mm256_any_lt(rsq22,rcutoff2))
659 r22 = _mm256_mul_ps(rsq22,rinv22);
661 /* EWALD ELECTROSTATICS */
663 /* Analytical PME correction */
664 zeta2 = _mm256_mul_ps(beta2,rsq22);
665 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
666 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
667 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
668 felec = _mm256_mul_ps(qq22,felec);
669 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
670 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
671 velec = _mm256_sub_ps(rinv22,pmecorrV);
672 velec = _mm256_mul_ps(qq22,velec);
674 d = _mm256_sub_ps(r22,rswitch);
675 d = _mm256_max_ps(d,_mm256_setzero_ps());
676 d2 = _mm256_mul_ps(d,d);
677 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
679 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
681 /* Evaluate switch function */
682 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
683 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
684 velec = _mm256_mul_ps(velec,sw);
685 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velec = _mm256_and_ps(velec,cutoff_mask);
689 velecsum = _mm256_add_ps(velecsum,velec);
693 fscal = _mm256_and_ps(fscal,cutoff_mask);
695 /* Calculate temporary vectorial force */
696 tx = _mm256_mul_ps(fscal,dx22);
697 ty = _mm256_mul_ps(fscal,dy22);
698 tz = _mm256_mul_ps(fscal,dz22);
700 /* Update vectorial force */
701 fix2 = _mm256_add_ps(fix2,tx);
702 fiy2 = _mm256_add_ps(fiy2,ty);
703 fiz2 = _mm256_add_ps(fiz2,tz);
705 fjx2 = _mm256_add_ps(fjx2,tx);
706 fjy2 = _mm256_add_ps(fjy2,ty);
707 fjz2 = _mm256_add_ps(fjz2,tz);
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
715 if (gmx_mm256_any_lt(rsq23,rcutoff2))
718 r23 = _mm256_mul_ps(rsq23,rinv23);
720 /* EWALD ELECTROSTATICS */
722 /* Analytical PME correction */
723 zeta2 = _mm256_mul_ps(beta2,rsq23);
724 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
725 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
726 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
727 felec = _mm256_mul_ps(qq23,felec);
728 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
729 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
730 velec = _mm256_sub_ps(rinv23,pmecorrV);
731 velec = _mm256_mul_ps(qq23,velec);
733 d = _mm256_sub_ps(r23,rswitch);
734 d = _mm256_max_ps(d,_mm256_setzero_ps());
735 d2 = _mm256_mul_ps(d,d);
736 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
738 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
740 /* Evaluate switch function */
741 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
742 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
743 velec = _mm256_mul_ps(velec,sw);
744 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velec = _mm256_and_ps(velec,cutoff_mask);
748 velecsum = _mm256_add_ps(velecsum,velec);
752 fscal = _mm256_and_ps(fscal,cutoff_mask);
754 /* Calculate temporary vectorial force */
755 tx = _mm256_mul_ps(fscal,dx23);
756 ty = _mm256_mul_ps(fscal,dy23);
757 tz = _mm256_mul_ps(fscal,dz23);
759 /* Update vectorial force */
760 fix2 = _mm256_add_ps(fix2,tx);
761 fiy2 = _mm256_add_ps(fiy2,ty);
762 fiz2 = _mm256_add_ps(fiz2,tz);
764 fjx3 = _mm256_add_ps(fjx3,tx);
765 fjy3 = _mm256_add_ps(fjy3,ty);
766 fjz3 = _mm256_add_ps(fjz3,tz);
770 /**************************
771 * CALCULATE INTERACTIONS *
772 **************************/
774 if (gmx_mm256_any_lt(rsq31,rcutoff2))
777 r31 = _mm256_mul_ps(rsq31,rinv31);
779 /* EWALD ELECTROSTATICS */
781 /* Analytical PME correction */
782 zeta2 = _mm256_mul_ps(beta2,rsq31);
783 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
784 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
785 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
786 felec = _mm256_mul_ps(qq31,felec);
787 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
788 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
789 velec = _mm256_sub_ps(rinv31,pmecorrV);
790 velec = _mm256_mul_ps(qq31,velec);
792 d = _mm256_sub_ps(r31,rswitch);
793 d = _mm256_max_ps(d,_mm256_setzero_ps());
794 d2 = _mm256_mul_ps(d,d);
795 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
797 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
799 /* Evaluate switch function */
800 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
801 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
802 velec = _mm256_mul_ps(velec,sw);
803 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
805 /* Update potential sum for this i atom from the interaction with this j atom. */
806 velec = _mm256_and_ps(velec,cutoff_mask);
807 velecsum = _mm256_add_ps(velecsum,velec);
811 fscal = _mm256_and_ps(fscal,cutoff_mask);
813 /* Calculate temporary vectorial force */
814 tx = _mm256_mul_ps(fscal,dx31);
815 ty = _mm256_mul_ps(fscal,dy31);
816 tz = _mm256_mul_ps(fscal,dz31);
818 /* Update vectorial force */
819 fix3 = _mm256_add_ps(fix3,tx);
820 fiy3 = _mm256_add_ps(fiy3,ty);
821 fiz3 = _mm256_add_ps(fiz3,tz);
823 fjx1 = _mm256_add_ps(fjx1,tx);
824 fjy1 = _mm256_add_ps(fjy1,ty);
825 fjz1 = _mm256_add_ps(fjz1,tz);
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 if (gmx_mm256_any_lt(rsq32,rcutoff2))
836 r32 = _mm256_mul_ps(rsq32,rinv32);
838 /* EWALD ELECTROSTATICS */
840 /* Analytical PME correction */
841 zeta2 = _mm256_mul_ps(beta2,rsq32);
842 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
843 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
844 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
845 felec = _mm256_mul_ps(qq32,felec);
846 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
847 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
848 velec = _mm256_sub_ps(rinv32,pmecorrV);
849 velec = _mm256_mul_ps(qq32,velec);
851 d = _mm256_sub_ps(r32,rswitch);
852 d = _mm256_max_ps(d,_mm256_setzero_ps());
853 d2 = _mm256_mul_ps(d,d);
854 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
856 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
858 /* Evaluate switch function */
859 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
860 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
861 velec = _mm256_mul_ps(velec,sw);
862 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec = _mm256_and_ps(velec,cutoff_mask);
866 velecsum = _mm256_add_ps(velecsum,velec);
870 fscal = _mm256_and_ps(fscal,cutoff_mask);
872 /* Calculate temporary vectorial force */
873 tx = _mm256_mul_ps(fscal,dx32);
874 ty = _mm256_mul_ps(fscal,dy32);
875 tz = _mm256_mul_ps(fscal,dz32);
877 /* Update vectorial force */
878 fix3 = _mm256_add_ps(fix3,tx);
879 fiy3 = _mm256_add_ps(fiy3,ty);
880 fiz3 = _mm256_add_ps(fiz3,tz);
882 fjx2 = _mm256_add_ps(fjx2,tx);
883 fjy2 = _mm256_add_ps(fjy2,ty);
884 fjz2 = _mm256_add_ps(fjz2,tz);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 if (gmx_mm256_any_lt(rsq33,rcutoff2))
895 r33 = _mm256_mul_ps(rsq33,rinv33);
897 /* EWALD ELECTROSTATICS */
899 /* Analytical PME correction */
900 zeta2 = _mm256_mul_ps(beta2,rsq33);
901 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
902 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
903 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
904 felec = _mm256_mul_ps(qq33,felec);
905 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
906 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
907 velec = _mm256_sub_ps(rinv33,pmecorrV);
908 velec = _mm256_mul_ps(qq33,velec);
910 d = _mm256_sub_ps(r33,rswitch);
911 d = _mm256_max_ps(d,_mm256_setzero_ps());
912 d2 = _mm256_mul_ps(d,d);
913 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
915 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
917 /* Evaluate switch function */
918 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
919 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
920 velec = _mm256_mul_ps(velec,sw);
921 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
923 /* Update potential sum for this i atom from the interaction with this j atom. */
924 velec = _mm256_and_ps(velec,cutoff_mask);
925 velecsum = _mm256_add_ps(velecsum,velec);
929 fscal = _mm256_and_ps(fscal,cutoff_mask);
931 /* Calculate temporary vectorial force */
932 tx = _mm256_mul_ps(fscal,dx33);
933 ty = _mm256_mul_ps(fscal,dy33);
934 tz = _mm256_mul_ps(fscal,dz33);
936 /* Update vectorial force */
937 fix3 = _mm256_add_ps(fix3,tx);
938 fiy3 = _mm256_add_ps(fiy3,ty);
939 fiz3 = _mm256_add_ps(fiz3,tz);
941 fjx3 = _mm256_add_ps(fjx3,tx);
942 fjy3 = _mm256_add_ps(fjy3,ty);
943 fjz3 = _mm256_add_ps(fjz3,tz);
947 fjptrA = f+j_coord_offsetA;
948 fjptrB = f+j_coord_offsetB;
949 fjptrC = f+j_coord_offsetC;
950 fjptrD = f+j_coord_offsetD;
951 fjptrE = f+j_coord_offsetE;
952 fjptrF = f+j_coord_offsetF;
953 fjptrG = f+j_coord_offsetG;
954 fjptrH = f+j_coord_offsetH;
956 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
957 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
958 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
960 /* Inner loop uses 1034 flops */
966 /* Get j neighbor index, and coordinate index */
967 jnrlistA = jjnr[jidx];
968 jnrlistB = jjnr[jidx+1];
969 jnrlistC = jjnr[jidx+2];
970 jnrlistD = jjnr[jidx+3];
971 jnrlistE = jjnr[jidx+4];
972 jnrlistF = jjnr[jidx+5];
973 jnrlistG = jjnr[jidx+6];
974 jnrlistH = jjnr[jidx+7];
975 /* Sign of each element will be negative for non-real atoms.
976 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
977 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
979 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
980 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
982 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
983 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
984 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
985 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
986 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
987 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
988 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
989 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
990 j_coord_offsetA = DIM*jnrA;
991 j_coord_offsetB = DIM*jnrB;
992 j_coord_offsetC = DIM*jnrC;
993 j_coord_offsetD = DIM*jnrD;
994 j_coord_offsetE = DIM*jnrE;
995 j_coord_offsetF = DIM*jnrF;
996 j_coord_offsetG = DIM*jnrG;
997 j_coord_offsetH = DIM*jnrH;
999 /* load j atom coordinates */
1000 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1001 x+j_coord_offsetC,x+j_coord_offsetD,
1002 x+j_coord_offsetE,x+j_coord_offsetF,
1003 x+j_coord_offsetG,x+j_coord_offsetH,
1004 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1005 &jy2,&jz2,&jx3,&jy3,&jz3);
1007 /* Calculate displacement vector */
1008 dx00 = _mm256_sub_ps(ix0,jx0);
1009 dy00 = _mm256_sub_ps(iy0,jy0);
1010 dz00 = _mm256_sub_ps(iz0,jz0);
1011 dx11 = _mm256_sub_ps(ix1,jx1);
1012 dy11 = _mm256_sub_ps(iy1,jy1);
1013 dz11 = _mm256_sub_ps(iz1,jz1);
1014 dx12 = _mm256_sub_ps(ix1,jx2);
1015 dy12 = _mm256_sub_ps(iy1,jy2);
1016 dz12 = _mm256_sub_ps(iz1,jz2);
1017 dx13 = _mm256_sub_ps(ix1,jx3);
1018 dy13 = _mm256_sub_ps(iy1,jy3);
1019 dz13 = _mm256_sub_ps(iz1,jz3);
1020 dx21 = _mm256_sub_ps(ix2,jx1);
1021 dy21 = _mm256_sub_ps(iy2,jy1);
1022 dz21 = _mm256_sub_ps(iz2,jz1);
1023 dx22 = _mm256_sub_ps(ix2,jx2);
1024 dy22 = _mm256_sub_ps(iy2,jy2);
1025 dz22 = _mm256_sub_ps(iz2,jz2);
1026 dx23 = _mm256_sub_ps(ix2,jx3);
1027 dy23 = _mm256_sub_ps(iy2,jy3);
1028 dz23 = _mm256_sub_ps(iz2,jz3);
1029 dx31 = _mm256_sub_ps(ix3,jx1);
1030 dy31 = _mm256_sub_ps(iy3,jy1);
1031 dz31 = _mm256_sub_ps(iz3,jz1);
1032 dx32 = _mm256_sub_ps(ix3,jx2);
1033 dy32 = _mm256_sub_ps(iy3,jy2);
1034 dz32 = _mm256_sub_ps(iz3,jz2);
1035 dx33 = _mm256_sub_ps(ix3,jx3);
1036 dy33 = _mm256_sub_ps(iy3,jy3);
1037 dz33 = _mm256_sub_ps(iz3,jz3);
1039 /* Calculate squared distance and things based on it */
1040 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1041 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1042 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1043 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1044 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1045 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1046 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1047 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1048 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1049 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1051 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1052 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1053 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1054 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1055 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1056 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1057 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1058 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1059 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1060 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1062 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1063 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1064 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1065 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1066 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1067 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1068 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1069 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1070 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1071 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1073 fjx0 = _mm256_setzero_ps();
1074 fjy0 = _mm256_setzero_ps();
1075 fjz0 = _mm256_setzero_ps();
1076 fjx1 = _mm256_setzero_ps();
1077 fjy1 = _mm256_setzero_ps();
1078 fjz1 = _mm256_setzero_ps();
1079 fjx2 = _mm256_setzero_ps();
1080 fjy2 = _mm256_setzero_ps();
1081 fjz2 = _mm256_setzero_ps();
1082 fjx3 = _mm256_setzero_ps();
1083 fjy3 = _mm256_setzero_ps();
1084 fjz3 = _mm256_setzero_ps();
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1093 r00 = _mm256_mul_ps(rsq00,rinv00);
1094 r00 = _mm256_andnot_ps(dummy_mask,r00);
1096 /* LENNARD-JONES DISPERSION/REPULSION */
1098 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1099 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1100 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1101 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1102 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1104 d = _mm256_sub_ps(r00,rswitch);
1105 d = _mm256_max_ps(d,_mm256_setzero_ps());
1106 d2 = _mm256_mul_ps(d,d);
1107 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1109 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1111 /* Evaluate switch function */
1112 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1113 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1114 vvdw = _mm256_mul_ps(vvdw,sw);
1115 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1117 /* Update potential sum for this i atom from the interaction with this j atom. */
1118 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
1119 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1120 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1124 fscal = _mm256_and_ps(fscal,cutoff_mask);
1126 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1128 /* Calculate temporary vectorial force */
1129 tx = _mm256_mul_ps(fscal,dx00);
1130 ty = _mm256_mul_ps(fscal,dy00);
1131 tz = _mm256_mul_ps(fscal,dz00);
1133 /* Update vectorial force */
1134 fix0 = _mm256_add_ps(fix0,tx);
1135 fiy0 = _mm256_add_ps(fiy0,ty);
1136 fiz0 = _mm256_add_ps(fiz0,tz);
1138 fjx0 = _mm256_add_ps(fjx0,tx);
1139 fjy0 = _mm256_add_ps(fjy0,ty);
1140 fjz0 = _mm256_add_ps(fjz0,tz);
1144 /**************************
1145 * CALCULATE INTERACTIONS *
1146 **************************/
1148 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1151 r11 = _mm256_mul_ps(rsq11,rinv11);
1152 r11 = _mm256_andnot_ps(dummy_mask,r11);
1154 /* EWALD ELECTROSTATICS */
1156 /* Analytical PME correction */
1157 zeta2 = _mm256_mul_ps(beta2,rsq11);
1158 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1159 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1160 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1161 felec = _mm256_mul_ps(qq11,felec);
1162 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1163 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1164 velec = _mm256_sub_ps(rinv11,pmecorrV);
1165 velec = _mm256_mul_ps(qq11,velec);
1167 d = _mm256_sub_ps(r11,rswitch);
1168 d = _mm256_max_ps(d,_mm256_setzero_ps());
1169 d2 = _mm256_mul_ps(d,d);
1170 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1172 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1174 /* Evaluate switch function */
1175 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1176 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1177 velec = _mm256_mul_ps(velec,sw);
1178 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1180 /* Update potential sum for this i atom from the interaction with this j atom. */
1181 velec = _mm256_and_ps(velec,cutoff_mask);
1182 velec = _mm256_andnot_ps(dummy_mask,velec);
1183 velecsum = _mm256_add_ps(velecsum,velec);
1187 fscal = _mm256_and_ps(fscal,cutoff_mask);
1189 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1191 /* Calculate temporary vectorial force */
1192 tx = _mm256_mul_ps(fscal,dx11);
1193 ty = _mm256_mul_ps(fscal,dy11);
1194 tz = _mm256_mul_ps(fscal,dz11);
1196 /* Update vectorial force */
1197 fix1 = _mm256_add_ps(fix1,tx);
1198 fiy1 = _mm256_add_ps(fiy1,ty);
1199 fiz1 = _mm256_add_ps(fiz1,tz);
1201 fjx1 = _mm256_add_ps(fjx1,tx);
1202 fjy1 = _mm256_add_ps(fjy1,ty);
1203 fjz1 = _mm256_add_ps(fjz1,tz);
1207 /**************************
1208 * CALCULATE INTERACTIONS *
1209 **************************/
1211 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1214 r12 = _mm256_mul_ps(rsq12,rinv12);
1215 r12 = _mm256_andnot_ps(dummy_mask,r12);
1217 /* EWALD ELECTROSTATICS */
1219 /* Analytical PME correction */
1220 zeta2 = _mm256_mul_ps(beta2,rsq12);
1221 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1222 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1223 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1224 felec = _mm256_mul_ps(qq12,felec);
1225 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1226 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1227 velec = _mm256_sub_ps(rinv12,pmecorrV);
1228 velec = _mm256_mul_ps(qq12,velec);
1230 d = _mm256_sub_ps(r12,rswitch);
1231 d = _mm256_max_ps(d,_mm256_setzero_ps());
1232 d2 = _mm256_mul_ps(d,d);
1233 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1235 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1237 /* Evaluate switch function */
1238 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1239 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1240 velec = _mm256_mul_ps(velec,sw);
1241 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1243 /* Update potential sum for this i atom from the interaction with this j atom. */
1244 velec = _mm256_and_ps(velec,cutoff_mask);
1245 velec = _mm256_andnot_ps(dummy_mask,velec);
1246 velecsum = _mm256_add_ps(velecsum,velec);
1250 fscal = _mm256_and_ps(fscal,cutoff_mask);
1252 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1254 /* Calculate temporary vectorial force */
1255 tx = _mm256_mul_ps(fscal,dx12);
1256 ty = _mm256_mul_ps(fscal,dy12);
1257 tz = _mm256_mul_ps(fscal,dz12);
1259 /* Update vectorial force */
1260 fix1 = _mm256_add_ps(fix1,tx);
1261 fiy1 = _mm256_add_ps(fiy1,ty);
1262 fiz1 = _mm256_add_ps(fiz1,tz);
1264 fjx2 = _mm256_add_ps(fjx2,tx);
1265 fjy2 = _mm256_add_ps(fjy2,ty);
1266 fjz2 = _mm256_add_ps(fjz2,tz);
1270 /**************************
1271 * CALCULATE INTERACTIONS *
1272 **************************/
1274 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1277 r13 = _mm256_mul_ps(rsq13,rinv13);
1278 r13 = _mm256_andnot_ps(dummy_mask,r13);
1280 /* EWALD ELECTROSTATICS */
1282 /* Analytical PME correction */
1283 zeta2 = _mm256_mul_ps(beta2,rsq13);
1284 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1285 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1286 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1287 felec = _mm256_mul_ps(qq13,felec);
1288 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1289 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1290 velec = _mm256_sub_ps(rinv13,pmecorrV);
1291 velec = _mm256_mul_ps(qq13,velec);
1293 d = _mm256_sub_ps(r13,rswitch);
1294 d = _mm256_max_ps(d,_mm256_setzero_ps());
1295 d2 = _mm256_mul_ps(d,d);
1296 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1298 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1300 /* Evaluate switch function */
1301 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1302 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1303 velec = _mm256_mul_ps(velec,sw);
1304 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1306 /* Update potential sum for this i atom from the interaction with this j atom. */
1307 velec = _mm256_and_ps(velec,cutoff_mask);
1308 velec = _mm256_andnot_ps(dummy_mask,velec);
1309 velecsum = _mm256_add_ps(velecsum,velec);
1313 fscal = _mm256_and_ps(fscal,cutoff_mask);
1315 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1317 /* Calculate temporary vectorial force */
1318 tx = _mm256_mul_ps(fscal,dx13);
1319 ty = _mm256_mul_ps(fscal,dy13);
1320 tz = _mm256_mul_ps(fscal,dz13);
1322 /* Update vectorial force */
1323 fix1 = _mm256_add_ps(fix1,tx);
1324 fiy1 = _mm256_add_ps(fiy1,ty);
1325 fiz1 = _mm256_add_ps(fiz1,tz);
1327 fjx3 = _mm256_add_ps(fjx3,tx);
1328 fjy3 = _mm256_add_ps(fjy3,ty);
1329 fjz3 = _mm256_add_ps(fjz3,tz);
1333 /**************************
1334 * CALCULATE INTERACTIONS *
1335 **************************/
1337 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1340 r21 = _mm256_mul_ps(rsq21,rinv21);
1341 r21 = _mm256_andnot_ps(dummy_mask,r21);
1343 /* EWALD ELECTROSTATICS */
1345 /* Analytical PME correction */
1346 zeta2 = _mm256_mul_ps(beta2,rsq21);
1347 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1348 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1349 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1350 felec = _mm256_mul_ps(qq21,felec);
1351 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1352 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1353 velec = _mm256_sub_ps(rinv21,pmecorrV);
1354 velec = _mm256_mul_ps(qq21,velec);
1356 d = _mm256_sub_ps(r21,rswitch);
1357 d = _mm256_max_ps(d,_mm256_setzero_ps());
1358 d2 = _mm256_mul_ps(d,d);
1359 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1361 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1363 /* Evaluate switch function */
1364 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1365 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1366 velec = _mm256_mul_ps(velec,sw);
1367 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1369 /* Update potential sum for this i atom from the interaction with this j atom. */
1370 velec = _mm256_and_ps(velec,cutoff_mask);
1371 velec = _mm256_andnot_ps(dummy_mask,velec);
1372 velecsum = _mm256_add_ps(velecsum,velec);
1376 fscal = _mm256_and_ps(fscal,cutoff_mask);
1378 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1380 /* Calculate temporary vectorial force */
1381 tx = _mm256_mul_ps(fscal,dx21);
1382 ty = _mm256_mul_ps(fscal,dy21);
1383 tz = _mm256_mul_ps(fscal,dz21);
1385 /* Update vectorial force */
1386 fix2 = _mm256_add_ps(fix2,tx);
1387 fiy2 = _mm256_add_ps(fiy2,ty);
1388 fiz2 = _mm256_add_ps(fiz2,tz);
1390 fjx1 = _mm256_add_ps(fjx1,tx);
1391 fjy1 = _mm256_add_ps(fjy1,ty);
1392 fjz1 = _mm256_add_ps(fjz1,tz);
1396 /**************************
1397 * CALCULATE INTERACTIONS *
1398 **************************/
1400 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1403 r22 = _mm256_mul_ps(rsq22,rinv22);
1404 r22 = _mm256_andnot_ps(dummy_mask,r22);
1406 /* EWALD ELECTROSTATICS */
1408 /* Analytical PME correction */
1409 zeta2 = _mm256_mul_ps(beta2,rsq22);
1410 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1411 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1412 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1413 felec = _mm256_mul_ps(qq22,felec);
1414 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1415 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1416 velec = _mm256_sub_ps(rinv22,pmecorrV);
1417 velec = _mm256_mul_ps(qq22,velec);
1419 d = _mm256_sub_ps(r22,rswitch);
1420 d = _mm256_max_ps(d,_mm256_setzero_ps());
1421 d2 = _mm256_mul_ps(d,d);
1422 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1424 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1426 /* Evaluate switch function */
1427 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1428 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1429 velec = _mm256_mul_ps(velec,sw);
1430 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1432 /* Update potential sum for this i atom from the interaction with this j atom. */
1433 velec = _mm256_and_ps(velec,cutoff_mask);
1434 velec = _mm256_andnot_ps(dummy_mask,velec);
1435 velecsum = _mm256_add_ps(velecsum,velec);
1439 fscal = _mm256_and_ps(fscal,cutoff_mask);
1441 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1443 /* Calculate temporary vectorial force */
1444 tx = _mm256_mul_ps(fscal,dx22);
1445 ty = _mm256_mul_ps(fscal,dy22);
1446 tz = _mm256_mul_ps(fscal,dz22);
1448 /* Update vectorial force */
1449 fix2 = _mm256_add_ps(fix2,tx);
1450 fiy2 = _mm256_add_ps(fiy2,ty);
1451 fiz2 = _mm256_add_ps(fiz2,tz);
1453 fjx2 = _mm256_add_ps(fjx2,tx);
1454 fjy2 = _mm256_add_ps(fjy2,ty);
1455 fjz2 = _mm256_add_ps(fjz2,tz);
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1463 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1466 r23 = _mm256_mul_ps(rsq23,rinv23);
1467 r23 = _mm256_andnot_ps(dummy_mask,r23);
1469 /* EWALD ELECTROSTATICS */
1471 /* Analytical PME correction */
1472 zeta2 = _mm256_mul_ps(beta2,rsq23);
1473 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1474 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1475 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1476 felec = _mm256_mul_ps(qq23,felec);
1477 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1478 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1479 velec = _mm256_sub_ps(rinv23,pmecorrV);
1480 velec = _mm256_mul_ps(qq23,velec);
1482 d = _mm256_sub_ps(r23,rswitch);
1483 d = _mm256_max_ps(d,_mm256_setzero_ps());
1484 d2 = _mm256_mul_ps(d,d);
1485 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1487 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1489 /* Evaluate switch function */
1490 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1491 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1492 velec = _mm256_mul_ps(velec,sw);
1493 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1495 /* Update potential sum for this i atom from the interaction with this j atom. */
1496 velec = _mm256_and_ps(velec,cutoff_mask);
1497 velec = _mm256_andnot_ps(dummy_mask,velec);
1498 velecsum = _mm256_add_ps(velecsum,velec);
1502 fscal = _mm256_and_ps(fscal,cutoff_mask);
1504 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1506 /* Calculate temporary vectorial force */
1507 tx = _mm256_mul_ps(fscal,dx23);
1508 ty = _mm256_mul_ps(fscal,dy23);
1509 tz = _mm256_mul_ps(fscal,dz23);
1511 /* Update vectorial force */
1512 fix2 = _mm256_add_ps(fix2,tx);
1513 fiy2 = _mm256_add_ps(fiy2,ty);
1514 fiz2 = _mm256_add_ps(fiz2,tz);
1516 fjx3 = _mm256_add_ps(fjx3,tx);
1517 fjy3 = _mm256_add_ps(fjy3,ty);
1518 fjz3 = _mm256_add_ps(fjz3,tz);
1522 /**************************
1523 * CALCULATE INTERACTIONS *
1524 **************************/
1526 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1529 r31 = _mm256_mul_ps(rsq31,rinv31);
1530 r31 = _mm256_andnot_ps(dummy_mask,r31);
1532 /* EWALD ELECTROSTATICS */
1534 /* Analytical PME correction */
1535 zeta2 = _mm256_mul_ps(beta2,rsq31);
1536 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1537 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1538 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1539 felec = _mm256_mul_ps(qq31,felec);
1540 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1541 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1542 velec = _mm256_sub_ps(rinv31,pmecorrV);
1543 velec = _mm256_mul_ps(qq31,velec);
1545 d = _mm256_sub_ps(r31,rswitch);
1546 d = _mm256_max_ps(d,_mm256_setzero_ps());
1547 d2 = _mm256_mul_ps(d,d);
1548 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1550 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1552 /* Evaluate switch function */
1553 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1554 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1555 velec = _mm256_mul_ps(velec,sw);
1556 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1558 /* Update potential sum for this i atom from the interaction with this j atom. */
1559 velec = _mm256_and_ps(velec,cutoff_mask);
1560 velec = _mm256_andnot_ps(dummy_mask,velec);
1561 velecsum = _mm256_add_ps(velecsum,velec);
1565 fscal = _mm256_and_ps(fscal,cutoff_mask);
1567 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1569 /* Calculate temporary vectorial force */
1570 tx = _mm256_mul_ps(fscal,dx31);
1571 ty = _mm256_mul_ps(fscal,dy31);
1572 tz = _mm256_mul_ps(fscal,dz31);
1574 /* Update vectorial force */
1575 fix3 = _mm256_add_ps(fix3,tx);
1576 fiy3 = _mm256_add_ps(fiy3,ty);
1577 fiz3 = _mm256_add_ps(fiz3,tz);
1579 fjx1 = _mm256_add_ps(fjx1,tx);
1580 fjy1 = _mm256_add_ps(fjy1,ty);
1581 fjz1 = _mm256_add_ps(fjz1,tz);
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1592 r32 = _mm256_mul_ps(rsq32,rinv32);
1593 r32 = _mm256_andnot_ps(dummy_mask,r32);
1595 /* EWALD ELECTROSTATICS */
1597 /* Analytical PME correction */
1598 zeta2 = _mm256_mul_ps(beta2,rsq32);
1599 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1600 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1601 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1602 felec = _mm256_mul_ps(qq32,felec);
1603 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1604 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1605 velec = _mm256_sub_ps(rinv32,pmecorrV);
1606 velec = _mm256_mul_ps(qq32,velec);
1608 d = _mm256_sub_ps(r32,rswitch);
1609 d = _mm256_max_ps(d,_mm256_setzero_ps());
1610 d2 = _mm256_mul_ps(d,d);
1611 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1613 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1615 /* Evaluate switch function */
1616 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1617 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1618 velec = _mm256_mul_ps(velec,sw);
1619 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1621 /* Update potential sum for this i atom from the interaction with this j atom. */
1622 velec = _mm256_and_ps(velec,cutoff_mask);
1623 velec = _mm256_andnot_ps(dummy_mask,velec);
1624 velecsum = _mm256_add_ps(velecsum,velec);
1628 fscal = _mm256_and_ps(fscal,cutoff_mask);
1630 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1632 /* Calculate temporary vectorial force */
1633 tx = _mm256_mul_ps(fscal,dx32);
1634 ty = _mm256_mul_ps(fscal,dy32);
1635 tz = _mm256_mul_ps(fscal,dz32);
1637 /* Update vectorial force */
1638 fix3 = _mm256_add_ps(fix3,tx);
1639 fiy3 = _mm256_add_ps(fiy3,ty);
1640 fiz3 = _mm256_add_ps(fiz3,tz);
1642 fjx2 = _mm256_add_ps(fjx2,tx);
1643 fjy2 = _mm256_add_ps(fjy2,ty);
1644 fjz2 = _mm256_add_ps(fjz2,tz);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1655 r33 = _mm256_mul_ps(rsq33,rinv33);
1656 r33 = _mm256_andnot_ps(dummy_mask,r33);
1658 /* EWALD ELECTROSTATICS */
1660 /* Analytical PME correction */
1661 zeta2 = _mm256_mul_ps(beta2,rsq33);
1662 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1663 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1664 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1665 felec = _mm256_mul_ps(qq33,felec);
1666 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1667 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1668 velec = _mm256_sub_ps(rinv33,pmecorrV);
1669 velec = _mm256_mul_ps(qq33,velec);
1671 d = _mm256_sub_ps(r33,rswitch);
1672 d = _mm256_max_ps(d,_mm256_setzero_ps());
1673 d2 = _mm256_mul_ps(d,d);
1674 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1676 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1678 /* Evaluate switch function */
1679 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1680 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1681 velec = _mm256_mul_ps(velec,sw);
1682 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1684 /* Update potential sum for this i atom from the interaction with this j atom. */
1685 velec = _mm256_and_ps(velec,cutoff_mask);
1686 velec = _mm256_andnot_ps(dummy_mask,velec);
1687 velecsum = _mm256_add_ps(velecsum,velec);
1691 fscal = _mm256_and_ps(fscal,cutoff_mask);
1693 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1695 /* Calculate temporary vectorial force */
1696 tx = _mm256_mul_ps(fscal,dx33);
1697 ty = _mm256_mul_ps(fscal,dy33);
1698 tz = _mm256_mul_ps(fscal,dz33);
1700 /* Update vectorial force */
1701 fix3 = _mm256_add_ps(fix3,tx);
1702 fiy3 = _mm256_add_ps(fiy3,ty);
1703 fiz3 = _mm256_add_ps(fiz3,tz);
1705 fjx3 = _mm256_add_ps(fjx3,tx);
1706 fjy3 = _mm256_add_ps(fjy3,ty);
1707 fjz3 = _mm256_add_ps(fjz3,tz);
1711 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1712 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1713 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1714 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1715 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1716 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1717 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1718 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1720 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1721 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1722 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1724 /* Inner loop uses 1044 flops */
1727 /* End of innermost loop */
1729 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1730 f+i_coord_offset,fshift+i_shift_offset);
1733 /* Update potential energies */
1734 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1735 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1737 /* Increment number of inner iterations */
1738 inneriter += j_index_end - j_index_start;
1740 /* Outer loop uses 26 flops */
1743 /* Increment number of outer iterations */
1746 /* Update outer/inner flops */
1748 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*1044);
1751 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single
1752 * Electrostatics interaction: Ewald
1753 * VdW interaction: LennardJones
1754 * Geometry: Water4-Water4
1755 * Calculate force/pot: Force
1758 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_single
1759 (t_nblist * gmx_restrict nlist,
1760 rvec * gmx_restrict xx,
1761 rvec * gmx_restrict ff,
1762 t_forcerec * gmx_restrict fr,
1763 t_mdatoms * gmx_restrict mdatoms,
1764 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1765 t_nrnb * gmx_restrict nrnb)
1767 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1768 * just 0 for non-waters.
1769 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1770 * jnr indices corresponding to data put in the four positions in the SIMD register.
1772 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1773 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1774 int jnrA,jnrB,jnrC,jnrD;
1775 int jnrE,jnrF,jnrG,jnrH;
1776 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1777 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1778 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1779 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1780 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1781 real rcutoff_scalar;
1782 real *shiftvec,*fshift,*x,*f;
1783 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1784 real scratch[4*DIM];
1785 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1786 real * vdwioffsetptr0;
1787 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1788 real * vdwioffsetptr1;
1789 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1790 real * vdwioffsetptr2;
1791 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1792 real * vdwioffsetptr3;
1793 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1794 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1795 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1796 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1797 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1798 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1799 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1800 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1801 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1802 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1803 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1804 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1805 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1806 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1807 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1808 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1809 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1810 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1811 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1812 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1815 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1818 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1819 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1821 __m128i ewitab_lo,ewitab_hi;
1822 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1823 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1825 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1826 real rswitch_scalar,d_scalar;
1827 __m256 dummy_mask,cutoff_mask;
1828 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1829 __m256 one = _mm256_set1_ps(1.0);
1830 __m256 two = _mm256_set1_ps(2.0);
1836 jindex = nlist->jindex;
1838 shiftidx = nlist->shift;
1840 shiftvec = fr->shift_vec[0];
1841 fshift = fr->fshift[0];
1842 facel = _mm256_set1_ps(fr->epsfac);
1843 charge = mdatoms->chargeA;
1844 nvdwtype = fr->ntype;
1845 vdwparam = fr->nbfp;
1846 vdwtype = mdatoms->typeA;
1848 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1849 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1850 beta2 = _mm256_mul_ps(beta,beta);
1851 beta3 = _mm256_mul_ps(beta,beta2);
1853 ewtab = fr->ic->tabq_coul_FDV0;
1854 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1855 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1857 /* Setup water-specific parameters */
1858 inr = nlist->iinr[0];
1859 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1860 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1861 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1862 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1864 jq1 = _mm256_set1_ps(charge[inr+1]);
1865 jq2 = _mm256_set1_ps(charge[inr+2]);
1866 jq3 = _mm256_set1_ps(charge[inr+3]);
1867 vdwjidx0A = 2*vdwtype[inr+0];
1868 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1869 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1870 qq11 = _mm256_mul_ps(iq1,jq1);
1871 qq12 = _mm256_mul_ps(iq1,jq2);
1872 qq13 = _mm256_mul_ps(iq1,jq3);
1873 qq21 = _mm256_mul_ps(iq2,jq1);
1874 qq22 = _mm256_mul_ps(iq2,jq2);
1875 qq23 = _mm256_mul_ps(iq2,jq3);
1876 qq31 = _mm256_mul_ps(iq3,jq1);
1877 qq32 = _mm256_mul_ps(iq3,jq2);
1878 qq33 = _mm256_mul_ps(iq3,jq3);
1880 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1881 rcutoff_scalar = fr->rcoulomb;
1882 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1883 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1885 rswitch_scalar = fr->rcoulomb_switch;
1886 rswitch = _mm256_set1_ps(rswitch_scalar);
1887 /* Setup switch parameters */
1888 d_scalar = rcutoff_scalar-rswitch_scalar;
1889 d = _mm256_set1_ps(d_scalar);
1890 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1891 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1892 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1893 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1894 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1895 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1897 /* Avoid stupid compiler warnings */
1898 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1899 j_coord_offsetA = 0;
1900 j_coord_offsetB = 0;
1901 j_coord_offsetC = 0;
1902 j_coord_offsetD = 0;
1903 j_coord_offsetE = 0;
1904 j_coord_offsetF = 0;
1905 j_coord_offsetG = 0;
1906 j_coord_offsetH = 0;
1911 for(iidx=0;iidx<4*DIM;iidx++)
1913 scratch[iidx] = 0.0;
1916 /* Start outer loop over neighborlists */
1917 for(iidx=0; iidx<nri; iidx++)
1919 /* Load shift vector for this list */
1920 i_shift_offset = DIM*shiftidx[iidx];
1922 /* Load limits for loop over neighbors */
1923 j_index_start = jindex[iidx];
1924 j_index_end = jindex[iidx+1];
1926 /* Get outer coordinate index */
1928 i_coord_offset = DIM*inr;
1930 /* Load i particle coords and add shift vector */
1931 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1932 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1934 fix0 = _mm256_setzero_ps();
1935 fiy0 = _mm256_setzero_ps();
1936 fiz0 = _mm256_setzero_ps();
1937 fix1 = _mm256_setzero_ps();
1938 fiy1 = _mm256_setzero_ps();
1939 fiz1 = _mm256_setzero_ps();
1940 fix2 = _mm256_setzero_ps();
1941 fiy2 = _mm256_setzero_ps();
1942 fiz2 = _mm256_setzero_ps();
1943 fix3 = _mm256_setzero_ps();
1944 fiy3 = _mm256_setzero_ps();
1945 fiz3 = _mm256_setzero_ps();
1947 /* Start inner kernel loop */
1948 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1951 /* Get j neighbor index, and coordinate index */
1953 jnrB = jjnr[jidx+1];
1954 jnrC = jjnr[jidx+2];
1955 jnrD = jjnr[jidx+3];
1956 jnrE = jjnr[jidx+4];
1957 jnrF = jjnr[jidx+5];
1958 jnrG = jjnr[jidx+6];
1959 jnrH = jjnr[jidx+7];
1960 j_coord_offsetA = DIM*jnrA;
1961 j_coord_offsetB = DIM*jnrB;
1962 j_coord_offsetC = DIM*jnrC;
1963 j_coord_offsetD = DIM*jnrD;
1964 j_coord_offsetE = DIM*jnrE;
1965 j_coord_offsetF = DIM*jnrF;
1966 j_coord_offsetG = DIM*jnrG;
1967 j_coord_offsetH = DIM*jnrH;
1969 /* load j atom coordinates */
1970 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1971 x+j_coord_offsetC,x+j_coord_offsetD,
1972 x+j_coord_offsetE,x+j_coord_offsetF,
1973 x+j_coord_offsetG,x+j_coord_offsetH,
1974 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1975 &jy2,&jz2,&jx3,&jy3,&jz3);
1977 /* Calculate displacement vector */
1978 dx00 = _mm256_sub_ps(ix0,jx0);
1979 dy00 = _mm256_sub_ps(iy0,jy0);
1980 dz00 = _mm256_sub_ps(iz0,jz0);
1981 dx11 = _mm256_sub_ps(ix1,jx1);
1982 dy11 = _mm256_sub_ps(iy1,jy1);
1983 dz11 = _mm256_sub_ps(iz1,jz1);
1984 dx12 = _mm256_sub_ps(ix1,jx2);
1985 dy12 = _mm256_sub_ps(iy1,jy2);
1986 dz12 = _mm256_sub_ps(iz1,jz2);
1987 dx13 = _mm256_sub_ps(ix1,jx3);
1988 dy13 = _mm256_sub_ps(iy1,jy3);
1989 dz13 = _mm256_sub_ps(iz1,jz3);
1990 dx21 = _mm256_sub_ps(ix2,jx1);
1991 dy21 = _mm256_sub_ps(iy2,jy1);
1992 dz21 = _mm256_sub_ps(iz2,jz1);
1993 dx22 = _mm256_sub_ps(ix2,jx2);
1994 dy22 = _mm256_sub_ps(iy2,jy2);
1995 dz22 = _mm256_sub_ps(iz2,jz2);
1996 dx23 = _mm256_sub_ps(ix2,jx3);
1997 dy23 = _mm256_sub_ps(iy2,jy3);
1998 dz23 = _mm256_sub_ps(iz2,jz3);
1999 dx31 = _mm256_sub_ps(ix3,jx1);
2000 dy31 = _mm256_sub_ps(iy3,jy1);
2001 dz31 = _mm256_sub_ps(iz3,jz1);
2002 dx32 = _mm256_sub_ps(ix3,jx2);
2003 dy32 = _mm256_sub_ps(iy3,jy2);
2004 dz32 = _mm256_sub_ps(iz3,jz2);
2005 dx33 = _mm256_sub_ps(ix3,jx3);
2006 dy33 = _mm256_sub_ps(iy3,jy3);
2007 dz33 = _mm256_sub_ps(iz3,jz3);
2009 /* Calculate squared distance and things based on it */
2010 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2011 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2012 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2013 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2014 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2015 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2016 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2017 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2018 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2019 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2021 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2022 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2023 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2024 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2025 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2026 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2027 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2028 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2029 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2030 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2032 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2033 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2034 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2035 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2036 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2037 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2038 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2039 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2040 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2041 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2043 fjx0 = _mm256_setzero_ps();
2044 fjy0 = _mm256_setzero_ps();
2045 fjz0 = _mm256_setzero_ps();
2046 fjx1 = _mm256_setzero_ps();
2047 fjy1 = _mm256_setzero_ps();
2048 fjz1 = _mm256_setzero_ps();
2049 fjx2 = _mm256_setzero_ps();
2050 fjy2 = _mm256_setzero_ps();
2051 fjz2 = _mm256_setzero_ps();
2052 fjx3 = _mm256_setzero_ps();
2053 fjy3 = _mm256_setzero_ps();
2054 fjz3 = _mm256_setzero_ps();
2056 /**************************
2057 * CALCULATE INTERACTIONS *
2058 **************************/
2060 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2063 r00 = _mm256_mul_ps(rsq00,rinv00);
2065 /* LENNARD-JONES DISPERSION/REPULSION */
2067 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2068 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
2069 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2070 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2071 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2073 d = _mm256_sub_ps(r00,rswitch);
2074 d = _mm256_max_ps(d,_mm256_setzero_ps());
2075 d2 = _mm256_mul_ps(d,d);
2076 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2078 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2080 /* Evaluate switch function */
2081 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2082 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2083 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2087 fscal = _mm256_and_ps(fscal,cutoff_mask);
2089 /* Calculate temporary vectorial force */
2090 tx = _mm256_mul_ps(fscal,dx00);
2091 ty = _mm256_mul_ps(fscal,dy00);
2092 tz = _mm256_mul_ps(fscal,dz00);
2094 /* Update vectorial force */
2095 fix0 = _mm256_add_ps(fix0,tx);
2096 fiy0 = _mm256_add_ps(fiy0,ty);
2097 fiz0 = _mm256_add_ps(fiz0,tz);
2099 fjx0 = _mm256_add_ps(fjx0,tx);
2100 fjy0 = _mm256_add_ps(fjy0,ty);
2101 fjz0 = _mm256_add_ps(fjz0,tz);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2112 r11 = _mm256_mul_ps(rsq11,rinv11);
2114 /* EWALD ELECTROSTATICS */
2116 /* Analytical PME correction */
2117 zeta2 = _mm256_mul_ps(beta2,rsq11);
2118 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2119 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2120 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2121 felec = _mm256_mul_ps(qq11,felec);
2122 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2123 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2124 velec = _mm256_sub_ps(rinv11,pmecorrV);
2125 velec = _mm256_mul_ps(qq11,velec);
2127 d = _mm256_sub_ps(r11,rswitch);
2128 d = _mm256_max_ps(d,_mm256_setzero_ps());
2129 d2 = _mm256_mul_ps(d,d);
2130 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2132 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2134 /* Evaluate switch function */
2135 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2136 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2137 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2141 fscal = _mm256_and_ps(fscal,cutoff_mask);
2143 /* Calculate temporary vectorial force */
2144 tx = _mm256_mul_ps(fscal,dx11);
2145 ty = _mm256_mul_ps(fscal,dy11);
2146 tz = _mm256_mul_ps(fscal,dz11);
2148 /* Update vectorial force */
2149 fix1 = _mm256_add_ps(fix1,tx);
2150 fiy1 = _mm256_add_ps(fiy1,ty);
2151 fiz1 = _mm256_add_ps(fiz1,tz);
2153 fjx1 = _mm256_add_ps(fjx1,tx);
2154 fjy1 = _mm256_add_ps(fjy1,ty);
2155 fjz1 = _mm256_add_ps(fjz1,tz);
2159 /**************************
2160 * CALCULATE INTERACTIONS *
2161 **************************/
2163 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2166 r12 = _mm256_mul_ps(rsq12,rinv12);
2168 /* EWALD ELECTROSTATICS */
2170 /* Analytical PME correction */
2171 zeta2 = _mm256_mul_ps(beta2,rsq12);
2172 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2173 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2174 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2175 felec = _mm256_mul_ps(qq12,felec);
2176 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2177 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2178 velec = _mm256_sub_ps(rinv12,pmecorrV);
2179 velec = _mm256_mul_ps(qq12,velec);
2181 d = _mm256_sub_ps(r12,rswitch);
2182 d = _mm256_max_ps(d,_mm256_setzero_ps());
2183 d2 = _mm256_mul_ps(d,d);
2184 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2186 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2188 /* Evaluate switch function */
2189 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2190 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2191 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2195 fscal = _mm256_and_ps(fscal,cutoff_mask);
2197 /* Calculate temporary vectorial force */
2198 tx = _mm256_mul_ps(fscal,dx12);
2199 ty = _mm256_mul_ps(fscal,dy12);
2200 tz = _mm256_mul_ps(fscal,dz12);
2202 /* Update vectorial force */
2203 fix1 = _mm256_add_ps(fix1,tx);
2204 fiy1 = _mm256_add_ps(fiy1,ty);
2205 fiz1 = _mm256_add_ps(fiz1,tz);
2207 fjx2 = _mm256_add_ps(fjx2,tx);
2208 fjy2 = _mm256_add_ps(fjy2,ty);
2209 fjz2 = _mm256_add_ps(fjz2,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2220 r13 = _mm256_mul_ps(rsq13,rinv13);
2222 /* EWALD ELECTROSTATICS */
2224 /* Analytical PME correction */
2225 zeta2 = _mm256_mul_ps(beta2,rsq13);
2226 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2227 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2228 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2229 felec = _mm256_mul_ps(qq13,felec);
2230 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2231 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2232 velec = _mm256_sub_ps(rinv13,pmecorrV);
2233 velec = _mm256_mul_ps(qq13,velec);
2235 d = _mm256_sub_ps(r13,rswitch);
2236 d = _mm256_max_ps(d,_mm256_setzero_ps());
2237 d2 = _mm256_mul_ps(d,d);
2238 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2240 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2242 /* Evaluate switch function */
2243 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2244 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2245 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2249 fscal = _mm256_and_ps(fscal,cutoff_mask);
2251 /* Calculate temporary vectorial force */
2252 tx = _mm256_mul_ps(fscal,dx13);
2253 ty = _mm256_mul_ps(fscal,dy13);
2254 tz = _mm256_mul_ps(fscal,dz13);
2256 /* Update vectorial force */
2257 fix1 = _mm256_add_ps(fix1,tx);
2258 fiy1 = _mm256_add_ps(fiy1,ty);
2259 fiz1 = _mm256_add_ps(fiz1,tz);
2261 fjx3 = _mm256_add_ps(fjx3,tx);
2262 fjy3 = _mm256_add_ps(fjy3,ty);
2263 fjz3 = _mm256_add_ps(fjz3,tz);
2267 /**************************
2268 * CALCULATE INTERACTIONS *
2269 **************************/
2271 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2274 r21 = _mm256_mul_ps(rsq21,rinv21);
2276 /* EWALD ELECTROSTATICS */
2278 /* Analytical PME correction */
2279 zeta2 = _mm256_mul_ps(beta2,rsq21);
2280 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2281 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2282 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2283 felec = _mm256_mul_ps(qq21,felec);
2284 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2285 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2286 velec = _mm256_sub_ps(rinv21,pmecorrV);
2287 velec = _mm256_mul_ps(qq21,velec);
2289 d = _mm256_sub_ps(r21,rswitch);
2290 d = _mm256_max_ps(d,_mm256_setzero_ps());
2291 d2 = _mm256_mul_ps(d,d);
2292 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2294 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2296 /* Evaluate switch function */
2297 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2298 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2299 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2303 fscal = _mm256_and_ps(fscal,cutoff_mask);
2305 /* Calculate temporary vectorial force */
2306 tx = _mm256_mul_ps(fscal,dx21);
2307 ty = _mm256_mul_ps(fscal,dy21);
2308 tz = _mm256_mul_ps(fscal,dz21);
2310 /* Update vectorial force */
2311 fix2 = _mm256_add_ps(fix2,tx);
2312 fiy2 = _mm256_add_ps(fiy2,ty);
2313 fiz2 = _mm256_add_ps(fiz2,tz);
2315 fjx1 = _mm256_add_ps(fjx1,tx);
2316 fjy1 = _mm256_add_ps(fjy1,ty);
2317 fjz1 = _mm256_add_ps(fjz1,tz);
2321 /**************************
2322 * CALCULATE INTERACTIONS *
2323 **************************/
2325 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2328 r22 = _mm256_mul_ps(rsq22,rinv22);
2330 /* EWALD ELECTROSTATICS */
2332 /* Analytical PME correction */
2333 zeta2 = _mm256_mul_ps(beta2,rsq22);
2334 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2335 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2336 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2337 felec = _mm256_mul_ps(qq22,felec);
2338 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2339 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2340 velec = _mm256_sub_ps(rinv22,pmecorrV);
2341 velec = _mm256_mul_ps(qq22,velec);
2343 d = _mm256_sub_ps(r22,rswitch);
2344 d = _mm256_max_ps(d,_mm256_setzero_ps());
2345 d2 = _mm256_mul_ps(d,d);
2346 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2348 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2350 /* Evaluate switch function */
2351 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2352 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2353 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2357 fscal = _mm256_and_ps(fscal,cutoff_mask);
2359 /* Calculate temporary vectorial force */
2360 tx = _mm256_mul_ps(fscal,dx22);
2361 ty = _mm256_mul_ps(fscal,dy22);
2362 tz = _mm256_mul_ps(fscal,dz22);
2364 /* Update vectorial force */
2365 fix2 = _mm256_add_ps(fix2,tx);
2366 fiy2 = _mm256_add_ps(fiy2,ty);
2367 fiz2 = _mm256_add_ps(fiz2,tz);
2369 fjx2 = _mm256_add_ps(fjx2,tx);
2370 fjy2 = _mm256_add_ps(fjy2,ty);
2371 fjz2 = _mm256_add_ps(fjz2,tz);
2375 /**************************
2376 * CALCULATE INTERACTIONS *
2377 **************************/
2379 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2382 r23 = _mm256_mul_ps(rsq23,rinv23);
2384 /* EWALD ELECTROSTATICS */
2386 /* Analytical PME correction */
2387 zeta2 = _mm256_mul_ps(beta2,rsq23);
2388 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2389 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2390 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2391 felec = _mm256_mul_ps(qq23,felec);
2392 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2393 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2394 velec = _mm256_sub_ps(rinv23,pmecorrV);
2395 velec = _mm256_mul_ps(qq23,velec);
2397 d = _mm256_sub_ps(r23,rswitch);
2398 d = _mm256_max_ps(d,_mm256_setzero_ps());
2399 d2 = _mm256_mul_ps(d,d);
2400 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2402 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2404 /* Evaluate switch function */
2405 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2406 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2407 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2411 fscal = _mm256_and_ps(fscal,cutoff_mask);
2413 /* Calculate temporary vectorial force */
2414 tx = _mm256_mul_ps(fscal,dx23);
2415 ty = _mm256_mul_ps(fscal,dy23);
2416 tz = _mm256_mul_ps(fscal,dz23);
2418 /* Update vectorial force */
2419 fix2 = _mm256_add_ps(fix2,tx);
2420 fiy2 = _mm256_add_ps(fiy2,ty);
2421 fiz2 = _mm256_add_ps(fiz2,tz);
2423 fjx3 = _mm256_add_ps(fjx3,tx);
2424 fjy3 = _mm256_add_ps(fjy3,ty);
2425 fjz3 = _mm256_add_ps(fjz3,tz);
2429 /**************************
2430 * CALCULATE INTERACTIONS *
2431 **************************/
2433 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2436 r31 = _mm256_mul_ps(rsq31,rinv31);
2438 /* EWALD ELECTROSTATICS */
2440 /* Analytical PME correction */
2441 zeta2 = _mm256_mul_ps(beta2,rsq31);
2442 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2443 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2444 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2445 felec = _mm256_mul_ps(qq31,felec);
2446 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2447 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2448 velec = _mm256_sub_ps(rinv31,pmecorrV);
2449 velec = _mm256_mul_ps(qq31,velec);
2451 d = _mm256_sub_ps(r31,rswitch);
2452 d = _mm256_max_ps(d,_mm256_setzero_ps());
2453 d2 = _mm256_mul_ps(d,d);
2454 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2456 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2458 /* Evaluate switch function */
2459 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2460 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2461 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2465 fscal = _mm256_and_ps(fscal,cutoff_mask);
2467 /* Calculate temporary vectorial force */
2468 tx = _mm256_mul_ps(fscal,dx31);
2469 ty = _mm256_mul_ps(fscal,dy31);
2470 tz = _mm256_mul_ps(fscal,dz31);
2472 /* Update vectorial force */
2473 fix3 = _mm256_add_ps(fix3,tx);
2474 fiy3 = _mm256_add_ps(fiy3,ty);
2475 fiz3 = _mm256_add_ps(fiz3,tz);
2477 fjx1 = _mm256_add_ps(fjx1,tx);
2478 fjy1 = _mm256_add_ps(fjy1,ty);
2479 fjz1 = _mm256_add_ps(fjz1,tz);
2483 /**************************
2484 * CALCULATE INTERACTIONS *
2485 **************************/
2487 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2490 r32 = _mm256_mul_ps(rsq32,rinv32);
2492 /* EWALD ELECTROSTATICS */
2494 /* Analytical PME correction */
2495 zeta2 = _mm256_mul_ps(beta2,rsq32);
2496 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2497 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2498 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2499 felec = _mm256_mul_ps(qq32,felec);
2500 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2501 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2502 velec = _mm256_sub_ps(rinv32,pmecorrV);
2503 velec = _mm256_mul_ps(qq32,velec);
2505 d = _mm256_sub_ps(r32,rswitch);
2506 d = _mm256_max_ps(d,_mm256_setzero_ps());
2507 d2 = _mm256_mul_ps(d,d);
2508 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2510 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2512 /* Evaluate switch function */
2513 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2514 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2515 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2519 fscal = _mm256_and_ps(fscal,cutoff_mask);
2521 /* Calculate temporary vectorial force */
2522 tx = _mm256_mul_ps(fscal,dx32);
2523 ty = _mm256_mul_ps(fscal,dy32);
2524 tz = _mm256_mul_ps(fscal,dz32);
2526 /* Update vectorial force */
2527 fix3 = _mm256_add_ps(fix3,tx);
2528 fiy3 = _mm256_add_ps(fiy3,ty);
2529 fiz3 = _mm256_add_ps(fiz3,tz);
2531 fjx2 = _mm256_add_ps(fjx2,tx);
2532 fjy2 = _mm256_add_ps(fjy2,ty);
2533 fjz2 = _mm256_add_ps(fjz2,tz);
2537 /**************************
2538 * CALCULATE INTERACTIONS *
2539 **************************/
2541 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2544 r33 = _mm256_mul_ps(rsq33,rinv33);
2546 /* EWALD ELECTROSTATICS */
2548 /* Analytical PME correction */
2549 zeta2 = _mm256_mul_ps(beta2,rsq33);
2550 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2551 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2552 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2553 felec = _mm256_mul_ps(qq33,felec);
2554 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2555 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2556 velec = _mm256_sub_ps(rinv33,pmecorrV);
2557 velec = _mm256_mul_ps(qq33,velec);
2559 d = _mm256_sub_ps(r33,rswitch);
2560 d = _mm256_max_ps(d,_mm256_setzero_ps());
2561 d2 = _mm256_mul_ps(d,d);
2562 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2564 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2566 /* Evaluate switch function */
2567 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2568 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2569 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2573 fscal = _mm256_and_ps(fscal,cutoff_mask);
2575 /* Calculate temporary vectorial force */
2576 tx = _mm256_mul_ps(fscal,dx33);
2577 ty = _mm256_mul_ps(fscal,dy33);
2578 tz = _mm256_mul_ps(fscal,dz33);
2580 /* Update vectorial force */
2581 fix3 = _mm256_add_ps(fix3,tx);
2582 fiy3 = _mm256_add_ps(fiy3,ty);
2583 fiz3 = _mm256_add_ps(fiz3,tz);
2585 fjx3 = _mm256_add_ps(fjx3,tx);
2586 fjy3 = _mm256_add_ps(fjy3,ty);
2587 fjz3 = _mm256_add_ps(fjz3,tz);
2591 fjptrA = f+j_coord_offsetA;
2592 fjptrB = f+j_coord_offsetB;
2593 fjptrC = f+j_coord_offsetC;
2594 fjptrD = f+j_coord_offsetD;
2595 fjptrE = f+j_coord_offsetE;
2596 fjptrF = f+j_coord_offsetF;
2597 fjptrG = f+j_coord_offsetG;
2598 fjptrH = f+j_coord_offsetH;
2600 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2601 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2602 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2604 /* Inner loop uses 1004 flops */
2607 if(jidx<j_index_end)
2610 /* Get j neighbor index, and coordinate index */
2611 jnrlistA = jjnr[jidx];
2612 jnrlistB = jjnr[jidx+1];
2613 jnrlistC = jjnr[jidx+2];
2614 jnrlistD = jjnr[jidx+3];
2615 jnrlistE = jjnr[jidx+4];
2616 jnrlistF = jjnr[jidx+5];
2617 jnrlistG = jjnr[jidx+6];
2618 jnrlistH = jjnr[jidx+7];
2619 /* Sign of each element will be negative for non-real atoms.
2620 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2621 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2623 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2624 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2626 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2627 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2628 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2629 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2630 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2631 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2632 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2633 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2634 j_coord_offsetA = DIM*jnrA;
2635 j_coord_offsetB = DIM*jnrB;
2636 j_coord_offsetC = DIM*jnrC;
2637 j_coord_offsetD = DIM*jnrD;
2638 j_coord_offsetE = DIM*jnrE;
2639 j_coord_offsetF = DIM*jnrF;
2640 j_coord_offsetG = DIM*jnrG;
2641 j_coord_offsetH = DIM*jnrH;
2643 /* load j atom coordinates */
2644 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2645 x+j_coord_offsetC,x+j_coord_offsetD,
2646 x+j_coord_offsetE,x+j_coord_offsetF,
2647 x+j_coord_offsetG,x+j_coord_offsetH,
2648 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2649 &jy2,&jz2,&jx3,&jy3,&jz3);
2651 /* Calculate displacement vector */
2652 dx00 = _mm256_sub_ps(ix0,jx0);
2653 dy00 = _mm256_sub_ps(iy0,jy0);
2654 dz00 = _mm256_sub_ps(iz0,jz0);
2655 dx11 = _mm256_sub_ps(ix1,jx1);
2656 dy11 = _mm256_sub_ps(iy1,jy1);
2657 dz11 = _mm256_sub_ps(iz1,jz1);
2658 dx12 = _mm256_sub_ps(ix1,jx2);
2659 dy12 = _mm256_sub_ps(iy1,jy2);
2660 dz12 = _mm256_sub_ps(iz1,jz2);
2661 dx13 = _mm256_sub_ps(ix1,jx3);
2662 dy13 = _mm256_sub_ps(iy1,jy3);
2663 dz13 = _mm256_sub_ps(iz1,jz3);
2664 dx21 = _mm256_sub_ps(ix2,jx1);
2665 dy21 = _mm256_sub_ps(iy2,jy1);
2666 dz21 = _mm256_sub_ps(iz2,jz1);
2667 dx22 = _mm256_sub_ps(ix2,jx2);
2668 dy22 = _mm256_sub_ps(iy2,jy2);
2669 dz22 = _mm256_sub_ps(iz2,jz2);
2670 dx23 = _mm256_sub_ps(ix2,jx3);
2671 dy23 = _mm256_sub_ps(iy2,jy3);
2672 dz23 = _mm256_sub_ps(iz2,jz3);
2673 dx31 = _mm256_sub_ps(ix3,jx1);
2674 dy31 = _mm256_sub_ps(iy3,jy1);
2675 dz31 = _mm256_sub_ps(iz3,jz1);
2676 dx32 = _mm256_sub_ps(ix3,jx2);
2677 dy32 = _mm256_sub_ps(iy3,jy2);
2678 dz32 = _mm256_sub_ps(iz3,jz2);
2679 dx33 = _mm256_sub_ps(ix3,jx3);
2680 dy33 = _mm256_sub_ps(iy3,jy3);
2681 dz33 = _mm256_sub_ps(iz3,jz3);
2683 /* Calculate squared distance and things based on it */
2684 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2685 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2686 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2687 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2688 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2689 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2690 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2691 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2692 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2693 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2695 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2696 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2697 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2698 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2699 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2700 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2701 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2702 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2703 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2704 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2706 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2707 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2708 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2709 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2710 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2711 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2712 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2713 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2714 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2715 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2717 fjx0 = _mm256_setzero_ps();
2718 fjy0 = _mm256_setzero_ps();
2719 fjz0 = _mm256_setzero_ps();
2720 fjx1 = _mm256_setzero_ps();
2721 fjy1 = _mm256_setzero_ps();
2722 fjz1 = _mm256_setzero_ps();
2723 fjx2 = _mm256_setzero_ps();
2724 fjy2 = _mm256_setzero_ps();
2725 fjz2 = _mm256_setzero_ps();
2726 fjx3 = _mm256_setzero_ps();
2727 fjy3 = _mm256_setzero_ps();
2728 fjz3 = _mm256_setzero_ps();
2730 /**************************
2731 * CALCULATE INTERACTIONS *
2732 **************************/
2734 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2737 r00 = _mm256_mul_ps(rsq00,rinv00);
2738 r00 = _mm256_andnot_ps(dummy_mask,r00);
2740 /* LENNARD-JONES DISPERSION/REPULSION */
2742 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2743 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
2744 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2745 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2746 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2748 d = _mm256_sub_ps(r00,rswitch);
2749 d = _mm256_max_ps(d,_mm256_setzero_ps());
2750 d2 = _mm256_mul_ps(d,d);
2751 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2753 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2755 /* Evaluate switch function */
2756 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2757 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2758 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2762 fscal = _mm256_and_ps(fscal,cutoff_mask);
2764 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2766 /* Calculate temporary vectorial force */
2767 tx = _mm256_mul_ps(fscal,dx00);
2768 ty = _mm256_mul_ps(fscal,dy00);
2769 tz = _mm256_mul_ps(fscal,dz00);
2771 /* Update vectorial force */
2772 fix0 = _mm256_add_ps(fix0,tx);
2773 fiy0 = _mm256_add_ps(fiy0,ty);
2774 fiz0 = _mm256_add_ps(fiz0,tz);
2776 fjx0 = _mm256_add_ps(fjx0,tx);
2777 fjy0 = _mm256_add_ps(fjy0,ty);
2778 fjz0 = _mm256_add_ps(fjz0,tz);
2782 /**************************
2783 * CALCULATE INTERACTIONS *
2784 **************************/
2786 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2789 r11 = _mm256_mul_ps(rsq11,rinv11);
2790 r11 = _mm256_andnot_ps(dummy_mask,r11);
2792 /* EWALD ELECTROSTATICS */
2794 /* Analytical PME correction */
2795 zeta2 = _mm256_mul_ps(beta2,rsq11);
2796 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2797 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2798 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2799 felec = _mm256_mul_ps(qq11,felec);
2800 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2801 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2802 velec = _mm256_sub_ps(rinv11,pmecorrV);
2803 velec = _mm256_mul_ps(qq11,velec);
2805 d = _mm256_sub_ps(r11,rswitch);
2806 d = _mm256_max_ps(d,_mm256_setzero_ps());
2807 d2 = _mm256_mul_ps(d,d);
2808 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2810 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2812 /* Evaluate switch function */
2813 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2814 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2815 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2819 fscal = _mm256_and_ps(fscal,cutoff_mask);
2821 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2823 /* Calculate temporary vectorial force */
2824 tx = _mm256_mul_ps(fscal,dx11);
2825 ty = _mm256_mul_ps(fscal,dy11);
2826 tz = _mm256_mul_ps(fscal,dz11);
2828 /* Update vectorial force */
2829 fix1 = _mm256_add_ps(fix1,tx);
2830 fiy1 = _mm256_add_ps(fiy1,ty);
2831 fiz1 = _mm256_add_ps(fiz1,tz);
2833 fjx1 = _mm256_add_ps(fjx1,tx);
2834 fjy1 = _mm256_add_ps(fjy1,ty);
2835 fjz1 = _mm256_add_ps(fjz1,tz);
2839 /**************************
2840 * CALCULATE INTERACTIONS *
2841 **************************/
2843 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2846 r12 = _mm256_mul_ps(rsq12,rinv12);
2847 r12 = _mm256_andnot_ps(dummy_mask,r12);
2849 /* EWALD ELECTROSTATICS */
2851 /* Analytical PME correction */
2852 zeta2 = _mm256_mul_ps(beta2,rsq12);
2853 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2854 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2855 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2856 felec = _mm256_mul_ps(qq12,felec);
2857 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2858 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2859 velec = _mm256_sub_ps(rinv12,pmecorrV);
2860 velec = _mm256_mul_ps(qq12,velec);
2862 d = _mm256_sub_ps(r12,rswitch);
2863 d = _mm256_max_ps(d,_mm256_setzero_ps());
2864 d2 = _mm256_mul_ps(d,d);
2865 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2867 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2869 /* Evaluate switch function */
2870 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2871 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2872 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2876 fscal = _mm256_and_ps(fscal,cutoff_mask);
2878 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2880 /* Calculate temporary vectorial force */
2881 tx = _mm256_mul_ps(fscal,dx12);
2882 ty = _mm256_mul_ps(fscal,dy12);
2883 tz = _mm256_mul_ps(fscal,dz12);
2885 /* Update vectorial force */
2886 fix1 = _mm256_add_ps(fix1,tx);
2887 fiy1 = _mm256_add_ps(fiy1,ty);
2888 fiz1 = _mm256_add_ps(fiz1,tz);
2890 fjx2 = _mm256_add_ps(fjx2,tx);
2891 fjy2 = _mm256_add_ps(fjy2,ty);
2892 fjz2 = _mm256_add_ps(fjz2,tz);
2896 /**************************
2897 * CALCULATE INTERACTIONS *
2898 **************************/
2900 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2903 r13 = _mm256_mul_ps(rsq13,rinv13);
2904 r13 = _mm256_andnot_ps(dummy_mask,r13);
2906 /* EWALD ELECTROSTATICS */
2908 /* Analytical PME correction */
2909 zeta2 = _mm256_mul_ps(beta2,rsq13);
2910 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2911 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2912 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2913 felec = _mm256_mul_ps(qq13,felec);
2914 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2915 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2916 velec = _mm256_sub_ps(rinv13,pmecorrV);
2917 velec = _mm256_mul_ps(qq13,velec);
2919 d = _mm256_sub_ps(r13,rswitch);
2920 d = _mm256_max_ps(d,_mm256_setzero_ps());
2921 d2 = _mm256_mul_ps(d,d);
2922 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2924 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2926 /* Evaluate switch function */
2927 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2928 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2929 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2933 fscal = _mm256_and_ps(fscal,cutoff_mask);
2935 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2937 /* Calculate temporary vectorial force */
2938 tx = _mm256_mul_ps(fscal,dx13);
2939 ty = _mm256_mul_ps(fscal,dy13);
2940 tz = _mm256_mul_ps(fscal,dz13);
2942 /* Update vectorial force */
2943 fix1 = _mm256_add_ps(fix1,tx);
2944 fiy1 = _mm256_add_ps(fiy1,ty);
2945 fiz1 = _mm256_add_ps(fiz1,tz);
2947 fjx3 = _mm256_add_ps(fjx3,tx);
2948 fjy3 = _mm256_add_ps(fjy3,ty);
2949 fjz3 = _mm256_add_ps(fjz3,tz);
2953 /**************************
2954 * CALCULATE INTERACTIONS *
2955 **************************/
2957 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2960 r21 = _mm256_mul_ps(rsq21,rinv21);
2961 r21 = _mm256_andnot_ps(dummy_mask,r21);
2963 /* EWALD ELECTROSTATICS */
2965 /* Analytical PME correction */
2966 zeta2 = _mm256_mul_ps(beta2,rsq21);
2967 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2968 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2969 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2970 felec = _mm256_mul_ps(qq21,felec);
2971 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2972 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2973 velec = _mm256_sub_ps(rinv21,pmecorrV);
2974 velec = _mm256_mul_ps(qq21,velec);
2976 d = _mm256_sub_ps(r21,rswitch);
2977 d = _mm256_max_ps(d,_mm256_setzero_ps());
2978 d2 = _mm256_mul_ps(d,d);
2979 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2981 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2983 /* Evaluate switch function */
2984 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2985 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2986 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2990 fscal = _mm256_and_ps(fscal,cutoff_mask);
2992 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2994 /* Calculate temporary vectorial force */
2995 tx = _mm256_mul_ps(fscal,dx21);
2996 ty = _mm256_mul_ps(fscal,dy21);
2997 tz = _mm256_mul_ps(fscal,dz21);
2999 /* Update vectorial force */
3000 fix2 = _mm256_add_ps(fix2,tx);
3001 fiy2 = _mm256_add_ps(fiy2,ty);
3002 fiz2 = _mm256_add_ps(fiz2,tz);
3004 fjx1 = _mm256_add_ps(fjx1,tx);
3005 fjy1 = _mm256_add_ps(fjy1,ty);
3006 fjz1 = _mm256_add_ps(fjz1,tz);
3010 /**************************
3011 * CALCULATE INTERACTIONS *
3012 **************************/
3014 if (gmx_mm256_any_lt(rsq22,rcutoff2))
3017 r22 = _mm256_mul_ps(rsq22,rinv22);
3018 r22 = _mm256_andnot_ps(dummy_mask,r22);
3020 /* EWALD ELECTROSTATICS */
3022 /* Analytical PME correction */
3023 zeta2 = _mm256_mul_ps(beta2,rsq22);
3024 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
3025 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3026 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3027 felec = _mm256_mul_ps(qq22,felec);
3028 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3029 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3030 velec = _mm256_sub_ps(rinv22,pmecorrV);
3031 velec = _mm256_mul_ps(qq22,velec);
3033 d = _mm256_sub_ps(r22,rswitch);
3034 d = _mm256_max_ps(d,_mm256_setzero_ps());
3035 d2 = _mm256_mul_ps(d,d);
3036 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3038 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3040 /* Evaluate switch function */
3041 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3042 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
3043 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
3047 fscal = _mm256_and_ps(fscal,cutoff_mask);
3049 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3051 /* Calculate temporary vectorial force */
3052 tx = _mm256_mul_ps(fscal,dx22);
3053 ty = _mm256_mul_ps(fscal,dy22);
3054 tz = _mm256_mul_ps(fscal,dz22);
3056 /* Update vectorial force */
3057 fix2 = _mm256_add_ps(fix2,tx);
3058 fiy2 = _mm256_add_ps(fiy2,ty);
3059 fiz2 = _mm256_add_ps(fiz2,tz);
3061 fjx2 = _mm256_add_ps(fjx2,tx);
3062 fjy2 = _mm256_add_ps(fjy2,ty);
3063 fjz2 = _mm256_add_ps(fjz2,tz);
3067 /**************************
3068 * CALCULATE INTERACTIONS *
3069 **************************/
3071 if (gmx_mm256_any_lt(rsq23,rcutoff2))
3074 r23 = _mm256_mul_ps(rsq23,rinv23);
3075 r23 = _mm256_andnot_ps(dummy_mask,r23);
3077 /* EWALD ELECTROSTATICS */
3079 /* Analytical PME correction */
3080 zeta2 = _mm256_mul_ps(beta2,rsq23);
3081 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
3082 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3083 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3084 felec = _mm256_mul_ps(qq23,felec);
3085 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3086 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3087 velec = _mm256_sub_ps(rinv23,pmecorrV);
3088 velec = _mm256_mul_ps(qq23,velec);
3090 d = _mm256_sub_ps(r23,rswitch);
3091 d = _mm256_max_ps(d,_mm256_setzero_ps());
3092 d2 = _mm256_mul_ps(d,d);
3093 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3095 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3097 /* Evaluate switch function */
3098 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3099 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
3100 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
3104 fscal = _mm256_and_ps(fscal,cutoff_mask);
3106 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3108 /* Calculate temporary vectorial force */
3109 tx = _mm256_mul_ps(fscal,dx23);
3110 ty = _mm256_mul_ps(fscal,dy23);
3111 tz = _mm256_mul_ps(fscal,dz23);
3113 /* Update vectorial force */
3114 fix2 = _mm256_add_ps(fix2,tx);
3115 fiy2 = _mm256_add_ps(fiy2,ty);
3116 fiz2 = _mm256_add_ps(fiz2,tz);
3118 fjx3 = _mm256_add_ps(fjx3,tx);
3119 fjy3 = _mm256_add_ps(fjy3,ty);
3120 fjz3 = _mm256_add_ps(fjz3,tz);
3124 /**************************
3125 * CALCULATE INTERACTIONS *
3126 **************************/
3128 if (gmx_mm256_any_lt(rsq31,rcutoff2))
3131 r31 = _mm256_mul_ps(rsq31,rinv31);
3132 r31 = _mm256_andnot_ps(dummy_mask,r31);
3134 /* EWALD ELECTROSTATICS */
3136 /* Analytical PME correction */
3137 zeta2 = _mm256_mul_ps(beta2,rsq31);
3138 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
3139 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3140 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3141 felec = _mm256_mul_ps(qq31,felec);
3142 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3143 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3144 velec = _mm256_sub_ps(rinv31,pmecorrV);
3145 velec = _mm256_mul_ps(qq31,velec);
3147 d = _mm256_sub_ps(r31,rswitch);
3148 d = _mm256_max_ps(d,_mm256_setzero_ps());
3149 d2 = _mm256_mul_ps(d,d);
3150 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3152 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3154 /* Evaluate switch function */
3155 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3156 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
3157 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
3161 fscal = _mm256_and_ps(fscal,cutoff_mask);
3163 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3165 /* Calculate temporary vectorial force */
3166 tx = _mm256_mul_ps(fscal,dx31);
3167 ty = _mm256_mul_ps(fscal,dy31);
3168 tz = _mm256_mul_ps(fscal,dz31);
3170 /* Update vectorial force */
3171 fix3 = _mm256_add_ps(fix3,tx);
3172 fiy3 = _mm256_add_ps(fiy3,ty);
3173 fiz3 = _mm256_add_ps(fiz3,tz);
3175 fjx1 = _mm256_add_ps(fjx1,tx);
3176 fjy1 = _mm256_add_ps(fjy1,ty);
3177 fjz1 = _mm256_add_ps(fjz1,tz);
3181 /**************************
3182 * CALCULATE INTERACTIONS *
3183 **************************/
3185 if (gmx_mm256_any_lt(rsq32,rcutoff2))
3188 r32 = _mm256_mul_ps(rsq32,rinv32);
3189 r32 = _mm256_andnot_ps(dummy_mask,r32);
3191 /* EWALD ELECTROSTATICS */
3193 /* Analytical PME correction */
3194 zeta2 = _mm256_mul_ps(beta2,rsq32);
3195 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
3196 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3197 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3198 felec = _mm256_mul_ps(qq32,felec);
3199 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3200 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3201 velec = _mm256_sub_ps(rinv32,pmecorrV);
3202 velec = _mm256_mul_ps(qq32,velec);
3204 d = _mm256_sub_ps(r32,rswitch);
3205 d = _mm256_max_ps(d,_mm256_setzero_ps());
3206 d2 = _mm256_mul_ps(d,d);
3207 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3209 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3211 /* Evaluate switch function */
3212 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3213 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
3214 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
3218 fscal = _mm256_and_ps(fscal,cutoff_mask);
3220 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3222 /* Calculate temporary vectorial force */
3223 tx = _mm256_mul_ps(fscal,dx32);
3224 ty = _mm256_mul_ps(fscal,dy32);
3225 tz = _mm256_mul_ps(fscal,dz32);
3227 /* Update vectorial force */
3228 fix3 = _mm256_add_ps(fix3,tx);
3229 fiy3 = _mm256_add_ps(fiy3,ty);
3230 fiz3 = _mm256_add_ps(fiz3,tz);
3232 fjx2 = _mm256_add_ps(fjx2,tx);
3233 fjy2 = _mm256_add_ps(fjy2,ty);
3234 fjz2 = _mm256_add_ps(fjz2,tz);
3238 /**************************
3239 * CALCULATE INTERACTIONS *
3240 **************************/
3242 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3245 r33 = _mm256_mul_ps(rsq33,rinv33);
3246 r33 = _mm256_andnot_ps(dummy_mask,r33);
3248 /* EWALD ELECTROSTATICS */
3250 /* Analytical PME correction */
3251 zeta2 = _mm256_mul_ps(beta2,rsq33);
3252 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
3253 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3254 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3255 felec = _mm256_mul_ps(qq33,felec);
3256 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3257 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3258 velec = _mm256_sub_ps(rinv33,pmecorrV);
3259 velec = _mm256_mul_ps(qq33,velec);
3261 d = _mm256_sub_ps(r33,rswitch);
3262 d = _mm256_max_ps(d,_mm256_setzero_ps());
3263 d2 = _mm256_mul_ps(d,d);
3264 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3266 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3268 /* Evaluate switch function */
3269 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3270 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
3271 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
3275 fscal = _mm256_and_ps(fscal,cutoff_mask);
3277 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3279 /* Calculate temporary vectorial force */
3280 tx = _mm256_mul_ps(fscal,dx33);
3281 ty = _mm256_mul_ps(fscal,dy33);
3282 tz = _mm256_mul_ps(fscal,dz33);
3284 /* Update vectorial force */
3285 fix3 = _mm256_add_ps(fix3,tx);
3286 fiy3 = _mm256_add_ps(fiy3,ty);
3287 fiz3 = _mm256_add_ps(fiz3,tz);
3289 fjx3 = _mm256_add_ps(fjx3,tx);
3290 fjy3 = _mm256_add_ps(fjy3,ty);
3291 fjz3 = _mm256_add_ps(fjz3,tz);
3295 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3296 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3297 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3298 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3299 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3300 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3301 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3302 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3304 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3305 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3306 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3308 /* Inner loop uses 1014 flops */
3311 /* End of innermost loop */
3313 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3314 f+i_coord_offset,fshift+i_shift_offset);
3316 /* Increment number of inner iterations */
3317 inneriter += j_index_end - j_index_start;
3319 /* Outer loop uses 24 flops */
3322 /* Increment number of outer iterations */
3325 /* Update outer/inner flops */
3327 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*1014);