2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gmx_math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
116 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
118 __m128i ewitab_lo,ewitab_hi;
119 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
122 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
123 real rswitch_scalar,d_scalar;
124 __m256 dummy_mask,cutoff_mask;
125 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
126 __m256 one = _mm256_set1_ps(1.0);
127 __m256 two = _mm256_set1_ps(2.0);
133 jindex = nlist->jindex;
135 shiftidx = nlist->shift;
137 shiftvec = fr->shift_vec[0];
138 fshift = fr->fshift[0];
139 facel = _mm256_set1_ps(fr->epsfac);
140 charge = mdatoms->chargeA;
141 nvdwtype = fr->ntype;
143 vdwtype = mdatoms->typeA;
145 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
146 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
147 beta2 = _mm256_mul_ps(beta,beta);
148 beta3 = _mm256_mul_ps(beta,beta2);
150 ewtab = fr->ic->tabq_coul_FDV0;
151 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
152 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
154 /* Setup water-specific parameters */
155 inr = nlist->iinr[0];
156 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
157 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
158 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
159 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
161 jq0 = _mm256_set1_ps(charge[inr+0]);
162 jq1 = _mm256_set1_ps(charge[inr+1]);
163 jq2 = _mm256_set1_ps(charge[inr+2]);
164 vdwjidx0A = 2*vdwtype[inr+0];
165 qq00 = _mm256_mul_ps(iq0,jq0);
166 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
167 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
168 qq01 = _mm256_mul_ps(iq0,jq1);
169 qq02 = _mm256_mul_ps(iq0,jq2);
170 qq10 = _mm256_mul_ps(iq1,jq0);
171 qq11 = _mm256_mul_ps(iq1,jq1);
172 qq12 = _mm256_mul_ps(iq1,jq2);
173 qq20 = _mm256_mul_ps(iq2,jq0);
174 qq21 = _mm256_mul_ps(iq2,jq1);
175 qq22 = _mm256_mul_ps(iq2,jq2);
177 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
178 rcutoff_scalar = fr->rcoulomb;
179 rcutoff = _mm256_set1_ps(rcutoff_scalar);
180 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
182 rswitch_scalar = fr->rcoulomb_switch;
183 rswitch = _mm256_set1_ps(rswitch_scalar);
184 /* Setup switch parameters */
185 d_scalar = rcutoff_scalar-rswitch_scalar;
186 d = _mm256_set1_ps(d_scalar);
187 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
188 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
189 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
190 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
191 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
192 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
194 /* Avoid stupid compiler warnings */
195 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
208 for(iidx=0;iidx<4*DIM;iidx++)
213 /* Start outer loop over neighborlists */
214 for(iidx=0; iidx<nri; iidx++)
216 /* Load shift vector for this list */
217 i_shift_offset = DIM*shiftidx[iidx];
219 /* Load limits for loop over neighbors */
220 j_index_start = jindex[iidx];
221 j_index_end = jindex[iidx+1];
223 /* Get outer coordinate index */
225 i_coord_offset = DIM*inr;
227 /* Load i particle coords and add shift vector */
228 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
229 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
231 fix0 = _mm256_setzero_ps();
232 fiy0 = _mm256_setzero_ps();
233 fiz0 = _mm256_setzero_ps();
234 fix1 = _mm256_setzero_ps();
235 fiy1 = _mm256_setzero_ps();
236 fiz1 = _mm256_setzero_ps();
237 fix2 = _mm256_setzero_ps();
238 fiy2 = _mm256_setzero_ps();
239 fiz2 = _mm256_setzero_ps();
241 /* Reset potential sums */
242 velecsum = _mm256_setzero_ps();
243 vvdwsum = _mm256_setzero_ps();
245 /* Start inner kernel loop */
246 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
249 /* Get j neighbor index, and coordinate index */
258 j_coord_offsetA = DIM*jnrA;
259 j_coord_offsetB = DIM*jnrB;
260 j_coord_offsetC = DIM*jnrC;
261 j_coord_offsetD = DIM*jnrD;
262 j_coord_offsetE = DIM*jnrE;
263 j_coord_offsetF = DIM*jnrF;
264 j_coord_offsetG = DIM*jnrG;
265 j_coord_offsetH = DIM*jnrH;
267 /* load j atom coordinates */
268 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
269 x+j_coord_offsetC,x+j_coord_offsetD,
270 x+j_coord_offsetE,x+j_coord_offsetF,
271 x+j_coord_offsetG,x+j_coord_offsetH,
272 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
274 /* Calculate displacement vector */
275 dx00 = _mm256_sub_ps(ix0,jx0);
276 dy00 = _mm256_sub_ps(iy0,jy0);
277 dz00 = _mm256_sub_ps(iz0,jz0);
278 dx01 = _mm256_sub_ps(ix0,jx1);
279 dy01 = _mm256_sub_ps(iy0,jy1);
280 dz01 = _mm256_sub_ps(iz0,jz1);
281 dx02 = _mm256_sub_ps(ix0,jx2);
282 dy02 = _mm256_sub_ps(iy0,jy2);
283 dz02 = _mm256_sub_ps(iz0,jz2);
284 dx10 = _mm256_sub_ps(ix1,jx0);
285 dy10 = _mm256_sub_ps(iy1,jy0);
286 dz10 = _mm256_sub_ps(iz1,jz0);
287 dx11 = _mm256_sub_ps(ix1,jx1);
288 dy11 = _mm256_sub_ps(iy1,jy1);
289 dz11 = _mm256_sub_ps(iz1,jz1);
290 dx12 = _mm256_sub_ps(ix1,jx2);
291 dy12 = _mm256_sub_ps(iy1,jy2);
292 dz12 = _mm256_sub_ps(iz1,jz2);
293 dx20 = _mm256_sub_ps(ix2,jx0);
294 dy20 = _mm256_sub_ps(iy2,jy0);
295 dz20 = _mm256_sub_ps(iz2,jz0);
296 dx21 = _mm256_sub_ps(ix2,jx1);
297 dy21 = _mm256_sub_ps(iy2,jy1);
298 dz21 = _mm256_sub_ps(iz2,jz1);
299 dx22 = _mm256_sub_ps(ix2,jx2);
300 dy22 = _mm256_sub_ps(iy2,jy2);
301 dz22 = _mm256_sub_ps(iz2,jz2);
303 /* Calculate squared distance and things based on it */
304 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
305 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
306 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
307 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
308 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
309 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
310 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
311 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
312 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
314 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
315 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
316 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
317 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
318 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
319 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
320 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
321 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
322 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
324 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
325 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
326 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
327 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
328 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
329 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
330 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
331 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
332 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
334 fjx0 = _mm256_setzero_ps();
335 fjy0 = _mm256_setzero_ps();
336 fjz0 = _mm256_setzero_ps();
337 fjx1 = _mm256_setzero_ps();
338 fjy1 = _mm256_setzero_ps();
339 fjz1 = _mm256_setzero_ps();
340 fjx2 = _mm256_setzero_ps();
341 fjy2 = _mm256_setzero_ps();
342 fjz2 = _mm256_setzero_ps();
344 /**************************
345 * CALCULATE INTERACTIONS *
346 **************************/
348 if (gmx_mm256_any_lt(rsq00,rcutoff2))
351 r00 = _mm256_mul_ps(rsq00,rinv00);
353 /* EWALD ELECTROSTATICS */
355 /* Analytical PME correction */
356 zeta2 = _mm256_mul_ps(beta2,rsq00);
357 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
358 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
359 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
360 felec = _mm256_mul_ps(qq00,felec);
361 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
362 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
363 velec = _mm256_sub_ps(rinv00,pmecorrV);
364 velec = _mm256_mul_ps(qq00,velec);
366 /* LENNARD-JONES DISPERSION/REPULSION */
368 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
369 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
370 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
371 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
372 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
374 d = _mm256_sub_ps(r00,rswitch);
375 d = _mm256_max_ps(d,_mm256_setzero_ps());
376 d2 = _mm256_mul_ps(d,d);
377 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
379 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
381 /* Evaluate switch function */
382 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
383 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
384 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
385 velec = _mm256_mul_ps(velec,sw);
386 vvdw = _mm256_mul_ps(vvdw,sw);
387 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velec = _mm256_and_ps(velec,cutoff_mask);
391 velecsum = _mm256_add_ps(velecsum,velec);
392 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
393 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
395 fscal = _mm256_add_ps(felec,fvdw);
397 fscal = _mm256_and_ps(fscal,cutoff_mask);
399 /* Calculate temporary vectorial force */
400 tx = _mm256_mul_ps(fscal,dx00);
401 ty = _mm256_mul_ps(fscal,dy00);
402 tz = _mm256_mul_ps(fscal,dz00);
404 /* Update vectorial force */
405 fix0 = _mm256_add_ps(fix0,tx);
406 fiy0 = _mm256_add_ps(fiy0,ty);
407 fiz0 = _mm256_add_ps(fiz0,tz);
409 fjx0 = _mm256_add_ps(fjx0,tx);
410 fjy0 = _mm256_add_ps(fjy0,ty);
411 fjz0 = _mm256_add_ps(fjz0,tz);
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 if (gmx_mm256_any_lt(rsq01,rcutoff2))
422 r01 = _mm256_mul_ps(rsq01,rinv01);
424 /* EWALD ELECTROSTATICS */
426 /* Analytical PME correction */
427 zeta2 = _mm256_mul_ps(beta2,rsq01);
428 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
429 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
430 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
431 felec = _mm256_mul_ps(qq01,felec);
432 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
433 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
434 velec = _mm256_sub_ps(rinv01,pmecorrV);
435 velec = _mm256_mul_ps(qq01,velec);
437 d = _mm256_sub_ps(r01,rswitch);
438 d = _mm256_max_ps(d,_mm256_setzero_ps());
439 d2 = _mm256_mul_ps(d,d);
440 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
442 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
444 /* Evaluate switch function */
445 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
446 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
447 velec = _mm256_mul_ps(velec,sw);
448 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velec = _mm256_and_ps(velec,cutoff_mask);
452 velecsum = _mm256_add_ps(velecsum,velec);
456 fscal = _mm256_and_ps(fscal,cutoff_mask);
458 /* Calculate temporary vectorial force */
459 tx = _mm256_mul_ps(fscal,dx01);
460 ty = _mm256_mul_ps(fscal,dy01);
461 tz = _mm256_mul_ps(fscal,dz01);
463 /* Update vectorial force */
464 fix0 = _mm256_add_ps(fix0,tx);
465 fiy0 = _mm256_add_ps(fiy0,ty);
466 fiz0 = _mm256_add_ps(fiz0,tz);
468 fjx1 = _mm256_add_ps(fjx1,tx);
469 fjy1 = _mm256_add_ps(fjy1,ty);
470 fjz1 = _mm256_add_ps(fjz1,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 if (gmx_mm256_any_lt(rsq02,rcutoff2))
481 r02 = _mm256_mul_ps(rsq02,rinv02);
483 /* EWALD ELECTROSTATICS */
485 /* Analytical PME correction */
486 zeta2 = _mm256_mul_ps(beta2,rsq02);
487 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
488 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
489 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
490 felec = _mm256_mul_ps(qq02,felec);
491 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
492 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
493 velec = _mm256_sub_ps(rinv02,pmecorrV);
494 velec = _mm256_mul_ps(qq02,velec);
496 d = _mm256_sub_ps(r02,rswitch);
497 d = _mm256_max_ps(d,_mm256_setzero_ps());
498 d2 = _mm256_mul_ps(d,d);
499 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
501 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
503 /* Evaluate switch function */
504 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
505 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
506 velec = _mm256_mul_ps(velec,sw);
507 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velec = _mm256_and_ps(velec,cutoff_mask);
511 velecsum = _mm256_add_ps(velecsum,velec);
515 fscal = _mm256_and_ps(fscal,cutoff_mask);
517 /* Calculate temporary vectorial force */
518 tx = _mm256_mul_ps(fscal,dx02);
519 ty = _mm256_mul_ps(fscal,dy02);
520 tz = _mm256_mul_ps(fscal,dz02);
522 /* Update vectorial force */
523 fix0 = _mm256_add_ps(fix0,tx);
524 fiy0 = _mm256_add_ps(fiy0,ty);
525 fiz0 = _mm256_add_ps(fiz0,tz);
527 fjx2 = _mm256_add_ps(fjx2,tx);
528 fjy2 = _mm256_add_ps(fjy2,ty);
529 fjz2 = _mm256_add_ps(fjz2,tz);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 if (gmx_mm256_any_lt(rsq10,rcutoff2))
540 r10 = _mm256_mul_ps(rsq10,rinv10);
542 /* EWALD ELECTROSTATICS */
544 /* Analytical PME correction */
545 zeta2 = _mm256_mul_ps(beta2,rsq10);
546 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
547 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
548 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
549 felec = _mm256_mul_ps(qq10,felec);
550 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
551 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
552 velec = _mm256_sub_ps(rinv10,pmecorrV);
553 velec = _mm256_mul_ps(qq10,velec);
555 d = _mm256_sub_ps(r10,rswitch);
556 d = _mm256_max_ps(d,_mm256_setzero_ps());
557 d2 = _mm256_mul_ps(d,d);
558 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
560 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
562 /* Evaluate switch function */
563 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
564 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
565 velec = _mm256_mul_ps(velec,sw);
566 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm256_and_ps(velec,cutoff_mask);
570 velecsum = _mm256_add_ps(velecsum,velec);
574 fscal = _mm256_and_ps(fscal,cutoff_mask);
576 /* Calculate temporary vectorial force */
577 tx = _mm256_mul_ps(fscal,dx10);
578 ty = _mm256_mul_ps(fscal,dy10);
579 tz = _mm256_mul_ps(fscal,dz10);
581 /* Update vectorial force */
582 fix1 = _mm256_add_ps(fix1,tx);
583 fiy1 = _mm256_add_ps(fiy1,ty);
584 fiz1 = _mm256_add_ps(fiz1,tz);
586 fjx0 = _mm256_add_ps(fjx0,tx);
587 fjy0 = _mm256_add_ps(fjy0,ty);
588 fjz0 = _mm256_add_ps(fjz0,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 if (gmx_mm256_any_lt(rsq11,rcutoff2))
599 r11 = _mm256_mul_ps(rsq11,rinv11);
601 /* EWALD ELECTROSTATICS */
603 /* Analytical PME correction */
604 zeta2 = _mm256_mul_ps(beta2,rsq11);
605 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
606 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
607 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
608 felec = _mm256_mul_ps(qq11,felec);
609 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
610 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
611 velec = _mm256_sub_ps(rinv11,pmecorrV);
612 velec = _mm256_mul_ps(qq11,velec);
614 d = _mm256_sub_ps(r11,rswitch);
615 d = _mm256_max_ps(d,_mm256_setzero_ps());
616 d2 = _mm256_mul_ps(d,d);
617 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
619 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
621 /* Evaluate switch function */
622 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
623 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
624 velec = _mm256_mul_ps(velec,sw);
625 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
627 /* Update potential sum for this i atom from the interaction with this j atom. */
628 velec = _mm256_and_ps(velec,cutoff_mask);
629 velecsum = _mm256_add_ps(velecsum,velec);
633 fscal = _mm256_and_ps(fscal,cutoff_mask);
635 /* Calculate temporary vectorial force */
636 tx = _mm256_mul_ps(fscal,dx11);
637 ty = _mm256_mul_ps(fscal,dy11);
638 tz = _mm256_mul_ps(fscal,dz11);
640 /* Update vectorial force */
641 fix1 = _mm256_add_ps(fix1,tx);
642 fiy1 = _mm256_add_ps(fiy1,ty);
643 fiz1 = _mm256_add_ps(fiz1,tz);
645 fjx1 = _mm256_add_ps(fjx1,tx);
646 fjy1 = _mm256_add_ps(fjy1,ty);
647 fjz1 = _mm256_add_ps(fjz1,tz);
651 /**************************
652 * CALCULATE INTERACTIONS *
653 **************************/
655 if (gmx_mm256_any_lt(rsq12,rcutoff2))
658 r12 = _mm256_mul_ps(rsq12,rinv12);
660 /* EWALD ELECTROSTATICS */
662 /* Analytical PME correction */
663 zeta2 = _mm256_mul_ps(beta2,rsq12);
664 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
665 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
666 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
667 felec = _mm256_mul_ps(qq12,felec);
668 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
669 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
670 velec = _mm256_sub_ps(rinv12,pmecorrV);
671 velec = _mm256_mul_ps(qq12,velec);
673 d = _mm256_sub_ps(r12,rswitch);
674 d = _mm256_max_ps(d,_mm256_setzero_ps());
675 d2 = _mm256_mul_ps(d,d);
676 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
678 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
680 /* Evaluate switch function */
681 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
682 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
683 velec = _mm256_mul_ps(velec,sw);
684 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
686 /* Update potential sum for this i atom from the interaction with this j atom. */
687 velec = _mm256_and_ps(velec,cutoff_mask);
688 velecsum = _mm256_add_ps(velecsum,velec);
692 fscal = _mm256_and_ps(fscal,cutoff_mask);
694 /* Calculate temporary vectorial force */
695 tx = _mm256_mul_ps(fscal,dx12);
696 ty = _mm256_mul_ps(fscal,dy12);
697 tz = _mm256_mul_ps(fscal,dz12);
699 /* Update vectorial force */
700 fix1 = _mm256_add_ps(fix1,tx);
701 fiy1 = _mm256_add_ps(fiy1,ty);
702 fiz1 = _mm256_add_ps(fiz1,tz);
704 fjx2 = _mm256_add_ps(fjx2,tx);
705 fjy2 = _mm256_add_ps(fjy2,ty);
706 fjz2 = _mm256_add_ps(fjz2,tz);
710 /**************************
711 * CALCULATE INTERACTIONS *
712 **************************/
714 if (gmx_mm256_any_lt(rsq20,rcutoff2))
717 r20 = _mm256_mul_ps(rsq20,rinv20);
719 /* EWALD ELECTROSTATICS */
721 /* Analytical PME correction */
722 zeta2 = _mm256_mul_ps(beta2,rsq20);
723 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
724 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
725 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
726 felec = _mm256_mul_ps(qq20,felec);
727 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
728 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
729 velec = _mm256_sub_ps(rinv20,pmecorrV);
730 velec = _mm256_mul_ps(qq20,velec);
732 d = _mm256_sub_ps(r20,rswitch);
733 d = _mm256_max_ps(d,_mm256_setzero_ps());
734 d2 = _mm256_mul_ps(d,d);
735 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
737 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
739 /* Evaluate switch function */
740 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
741 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
742 velec = _mm256_mul_ps(velec,sw);
743 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
745 /* Update potential sum for this i atom from the interaction with this j atom. */
746 velec = _mm256_and_ps(velec,cutoff_mask);
747 velecsum = _mm256_add_ps(velecsum,velec);
751 fscal = _mm256_and_ps(fscal,cutoff_mask);
753 /* Calculate temporary vectorial force */
754 tx = _mm256_mul_ps(fscal,dx20);
755 ty = _mm256_mul_ps(fscal,dy20);
756 tz = _mm256_mul_ps(fscal,dz20);
758 /* Update vectorial force */
759 fix2 = _mm256_add_ps(fix2,tx);
760 fiy2 = _mm256_add_ps(fiy2,ty);
761 fiz2 = _mm256_add_ps(fiz2,tz);
763 fjx0 = _mm256_add_ps(fjx0,tx);
764 fjy0 = _mm256_add_ps(fjy0,ty);
765 fjz0 = _mm256_add_ps(fjz0,tz);
769 /**************************
770 * CALCULATE INTERACTIONS *
771 **************************/
773 if (gmx_mm256_any_lt(rsq21,rcutoff2))
776 r21 = _mm256_mul_ps(rsq21,rinv21);
778 /* EWALD ELECTROSTATICS */
780 /* Analytical PME correction */
781 zeta2 = _mm256_mul_ps(beta2,rsq21);
782 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
783 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
784 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
785 felec = _mm256_mul_ps(qq21,felec);
786 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
787 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
788 velec = _mm256_sub_ps(rinv21,pmecorrV);
789 velec = _mm256_mul_ps(qq21,velec);
791 d = _mm256_sub_ps(r21,rswitch);
792 d = _mm256_max_ps(d,_mm256_setzero_ps());
793 d2 = _mm256_mul_ps(d,d);
794 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
796 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
798 /* Evaluate switch function */
799 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
800 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
801 velec = _mm256_mul_ps(velec,sw);
802 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
804 /* Update potential sum for this i atom from the interaction with this j atom. */
805 velec = _mm256_and_ps(velec,cutoff_mask);
806 velecsum = _mm256_add_ps(velecsum,velec);
810 fscal = _mm256_and_ps(fscal,cutoff_mask);
812 /* Calculate temporary vectorial force */
813 tx = _mm256_mul_ps(fscal,dx21);
814 ty = _mm256_mul_ps(fscal,dy21);
815 tz = _mm256_mul_ps(fscal,dz21);
817 /* Update vectorial force */
818 fix2 = _mm256_add_ps(fix2,tx);
819 fiy2 = _mm256_add_ps(fiy2,ty);
820 fiz2 = _mm256_add_ps(fiz2,tz);
822 fjx1 = _mm256_add_ps(fjx1,tx);
823 fjy1 = _mm256_add_ps(fjy1,ty);
824 fjz1 = _mm256_add_ps(fjz1,tz);
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 if (gmx_mm256_any_lt(rsq22,rcutoff2))
835 r22 = _mm256_mul_ps(rsq22,rinv22);
837 /* EWALD ELECTROSTATICS */
839 /* Analytical PME correction */
840 zeta2 = _mm256_mul_ps(beta2,rsq22);
841 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
842 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
843 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
844 felec = _mm256_mul_ps(qq22,felec);
845 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
846 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
847 velec = _mm256_sub_ps(rinv22,pmecorrV);
848 velec = _mm256_mul_ps(qq22,velec);
850 d = _mm256_sub_ps(r22,rswitch);
851 d = _mm256_max_ps(d,_mm256_setzero_ps());
852 d2 = _mm256_mul_ps(d,d);
853 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
855 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
857 /* Evaluate switch function */
858 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
859 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
860 velec = _mm256_mul_ps(velec,sw);
861 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
863 /* Update potential sum for this i atom from the interaction with this j atom. */
864 velec = _mm256_and_ps(velec,cutoff_mask);
865 velecsum = _mm256_add_ps(velecsum,velec);
869 fscal = _mm256_and_ps(fscal,cutoff_mask);
871 /* Calculate temporary vectorial force */
872 tx = _mm256_mul_ps(fscal,dx22);
873 ty = _mm256_mul_ps(fscal,dy22);
874 tz = _mm256_mul_ps(fscal,dz22);
876 /* Update vectorial force */
877 fix2 = _mm256_add_ps(fix2,tx);
878 fiy2 = _mm256_add_ps(fiy2,ty);
879 fiz2 = _mm256_add_ps(fiz2,tz);
881 fjx2 = _mm256_add_ps(fjx2,tx);
882 fjy2 = _mm256_add_ps(fjy2,ty);
883 fjz2 = _mm256_add_ps(fjz2,tz);
887 fjptrA = f+j_coord_offsetA;
888 fjptrB = f+j_coord_offsetB;
889 fjptrC = f+j_coord_offsetC;
890 fjptrD = f+j_coord_offsetD;
891 fjptrE = f+j_coord_offsetE;
892 fjptrF = f+j_coord_offsetF;
893 fjptrG = f+j_coord_offsetG;
894 fjptrH = f+j_coord_offsetH;
896 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
897 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
899 /* Inner loop uses 990 flops */
905 /* Get j neighbor index, and coordinate index */
906 jnrlistA = jjnr[jidx];
907 jnrlistB = jjnr[jidx+1];
908 jnrlistC = jjnr[jidx+2];
909 jnrlistD = jjnr[jidx+3];
910 jnrlistE = jjnr[jidx+4];
911 jnrlistF = jjnr[jidx+5];
912 jnrlistG = jjnr[jidx+6];
913 jnrlistH = jjnr[jidx+7];
914 /* Sign of each element will be negative for non-real atoms.
915 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
916 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
918 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
919 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
921 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
922 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
923 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
924 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
925 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
926 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
927 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
928 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
929 j_coord_offsetA = DIM*jnrA;
930 j_coord_offsetB = DIM*jnrB;
931 j_coord_offsetC = DIM*jnrC;
932 j_coord_offsetD = DIM*jnrD;
933 j_coord_offsetE = DIM*jnrE;
934 j_coord_offsetF = DIM*jnrF;
935 j_coord_offsetG = DIM*jnrG;
936 j_coord_offsetH = DIM*jnrH;
938 /* load j atom coordinates */
939 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
940 x+j_coord_offsetC,x+j_coord_offsetD,
941 x+j_coord_offsetE,x+j_coord_offsetF,
942 x+j_coord_offsetG,x+j_coord_offsetH,
943 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
945 /* Calculate displacement vector */
946 dx00 = _mm256_sub_ps(ix0,jx0);
947 dy00 = _mm256_sub_ps(iy0,jy0);
948 dz00 = _mm256_sub_ps(iz0,jz0);
949 dx01 = _mm256_sub_ps(ix0,jx1);
950 dy01 = _mm256_sub_ps(iy0,jy1);
951 dz01 = _mm256_sub_ps(iz0,jz1);
952 dx02 = _mm256_sub_ps(ix0,jx2);
953 dy02 = _mm256_sub_ps(iy0,jy2);
954 dz02 = _mm256_sub_ps(iz0,jz2);
955 dx10 = _mm256_sub_ps(ix1,jx0);
956 dy10 = _mm256_sub_ps(iy1,jy0);
957 dz10 = _mm256_sub_ps(iz1,jz0);
958 dx11 = _mm256_sub_ps(ix1,jx1);
959 dy11 = _mm256_sub_ps(iy1,jy1);
960 dz11 = _mm256_sub_ps(iz1,jz1);
961 dx12 = _mm256_sub_ps(ix1,jx2);
962 dy12 = _mm256_sub_ps(iy1,jy2);
963 dz12 = _mm256_sub_ps(iz1,jz2);
964 dx20 = _mm256_sub_ps(ix2,jx0);
965 dy20 = _mm256_sub_ps(iy2,jy0);
966 dz20 = _mm256_sub_ps(iz2,jz0);
967 dx21 = _mm256_sub_ps(ix2,jx1);
968 dy21 = _mm256_sub_ps(iy2,jy1);
969 dz21 = _mm256_sub_ps(iz2,jz1);
970 dx22 = _mm256_sub_ps(ix2,jx2);
971 dy22 = _mm256_sub_ps(iy2,jy2);
972 dz22 = _mm256_sub_ps(iz2,jz2);
974 /* Calculate squared distance and things based on it */
975 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
976 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
977 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
978 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
979 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
980 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
981 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
982 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
983 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
985 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
986 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
987 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
988 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
989 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
990 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
991 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
992 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
993 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
995 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
996 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
997 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
998 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
999 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1000 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1001 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1002 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1003 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1005 fjx0 = _mm256_setzero_ps();
1006 fjy0 = _mm256_setzero_ps();
1007 fjz0 = _mm256_setzero_ps();
1008 fjx1 = _mm256_setzero_ps();
1009 fjy1 = _mm256_setzero_ps();
1010 fjz1 = _mm256_setzero_ps();
1011 fjx2 = _mm256_setzero_ps();
1012 fjy2 = _mm256_setzero_ps();
1013 fjz2 = _mm256_setzero_ps();
1015 /**************************
1016 * CALCULATE INTERACTIONS *
1017 **************************/
1019 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1022 r00 = _mm256_mul_ps(rsq00,rinv00);
1023 r00 = _mm256_andnot_ps(dummy_mask,r00);
1025 /* EWALD ELECTROSTATICS */
1027 /* Analytical PME correction */
1028 zeta2 = _mm256_mul_ps(beta2,rsq00);
1029 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
1030 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1031 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1032 felec = _mm256_mul_ps(qq00,felec);
1033 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1034 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1035 velec = _mm256_sub_ps(rinv00,pmecorrV);
1036 velec = _mm256_mul_ps(qq00,velec);
1038 /* LENNARD-JONES DISPERSION/REPULSION */
1040 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1041 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1042 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1043 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1044 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1046 d = _mm256_sub_ps(r00,rswitch);
1047 d = _mm256_max_ps(d,_mm256_setzero_ps());
1048 d2 = _mm256_mul_ps(d,d);
1049 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1051 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1053 /* Evaluate switch function */
1054 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1055 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1056 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1057 velec = _mm256_mul_ps(velec,sw);
1058 vvdw = _mm256_mul_ps(vvdw,sw);
1059 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1061 /* Update potential sum for this i atom from the interaction with this j atom. */
1062 velec = _mm256_and_ps(velec,cutoff_mask);
1063 velec = _mm256_andnot_ps(dummy_mask,velec);
1064 velecsum = _mm256_add_ps(velecsum,velec);
1065 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
1066 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1067 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1069 fscal = _mm256_add_ps(felec,fvdw);
1071 fscal = _mm256_and_ps(fscal,cutoff_mask);
1073 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1075 /* Calculate temporary vectorial force */
1076 tx = _mm256_mul_ps(fscal,dx00);
1077 ty = _mm256_mul_ps(fscal,dy00);
1078 tz = _mm256_mul_ps(fscal,dz00);
1080 /* Update vectorial force */
1081 fix0 = _mm256_add_ps(fix0,tx);
1082 fiy0 = _mm256_add_ps(fiy0,ty);
1083 fiz0 = _mm256_add_ps(fiz0,tz);
1085 fjx0 = _mm256_add_ps(fjx0,tx);
1086 fjy0 = _mm256_add_ps(fjy0,ty);
1087 fjz0 = _mm256_add_ps(fjz0,tz);
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1095 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1098 r01 = _mm256_mul_ps(rsq01,rinv01);
1099 r01 = _mm256_andnot_ps(dummy_mask,r01);
1101 /* EWALD ELECTROSTATICS */
1103 /* Analytical PME correction */
1104 zeta2 = _mm256_mul_ps(beta2,rsq01);
1105 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
1106 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1107 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1108 felec = _mm256_mul_ps(qq01,felec);
1109 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1110 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1111 velec = _mm256_sub_ps(rinv01,pmecorrV);
1112 velec = _mm256_mul_ps(qq01,velec);
1114 d = _mm256_sub_ps(r01,rswitch);
1115 d = _mm256_max_ps(d,_mm256_setzero_ps());
1116 d2 = _mm256_mul_ps(d,d);
1117 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1119 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1121 /* Evaluate switch function */
1122 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1123 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1124 velec = _mm256_mul_ps(velec,sw);
1125 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1127 /* Update potential sum for this i atom from the interaction with this j atom. */
1128 velec = _mm256_and_ps(velec,cutoff_mask);
1129 velec = _mm256_andnot_ps(dummy_mask,velec);
1130 velecsum = _mm256_add_ps(velecsum,velec);
1134 fscal = _mm256_and_ps(fscal,cutoff_mask);
1136 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1138 /* Calculate temporary vectorial force */
1139 tx = _mm256_mul_ps(fscal,dx01);
1140 ty = _mm256_mul_ps(fscal,dy01);
1141 tz = _mm256_mul_ps(fscal,dz01);
1143 /* Update vectorial force */
1144 fix0 = _mm256_add_ps(fix0,tx);
1145 fiy0 = _mm256_add_ps(fiy0,ty);
1146 fiz0 = _mm256_add_ps(fiz0,tz);
1148 fjx1 = _mm256_add_ps(fjx1,tx);
1149 fjy1 = _mm256_add_ps(fjy1,ty);
1150 fjz1 = _mm256_add_ps(fjz1,tz);
1154 /**************************
1155 * CALCULATE INTERACTIONS *
1156 **************************/
1158 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1161 r02 = _mm256_mul_ps(rsq02,rinv02);
1162 r02 = _mm256_andnot_ps(dummy_mask,r02);
1164 /* EWALD ELECTROSTATICS */
1166 /* Analytical PME correction */
1167 zeta2 = _mm256_mul_ps(beta2,rsq02);
1168 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
1169 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1170 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1171 felec = _mm256_mul_ps(qq02,felec);
1172 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1173 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1174 velec = _mm256_sub_ps(rinv02,pmecorrV);
1175 velec = _mm256_mul_ps(qq02,velec);
1177 d = _mm256_sub_ps(r02,rswitch);
1178 d = _mm256_max_ps(d,_mm256_setzero_ps());
1179 d2 = _mm256_mul_ps(d,d);
1180 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1182 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1184 /* Evaluate switch function */
1185 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1186 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1187 velec = _mm256_mul_ps(velec,sw);
1188 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1190 /* Update potential sum for this i atom from the interaction with this j atom. */
1191 velec = _mm256_and_ps(velec,cutoff_mask);
1192 velec = _mm256_andnot_ps(dummy_mask,velec);
1193 velecsum = _mm256_add_ps(velecsum,velec);
1197 fscal = _mm256_and_ps(fscal,cutoff_mask);
1199 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1201 /* Calculate temporary vectorial force */
1202 tx = _mm256_mul_ps(fscal,dx02);
1203 ty = _mm256_mul_ps(fscal,dy02);
1204 tz = _mm256_mul_ps(fscal,dz02);
1206 /* Update vectorial force */
1207 fix0 = _mm256_add_ps(fix0,tx);
1208 fiy0 = _mm256_add_ps(fiy0,ty);
1209 fiz0 = _mm256_add_ps(fiz0,tz);
1211 fjx2 = _mm256_add_ps(fjx2,tx);
1212 fjy2 = _mm256_add_ps(fjy2,ty);
1213 fjz2 = _mm256_add_ps(fjz2,tz);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1224 r10 = _mm256_mul_ps(rsq10,rinv10);
1225 r10 = _mm256_andnot_ps(dummy_mask,r10);
1227 /* EWALD ELECTROSTATICS */
1229 /* Analytical PME correction */
1230 zeta2 = _mm256_mul_ps(beta2,rsq10);
1231 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1232 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1233 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1234 felec = _mm256_mul_ps(qq10,felec);
1235 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1236 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1237 velec = _mm256_sub_ps(rinv10,pmecorrV);
1238 velec = _mm256_mul_ps(qq10,velec);
1240 d = _mm256_sub_ps(r10,rswitch);
1241 d = _mm256_max_ps(d,_mm256_setzero_ps());
1242 d2 = _mm256_mul_ps(d,d);
1243 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1245 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1247 /* Evaluate switch function */
1248 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1249 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1250 velec = _mm256_mul_ps(velec,sw);
1251 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1253 /* Update potential sum for this i atom from the interaction with this j atom. */
1254 velec = _mm256_and_ps(velec,cutoff_mask);
1255 velec = _mm256_andnot_ps(dummy_mask,velec);
1256 velecsum = _mm256_add_ps(velecsum,velec);
1260 fscal = _mm256_and_ps(fscal,cutoff_mask);
1262 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1264 /* Calculate temporary vectorial force */
1265 tx = _mm256_mul_ps(fscal,dx10);
1266 ty = _mm256_mul_ps(fscal,dy10);
1267 tz = _mm256_mul_ps(fscal,dz10);
1269 /* Update vectorial force */
1270 fix1 = _mm256_add_ps(fix1,tx);
1271 fiy1 = _mm256_add_ps(fiy1,ty);
1272 fiz1 = _mm256_add_ps(fiz1,tz);
1274 fjx0 = _mm256_add_ps(fjx0,tx);
1275 fjy0 = _mm256_add_ps(fjy0,ty);
1276 fjz0 = _mm256_add_ps(fjz0,tz);
1280 /**************************
1281 * CALCULATE INTERACTIONS *
1282 **************************/
1284 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1287 r11 = _mm256_mul_ps(rsq11,rinv11);
1288 r11 = _mm256_andnot_ps(dummy_mask,r11);
1290 /* EWALD ELECTROSTATICS */
1292 /* Analytical PME correction */
1293 zeta2 = _mm256_mul_ps(beta2,rsq11);
1294 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1295 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1296 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1297 felec = _mm256_mul_ps(qq11,felec);
1298 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1299 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1300 velec = _mm256_sub_ps(rinv11,pmecorrV);
1301 velec = _mm256_mul_ps(qq11,velec);
1303 d = _mm256_sub_ps(r11,rswitch);
1304 d = _mm256_max_ps(d,_mm256_setzero_ps());
1305 d2 = _mm256_mul_ps(d,d);
1306 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1308 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1310 /* Evaluate switch function */
1311 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1312 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1313 velec = _mm256_mul_ps(velec,sw);
1314 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1316 /* Update potential sum for this i atom from the interaction with this j atom. */
1317 velec = _mm256_and_ps(velec,cutoff_mask);
1318 velec = _mm256_andnot_ps(dummy_mask,velec);
1319 velecsum = _mm256_add_ps(velecsum,velec);
1323 fscal = _mm256_and_ps(fscal,cutoff_mask);
1325 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1327 /* Calculate temporary vectorial force */
1328 tx = _mm256_mul_ps(fscal,dx11);
1329 ty = _mm256_mul_ps(fscal,dy11);
1330 tz = _mm256_mul_ps(fscal,dz11);
1332 /* Update vectorial force */
1333 fix1 = _mm256_add_ps(fix1,tx);
1334 fiy1 = _mm256_add_ps(fiy1,ty);
1335 fiz1 = _mm256_add_ps(fiz1,tz);
1337 fjx1 = _mm256_add_ps(fjx1,tx);
1338 fjy1 = _mm256_add_ps(fjy1,ty);
1339 fjz1 = _mm256_add_ps(fjz1,tz);
1343 /**************************
1344 * CALCULATE INTERACTIONS *
1345 **************************/
1347 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1350 r12 = _mm256_mul_ps(rsq12,rinv12);
1351 r12 = _mm256_andnot_ps(dummy_mask,r12);
1353 /* EWALD ELECTROSTATICS */
1355 /* Analytical PME correction */
1356 zeta2 = _mm256_mul_ps(beta2,rsq12);
1357 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1358 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1359 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1360 felec = _mm256_mul_ps(qq12,felec);
1361 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1362 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1363 velec = _mm256_sub_ps(rinv12,pmecorrV);
1364 velec = _mm256_mul_ps(qq12,velec);
1366 d = _mm256_sub_ps(r12,rswitch);
1367 d = _mm256_max_ps(d,_mm256_setzero_ps());
1368 d2 = _mm256_mul_ps(d,d);
1369 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1371 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1373 /* Evaluate switch function */
1374 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1375 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1376 velec = _mm256_mul_ps(velec,sw);
1377 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1379 /* Update potential sum for this i atom from the interaction with this j atom. */
1380 velec = _mm256_and_ps(velec,cutoff_mask);
1381 velec = _mm256_andnot_ps(dummy_mask,velec);
1382 velecsum = _mm256_add_ps(velecsum,velec);
1386 fscal = _mm256_and_ps(fscal,cutoff_mask);
1388 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1390 /* Calculate temporary vectorial force */
1391 tx = _mm256_mul_ps(fscal,dx12);
1392 ty = _mm256_mul_ps(fscal,dy12);
1393 tz = _mm256_mul_ps(fscal,dz12);
1395 /* Update vectorial force */
1396 fix1 = _mm256_add_ps(fix1,tx);
1397 fiy1 = _mm256_add_ps(fiy1,ty);
1398 fiz1 = _mm256_add_ps(fiz1,tz);
1400 fjx2 = _mm256_add_ps(fjx2,tx);
1401 fjy2 = _mm256_add_ps(fjy2,ty);
1402 fjz2 = _mm256_add_ps(fjz2,tz);
1406 /**************************
1407 * CALCULATE INTERACTIONS *
1408 **************************/
1410 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1413 r20 = _mm256_mul_ps(rsq20,rinv20);
1414 r20 = _mm256_andnot_ps(dummy_mask,r20);
1416 /* EWALD ELECTROSTATICS */
1418 /* Analytical PME correction */
1419 zeta2 = _mm256_mul_ps(beta2,rsq20);
1420 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1421 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1422 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1423 felec = _mm256_mul_ps(qq20,felec);
1424 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1425 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1426 velec = _mm256_sub_ps(rinv20,pmecorrV);
1427 velec = _mm256_mul_ps(qq20,velec);
1429 d = _mm256_sub_ps(r20,rswitch);
1430 d = _mm256_max_ps(d,_mm256_setzero_ps());
1431 d2 = _mm256_mul_ps(d,d);
1432 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1434 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1436 /* Evaluate switch function */
1437 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1438 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1439 velec = _mm256_mul_ps(velec,sw);
1440 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1442 /* Update potential sum for this i atom from the interaction with this j atom. */
1443 velec = _mm256_and_ps(velec,cutoff_mask);
1444 velec = _mm256_andnot_ps(dummy_mask,velec);
1445 velecsum = _mm256_add_ps(velecsum,velec);
1449 fscal = _mm256_and_ps(fscal,cutoff_mask);
1451 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1453 /* Calculate temporary vectorial force */
1454 tx = _mm256_mul_ps(fscal,dx20);
1455 ty = _mm256_mul_ps(fscal,dy20);
1456 tz = _mm256_mul_ps(fscal,dz20);
1458 /* Update vectorial force */
1459 fix2 = _mm256_add_ps(fix2,tx);
1460 fiy2 = _mm256_add_ps(fiy2,ty);
1461 fiz2 = _mm256_add_ps(fiz2,tz);
1463 fjx0 = _mm256_add_ps(fjx0,tx);
1464 fjy0 = _mm256_add_ps(fjy0,ty);
1465 fjz0 = _mm256_add_ps(fjz0,tz);
1469 /**************************
1470 * CALCULATE INTERACTIONS *
1471 **************************/
1473 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1476 r21 = _mm256_mul_ps(rsq21,rinv21);
1477 r21 = _mm256_andnot_ps(dummy_mask,r21);
1479 /* EWALD ELECTROSTATICS */
1481 /* Analytical PME correction */
1482 zeta2 = _mm256_mul_ps(beta2,rsq21);
1483 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1484 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1485 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1486 felec = _mm256_mul_ps(qq21,felec);
1487 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1488 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1489 velec = _mm256_sub_ps(rinv21,pmecorrV);
1490 velec = _mm256_mul_ps(qq21,velec);
1492 d = _mm256_sub_ps(r21,rswitch);
1493 d = _mm256_max_ps(d,_mm256_setzero_ps());
1494 d2 = _mm256_mul_ps(d,d);
1495 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1497 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1499 /* Evaluate switch function */
1500 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1501 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1502 velec = _mm256_mul_ps(velec,sw);
1503 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1505 /* Update potential sum for this i atom from the interaction with this j atom. */
1506 velec = _mm256_and_ps(velec,cutoff_mask);
1507 velec = _mm256_andnot_ps(dummy_mask,velec);
1508 velecsum = _mm256_add_ps(velecsum,velec);
1512 fscal = _mm256_and_ps(fscal,cutoff_mask);
1514 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1516 /* Calculate temporary vectorial force */
1517 tx = _mm256_mul_ps(fscal,dx21);
1518 ty = _mm256_mul_ps(fscal,dy21);
1519 tz = _mm256_mul_ps(fscal,dz21);
1521 /* Update vectorial force */
1522 fix2 = _mm256_add_ps(fix2,tx);
1523 fiy2 = _mm256_add_ps(fiy2,ty);
1524 fiz2 = _mm256_add_ps(fiz2,tz);
1526 fjx1 = _mm256_add_ps(fjx1,tx);
1527 fjy1 = _mm256_add_ps(fjy1,ty);
1528 fjz1 = _mm256_add_ps(fjz1,tz);
1532 /**************************
1533 * CALCULATE INTERACTIONS *
1534 **************************/
1536 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1539 r22 = _mm256_mul_ps(rsq22,rinv22);
1540 r22 = _mm256_andnot_ps(dummy_mask,r22);
1542 /* EWALD ELECTROSTATICS */
1544 /* Analytical PME correction */
1545 zeta2 = _mm256_mul_ps(beta2,rsq22);
1546 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1547 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1548 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1549 felec = _mm256_mul_ps(qq22,felec);
1550 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1551 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1552 velec = _mm256_sub_ps(rinv22,pmecorrV);
1553 velec = _mm256_mul_ps(qq22,velec);
1555 d = _mm256_sub_ps(r22,rswitch);
1556 d = _mm256_max_ps(d,_mm256_setzero_ps());
1557 d2 = _mm256_mul_ps(d,d);
1558 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1560 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1562 /* Evaluate switch function */
1563 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1564 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1565 velec = _mm256_mul_ps(velec,sw);
1566 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1568 /* Update potential sum for this i atom from the interaction with this j atom. */
1569 velec = _mm256_and_ps(velec,cutoff_mask);
1570 velec = _mm256_andnot_ps(dummy_mask,velec);
1571 velecsum = _mm256_add_ps(velecsum,velec);
1575 fscal = _mm256_and_ps(fscal,cutoff_mask);
1577 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1579 /* Calculate temporary vectorial force */
1580 tx = _mm256_mul_ps(fscal,dx22);
1581 ty = _mm256_mul_ps(fscal,dy22);
1582 tz = _mm256_mul_ps(fscal,dz22);
1584 /* Update vectorial force */
1585 fix2 = _mm256_add_ps(fix2,tx);
1586 fiy2 = _mm256_add_ps(fiy2,ty);
1587 fiz2 = _mm256_add_ps(fiz2,tz);
1589 fjx2 = _mm256_add_ps(fjx2,tx);
1590 fjy2 = _mm256_add_ps(fjy2,ty);
1591 fjz2 = _mm256_add_ps(fjz2,tz);
1595 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1596 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1597 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1598 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1599 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1600 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1601 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1602 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1604 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1605 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1607 /* Inner loop uses 999 flops */
1610 /* End of innermost loop */
1612 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1613 f+i_coord_offset,fshift+i_shift_offset);
1616 /* Update potential energies */
1617 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1618 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1620 /* Increment number of inner iterations */
1621 inneriter += j_index_end - j_index_start;
1623 /* Outer loop uses 20 flops */
1626 /* Increment number of outer iterations */
1629 /* Update outer/inner flops */
1631 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*999);
1634 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single
1635 * Electrostatics interaction: Ewald
1636 * VdW interaction: LennardJones
1637 * Geometry: Water3-Water3
1638 * Calculate force/pot: Force
1641 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single
1642 (t_nblist * gmx_restrict nlist,
1643 rvec * gmx_restrict xx,
1644 rvec * gmx_restrict ff,
1645 t_forcerec * gmx_restrict fr,
1646 t_mdatoms * gmx_restrict mdatoms,
1647 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1648 t_nrnb * gmx_restrict nrnb)
1650 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1651 * just 0 for non-waters.
1652 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1653 * jnr indices corresponding to data put in the four positions in the SIMD register.
1655 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1656 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1657 int jnrA,jnrB,jnrC,jnrD;
1658 int jnrE,jnrF,jnrG,jnrH;
1659 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1660 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1661 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1662 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1663 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1664 real rcutoff_scalar;
1665 real *shiftvec,*fshift,*x,*f;
1666 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1667 real scratch[4*DIM];
1668 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1669 real * vdwioffsetptr0;
1670 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1671 real * vdwioffsetptr1;
1672 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1673 real * vdwioffsetptr2;
1674 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1675 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1676 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1677 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1678 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1679 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1680 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1681 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1682 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1683 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1684 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1685 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1686 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1687 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1688 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1689 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1690 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1693 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1696 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1697 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1699 __m128i ewitab_lo,ewitab_hi;
1700 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1701 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1703 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1704 real rswitch_scalar,d_scalar;
1705 __m256 dummy_mask,cutoff_mask;
1706 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1707 __m256 one = _mm256_set1_ps(1.0);
1708 __m256 two = _mm256_set1_ps(2.0);
1714 jindex = nlist->jindex;
1716 shiftidx = nlist->shift;
1718 shiftvec = fr->shift_vec[0];
1719 fshift = fr->fshift[0];
1720 facel = _mm256_set1_ps(fr->epsfac);
1721 charge = mdatoms->chargeA;
1722 nvdwtype = fr->ntype;
1723 vdwparam = fr->nbfp;
1724 vdwtype = mdatoms->typeA;
1726 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1727 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
1728 beta2 = _mm256_mul_ps(beta,beta);
1729 beta3 = _mm256_mul_ps(beta,beta2);
1731 ewtab = fr->ic->tabq_coul_FDV0;
1732 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1733 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1735 /* Setup water-specific parameters */
1736 inr = nlist->iinr[0];
1737 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1738 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1739 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1740 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1742 jq0 = _mm256_set1_ps(charge[inr+0]);
1743 jq1 = _mm256_set1_ps(charge[inr+1]);
1744 jq2 = _mm256_set1_ps(charge[inr+2]);
1745 vdwjidx0A = 2*vdwtype[inr+0];
1746 qq00 = _mm256_mul_ps(iq0,jq0);
1747 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1748 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1749 qq01 = _mm256_mul_ps(iq0,jq1);
1750 qq02 = _mm256_mul_ps(iq0,jq2);
1751 qq10 = _mm256_mul_ps(iq1,jq0);
1752 qq11 = _mm256_mul_ps(iq1,jq1);
1753 qq12 = _mm256_mul_ps(iq1,jq2);
1754 qq20 = _mm256_mul_ps(iq2,jq0);
1755 qq21 = _mm256_mul_ps(iq2,jq1);
1756 qq22 = _mm256_mul_ps(iq2,jq2);
1758 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1759 rcutoff_scalar = fr->rcoulomb;
1760 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1761 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1763 rswitch_scalar = fr->rcoulomb_switch;
1764 rswitch = _mm256_set1_ps(rswitch_scalar);
1765 /* Setup switch parameters */
1766 d_scalar = rcutoff_scalar-rswitch_scalar;
1767 d = _mm256_set1_ps(d_scalar);
1768 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1769 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1770 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1771 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1772 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1773 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1775 /* Avoid stupid compiler warnings */
1776 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1777 j_coord_offsetA = 0;
1778 j_coord_offsetB = 0;
1779 j_coord_offsetC = 0;
1780 j_coord_offsetD = 0;
1781 j_coord_offsetE = 0;
1782 j_coord_offsetF = 0;
1783 j_coord_offsetG = 0;
1784 j_coord_offsetH = 0;
1789 for(iidx=0;iidx<4*DIM;iidx++)
1791 scratch[iidx] = 0.0;
1794 /* Start outer loop over neighborlists */
1795 for(iidx=0; iidx<nri; iidx++)
1797 /* Load shift vector for this list */
1798 i_shift_offset = DIM*shiftidx[iidx];
1800 /* Load limits for loop over neighbors */
1801 j_index_start = jindex[iidx];
1802 j_index_end = jindex[iidx+1];
1804 /* Get outer coordinate index */
1806 i_coord_offset = DIM*inr;
1808 /* Load i particle coords and add shift vector */
1809 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1810 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1812 fix0 = _mm256_setzero_ps();
1813 fiy0 = _mm256_setzero_ps();
1814 fiz0 = _mm256_setzero_ps();
1815 fix1 = _mm256_setzero_ps();
1816 fiy1 = _mm256_setzero_ps();
1817 fiz1 = _mm256_setzero_ps();
1818 fix2 = _mm256_setzero_ps();
1819 fiy2 = _mm256_setzero_ps();
1820 fiz2 = _mm256_setzero_ps();
1822 /* Start inner kernel loop */
1823 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1826 /* Get j neighbor index, and coordinate index */
1828 jnrB = jjnr[jidx+1];
1829 jnrC = jjnr[jidx+2];
1830 jnrD = jjnr[jidx+3];
1831 jnrE = jjnr[jidx+4];
1832 jnrF = jjnr[jidx+5];
1833 jnrG = jjnr[jidx+6];
1834 jnrH = jjnr[jidx+7];
1835 j_coord_offsetA = DIM*jnrA;
1836 j_coord_offsetB = DIM*jnrB;
1837 j_coord_offsetC = DIM*jnrC;
1838 j_coord_offsetD = DIM*jnrD;
1839 j_coord_offsetE = DIM*jnrE;
1840 j_coord_offsetF = DIM*jnrF;
1841 j_coord_offsetG = DIM*jnrG;
1842 j_coord_offsetH = DIM*jnrH;
1844 /* load j atom coordinates */
1845 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1846 x+j_coord_offsetC,x+j_coord_offsetD,
1847 x+j_coord_offsetE,x+j_coord_offsetF,
1848 x+j_coord_offsetG,x+j_coord_offsetH,
1849 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1851 /* Calculate displacement vector */
1852 dx00 = _mm256_sub_ps(ix0,jx0);
1853 dy00 = _mm256_sub_ps(iy0,jy0);
1854 dz00 = _mm256_sub_ps(iz0,jz0);
1855 dx01 = _mm256_sub_ps(ix0,jx1);
1856 dy01 = _mm256_sub_ps(iy0,jy1);
1857 dz01 = _mm256_sub_ps(iz0,jz1);
1858 dx02 = _mm256_sub_ps(ix0,jx2);
1859 dy02 = _mm256_sub_ps(iy0,jy2);
1860 dz02 = _mm256_sub_ps(iz0,jz2);
1861 dx10 = _mm256_sub_ps(ix1,jx0);
1862 dy10 = _mm256_sub_ps(iy1,jy0);
1863 dz10 = _mm256_sub_ps(iz1,jz0);
1864 dx11 = _mm256_sub_ps(ix1,jx1);
1865 dy11 = _mm256_sub_ps(iy1,jy1);
1866 dz11 = _mm256_sub_ps(iz1,jz1);
1867 dx12 = _mm256_sub_ps(ix1,jx2);
1868 dy12 = _mm256_sub_ps(iy1,jy2);
1869 dz12 = _mm256_sub_ps(iz1,jz2);
1870 dx20 = _mm256_sub_ps(ix2,jx0);
1871 dy20 = _mm256_sub_ps(iy2,jy0);
1872 dz20 = _mm256_sub_ps(iz2,jz0);
1873 dx21 = _mm256_sub_ps(ix2,jx1);
1874 dy21 = _mm256_sub_ps(iy2,jy1);
1875 dz21 = _mm256_sub_ps(iz2,jz1);
1876 dx22 = _mm256_sub_ps(ix2,jx2);
1877 dy22 = _mm256_sub_ps(iy2,jy2);
1878 dz22 = _mm256_sub_ps(iz2,jz2);
1880 /* Calculate squared distance and things based on it */
1881 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1882 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1883 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1884 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1885 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1886 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1887 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1888 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1889 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1891 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1892 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1893 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1894 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1895 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1896 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1897 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1898 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1899 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1901 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1902 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1903 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1904 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1905 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1906 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1907 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1908 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1909 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1911 fjx0 = _mm256_setzero_ps();
1912 fjy0 = _mm256_setzero_ps();
1913 fjz0 = _mm256_setzero_ps();
1914 fjx1 = _mm256_setzero_ps();
1915 fjy1 = _mm256_setzero_ps();
1916 fjz1 = _mm256_setzero_ps();
1917 fjx2 = _mm256_setzero_ps();
1918 fjy2 = _mm256_setzero_ps();
1919 fjz2 = _mm256_setzero_ps();
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1928 r00 = _mm256_mul_ps(rsq00,rinv00);
1930 /* EWALD ELECTROSTATICS */
1932 /* Analytical PME correction */
1933 zeta2 = _mm256_mul_ps(beta2,rsq00);
1934 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
1935 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1936 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1937 felec = _mm256_mul_ps(qq00,felec);
1938 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1939 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1940 velec = _mm256_sub_ps(rinv00,pmecorrV);
1941 velec = _mm256_mul_ps(qq00,velec);
1943 /* LENNARD-JONES DISPERSION/REPULSION */
1945 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1946 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1947 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1948 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1949 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1951 d = _mm256_sub_ps(r00,rswitch);
1952 d = _mm256_max_ps(d,_mm256_setzero_ps());
1953 d2 = _mm256_mul_ps(d,d);
1954 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1956 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1958 /* Evaluate switch function */
1959 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1960 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1961 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1962 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1964 fscal = _mm256_add_ps(felec,fvdw);
1966 fscal = _mm256_and_ps(fscal,cutoff_mask);
1968 /* Calculate temporary vectorial force */
1969 tx = _mm256_mul_ps(fscal,dx00);
1970 ty = _mm256_mul_ps(fscal,dy00);
1971 tz = _mm256_mul_ps(fscal,dz00);
1973 /* Update vectorial force */
1974 fix0 = _mm256_add_ps(fix0,tx);
1975 fiy0 = _mm256_add_ps(fiy0,ty);
1976 fiz0 = _mm256_add_ps(fiz0,tz);
1978 fjx0 = _mm256_add_ps(fjx0,tx);
1979 fjy0 = _mm256_add_ps(fjy0,ty);
1980 fjz0 = _mm256_add_ps(fjz0,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1991 r01 = _mm256_mul_ps(rsq01,rinv01);
1993 /* EWALD ELECTROSTATICS */
1995 /* Analytical PME correction */
1996 zeta2 = _mm256_mul_ps(beta2,rsq01);
1997 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
1998 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1999 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2000 felec = _mm256_mul_ps(qq01,felec);
2001 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2002 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2003 velec = _mm256_sub_ps(rinv01,pmecorrV);
2004 velec = _mm256_mul_ps(qq01,velec);
2006 d = _mm256_sub_ps(r01,rswitch);
2007 d = _mm256_max_ps(d,_mm256_setzero_ps());
2008 d2 = _mm256_mul_ps(d,d);
2009 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2011 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2013 /* Evaluate switch function */
2014 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2015 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2016 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2020 fscal = _mm256_and_ps(fscal,cutoff_mask);
2022 /* Calculate temporary vectorial force */
2023 tx = _mm256_mul_ps(fscal,dx01);
2024 ty = _mm256_mul_ps(fscal,dy01);
2025 tz = _mm256_mul_ps(fscal,dz01);
2027 /* Update vectorial force */
2028 fix0 = _mm256_add_ps(fix0,tx);
2029 fiy0 = _mm256_add_ps(fiy0,ty);
2030 fiz0 = _mm256_add_ps(fiz0,tz);
2032 fjx1 = _mm256_add_ps(fjx1,tx);
2033 fjy1 = _mm256_add_ps(fjy1,ty);
2034 fjz1 = _mm256_add_ps(fjz1,tz);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2045 r02 = _mm256_mul_ps(rsq02,rinv02);
2047 /* EWALD ELECTROSTATICS */
2049 /* Analytical PME correction */
2050 zeta2 = _mm256_mul_ps(beta2,rsq02);
2051 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
2052 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2053 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2054 felec = _mm256_mul_ps(qq02,felec);
2055 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2056 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2057 velec = _mm256_sub_ps(rinv02,pmecorrV);
2058 velec = _mm256_mul_ps(qq02,velec);
2060 d = _mm256_sub_ps(r02,rswitch);
2061 d = _mm256_max_ps(d,_mm256_setzero_ps());
2062 d2 = _mm256_mul_ps(d,d);
2063 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2065 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2067 /* Evaluate switch function */
2068 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2069 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2070 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2074 fscal = _mm256_and_ps(fscal,cutoff_mask);
2076 /* Calculate temporary vectorial force */
2077 tx = _mm256_mul_ps(fscal,dx02);
2078 ty = _mm256_mul_ps(fscal,dy02);
2079 tz = _mm256_mul_ps(fscal,dz02);
2081 /* Update vectorial force */
2082 fix0 = _mm256_add_ps(fix0,tx);
2083 fiy0 = _mm256_add_ps(fiy0,ty);
2084 fiz0 = _mm256_add_ps(fiz0,tz);
2086 fjx2 = _mm256_add_ps(fjx2,tx);
2087 fjy2 = _mm256_add_ps(fjy2,ty);
2088 fjz2 = _mm256_add_ps(fjz2,tz);
2092 /**************************
2093 * CALCULATE INTERACTIONS *
2094 **************************/
2096 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2099 r10 = _mm256_mul_ps(rsq10,rinv10);
2101 /* EWALD ELECTROSTATICS */
2103 /* Analytical PME correction */
2104 zeta2 = _mm256_mul_ps(beta2,rsq10);
2105 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
2106 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2107 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2108 felec = _mm256_mul_ps(qq10,felec);
2109 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2110 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2111 velec = _mm256_sub_ps(rinv10,pmecorrV);
2112 velec = _mm256_mul_ps(qq10,velec);
2114 d = _mm256_sub_ps(r10,rswitch);
2115 d = _mm256_max_ps(d,_mm256_setzero_ps());
2116 d2 = _mm256_mul_ps(d,d);
2117 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2119 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2121 /* Evaluate switch function */
2122 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2123 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2124 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2128 fscal = _mm256_and_ps(fscal,cutoff_mask);
2130 /* Calculate temporary vectorial force */
2131 tx = _mm256_mul_ps(fscal,dx10);
2132 ty = _mm256_mul_ps(fscal,dy10);
2133 tz = _mm256_mul_ps(fscal,dz10);
2135 /* Update vectorial force */
2136 fix1 = _mm256_add_ps(fix1,tx);
2137 fiy1 = _mm256_add_ps(fiy1,ty);
2138 fiz1 = _mm256_add_ps(fiz1,tz);
2140 fjx0 = _mm256_add_ps(fjx0,tx);
2141 fjy0 = _mm256_add_ps(fjy0,ty);
2142 fjz0 = _mm256_add_ps(fjz0,tz);
2146 /**************************
2147 * CALCULATE INTERACTIONS *
2148 **************************/
2150 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2153 r11 = _mm256_mul_ps(rsq11,rinv11);
2155 /* EWALD ELECTROSTATICS */
2157 /* Analytical PME correction */
2158 zeta2 = _mm256_mul_ps(beta2,rsq11);
2159 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2160 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2161 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2162 felec = _mm256_mul_ps(qq11,felec);
2163 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2164 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2165 velec = _mm256_sub_ps(rinv11,pmecorrV);
2166 velec = _mm256_mul_ps(qq11,velec);
2168 d = _mm256_sub_ps(r11,rswitch);
2169 d = _mm256_max_ps(d,_mm256_setzero_ps());
2170 d2 = _mm256_mul_ps(d,d);
2171 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2173 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2175 /* Evaluate switch function */
2176 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2177 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2178 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2182 fscal = _mm256_and_ps(fscal,cutoff_mask);
2184 /* Calculate temporary vectorial force */
2185 tx = _mm256_mul_ps(fscal,dx11);
2186 ty = _mm256_mul_ps(fscal,dy11);
2187 tz = _mm256_mul_ps(fscal,dz11);
2189 /* Update vectorial force */
2190 fix1 = _mm256_add_ps(fix1,tx);
2191 fiy1 = _mm256_add_ps(fiy1,ty);
2192 fiz1 = _mm256_add_ps(fiz1,tz);
2194 fjx1 = _mm256_add_ps(fjx1,tx);
2195 fjy1 = _mm256_add_ps(fjy1,ty);
2196 fjz1 = _mm256_add_ps(fjz1,tz);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2207 r12 = _mm256_mul_ps(rsq12,rinv12);
2209 /* EWALD ELECTROSTATICS */
2211 /* Analytical PME correction */
2212 zeta2 = _mm256_mul_ps(beta2,rsq12);
2213 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2214 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2215 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2216 felec = _mm256_mul_ps(qq12,felec);
2217 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2218 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2219 velec = _mm256_sub_ps(rinv12,pmecorrV);
2220 velec = _mm256_mul_ps(qq12,velec);
2222 d = _mm256_sub_ps(r12,rswitch);
2223 d = _mm256_max_ps(d,_mm256_setzero_ps());
2224 d2 = _mm256_mul_ps(d,d);
2225 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2227 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2229 /* Evaluate switch function */
2230 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2231 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2232 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2236 fscal = _mm256_and_ps(fscal,cutoff_mask);
2238 /* Calculate temporary vectorial force */
2239 tx = _mm256_mul_ps(fscal,dx12);
2240 ty = _mm256_mul_ps(fscal,dy12);
2241 tz = _mm256_mul_ps(fscal,dz12);
2243 /* Update vectorial force */
2244 fix1 = _mm256_add_ps(fix1,tx);
2245 fiy1 = _mm256_add_ps(fiy1,ty);
2246 fiz1 = _mm256_add_ps(fiz1,tz);
2248 fjx2 = _mm256_add_ps(fjx2,tx);
2249 fjy2 = _mm256_add_ps(fjy2,ty);
2250 fjz2 = _mm256_add_ps(fjz2,tz);
2254 /**************************
2255 * CALCULATE INTERACTIONS *
2256 **************************/
2258 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2261 r20 = _mm256_mul_ps(rsq20,rinv20);
2263 /* EWALD ELECTROSTATICS */
2265 /* Analytical PME correction */
2266 zeta2 = _mm256_mul_ps(beta2,rsq20);
2267 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
2268 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2269 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2270 felec = _mm256_mul_ps(qq20,felec);
2271 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2272 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2273 velec = _mm256_sub_ps(rinv20,pmecorrV);
2274 velec = _mm256_mul_ps(qq20,velec);
2276 d = _mm256_sub_ps(r20,rswitch);
2277 d = _mm256_max_ps(d,_mm256_setzero_ps());
2278 d2 = _mm256_mul_ps(d,d);
2279 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2281 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2283 /* Evaluate switch function */
2284 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2285 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2286 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2290 fscal = _mm256_and_ps(fscal,cutoff_mask);
2292 /* Calculate temporary vectorial force */
2293 tx = _mm256_mul_ps(fscal,dx20);
2294 ty = _mm256_mul_ps(fscal,dy20);
2295 tz = _mm256_mul_ps(fscal,dz20);
2297 /* Update vectorial force */
2298 fix2 = _mm256_add_ps(fix2,tx);
2299 fiy2 = _mm256_add_ps(fiy2,ty);
2300 fiz2 = _mm256_add_ps(fiz2,tz);
2302 fjx0 = _mm256_add_ps(fjx0,tx);
2303 fjy0 = _mm256_add_ps(fjy0,ty);
2304 fjz0 = _mm256_add_ps(fjz0,tz);
2308 /**************************
2309 * CALCULATE INTERACTIONS *
2310 **************************/
2312 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2315 r21 = _mm256_mul_ps(rsq21,rinv21);
2317 /* EWALD ELECTROSTATICS */
2319 /* Analytical PME correction */
2320 zeta2 = _mm256_mul_ps(beta2,rsq21);
2321 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2322 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2323 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2324 felec = _mm256_mul_ps(qq21,felec);
2325 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2326 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2327 velec = _mm256_sub_ps(rinv21,pmecorrV);
2328 velec = _mm256_mul_ps(qq21,velec);
2330 d = _mm256_sub_ps(r21,rswitch);
2331 d = _mm256_max_ps(d,_mm256_setzero_ps());
2332 d2 = _mm256_mul_ps(d,d);
2333 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2335 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2337 /* Evaluate switch function */
2338 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2339 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2340 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2344 fscal = _mm256_and_ps(fscal,cutoff_mask);
2346 /* Calculate temporary vectorial force */
2347 tx = _mm256_mul_ps(fscal,dx21);
2348 ty = _mm256_mul_ps(fscal,dy21);
2349 tz = _mm256_mul_ps(fscal,dz21);
2351 /* Update vectorial force */
2352 fix2 = _mm256_add_ps(fix2,tx);
2353 fiy2 = _mm256_add_ps(fiy2,ty);
2354 fiz2 = _mm256_add_ps(fiz2,tz);
2356 fjx1 = _mm256_add_ps(fjx1,tx);
2357 fjy1 = _mm256_add_ps(fjy1,ty);
2358 fjz1 = _mm256_add_ps(fjz1,tz);
2362 /**************************
2363 * CALCULATE INTERACTIONS *
2364 **************************/
2366 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2369 r22 = _mm256_mul_ps(rsq22,rinv22);
2371 /* EWALD ELECTROSTATICS */
2373 /* Analytical PME correction */
2374 zeta2 = _mm256_mul_ps(beta2,rsq22);
2375 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2376 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2377 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2378 felec = _mm256_mul_ps(qq22,felec);
2379 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2380 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2381 velec = _mm256_sub_ps(rinv22,pmecorrV);
2382 velec = _mm256_mul_ps(qq22,velec);
2384 d = _mm256_sub_ps(r22,rswitch);
2385 d = _mm256_max_ps(d,_mm256_setzero_ps());
2386 d2 = _mm256_mul_ps(d,d);
2387 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2389 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2391 /* Evaluate switch function */
2392 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2393 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2394 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2398 fscal = _mm256_and_ps(fscal,cutoff_mask);
2400 /* Calculate temporary vectorial force */
2401 tx = _mm256_mul_ps(fscal,dx22);
2402 ty = _mm256_mul_ps(fscal,dy22);
2403 tz = _mm256_mul_ps(fscal,dz22);
2405 /* Update vectorial force */
2406 fix2 = _mm256_add_ps(fix2,tx);
2407 fiy2 = _mm256_add_ps(fiy2,ty);
2408 fiz2 = _mm256_add_ps(fiz2,tz);
2410 fjx2 = _mm256_add_ps(fjx2,tx);
2411 fjy2 = _mm256_add_ps(fjy2,ty);
2412 fjz2 = _mm256_add_ps(fjz2,tz);
2416 fjptrA = f+j_coord_offsetA;
2417 fjptrB = f+j_coord_offsetB;
2418 fjptrC = f+j_coord_offsetC;
2419 fjptrD = f+j_coord_offsetD;
2420 fjptrE = f+j_coord_offsetE;
2421 fjptrF = f+j_coord_offsetF;
2422 fjptrG = f+j_coord_offsetG;
2423 fjptrH = f+j_coord_offsetH;
2425 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2426 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2428 /* Inner loop uses 960 flops */
2431 if(jidx<j_index_end)
2434 /* Get j neighbor index, and coordinate index */
2435 jnrlistA = jjnr[jidx];
2436 jnrlistB = jjnr[jidx+1];
2437 jnrlistC = jjnr[jidx+2];
2438 jnrlistD = jjnr[jidx+3];
2439 jnrlistE = jjnr[jidx+4];
2440 jnrlistF = jjnr[jidx+5];
2441 jnrlistG = jjnr[jidx+6];
2442 jnrlistH = jjnr[jidx+7];
2443 /* Sign of each element will be negative for non-real atoms.
2444 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2445 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2447 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2448 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2450 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2451 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2452 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2453 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2454 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2455 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2456 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2457 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2458 j_coord_offsetA = DIM*jnrA;
2459 j_coord_offsetB = DIM*jnrB;
2460 j_coord_offsetC = DIM*jnrC;
2461 j_coord_offsetD = DIM*jnrD;
2462 j_coord_offsetE = DIM*jnrE;
2463 j_coord_offsetF = DIM*jnrF;
2464 j_coord_offsetG = DIM*jnrG;
2465 j_coord_offsetH = DIM*jnrH;
2467 /* load j atom coordinates */
2468 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2469 x+j_coord_offsetC,x+j_coord_offsetD,
2470 x+j_coord_offsetE,x+j_coord_offsetF,
2471 x+j_coord_offsetG,x+j_coord_offsetH,
2472 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2474 /* Calculate displacement vector */
2475 dx00 = _mm256_sub_ps(ix0,jx0);
2476 dy00 = _mm256_sub_ps(iy0,jy0);
2477 dz00 = _mm256_sub_ps(iz0,jz0);
2478 dx01 = _mm256_sub_ps(ix0,jx1);
2479 dy01 = _mm256_sub_ps(iy0,jy1);
2480 dz01 = _mm256_sub_ps(iz0,jz1);
2481 dx02 = _mm256_sub_ps(ix0,jx2);
2482 dy02 = _mm256_sub_ps(iy0,jy2);
2483 dz02 = _mm256_sub_ps(iz0,jz2);
2484 dx10 = _mm256_sub_ps(ix1,jx0);
2485 dy10 = _mm256_sub_ps(iy1,jy0);
2486 dz10 = _mm256_sub_ps(iz1,jz0);
2487 dx11 = _mm256_sub_ps(ix1,jx1);
2488 dy11 = _mm256_sub_ps(iy1,jy1);
2489 dz11 = _mm256_sub_ps(iz1,jz1);
2490 dx12 = _mm256_sub_ps(ix1,jx2);
2491 dy12 = _mm256_sub_ps(iy1,jy2);
2492 dz12 = _mm256_sub_ps(iz1,jz2);
2493 dx20 = _mm256_sub_ps(ix2,jx0);
2494 dy20 = _mm256_sub_ps(iy2,jy0);
2495 dz20 = _mm256_sub_ps(iz2,jz0);
2496 dx21 = _mm256_sub_ps(ix2,jx1);
2497 dy21 = _mm256_sub_ps(iy2,jy1);
2498 dz21 = _mm256_sub_ps(iz2,jz1);
2499 dx22 = _mm256_sub_ps(ix2,jx2);
2500 dy22 = _mm256_sub_ps(iy2,jy2);
2501 dz22 = _mm256_sub_ps(iz2,jz2);
2503 /* Calculate squared distance and things based on it */
2504 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2505 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2506 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2507 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2508 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2509 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2510 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2511 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2512 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2514 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2515 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2516 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2517 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2518 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2519 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2520 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2521 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2522 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2524 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2525 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
2526 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
2527 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
2528 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2529 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2530 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
2531 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2532 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2534 fjx0 = _mm256_setzero_ps();
2535 fjy0 = _mm256_setzero_ps();
2536 fjz0 = _mm256_setzero_ps();
2537 fjx1 = _mm256_setzero_ps();
2538 fjy1 = _mm256_setzero_ps();
2539 fjz1 = _mm256_setzero_ps();
2540 fjx2 = _mm256_setzero_ps();
2541 fjy2 = _mm256_setzero_ps();
2542 fjz2 = _mm256_setzero_ps();
2544 /**************************
2545 * CALCULATE INTERACTIONS *
2546 **************************/
2548 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2551 r00 = _mm256_mul_ps(rsq00,rinv00);
2552 r00 = _mm256_andnot_ps(dummy_mask,r00);
2554 /* EWALD ELECTROSTATICS */
2556 /* Analytical PME correction */
2557 zeta2 = _mm256_mul_ps(beta2,rsq00);
2558 rinv3 = _mm256_mul_ps(rinvsq00,rinv00);
2559 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2560 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2561 felec = _mm256_mul_ps(qq00,felec);
2562 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2563 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2564 velec = _mm256_sub_ps(rinv00,pmecorrV);
2565 velec = _mm256_mul_ps(qq00,velec);
2567 /* LENNARD-JONES DISPERSION/REPULSION */
2569 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2570 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
2571 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2572 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2573 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2575 d = _mm256_sub_ps(r00,rswitch);
2576 d = _mm256_max_ps(d,_mm256_setzero_ps());
2577 d2 = _mm256_mul_ps(d,d);
2578 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2580 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2582 /* Evaluate switch function */
2583 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2584 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
2585 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2586 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2588 fscal = _mm256_add_ps(felec,fvdw);
2590 fscal = _mm256_and_ps(fscal,cutoff_mask);
2592 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2594 /* Calculate temporary vectorial force */
2595 tx = _mm256_mul_ps(fscal,dx00);
2596 ty = _mm256_mul_ps(fscal,dy00);
2597 tz = _mm256_mul_ps(fscal,dz00);
2599 /* Update vectorial force */
2600 fix0 = _mm256_add_ps(fix0,tx);
2601 fiy0 = _mm256_add_ps(fiy0,ty);
2602 fiz0 = _mm256_add_ps(fiz0,tz);
2604 fjx0 = _mm256_add_ps(fjx0,tx);
2605 fjy0 = _mm256_add_ps(fjy0,ty);
2606 fjz0 = _mm256_add_ps(fjz0,tz);
2610 /**************************
2611 * CALCULATE INTERACTIONS *
2612 **************************/
2614 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2617 r01 = _mm256_mul_ps(rsq01,rinv01);
2618 r01 = _mm256_andnot_ps(dummy_mask,r01);
2620 /* EWALD ELECTROSTATICS */
2622 /* Analytical PME correction */
2623 zeta2 = _mm256_mul_ps(beta2,rsq01);
2624 rinv3 = _mm256_mul_ps(rinvsq01,rinv01);
2625 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2626 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2627 felec = _mm256_mul_ps(qq01,felec);
2628 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2629 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2630 velec = _mm256_sub_ps(rinv01,pmecorrV);
2631 velec = _mm256_mul_ps(qq01,velec);
2633 d = _mm256_sub_ps(r01,rswitch);
2634 d = _mm256_max_ps(d,_mm256_setzero_ps());
2635 d2 = _mm256_mul_ps(d,d);
2636 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2638 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2640 /* Evaluate switch function */
2641 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2642 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2643 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2647 fscal = _mm256_and_ps(fscal,cutoff_mask);
2649 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2651 /* Calculate temporary vectorial force */
2652 tx = _mm256_mul_ps(fscal,dx01);
2653 ty = _mm256_mul_ps(fscal,dy01);
2654 tz = _mm256_mul_ps(fscal,dz01);
2656 /* Update vectorial force */
2657 fix0 = _mm256_add_ps(fix0,tx);
2658 fiy0 = _mm256_add_ps(fiy0,ty);
2659 fiz0 = _mm256_add_ps(fiz0,tz);
2661 fjx1 = _mm256_add_ps(fjx1,tx);
2662 fjy1 = _mm256_add_ps(fjy1,ty);
2663 fjz1 = _mm256_add_ps(fjz1,tz);
2667 /**************************
2668 * CALCULATE INTERACTIONS *
2669 **************************/
2671 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2674 r02 = _mm256_mul_ps(rsq02,rinv02);
2675 r02 = _mm256_andnot_ps(dummy_mask,r02);
2677 /* EWALD ELECTROSTATICS */
2679 /* Analytical PME correction */
2680 zeta2 = _mm256_mul_ps(beta2,rsq02);
2681 rinv3 = _mm256_mul_ps(rinvsq02,rinv02);
2682 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2683 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2684 felec = _mm256_mul_ps(qq02,felec);
2685 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2686 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2687 velec = _mm256_sub_ps(rinv02,pmecorrV);
2688 velec = _mm256_mul_ps(qq02,velec);
2690 d = _mm256_sub_ps(r02,rswitch);
2691 d = _mm256_max_ps(d,_mm256_setzero_ps());
2692 d2 = _mm256_mul_ps(d,d);
2693 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2695 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2697 /* Evaluate switch function */
2698 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2699 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2700 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2704 fscal = _mm256_and_ps(fscal,cutoff_mask);
2706 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2708 /* Calculate temporary vectorial force */
2709 tx = _mm256_mul_ps(fscal,dx02);
2710 ty = _mm256_mul_ps(fscal,dy02);
2711 tz = _mm256_mul_ps(fscal,dz02);
2713 /* Update vectorial force */
2714 fix0 = _mm256_add_ps(fix0,tx);
2715 fiy0 = _mm256_add_ps(fiy0,ty);
2716 fiz0 = _mm256_add_ps(fiz0,tz);
2718 fjx2 = _mm256_add_ps(fjx2,tx);
2719 fjy2 = _mm256_add_ps(fjy2,ty);
2720 fjz2 = _mm256_add_ps(fjz2,tz);
2724 /**************************
2725 * CALCULATE INTERACTIONS *
2726 **************************/
2728 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2731 r10 = _mm256_mul_ps(rsq10,rinv10);
2732 r10 = _mm256_andnot_ps(dummy_mask,r10);
2734 /* EWALD ELECTROSTATICS */
2736 /* Analytical PME correction */
2737 zeta2 = _mm256_mul_ps(beta2,rsq10);
2738 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
2739 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2740 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2741 felec = _mm256_mul_ps(qq10,felec);
2742 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2743 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2744 velec = _mm256_sub_ps(rinv10,pmecorrV);
2745 velec = _mm256_mul_ps(qq10,velec);
2747 d = _mm256_sub_ps(r10,rswitch);
2748 d = _mm256_max_ps(d,_mm256_setzero_ps());
2749 d2 = _mm256_mul_ps(d,d);
2750 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2752 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2754 /* Evaluate switch function */
2755 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2756 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2757 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2761 fscal = _mm256_and_ps(fscal,cutoff_mask);
2763 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2765 /* Calculate temporary vectorial force */
2766 tx = _mm256_mul_ps(fscal,dx10);
2767 ty = _mm256_mul_ps(fscal,dy10);
2768 tz = _mm256_mul_ps(fscal,dz10);
2770 /* Update vectorial force */
2771 fix1 = _mm256_add_ps(fix1,tx);
2772 fiy1 = _mm256_add_ps(fiy1,ty);
2773 fiz1 = _mm256_add_ps(fiz1,tz);
2775 fjx0 = _mm256_add_ps(fjx0,tx);
2776 fjy0 = _mm256_add_ps(fjy0,ty);
2777 fjz0 = _mm256_add_ps(fjz0,tz);
2781 /**************************
2782 * CALCULATE INTERACTIONS *
2783 **************************/
2785 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2788 r11 = _mm256_mul_ps(rsq11,rinv11);
2789 r11 = _mm256_andnot_ps(dummy_mask,r11);
2791 /* EWALD ELECTROSTATICS */
2793 /* Analytical PME correction */
2794 zeta2 = _mm256_mul_ps(beta2,rsq11);
2795 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2796 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2797 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2798 felec = _mm256_mul_ps(qq11,felec);
2799 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2800 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2801 velec = _mm256_sub_ps(rinv11,pmecorrV);
2802 velec = _mm256_mul_ps(qq11,velec);
2804 d = _mm256_sub_ps(r11,rswitch);
2805 d = _mm256_max_ps(d,_mm256_setzero_ps());
2806 d2 = _mm256_mul_ps(d,d);
2807 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2809 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2811 /* Evaluate switch function */
2812 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2813 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2814 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2818 fscal = _mm256_and_ps(fscal,cutoff_mask);
2820 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2822 /* Calculate temporary vectorial force */
2823 tx = _mm256_mul_ps(fscal,dx11);
2824 ty = _mm256_mul_ps(fscal,dy11);
2825 tz = _mm256_mul_ps(fscal,dz11);
2827 /* Update vectorial force */
2828 fix1 = _mm256_add_ps(fix1,tx);
2829 fiy1 = _mm256_add_ps(fiy1,ty);
2830 fiz1 = _mm256_add_ps(fiz1,tz);
2832 fjx1 = _mm256_add_ps(fjx1,tx);
2833 fjy1 = _mm256_add_ps(fjy1,ty);
2834 fjz1 = _mm256_add_ps(fjz1,tz);
2838 /**************************
2839 * CALCULATE INTERACTIONS *
2840 **************************/
2842 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2845 r12 = _mm256_mul_ps(rsq12,rinv12);
2846 r12 = _mm256_andnot_ps(dummy_mask,r12);
2848 /* EWALD ELECTROSTATICS */
2850 /* Analytical PME correction */
2851 zeta2 = _mm256_mul_ps(beta2,rsq12);
2852 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2853 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2854 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2855 felec = _mm256_mul_ps(qq12,felec);
2856 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2857 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2858 velec = _mm256_sub_ps(rinv12,pmecorrV);
2859 velec = _mm256_mul_ps(qq12,velec);
2861 d = _mm256_sub_ps(r12,rswitch);
2862 d = _mm256_max_ps(d,_mm256_setzero_ps());
2863 d2 = _mm256_mul_ps(d,d);
2864 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2866 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2868 /* Evaluate switch function */
2869 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2870 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2871 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2875 fscal = _mm256_and_ps(fscal,cutoff_mask);
2877 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2879 /* Calculate temporary vectorial force */
2880 tx = _mm256_mul_ps(fscal,dx12);
2881 ty = _mm256_mul_ps(fscal,dy12);
2882 tz = _mm256_mul_ps(fscal,dz12);
2884 /* Update vectorial force */
2885 fix1 = _mm256_add_ps(fix1,tx);
2886 fiy1 = _mm256_add_ps(fiy1,ty);
2887 fiz1 = _mm256_add_ps(fiz1,tz);
2889 fjx2 = _mm256_add_ps(fjx2,tx);
2890 fjy2 = _mm256_add_ps(fjy2,ty);
2891 fjz2 = _mm256_add_ps(fjz2,tz);
2895 /**************************
2896 * CALCULATE INTERACTIONS *
2897 **************************/
2899 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2902 r20 = _mm256_mul_ps(rsq20,rinv20);
2903 r20 = _mm256_andnot_ps(dummy_mask,r20);
2905 /* EWALD ELECTROSTATICS */
2907 /* Analytical PME correction */
2908 zeta2 = _mm256_mul_ps(beta2,rsq20);
2909 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
2910 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2911 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2912 felec = _mm256_mul_ps(qq20,felec);
2913 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2914 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2915 velec = _mm256_sub_ps(rinv20,pmecorrV);
2916 velec = _mm256_mul_ps(qq20,velec);
2918 d = _mm256_sub_ps(r20,rswitch);
2919 d = _mm256_max_ps(d,_mm256_setzero_ps());
2920 d2 = _mm256_mul_ps(d,d);
2921 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2923 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2925 /* Evaluate switch function */
2926 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2927 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2928 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2932 fscal = _mm256_and_ps(fscal,cutoff_mask);
2934 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2936 /* Calculate temporary vectorial force */
2937 tx = _mm256_mul_ps(fscal,dx20);
2938 ty = _mm256_mul_ps(fscal,dy20);
2939 tz = _mm256_mul_ps(fscal,dz20);
2941 /* Update vectorial force */
2942 fix2 = _mm256_add_ps(fix2,tx);
2943 fiy2 = _mm256_add_ps(fiy2,ty);
2944 fiz2 = _mm256_add_ps(fiz2,tz);
2946 fjx0 = _mm256_add_ps(fjx0,tx);
2947 fjy0 = _mm256_add_ps(fjy0,ty);
2948 fjz0 = _mm256_add_ps(fjz0,tz);
2952 /**************************
2953 * CALCULATE INTERACTIONS *
2954 **************************/
2956 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2959 r21 = _mm256_mul_ps(rsq21,rinv21);
2960 r21 = _mm256_andnot_ps(dummy_mask,r21);
2962 /* EWALD ELECTROSTATICS */
2964 /* Analytical PME correction */
2965 zeta2 = _mm256_mul_ps(beta2,rsq21);
2966 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2967 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2968 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2969 felec = _mm256_mul_ps(qq21,felec);
2970 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2971 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2972 velec = _mm256_sub_ps(rinv21,pmecorrV);
2973 velec = _mm256_mul_ps(qq21,velec);
2975 d = _mm256_sub_ps(r21,rswitch);
2976 d = _mm256_max_ps(d,_mm256_setzero_ps());
2977 d2 = _mm256_mul_ps(d,d);
2978 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2980 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2982 /* Evaluate switch function */
2983 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2984 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2985 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2989 fscal = _mm256_and_ps(fscal,cutoff_mask);
2991 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2993 /* Calculate temporary vectorial force */
2994 tx = _mm256_mul_ps(fscal,dx21);
2995 ty = _mm256_mul_ps(fscal,dy21);
2996 tz = _mm256_mul_ps(fscal,dz21);
2998 /* Update vectorial force */
2999 fix2 = _mm256_add_ps(fix2,tx);
3000 fiy2 = _mm256_add_ps(fiy2,ty);
3001 fiz2 = _mm256_add_ps(fiz2,tz);
3003 fjx1 = _mm256_add_ps(fjx1,tx);
3004 fjy1 = _mm256_add_ps(fjy1,ty);
3005 fjz1 = _mm256_add_ps(fjz1,tz);
3009 /**************************
3010 * CALCULATE INTERACTIONS *
3011 **************************/
3013 if (gmx_mm256_any_lt(rsq22,rcutoff2))
3016 r22 = _mm256_mul_ps(rsq22,rinv22);
3017 r22 = _mm256_andnot_ps(dummy_mask,r22);
3019 /* EWALD ELECTROSTATICS */
3021 /* Analytical PME correction */
3022 zeta2 = _mm256_mul_ps(beta2,rsq22);
3023 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
3024 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
3025 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3026 felec = _mm256_mul_ps(qq22,felec);
3027 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
3028 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
3029 velec = _mm256_sub_ps(rinv22,pmecorrV);
3030 velec = _mm256_mul_ps(qq22,velec);
3032 d = _mm256_sub_ps(r22,rswitch);
3033 d = _mm256_max_ps(d,_mm256_setzero_ps());
3034 d2 = _mm256_mul_ps(d,d);
3035 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3037 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3039 /* Evaluate switch function */
3040 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3041 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
3042 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
3046 fscal = _mm256_and_ps(fscal,cutoff_mask);
3048 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3050 /* Calculate temporary vectorial force */
3051 tx = _mm256_mul_ps(fscal,dx22);
3052 ty = _mm256_mul_ps(fscal,dy22);
3053 tz = _mm256_mul_ps(fscal,dz22);
3055 /* Update vectorial force */
3056 fix2 = _mm256_add_ps(fix2,tx);
3057 fiy2 = _mm256_add_ps(fiy2,ty);
3058 fiz2 = _mm256_add_ps(fiz2,tz);
3060 fjx2 = _mm256_add_ps(fjx2,tx);
3061 fjy2 = _mm256_add_ps(fjy2,ty);
3062 fjz2 = _mm256_add_ps(fjz2,tz);
3066 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3067 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3068 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3069 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3070 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3071 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3072 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3073 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3075 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3076 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3078 /* Inner loop uses 969 flops */
3081 /* End of innermost loop */
3083 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3084 f+i_coord_offset,fshift+i_shift_offset);
3086 /* Increment number of inner iterations */
3087 inneriter += j_index_end - j_index_start;
3089 /* Outer loop uses 18 flops */
3092 /* Increment number of outer iterations */
3095 /* Update outer/inner flops */
3097 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*969);