2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 real * vdwioffsetptr3;
95 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
97 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
102 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
105 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
109 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
111 __m128i ewitab_lo,ewitab_hi;
112 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
115 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116 real rswitch_scalar,d_scalar;
117 __m256 dummy_mask,cutoff_mask;
118 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
119 __m256 one = _mm256_set1_ps(1.0);
120 __m256 two = _mm256_set1_ps(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm256_set1_ps(fr->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
139 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
140 beta2 = _mm256_mul_ps(beta,beta);
141 beta3 = _mm256_mul_ps(beta,beta2);
143 ewtab = fr->ic->tabq_coul_FDV0;
144 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
145 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
150 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
151 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
152 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm256_set1_ps(rcutoff_scalar);
157 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
159 rswitch_scalar = fr->rcoulomb_switch;
160 rswitch = _mm256_set1_ps(rswitch_scalar);
161 /* Setup switch parameters */
162 d_scalar = rcutoff_scalar-rswitch_scalar;
163 d = _mm256_set1_ps(d_scalar);
164 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
165 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
166 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
168 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
171 /* Avoid stupid compiler warnings */
172 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
185 for(iidx=0;iidx<4*DIM;iidx++)
190 /* Start outer loop over neighborlists */
191 for(iidx=0; iidx<nri; iidx++)
193 /* Load shift vector for this list */
194 i_shift_offset = DIM*shiftidx[iidx];
196 /* Load limits for loop over neighbors */
197 j_index_start = jindex[iidx];
198 j_index_end = jindex[iidx+1];
200 /* Get outer coordinate index */
202 i_coord_offset = DIM*inr;
204 /* Load i particle coords and add shift vector */
205 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
206 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
208 fix0 = _mm256_setzero_ps();
209 fiy0 = _mm256_setzero_ps();
210 fiz0 = _mm256_setzero_ps();
211 fix1 = _mm256_setzero_ps();
212 fiy1 = _mm256_setzero_ps();
213 fiz1 = _mm256_setzero_ps();
214 fix2 = _mm256_setzero_ps();
215 fiy2 = _mm256_setzero_ps();
216 fiz2 = _mm256_setzero_ps();
217 fix3 = _mm256_setzero_ps();
218 fiy3 = _mm256_setzero_ps();
219 fiz3 = _mm256_setzero_ps();
221 /* Reset potential sums */
222 velecsum = _mm256_setzero_ps();
223 vvdwsum = _mm256_setzero_ps();
225 /* Start inner kernel loop */
226 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
229 /* Get j neighbor index, and coordinate index */
238 j_coord_offsetA = DIM*jnrA;
239 j_coord_offsetB = DIM*jnrB;
240 j_coord_offsetC = DIM*jnrC;
241 j_coord_offsetD = DIM*jnrD;
242 j_coord_offsetE = DIM*jnrE;
243 j_coord_offsetF = DIM*jnrF;
244 j_coord_offsetG = DIM*jnrG;
245 j_coord_offsetH = DIM*jnrH;
247 /* load j atom coordinates */
248 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
249 x+j_coord_offsetC,x+j_coord_offsetD,
250 x+j_coord_offsetE,x+j_coord_offsetF,
251 x+j_coord_offsetG,x+j_coord_offsetH,
254 /* Calculate displacement vector */
255 dx00 = _mm256_sub_ps(ix0,jx0);
256 dy00 = _mm256_sub_ps(iy0,jy0);
257 dz00 = _mm256_sub_ps(iz0,jz0);
258 dx10 = _mm256_sub_ps(ix1,jx0);
259 dy10 = _mm256_sub_ps(iy1,jy0);
260 dz10 = _mm256_sub_ps(iz1,jz0);
261 dx20 = _mm256_sub_ps(ix2,jx0);
262 dy20 = _mm256_sub_ps(iy2,jy0);
263 dz20 = _mm256_sub_ps(iz2,jz0);
264 dx30 = _mm256_sub_ps(ix3,jx0);
265 dy30 = _mm256_sub_ps(iy3,jy0);
266 dz30 = _mm256_sub_ps(iz3,jz0);
268 /* Calculate squared distance and things based on it */
269 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
270 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
271 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
272 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
274 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
275 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
276 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
277 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
279 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
280 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
281 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
282 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
284 /* Load parameters for j particles */
285 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
286 charge+jnrC+0,charge+jnrD+0,
287 charge+jnrE+0,charge+jnrF+0,
288 charge+jnrG+0,charge+jnrH+0);
289 vdwjidx0A = 2*vdwtype[jnrA+0];
290 vdwjidx0B = 2*vdwtype[jnrB+0];
291 vdwjidx0C = 2*vdwtype[jnrC+0];
292 vdwjidx0D = 2*vdwtype[jnrD+0];
293 vdwjidx0E = 2*vdwtype[jnrE+0];
294 vdwjidx0F = 2*vdwtype[jnrF+0];
295 vdwjidx0G = 2*vdwtype[jnrG+0];
296 vdwjidx0H = 2*vdwtype[jnrH+0];
298 fjx0 = _mm256_setzero_ps();
299 fjy0 = _mm256_setzero_ps();
300 fjz0 = _mm256_setzero_ps();
302 /**************************
303 * CALCULATE INTERACTIONS *
304 **************************/
306 if (gmx_mm256_any_lt(rsq00,rcutoff2))
309 r00 = _mm256_mul_ps(rsq00,rinv00);
311 /* Compute parameters for interactions between i and j atoms */
312 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
313 vdwioffsetptr0+vdwjidx0B,
314 vdwioffsetptr0+vdwjidx0C,
315 vdwioffsetptr0+vdwjidx0D,
316 vdwioffsetptr0+vdwjidx0E,
317 vdwioffsetptr0+vdwjidx0F,
318 vdwioffsetptr0+vdwjidx0G,
319 vdwioffsetptr0+vdwjidx0H,
322 /* LENNARD-JONES DISPERSION/REPULSION */
324 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
325 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
326 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
327 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
328 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
330 d = _mm256_sub_ps(r00,rswitch);
331 d = _mm256_max_ps(d,_mm256_setzero_ps());
332 d2 = _mm256_mul_ps(d,d);
333 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
335 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
337 /* Evaluate switch function */
338 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
339 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
340 vvdw = _mm256_mul_ps(vvdw,sw);
341 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
345 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
349 fscal = _mm256_and_ps(fscal,cutoff_mask);
351 /* Calculate temporary vectorial force */
352 tx = _mm256_mul_ps(fscal,dx00);
353 ty = _mm256_mul_ps(fscal,dy00);
354 tz = _mm256_mul_ps(fscal,dz00);
356 /* Update vectorial force */
357 fix0 = _mm256_add_ps(fix0,tx);
358 fiy0 = _mm256_add_ps(fiy0,ty);
359 fiz0 = _mm256_add_ps(fiz0,tz);
361 fjx0 = _mm256_add_ps(fjx0,tx);
362 fjy0 = _mm256_add_ps(fjy0,ty);
363 fjz0 = _mm256_add_ps(fjz0,tz);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 if (gmx_mm256_any_lt(rsq10,rcutoff2))
374 r10 = _mm256_mul_ps(rsq10,rinv10);
376 /* Compute parameters for interactions between i and j atoms */
377 qq10 = _mm256_mul_ps(iq1,jq0);
379 /* EWALD ELECTROSTATICS */
381 /* Analytical PME correction */
382 zeta2 = _mm256_mul_ps(beta2,rsq10);
383 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
384 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
385 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
386 felec = _mm256_mul_ps(qq10,felec);
387 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
388 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
389 velec = _mm256_sub_ps(rinv10,pmecorrV);
390 velec = _mm256_mul_ps(qq10,velec);
392 d = _mm256_sub_ps(r10,rswitch);
393 d = _mm256_max_ps(d,_mm256_setzero_ps());
394 d2 = _mm256_mul_ps(d,d);
395 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
397 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
399 /* Evaluate switch function */
400 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
401 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
402 velec = _mm256_mul_ps(velec,sw);
403 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velec = _mm256_and_ps(velec,cutoff_mask);
407 velecsum = _mm256_add_ps(velecsum,velec);
411 fscal = _mm256_and_ps(fscal,cutoff_mask);
413 /* Calculate temporary vectorial force */
414 tx = _mm256_mul_ps(fscal,dx10);
415 ty = _mm256_mul_ps(fscal,dy10);
416 tz = _mm256_mul_ps(fscal,dz10);
418 /* Update vectorial force */
419 fix1 = _mm256_add_ps(fix1,tx);
420 fiy1 = _mm256_add_ps(fiy1,ty);
421 fiz1 = _mm256_add_ps(fiz1,tz);
423 fjx0 = _mm256_add_ps(fjx0,tx);
424 fjy0 = _mm256_add_ps(fjy0,ty);
425 fjz0 = _mm256_add_ps(fjz0,tz);
429 /**************************
430 * CALCULATE INTERACTIONS *
431 **************************/
433 if (gmx_mm256_any_lt(rsq20,rcutoff2))
436 r20 = _mm256_mul_ps(rsq20,rinv20);
438 /* Compute parameters for interactions between i and j atoms */
439 qq20 = _mm256_mul_ps(iq2,jq0);
441 /* EWALD ELECTROSTATICS */
443 /* Analytical PME correction */
444 zeta2 = _mm256_mul_ps(beta2,rsq20);
445 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
446 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
447 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
448 felec = _mm256_mul_ps(qq20,felec);
449 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
450 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
451 velec = _mm256_sub_ps(rinv20,pmecorrV);
452 velec = _mm256_mul_ps(qq20,velec);
454 d = _mm256_sub_ps(r20,rswitch);
455 d = _mm256_max_ps(d,_mm256_setzero_ps());
456 d2 = _mm256_mul_ps(d,d);
457 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
459 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
461 /* Evaluate switch function */
462 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
463 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
464 velec = _mm256_mul_ps(velec,sw);
465 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velec = _mm256_and_ps(velec,cutoff_mask);
469 velecsum = _mm256_add_ps(velecsum,velec);
473 fscal = _mm256_and_ps(fscal,cutoff_mask);
475 /* Calculate temporary vectorial force */
476 tx = _mm256_mul_ps(fscal,dx20);
477 ty = _mm256_mul_ps(fscal,dy20);
478 tz = _mm256_mul_ps(fscal,dz20);
480 /* Update vectorial force */
481 fix2 = _mm256_add_ps(fix2,tx);
482 fiy2 = _mm256_add_ps(fiy2,ty);
483 fiz2 = _mm256_add_ps(fiz2,tz);
485 fjx0 = _mm256_add_ps(fjx0,tx);
486 fjy0 = _mm256_add_ps(fjy0,ty);
487 fjz0 = _mm256_add_ps(fjz0,tz);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 if (gmx_mm256_any_lt(rsq30,rcutoff2))
498 r30 = _mm256_mul_ps(rsq30,rinv30);
500 /* Compute parameters for interactions between i and j atoms */
501 qq30 = _mm256_mul_ps(iq3,jq0);
503 /* EWALD ELECTROSTATICS */
505 /* Analytical PME correction */
506 zeta2 = _mm256_mul_ps(beta2,rsq30);
507 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
508 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
509 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
510 felec = _mm256_mul_ps(qq30,felec);
511 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
512 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
513 velec = _mm256_sub_ps(rinv30,pmecorrV);
514 velec = _mm256_mul_ps(qq30,velec);
516 d = _mm256_sub_ps(r30,rswitch);
517 d = _mm256_max_ps(d,_mm256_setzero_ps());
518 d2 = _mm256_mul_ps(d,d);
519 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
521 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
523 /* Evaluate switch function */
524 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
525 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
526 velec = _mm256_mul_ps(velec,sw);
527 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
529 /* Update potential sum for this i atom from the interaction with this j atom. */
530 velec = _mm256_and_ps(velec,cutoff_mask);
531 velecsum = _mm256_add_ps(velecsum,velec);
535 fscal = _mm256_and_ps(fscal,cutoff_mask);
537 /* Calculate temporary vectorial force */
538 tx = _mm256_mul_ps(fscal,dx30);
539 ty = _mm256_mul_ps(fscal,dy30);
540 tz = _mm256_mul_ps(fscal,dz30);
542 /* Update vectorial force */
543 fix3 = _mm256_add_ps(fix3,tx);
544 fiy3 = _mm256_add_ps(fiy3,ty);
545 fiz3 = _mm256_add_ps(fiz3,tz);
547 fjx0 = _mm256_add_ps(fjx0,tx);
548 fjy0 = _mm256_add_ps(fjy0,ty);
549 fjz0 = _mm256_add_ps(fjz0,tz);
553 fjptrA = f+j_coord_offsetA;
554 fjptrB = f+j_coord_offsetB;
555 fjptrC = f+j_coord_offsetC;
556 fjptrD = f+j_coord_offsetD;
557 fjptrE = f+j_coord_offsetE;
558 fjptrF = f+j_coord_offsetF;
559 fjptrG = f+j_coord_offsetG;
560 fjptrH = f+j_coord_offsetH;
562 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
564 /* Inner loop uses 386 flops */
570 /* Get j neighbor index, and coordinate index */
571 jnrlistA = jjnr[jidx];
572 jnrlistB = jjnr[jidx+1];
573 jnrlistC = jjnr[jidx+2];
574 jnrlistD = jjnr[jidx+3];
575 jnrlistE = jjnr[jidx+4];
576 jnrlistF = jjnr[jidx+5];
577 jnrlistG = jjnr[jidx+6];
578 jnrlistH = jjnr[jidx+7];
579 /* Sign of each element will be negative for non-real atoms.
580 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
581 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
583 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
584 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
586 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
587 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
588 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
589 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
590 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
591 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
592 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
593 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
594 j_coord_offsetA = DIM*jnrA;
595 j_coord_offsetB = DIM*jnrB;
596 j_coord_offsetC = DIM*jnrC;
597 j_coord_offsetD = DIM*jnrD;
598 j_coord_offsetE = DIM*jnrE;
599 j_coord_offsetF = DIM*jnrF;
600 j_coord_offsetG = DIM*jnrG;
601 j_coord_offsetH = DIM*jnrH;
603 /* load j atom coordinates */
604 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
605 x+j_coord_offsetC,x+j_coord_offsetD,
606 x+j_coord_offsetE,x+j_coord_offsetF,
607 x+j_coord_offsetG,x+j_coord_offsetH,
610 /* Calculate displacement vector */
611 dx00 = _mm256_sub_ps(ix0,jx0);
612 dy00 = _mm256_sub_ps(iy0,jy0);
613 dz00 = _mm256_sub_ps(iz0,jz0);
614 dx10 = _mm256_sub_ps(ix1,jx0);
615 dy10 = _mm256_sub_ps(iy1,jy0);
616 dz10 = _mm256_sub_ps(iz1,jz0);
617 dx20 = _mm256_sub_ps(ix2,jx0);
618 dy20 = _mm256_sub_ps(iy2,jy0);
619 dz20 = _mm256_sub_ps(iz2,jz0);
620 dx30 = _mm256_sub_ps(ix3,jx0);
621 dy30 = _mm256_sub_ps(iy3,jy0);
622 dz30 = _mm256_sub_ps(iz3,jz0);
624 /* Calculate squared distance and things based on it */
625 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
626 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
627 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
628 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
630 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
631 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
632 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
633 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
635 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
636 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
637 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
638 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
640 /* Load parameters for j particles */
641 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
642 charge+jnrC+0,charge+jnrD+0,
643 charge+jnrE+0,charge+jnrF+0,
644 charge+jnrG+0,charge+jnrH+0);
645 vdwjidx0A = 2*vdwtype[jnrA+0];
646 vdwjidx0B = 2*vdwtype[jnrB+0];
647 vdwjidx0C = 2*vdwtype[jnrC+0];
648 vdwjidx0D = 2*vdwtype[jnrD+0];
649 vdwjidx0E = 2*vdwtype[jnrE+0];
650 vdwjidx0F = 2*vdwtype[jnrF+0];
651 vdwjidx0G = 2*vdwtype[jnrG+0];
652 vdwjidx0H = 2*vdwtype[jnrH+0];
654 fjx0 = _mm256_setzero_ps();
655 fjy0 = _mm256_setzero_ps();
656 fjz0 = _mm256_setzero_ps();
658 /**************************
659 * CALCULATE INTERACTIONS *
660 **************************/
662 if (gmx_mm256_any_lt(rsq00,rcutoff2))
665 r00 = _mm256_mul_ps(rsq00,rinv00);
666 r00 = _mm256_andnot_ps(dummy_mask,r00);
668 /* Compute parameters for interactions between i and j atoms */
669 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
670 vdwioffsetptr0+vdwjidx0B,
671 vdwioffsetptr0+vdwjidx0C,
672 vdwioffsetptr0+vdwjidx0D,
673 vdwioffsetptr0+vdwjidx0E,
674 vdwioffsetptr0+vdwjidx0F,
675 vdwioffsetptr0+vdwjidx0G,
676 vdwioffsetptr0+vdwjidx0H,
679 /* LENNARD-JONES DISPERSION/REPULSION */
681 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
682 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
683 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
684 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
685 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
687 d = _mm256_sub_ps(r00,rswitch);
688 d = _mm256_max_ps(d,_mm256_setzero_ps());
689 d2 = _mm256_mul_ps(d,d);
690 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
692 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
694 /* Evaluate switch function */
695 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
696 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
697 vvdw = _mm256_mul_ps(vvdw,sw);
698 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
700 /* Update potential sum for this i atom from the interaction with this j atom. */
701 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
702 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
703 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
707 fscal = _mm256_and_ps(fscal,cutoff_mask);
709 fscal = _mm256_andnot_ps(dummy_mask,fscal);
711 /* Calculate temporary vectorial force */
712 tx = _mm256_mul_ps(fscal,dx00);
713 ty = _mm256_mul_ps(fscal,dy00);
714 tz = _mm256_mul_ps(fscal,dz00);
716 /* Update vectorial force */
717 fix0 = _mm256_add_ps(fix0,tx);
718 fiy0 = _mm256_add_ps(fiy0,ty);
719 fiz0 = _mm256_add_ps(fiz0,tz);
721 fjx0 = _mm256_add_ps(fjx0,tx);
722 fjy0 = _mm256_add_ps(fjy0,ty);
723 fjz0 = _mm256_add_ps(fjz0,tz);
727 /**************************
728 * CALCULATE INTERACTIONS *
729 **************************/
731 if (gmx_mm256_any_lt(rsq10,rcutoff2))
734 r10 = _mm256_mul_ps(rsq10,rinv10);
735 r10 = _mm256_andnot_ps(dummy_mask,r10);
737 /* Compute parameters for interactions between i and j atoms */
738 qq10 = _mm256_mul_ps(iq1,jq0);
740 /* EWALD ELECTROSTATICS */
742 /* Analytical PME correction */
743 zeta2 = _mm256_mul_ps(beta2,rsq10);
744 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
745 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
746 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
747 felec = _mm256_mul_ps(qq10,felec);
748 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
749 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
750 velec = _mm256_sub_ps(rinv10,pmecorrV);
751 velec = _mm256_mul_ps(qq10,velec);
753 d = _mm256_sub_ps(r10,rswitch);
754 d = _mm256_max_ps(d,_mm256_setzero_ps());
755 d2 = _mm256_mul_ps(d,d);
756 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
758 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
760 /* Evaluate switch function */
761 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
762 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
763 velec = _mm256_mul_ps(velec,sw);
764 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
766 /* Update potential sum for this i atom from the interaction with this j atom. */
767 velec = _mm256_and_ps(velec,cutoff_mask);
768 velec = _mm256_andnot_ps(dummy_mask,velec);
769 velecsum = _mm256_add_ps(velecsum,velec);
773 fscal = _mm256_and_ps(fscal,cutoff_mask);
775 fscal = _mm256_andnot_ps(dummy_mask,fscal);
777 /* Calculate temporary vectorial force */
778 tx = _mm256_mul_ps(fscal,dx10);
779 ty = _mm256_mul_ps(fscal,dy10);
780 tz = _mm256_mul_ps(fscal,dz10);
782 /* Update vectorial force */
783 fix1 = _mm256_add_ps(fix1,tx);
784 fiy1 = _mm256_add_ps(fiy1,ty);
785 fiz1 = _mm256_add_ps(fiz1,tz);
787 fjx0 = _mm256_add_ps(fjx0,tx);
788 fjy0 = _mm256_add_ps(fjy0,ty);
789 fjz0 = _mm256_add_ps(fjz0,tz);
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 if (gmx_mm256_any_lt(rsq20,rcutoff2))
800 r20 = _mm256_mul_ps(rsq20,rinv20);
801 r20 = _mm256_andnot_ps(dummy_mask,r20);
803 /* Compute parameters for interactions between i and j atoms */
804 qq20 = _mm256_mul_ps(iq2,jq0);
806 /* EWALD ELECTROSTATICS */
808 /* Analytical PME correction */
809 zeta2 = _mm256_mul_ps(beta2,rsq20);
810 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
811 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
812 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
813 felec = _mm256_mul_ps(qq20,felec);
814 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
815 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
816 velec = _mm256_sub_ps(rinv20,pmecorrV);
817 velec = _mm256_mul_ps(qq20,velec);
819 d = _mm256_sub_ps(r20,rswitch);
820 d = _mm256_max_ps(d,_mm256_setzero_ps());
821 d2 = _mm256_mul_ps(d,d);
822 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
824 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
826 /* Evaluate switch function */
827 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
828 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
829 velec = _mm256_mul_ps(velec,sw);
830 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
832 /* Update potential sum for this i atom from the interaction with this j atom. */
833 velec = _mm256_and_ps(velec,cutoff_mask);
834 velec = _mm256_andnot_ps(dummy_mask,velec);
835 velecsum = _mm256_add_ps(velecsum,velec);
839 fscal = _mm256_and_ps(fscal,cutoff_mask);
841 fscal = _mm256_andnot_ps(dummy_mask,fscal);
843 /* Calculate temporary vectorial force */
844 tx = _mm256_mul_ps(fscal,dx20);
845 ty = _mm256_mul_ps(fscal,dy20);
846 tz = _mm256_mul_ps(fscal,dz20);
848 /* Update vectorial force */
849 fix2 = _mm256_add_ps(fix2,tx);
850 fiy2 = _mm256_add_ps(fiy2,ty);
851 fiz2 = _mm256_add_ps(fiz2,tz);
853 fjx0 = _mm256_add_ps(fjx0,tx);
854 fjy0 = _mm256_add_ps(fjy0,ty);
855 fjz0 = _mm256_add_ps(fjz0,tz);
859 /**************************
860 * CALCULATE INTERACTIONS *
861 **************************/
863 if (gmx_mm256_any_lt(rsq30,rcutoff2))
866 r30 = _mm256_mul_ps(rsq30,rinv30);
867 r30 = _mm256_andnot_ps(dummy_mask,r30);
869 /* Compute parameters for interactions between i and j atoms */
870 qq30 = _mm256_mul_ps(iq3,jq0);
872 /* EWALD ELECTROSTATICS */
874 /* Analytical PME correction */
875 zeta2 = _mm256_mul_ps(beta2,rsq30);
876 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
877 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
878 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
879 felec = _mm256_mul_ps(qq30,felec);
880 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
881 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
882 velec = _mm256_sub_ps(rinv30,pmecorrV);
883 velec = _mm256_mul_ps(qq30,velec);
885 d = _mm256_sub_ps(r30,rswitch);
886 d = _mm256_max_ps(d,_mm256_setzero_ps());
887 d2 = _mm256_mul_ps(d,d);
888 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
890 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
892 /* Evaluate switch function */
893 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
894 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
895 velec = _mm256_mul_ps(velec,sw);
896 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
898 /* Update potential sum for this i atom from the interaction with this j atom. */
899 velec = _mm256_and_ps(velec,cutoff_mask);
900 velec = _mm256_andnot_ps(dummy_mask,velec);
901 velecsum = _mm256_add_ps(velecsum,velec);
905 fscal = _mm256_and_ps(fscal,cutoff_mask);
907 fscal = _mm256_andnot_ps(dummy_mask,fscal);
909 /* Calculate temporary vectorial force */
910 tx = _mm256_mul_ps(fscal,dx30);
911 ty = _mm256_mul_ps(fscal,dy30);
912 tz = _mm256_mul_ps(fscal,dz30);
914 /* Update vectorial force */
915 fix3 = _mm256_add_ps(fix3,tx);
916 fiy3 = _mm256_add_ps(fiy3,ty);
917 fiz3 = _mm256_add_ps(fiz3,tz);
919 fjx0 = _mm256_add_ps(fjx0,tx);
920 fjy0 = _mm256_add_ps(fjy0,ty);
921 fjz0 = _mm256_add_ps(fjz0,tz);
925 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
926 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
927 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
928 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
929 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
930 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
931 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
932 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
934 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
936 /* Inner loop uses 390 flops */
939 /* End of innermost loop */
941 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
942 f+i_coord_offset,fshift+i_shift_offset);
945 /* Update potential energies */
946 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
947 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
949 /* Increment number of inner iterations */
950 inneriter += j_index_end - j_index_start;
952 /* Outer loop uses 26 flops */
955 /* Increment number of outer iterations */
958 /* Update outer/inner flops */
960 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*390);
963 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_single
964 * Electrostatics interaction: Ewald
965 * VdW interaction: LennardJones
966 * Geometry: Water4-Particle
967 * Calculate force/pot: Force
970 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_single
971 (t_nblist * gmx_restrict nlist,
972 rvec * gmx_restrict xx,
973 rvec * gmx_restrict ff,
974 t_forcerec * gmx_restrict fr,
975 t_mdatoms * gmx_restrict mdatoms,
976 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
977 t_nrnb * gmx_restrict nrnb)
979 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
980 * just 0 for non-waters.
981 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
982 * jnr indices corresponding to data put in the four positions in the SIMD register.
984 int i_shift_offset,i_coord_offset,outeriter,inneriter;
985 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
986 int jnrA,jnrB,jnrC,jnrD;
987 int jnrE,jnrF,jnrG,jnrH;
988 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
989 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
990 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
991 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
992 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
994 real *shiftvec,*fshift,*x,*f;
995 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
997 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
998 real * vdwioffsetptr0;
999 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1000 real * vdwioffsetptr1;
1001 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1002 real * vdwioffsetptr2;
1003 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1004 real * vdwioffsetptr3;
1005 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1006 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1007 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1008 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1009 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1010 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1011 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
1012 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1015 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1018 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1019 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1021 __m128i ewitab_lo,ewitab_hi;
1022 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1023 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1025 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1026 real rswitch_scalar,d_scalar;
1027 __m256 dummy_mask,cutoff_mask;
1028 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1029 __m256 one = _mm256_set1_ps(1.0);
1030 __m256 two = _mm256_set1_ps(2.0);
1036 jindex = nlist->jindex;
1038 shiftidx = nlist->shift;
1040 shiftvec = fr->shift_vec[0];
1041 fshift = fr->fshift[0];
1042 facel = _mm256_set1_ps(fr->epsfac);
1043 charge = mdatoms->chargeA;
1044 nvdwtype = fr->ntype;
1045 vdwparam = fr->nbfp;
1046 vdwtype = mdatoms->typeA;
1048 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1049 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1050 beta2 = _mm256_mul_ps(beta,beta);
1051 beta3 = _mm256_mul_ps(beta,beta2);
1053 ewtab = fr->ic->tabq_coul_FDV0;
1054 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1055 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1057 /* Setup water-specific parameters */
1058 inr = nlist->iinr[0];
1059 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1060 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1061 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1062 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1064 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1065 rcutoff_scalar = fr->rcoulomb;
1066 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1067 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1069 rswitch_scalar = fr->rcoulomb_switch;
1070 rswitch = _mm256_set1_ps(rswitch_scalar);
1071 /* Setup switch parameters */
1072 d_scalar = rcutoff_scalar-rswitch_scalar;
1073 d = _mm256_set1_ps(d_scalar);
1074 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1075 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1076 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1077 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1078 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1079 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1081 /* Avoid stupid compiler warnings */
1082 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1083 j_coord_offsetA = 0;
1084 j_coord_offsetB = 0;
1085 j_coord_offsetC = 0;
1086 j_coord_offsetD = 0;
1087 j_coord_offsetE = 0;
1088 j_coord_offsetF = 0;
1089 j_coord_offsetG = 0;
1090 j_coord_offsetH = 0;
1095 for(iidx=0;iidx<4*DIM;iidx++)
1097 scratch[iidx] = 0.0;
1100 /* Start outer loop over neighborlists */
1101 for(iidx=0; iidx<nri; iidx++)
1103 /* Load shift vector for this list */
1104 i_shift_offset = DIM*shiftidx[iidx];
1106 /* Load limits for loop over neighbors */
1107 j_index_start = jindex[iidx];
1108 j_index_end = jindex[iidx+1];
1110 /* Get outer coordinate index */
1112 i_coord_offset = DIM*inr;
1114 /* Load i particle coords and add shift vector */
1115 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1116 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1118 fix0 = _mm256_setzero_ps();
1119 fiy0 = _mm256_setzero_ps();
1120 fiz0 = _mm256_setzero_ps();
1121 fix1 = _mm256_setzero_ps();
1122 fiy1 = _mm256_setzero_ps();
1123 fiz1 = _mm256_setzero_ps();
1124 fix2 = _mm256_setzero_ps();
1125 fiy2 = _mm256_setzero_ps();
1126 fiz2 = _mm256_setzero_ps();
1127 fix3 = _mm256_setzero_ps();
1128 fiy3 = _mm256_setzero_ps();
1129 fiz3 = _mm256_setzero_ps();
1131 /* Start inner kernel loop */
1132 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1135 /* Get j neighbor index, and coordinate index */
1137 jnrB = jjnr[jidx+1];
1138 jnrC = jjnr[jidx+2];
1139 jnrD = jjnr[jidx+3];
1140 jnrE = jjnr[jidx+4];
1141 jnrF = jjnr[jidx+5];
1142 jnrG = jjnr[jidx+6];
1143 jnrH = jjnr[jidx+7];
1144 j_coord_offsetA = DIM*jnrA;
1145 j_coord_offsetB = DIM*jnrB;
1146 j_coord_offsetC = DIM*jnrC;
1147 j_coord_offsetD = DIM*jnrD;
1148 j_coord_offsetE = DIM*jnrE;
1149 j_coord_offsetF = DIM*jnrF;
1150 j_coord_offsetG = DIM*jnrG;
1151 j_coord_offsetH = DIM*jnrH;
1153 /* load j atom coordinates */
1154 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1155 x+j_coord_offsetC,x+j_coord_offsetD,
1156 x+j_coord_offsetE,x+j_coord_offsetF,
1157 x+j_coord_offsetG,x+j_coord_offsetH,
1160 /* Calculate displacement vector */
1161 dx00 = _mm256_sub_ps(ix0,jx0);
1162 dy00 = _mm256_sub_ps(iy0,jy0);
1163 dz00 = _mm256_sub_ps(iz0,jz0);
1164 dx10 = _mm256_sub_ps(ix1,jx0);
1165 dy10 = _mm256_sub_ps(iy1,jy0);
1166 dz10 = _mm256_sub_ps(iz1,jz0);
1167 dx20 = _mm256_sub_ps(ix2,jx0);
1168 dy20 = _mm256_sub_ps(iy2,jy0);
1169 dz20 = _mm256_sub_ps(iz2,jz0);
1170 dx30 = _mm256_sub_ps(ix3,jx0);
1171 dy30 = _mm256_sub_ps(iy3,jy0);
1172 dz30 = _mm256_sub_ps(iz3,jz0);
1174 /* Calculate squared distance and things based on it */
1175 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1176 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1177 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1178 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1180 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1181 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1182 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1183 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1185 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1186 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1187 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1188 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
1190 /* Load parameters for j particles */
1191 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1192 charge+jnrC+0,charge+jnrD+0,
1193 charge+jnrE+0,charge+jnrF+0,
1194 charge+jnrG+0,charge+jnrH+0);
1195 vdwjidx0A = 2*vdwtype[jnrA+0];
1196 vdwjidx0B = 2*vdwtype[jnrB+0];
1197 vdwjidx0C = 2*vdwtype[jnrC+0];
1198 vdwjidx0D = 2*vdwtype[jnrD+0];
1199 vdwjidx0E = 2*vdwtype[jnrE+0];
1200 vdwjidx0F = 2*vdwtype[jnrF+0];
1201 vdwjidx0G = 2*vdwtype[jnrG+0];
1202 vdwjidx0H = 2*vdwtype[jnrH+0];
1204 fjx0 = _mm256_setzero_ps();
1205 fjy0 = _mm256_setzero_ps();
1206 fjz0 = _mm256_setzero_ps();
1208 /**************************
1209 * CALCULATE INTERACTIONS *
1210 **************************/
1212 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1215 r00 = _mm256_mul_ps(rsq00,rinv00);
1217 /* Compute parameters for interactions between i and j atoms */
1218 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1219 vdwioffsetptr0+vdwjidx0B,
1220 vdwioffsetptr0+vdwjidx0C,
1221 vdwioffsetptr0+vdwjidx0D,
1222 vdwioffsetptr0+vdwjidx0E,
1223 vdwioffsetptr0+vdwjidx0F,
1224 vdwioffsetptr0+vdwjidx0G,
1225 vdwioffsetptr0+vdwjidx0H,
1228 /* LENNARD-JONES DISPERSION/REPULSION */
1230 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1231 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1232 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1233 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1234 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1236 d = _mm256_sub_ps(r00,rswitch);
1237 d = _mm256_max_ps(d,_mm256_setzero_ps());
1238 d2 = _mm256_mul_ps(d,d);
1239 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1241 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1243 /* Evaluate switch function */
1244 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1245 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1246 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1250 fscal = _mm256_and_ps(fscal,cutoff_mask);
1252 /* Calculate temporary vectorial force */
1253 tx = _mm256_mul_ps(fscal,dx00);
1254 ty = _mm256_mul_ps(fscal,dy00);
1255 tz = _mm256_mul_ps(fscal,dz00);
1257 /* Update vectorial force */
1258 fix0 = _mm256_add_ps(fix0,tx);
1259 fiy0 = _mm256_add_ps(fiy0,ty);
1260 fiz0 = _mm256_add_ps(fiz0,tz);
1262 fjx0 = _mm256_add_ps(fjx0,tx);
1263 fjy0 = _mm256_add_ps(fjy0,ty);
1264 fjz0 = _mm256_add_ps(fjz0,tz);
1268 /**************************
1269 * CALCULATE INTERACTIONS *
1270 **************************/
1272 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1275 r10 = _mm256_mul_ps(rsq10,rinv10);
1277 /* Compute parameters for interactions between i and j atoms */
1278 qq10 = _mm256_mul_ps(iq1,jq0);
1280 /* EWALD ELECTROSTATICS */
1282 /* Analytical PME correction */
1283 zeta2 = _mm256_mul_ps(beta2,rsq10);
1284 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1285 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1286 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1287 felec = _mm256_mul_ps(qq10,felec);
1288 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1289 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1290 velec = _mm256_sub_ps(rinv10,pmecorrV);
1291 velec = _mm256_mul_ps(qq10,velec);
1293 d = _mm256_sub_ps(r10,rswitch);
1294 d = _mm256_max_ps(d,_mm256_setzero_ps());
1295 d2 = _mm256_mul_ps(d,d);
1296 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1298 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1300 /* Evaluate switch function */
1301 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1302 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1303 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1307 fscal = _mm256_and_ps(fscal,cutoff_mask);
1309 /* Calculate temporary vectorial force */
1310 tx = _mm256_mul_ps(fscal,dx10);
1311 ty = _mm256_mul_ps(fscal,dy10);
1312 tz = _mm256_mul_ps(fscal,dz10);
1314 /* Update vectorial force */
1315 fix1 = _mm256_add_ps(fix1,tx);
1316 fiy1 = _mm256_add_ps(fiy1,ty);
1317 fiz1 = _mm256_add_ps(fiz1,tz);
1319 fjx0 = _mm256_add_ps(fjx0,tx);
1320 fjy0 = _mm256_add_ps(fjy0,ty);
1321 fjz0 = _mm256_add_ps(fjz0,tz);
1325 /**************************
1326 * CALCULATE INTERACTIONS *
1327 **************************/
1329 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1332 r20 = _mm256_mul_ps(rsq20,rinv20);
1334 /* Compute parameters for interactions between i and j atoms */
1335 qq20 = _mm256_mul_ps(iq2,jq0);
1337 /* EWALD ELECTROSTATICS */
1339 /* Analytical PME correction */
1340 zeta2 = _mm256_mul_ps(beta2,rsq20);
1341 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1342 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1343 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1344 felec = _mm256_mul_ps(qq20,felec);
1345 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1346 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1347 velec = _mm256_sub_ps(rinv20,pmecorrV);
1348 velec = _mm256_mul_ps(qq20,velec);
1350 d = _mm256_sub_ps(r20,rswitch);
1351 d = _mm256_max_ps(d,_mm256_setzero_ps());
1352 d2 = _mm256_mul_ps(d,d);
1353 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1355 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1357 /* Evaluate switch function */
1358 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1359 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1360 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1364 fscal = _mm256_and_ps(fscal,cutoff_mask);
1366 /* Calculate temporary vectorial force */
1367 tx = _mm256_mul_ps(fscal,dx20);
1368 ty = _mm256_mul_ps(fscal,dy20);
1369 tz = _mm256_mul_ps(fscal,dz20);
1371 /* Update vectorial force */
1372 fix2 = _mm256_add_ps(fix2,tx);
1373 fiy2 = _mm256_add_ps(fiy2,ty);
1374 fiz2 = _mm256_add_ps(fiz2,tz);
1376 fjx0 = _mm256_add_ps(fjx0,tx);
1377 fjy0 = _mm256_add_ps(fjy0,ty);
1378 fjz0 = _mm256_add_ps(fjz0,tz);
1382 /**************************
1383 * CALCULATE INTERACTIONS *
1384 **************************/
1386 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1389 r30 = _mm256_mul_ps(rsq30,rinv30);
1391 /* Compute parameters for interactions between i and j atoms */
1392 qq30 = _mm256_mul_ps(iq3,jq0);
1394 /* EWALD ELECTROSTATICS */
1396 /* Analytical PME correction */
1397 zeta2 = _mm256_mul_ps(beta2,rsq30);
1398 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
1399 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1400 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1401 felec = _mm256_mul_ps(qq30,felec);
1402 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1403 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1404 velec = _mm256_sub_ps(rinv30,pmecorrV);
1405 velec = _mm256_mul_ps(qq30,velec);
1407 d = _mm256_sub_ps(r30,rswitch);
1408 d = _mm256_max_ps(d,_mm256_setzero_ps());
1409 d2 = _mm256_mul_ps(d,d);
1410 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1412 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1414 /* Evaluate switch function */
1415 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1416 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
1417 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1421 fscal = _mm256_and_ps(fscal,cutoff_mask);
1423 /* Calculate temporary vectorial force */
1424 tx = _mm256_mul_ps(fscal,dx30);
1425 ty = _mm256_mul_ps(fscal,dy30);
1426 tz = _mm256_mul_ps(fscal,dz30);
1428 /* Update vectorial force */
1429 fix3 = _mm256_add_ps(fix3,tx);
1430 fiy3 = _mm256_add_ps(fiy3,ty);
1431 fiz3 = _mm256_add_ps(fiz3,tz);
1433 fjx0 = _mm256_add_ps(fjx0,tx);
1434 fjy0 = _mm256_add_ps(fjy0,ty);
1435 fjz0 = _mm256_add_ps(fjz0,tz);
1439 fjptrA = f+j_coord_offsetA;
1440 fjptrB = f+j_coord_offsetB;
1441 fjptrC = f+j_coord_offsetC;
1442 fjptrD = f+j_coord_offsetD;
1443 fjptrE = f+j_coord_offsetE;
1444 fjptrF = f+j_coord_offsetF;
1445 fjptrG = f+j_coord_offsetG;
1446 fjptrH = f+j_coord_offsetH;
1448 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1450 /* Inner loop uses 374 flops */
1453 if(jidx<j_index_end)
1456 /* Get j neighbor index, and coordinate index */
1457 jnrlistA = jjnr[jidx];
1458 jnrlistB = jjnr[jidx+1];
1459 jnrlistC = jjnr[jidx+2];
1460 jnrlistD = jjnr[jidx+3];
1461 jnrlistE = jjnr[jidx+4];
1462 jnrlistF = jjnr[jidx+5];
1463 jnrlistG = jjnr[jidx+6];
1464 jnrlistH = jjnr[jidx+7];
1465 /* Sign of each element will be negative for non-real atoms.
1466 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1467 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1469 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1470 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1472 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1473 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1474 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1475 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1476 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1477 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1478 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1479 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1480 j_coord_offsetA = DIM*jnrA;
1481 j_coord_offsetB = DIM*jnrB;
1482 j_coord_offsetC = DIM*jnrC;
1483 j_coord_offsetD = DIM*jnrD;
1484 j_coord_offsetE = DIM*jnrE;
1485 j_coord_offsetF = DIM*jnrF;
1486 j_coord_offsetG = DIM*jnrG;
1487 j_coord_offsetH = DIM*jnrH;
1489 /* load j atom coordinates */
1490 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1491 x+j_coord_offsetC,x+j_coord_offsetD,
1492 x+j_coord_offsetE,x+j_coord_offsetF,
1493 x+j_coord_offsetG,x+j_coord_offsetH,
1496 /* Calculate displacement vector */
1497 dx00 = _mm256_sub_ps(ix0,jx0);
1498 dy00 = _mm256_sub_ps(iy0,jy0);
1499 dz00 = _mm256_sub_ps(iz0,jz0);
1500 dx10 = _mm256_sub_ps(ix1,jx0);
1501 dy10 = _mm256_sub_ps(iy1,jy0);
1502 dz10 = _mm256_sub_ps(iz1,jz0);
1503 dx20 = _mm256_sub_ps(ix2,jx0);
1504 dy20 = _mm256_sub_ps(iy2,jy0);
1505 dz20 = _mm256_sub_ps(iz2,jz0);
1506 dx30 = _mm256_sub_ps(ix3,jx0);
1507 dy30 = _mm256_sub_ps(iy3,jy0);
1508 dz30 = _mm256_sub_ps(iz3,jz0);
1510 /* Calculate squared distance and things based on it */
1511 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1512 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1513 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1514 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1516 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1517 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1518 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1519 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1521 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1522 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1523 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1524 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
1526 /* Load parameters for j particles */
1527 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1528 charge+jnrC+0,charge+jnrD+0,
1529 charge+jnrE+0,charge+jnrF+0,
1530 charge+jnrG+0,charge+jnrH+0);
1531 vdwjidx0A = 2*vdwtype[jnrA+0];
1532 vdwjidx0B = 2*vdwtype[jnrB+0];
1533 vdwjidx0C = 2*vdwtype[jnrC+0];
1534 vdwjidx0D = 2*vdwtype[jnrD+0];
1535 vdwjidx0E = 2*vdwtype[jnrE+0];
1536 vdwjidx0F = 2*vdwtype[jnrF+0];
1537 vdwjidx0G = 2*vdwtype[jnrG+0];
1538 vdwjidx0H = 2*vdwtype[jnrH+0];
1540 fjx0 = _mm256_setzero_ps();
1541 fjy0 = _mm256_setzero_ps();
1542 fjz0 = _mm256_setzero_ps();
1544 /**************************
1545 * CALCULATE INTERACTIONS *
1546 **************************/
1548 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1551 r00 = _mm256_mul_ps(rsq00,rinv00);
1552 r00 = _mm256_andnot_ps(dummy_mask,r00);
1554 /* Compute parameters for interactions between i and j atoms */
1555 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1556 vdwioffsetptr0+vdwjidx0B,
1557 vdwioffsetptr0+vdwjidx0C,
1558 vdwioffsetptr0+vdwjidx0D,
1559 vdwioffsetptr0+vdwjidx0E,
1560 vdwioffsetptr0+vdwjidx0F,
1561 vdwioffsetptr0+vdwjidx0G,
1562 vdwioffsetptr0+vdwjidx0H,
1565 /* LENNARD-JONES DISPERSION/REPULSION */
1567 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1568 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1569 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1570 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1571 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1573 d = _mm256_sub_ps(r00,rswitch);
1574 d = _mm256_max_ps(d,_mm256_setzero_ps());
1575 d2 = _mm256_mul_ps(d,d);
1576 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1578 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1580 /* Evaluate switch function */
1581 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1582 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1583 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1587 fscal = _mm256_and_ps(fscal,cutoff_mask);
1589 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1591 /* Calculate temporary vectorial force */
1592 tx = _mm256_mul_ps(fscal,dx00);
1593 ty = _mm256_mul_ps(fscal,dy00);
1594 tz = _mm256_mul_ps(fscal,dz00);
1596 /* Update vectorial force */
1597 fix0 = _mm256_add_ps(fix0,tx);
1598 fiy0 = _mm256_add_ps(fiy0,ty);
1599 fiz0 = _mm256_add_ps(fiz0,tz);
1601 fjx0 = _mm256_add_ps(fjx0,tx);
1602 fjy0 = _mm256_add_ps(fjy0,ty);
1603 fjz0 = _mm256_add_ps(fjz0,tz);
1607 /**************************
1608 * CALCULATE INTERACTIONS *
1609 **************************/
1611 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1614 r10 = _mm256_mul_ps(rsq10,rinv10);
1615 r10 = _mm256_andnot_ps(dummy_mask,r10);
1617 /* Compute parameters for interactions between i and j atoms */
1618 qq10 = _mm256_mul_ps(iq1,jq0);
1620 /* EWALD ELECTROSTATICS */
1622 /* Analytical PME correction */
1623 zeta2 = _mm256_mul_ps(beta2,rsq10);
1624 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1625 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1626 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1627 felec = _mm256_mul_ps(qq10,felec);
1628 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1629 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1630 velec = _mm256_sub_ps(rinv10,pmecorrV);
1631 velec = _mm256_mul_ps(qq10,velec);
1633 d = _mm256_sub_ps(r10,rswitch);
1634 d = _mm256_max_ps(d,_mm256_setzero_ps());
1635 d2 = _mm256_mul_ps(d,d);
1636 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1638 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1640 /* Evaluate switch function */
1641 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1642 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1643 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1647 fscal = _mm256_and_ps(fscal,cutoff_mask);
1649 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1651 /* Calculate temporary vectorial force */
1652 tx = _mm256_mul_ps(fscal,dx10);
1653 ty = _mm256_mul_ps(fscal,dy10);
1654 tz = _mm256_mul_ps(fscal,dz10);
1656 /* Update vectorial force */
1657 fix1 = _mm256_add_ps(fix1,tx);
1658 fiy1 = _mm256_add_ps(fiy1,ty);
1659 fiz1 = _mm256_add_ps(fiz1,tz);
1661 fjx0 = _mm256_add_ps(fjx0,tx);
1662 fjy0 = _mm256_add_ps(fjy0,ty);
1663 fjz0 = _mm256_add_ps(fjz0,tz);
1667 /**************************
1668 * CALCULATE INTERACTIONS *
1669 **************************/
1671 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1674 r20 = _mm256_mul_ps(rsq20,rinv20);
1675 r20 = _mm256_andnot_ps(dummy_mask,r20);
1677 /* Compute parameters for interactions between i and j atoms */
1678 qq20 = _mm256_mul_ps(iq2,jq0);
1680 /* EWALD ELECTROSTATICS */
1682 /* Analytical PME correction */
1683 zeta2 = _mm256_mul_ps(beta2,rsq20);
1684 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1685 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1686 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1687 felec = _mm256_mul_ps(qq20,felec);
1688 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1689 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1690 velec = _mm256_sub_ps(rinv20,pmecorrV);
1691 velec = _mm256_mul_ps(qq20,velec);
1693 d = _mm256_sub_ps(r20,rswitch);
1694 d = _mm256_max_ps(d,_mm256_setzero_ps());
1695 d2 = _mm256_mul_ps(d,d);
1696 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1698 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1700 /* Evaluate switch function */
1701 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1702 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1703 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1707 fscal = _mm256_and_ps(fscal,cutoff_mask);
1709 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1711 /* Calculate temporary vectorial force */
1712 tx = _mm256_mul_ps(fscal,dx20);
1713 ty = _mm256_mul_ps(fscal,dy20);
1714 tz = _mm256_mul_ps(fscal,dz20);
1716 /* Update vectorial force */
1717 fix2 = _mm256_add_ps(fix2,tx);
1718 fiy2 = _mm256_add_ps(fiy2,ty);
1719 fiz2 = _mm256_add_ps(fiz2,tz);
1721 fjx0 = _mm256_add_ps(fjx0,tx);
1722 fjy0 = _mm256_add_ps(fjy0,ty);
1723 fjz0 = _mm256_add_ps(fjz0,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1734 r30 = _mm256_mul_ps(rsq30,rinv30);
1735 r30 = _mm256_andnot_ps(dummy_mask,r30);
1737 /* Compute parameters for interactions between i and j atoms */
1738 qq30 = _mm256_mul_ps(iq3,jq0);
1740 /* EWALD ELECTROSTATICS */
1742 /* Analytical PME correction */
1743 zeta2 = _mm256_mul_ps(beta2,rsq30);
1744 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
1745 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1746 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1747 felec = _mm256_mul_ps(qq30,felec);
1748 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1749 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1750 velec = _mm256_sub_ps(rinv30,pmecorrV);
1751 velec = _mm256_mul_ps(qq30,velec);
1753 d = _mm256_sub_ps(r30,rswitch);
1754 d = _mm256_max_ps(d,_mm256_setzero_ps());
1755 d2 = _mm256_mul_ps(d,d);
1756 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1758 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1760 /* Evaluate switch function */
1761 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1762 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
1763 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1767 fscal = _mm256_and_ps(fscal,cutoff_mask);
1769 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1771 /* Calculate temporary vectorial force */
1772 tx = _mm256_mul_ps(fscal,dx30);
1773 ty = _mm256_mul_ps(fscal,dy30);
1774 tz = _mm256_mul_ps(fscal,dz30);
1776 /* Update vectorial force */
1777 fix3 = _mm256_add_ps(fix3,tx);
1778 fiy3 = _mm256_add_ps(fiy3,ty);
1779 fiz3 = _mm256_add_ps(fiz3,tz);
1781 fjx0 = _mm256_add_ps(fjx0,tx);
1782 fjy0 = _mm256_add_ps(fjy0,ty);
1783 fjz0 = _mm256_add_ps(fjz0,tz);
1787 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1788 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1789 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1790 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1791 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1792 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1793 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1794 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1796 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1798 /* Inner loop uses 378 flops */
1801 /* End of innermost loop */
1803 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1804 f+i_coord_offset,fshift+i_shift_offset);
1806 /* Increment number of inner iterations */
1807 inneriter += j_index_end - j_index_start;
1809 /* Outer loop uses 24 flops */
1812 /* Increment number of outer iterations */
1815 /* Update outer/inner flops */
1817 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*378);