2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: None
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
99 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128i ewitab_lo,ewitab_hi;
113 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
116 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
117 real rswitch_scalar,d_scalar;
118 __m256 dummy_mask,cutoff_mask;
119 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
120 __m256 one = _mm256_set1_ps(1.0);
121 __m256 two = _mm256_set1_ps(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm256_set1_ps(fr->epsfac);
134 charge = mdatoms->chargeA;
136 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
137 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
138 beta2 = _mm256_mul_ps(beta,beta);
139 beta3 = _mm256_mul_ps(beta,beta2);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
143 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
148 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
149 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
151 jq1 = _mm256_set1_ps(charge[inr+1]);
152 jq2 = _mm256_set1_ps(charge[inr+2]);
153 jq3 = _mm256_set1_ps(charge[inr+3]);
154 qq11 = _mm256_mul_ps(iq1,jq1);
155 qq12 = _mm256_mul_ps(iq1,jq2);
156 qq13 = _mm256_mul_ps(iq1,jq3);
157 qq21 = _mm256_mul_ps(iq2,jq1);
158 qq22 = _mm256_mul_ps(iq2,jq2);
159 qq23 = _mm256_mul_ps(iq2,jq3);
160 qq31 = _mm256_mul_ps(iq3,jq1);
161 qq32 = _mm256_mul_ps(iq3,jq2);
162 qq33 = _mm256_mul_ps(iq3,jq3);
164 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165 rcutoff_scalar = fr->rcoulomb;
166 rcutoff = _mm256_set1_ps(rcutoff_scalar);
167 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
169 rswitch_scalar = fr->rcoulomb_switch;
170 rswitch = _mm256_set1_ps(rswitch_scalar);
171 /* Setup switch parameters */
172 d_scalar = rcutoff_scalar-rswitch_scalar;
173 d = _mm256_set1_ps(d_scalar);
174 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
175 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
178 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
179 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 /* Avoid stupid compiler warnings */
182 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
195 for(iidx=0;iidx<4*DIM;iidx++)
200 /* Start outer loop over neighborlists */
201 for(iidx=0; iidx<nri; iidx++)
203 /* Load shift vector for this list */
204 i_shift_offset = DIM*shiftidx[iidx];
206 /* Load limits for loop over neighbors */
207 j_index_start = jindex[iidx];
208 j_index_end = jindex[iidx+1];
210 /* Get outer coordinate index */
212 i_coord_offset = DIM*inr;
214 /* Load i particle coords and add shift vector */
215 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
216 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
218 fix1 = _mm256_setzero_ps();
219 fiy1 = _mm256_setzero_ps();
220 fiz1 = _mm256_setzero_ps();
221 fix2 = _mm256_setzero_ps();
222 fiy2 = _mm256_setzero_ps();
223 fiz2 = _mm256_setzero_ps();
224 fix3 = _mm256_setzero_ps();
225 fiy3 = _mm256_setzero_ps();
226 fiz3 = _mm256_setzero_ps();
228 /* Reset potential sums */
229 velecsum = _mm256_setzero_ps();
231 /* Start inner kernel loop */
232 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
235 /* Get j neighbor index, and coordinate index */
244 j_coord_offsetA = DIM*jnrA;
245 j_coord_offsetB = DIM*jnrB;
246 j_coord_offsetC = DIM*jnrC;
247 j_coord_offsetD = DIM*jnrD;
248 j_coord_offsetE = DIM*jnrE;
249 j_coord_offsetF = DIM*jnrF;
250 j_coord_offsetG = DIM*jnrG;
251 j_coord_offsetH = DIM*jnrH;
253 /* load j atom coordinates */
254 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
255 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
256 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
257 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
258 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
260 /* Calculate displacement vector */
261 dx11 = _mm256_sub_ps(ix1,jx1);
262 dy11 = _mm256_sub_ps(iy1,jy1);
263 dz11 = _mm256_sub_ps(iz1,jz1);
264 dx12 = _mm256_sub_ps(ix1,jx2);
265 dy12 = _mm256_sub_ps(iy1,jy2);
266 dz12 = _mm256_sub_ps(iz1,jz2);
267 dx13 = _mm256_sub_ps(ix1,jx3);
268 dy13 = _mm256_sub_ps(iy1,jy3);
269 dz13 = _mm256_sub_ps(iz1,jz3);
270 dx21 = _mm256_sub_ps(ix2,jx1);
271 dy21 = _mm256_sub_ps(iy2,jy1);
272 dz21 = _mm256_sub_ps(iz2,jz1);
273 dx22 = _mm256_sub_ps(ix2,jx2);
274 dy22 = _mm256_sub_ps(iy2,jy2);
275 dz22 = _mm256_sub_ps(iz2,jz2);
276 dx23 = _mm256_sub_ps(ix2,jx3);
277 dy23 = _mm256_sub_ps(iy2,jy3);
278 dz23 = _mm256_sub_ps(iz2,jz3);
279 dx31 = _mm256_sub_ps(ix3,jx1);
280 dy31 = _mm256_sub_ps(iy3,jy1);
281 dz31 = _mm256_sub_ps(iz3,jz1);
282 dx32 = _mm256_sub_ps(ix3,jx2);
283 dy32 = _mm256_sub_ps(iy3,jy2);
284 dz32 = _mm256_sub_ps(iz3,jz2);
285 dx33 = _mm256_sub_ps(ix3,jx3);
286 dy33 = _mm256_sub_ps(iy3,jy3);
287 dz33 = _mm256_sub_ps(iz3,jz3);
289 /* Calculate squared distance and things based on it */
290 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
291 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
292 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
293 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
294 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
295 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
296 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
297 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
298 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
300 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
301 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
302 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
303 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
304 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
305 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
306 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
307 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
308 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
310 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
311 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
312 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
313 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
314 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
315 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
316 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
317 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
318 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
320 fjx1 = _mm256_setzero_ps();
321 fjy1 = _mm256_setzero_ps();
322 fjz1 = _mm256_setzero_ps();
323 fjx2 = _mm256_setzero_ps();
324 fjy2 = _mm256_setzero_ps();
325 fjz2 = _mm256_setzero_ps();
326 fjx3 = _mm256_setzero_ps();
327 fjy3 = _mm256_setzero_ps();
328 fjz3 = _mm256_setzero_ps();
330 /**************************
331 * CALCULATE INTERACTIONS *
332 **************************/
334 if (gmx_mm256_any_lt(rsq11,rcutoff2))
337 r11 = _mm256_mul_ps(rsq11,rinv11);
339 /* EWALD ELECTROSTATICS */
341 /* Analytical PME correction */
342 zeta2 = _mm256_mul_ps(beta2,rsq11);
343 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
344 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
345 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
346 felec = _mm256_mul_ps(qq11,felec);
347 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
348 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
349 velec = _mm256_sub_ps(rinv11,pmecorrV);
350 velec = _mm256_mul_ps(qq11,velec);
352 d = _mm256_sub_ps(r11,rswitch);
353 d = _mm256_max_ps(d,_mm256_setzero_ps());
354 d2 = _mm256_mul_ps(d,d);
355 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
357 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
359 /* Evaluate switch function */
360 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
361 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
362 velec = _mm256_mul_ps(velec,sw);
363 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velec = _mm256_and_ps(velec,cutoff_mask);
367 velecsum = _mm256_add_ps(velecsum,velec);
371 fscal = _mm256_and_ps(fscal,cutoff_mask);
373 /* Calculate temporary vectorial force */
374 tx = _mm256_mul_ps(fscal,dx11);
375 ty = _mm256_mul_ps(fscal,dy11);
376 tz = _mm256_mul_ps(fscal,dz11);
378 /* Update vectorial force */
379 fix1 = _mm256_add_ps(fix1,tx);
380 fiy1 = _mm256_add_ps(fiy1,ty);
381 fiz1 = _mm256_add_ps(fiz1,tz);
383 fjx1 = _mm256_add_ps(fjx1,tx);
384 fjy1 = _mm256_add_ps(fjy1,ty);
385 fjz1 = _mm256_add_ps(fjz1,tz);
389 /**************************
390 * CALCULATE INTERACTIONS *
391 **************************/
393 if (gmx_mm256_any_lt(rsq12,rcutoff2))
396 r12 = _mm256_mul_ps(rsq12,rinv12);
398 /* EWALD ELECTROSTATICS */
400 /* Analytical PME correction */
401 zeta2 = _mm256_mul_ps(beta2,rsq12);
402 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
403 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
404 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
405 felec = _mm256_mul_ps(qq12,felec);
406 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
407 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
408 velec = _mm256_sub_ps(rinv12,pmecorrV);
409 velec = _mm256_mul_ps(qq12,velec);
411 d = _mm256_sub_ps(r12,rswitch);
412 d = _mm256_max_ps(d,_mm256_setzero_ps());
413 d2 = _mm256_mul_ps(d,d);
414 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
416 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
418 /* Evaluate switch function */
419 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
420 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
421 velec = _mm256_mul_ps(velec,sw);
422 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velec = _mm256_and_ps(velec,cutoff_mask);
426 velecsum = _mm256_add_ps(velecsum,velec);
430 fscal = _mm256_and_ps(fscal,cutoff_mask);
432 /* Calculate temporary vectorial force */
433 tx = _mm256_mul_ps(fscal,dx12);
434 ty = _mm256_mul_ps(fscal,dy12);
435 tz = _mm256_mul_ps(fscal,dz12);
437 /* Update vectorial force */
438 fix1 = _mm256_add_ps(fix1,tx);
439 fiy1 = _mm256_add_ps(fiy1,ty);
440 fiz1 = _mm256_add_ps(fiz1,tz);
442 fjx2 = _mm256_add_ps(fjx2,tx);
443 fjy2 = _mm256_add_ps(fjy2,ty);
444 fjz2 = _mm256_add_ps(fjz2,tz);
448 /**************************
449 * CALCULATE INTERACTIONS *
450 **************************/
452 if (gmx_mm256_any_lt(rsq13,rcutoff2))
455 r13 = _mm256_mul_ps(rsq13,rinv13);
457 /* EWALD ELECTROSTATICS */
459 /* Analytical PME correction */
460 zeta2 = _mm256_mul_ps(beta2,rsq13);
461 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
462 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
463 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
464 felec = _mm256_mul_ps(qq13,felec);
465 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
466 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
467 velec = _mm256_sub_ps(rinv13,pmecorrV);
468 velec = _mm256_mul_ps(qq13,velec);
470 d = _mm256_sub_ps(r13,rswitch);
471 d = _mm256_max_ps(d,_mm256_setzero_ps());
472 d2 = _mm256_mul_ps(d,d);
473 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
475 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
477 /* Evaluate switch function */
478 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
479 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
480 velec = _mm256_mul_ps(velec,sw);
481 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velec = _mm256_and_ps(velec,cutoff_mask);
485 velecsum = _mm256_add_ps(velecsum,velec);
489 fscal = _mm256_and_ps(fscal,cutoff_mask);
491 /* Calculate temporary vectorial force */
492 tx = _mm256_mul_ps(fscal,dx13);
493 ty = _mm256_mul_ps(fscal,dy13);
494 tz = _mm256_mul_ps(fscal,dz13);
496 /* Update vectorial force */
497 fix1 = _mm256_add_ps(fix1,tx);
498 fiy1 = _mm256_add_ps(fiy1,ty);
499 fiz1 = _mm256_add_ps(fiz1,tz);
501 fjx3 = _mm256_add_ps(fjx3,tx);
502 fjy3 = _mm256_add_ps(fjy3,ty);
503 fjz3 = _mm256_add_ps(fjz3,tz);
507 /**************************
508 * CALCULATE INTERACTIONS *
509 **************************/
511 if (gmx_mm256_any_lt(rsq21,rcutoff2))
514 r21 = _mm256_mul_ps(rsq21,rinv21);
516 /* EWALD ELECTROSTATICS */
518 /* Analytical PME correction */
519 zeta2 = _mm256_mul_ps(beta2,rsq21);
520 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
521 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
522 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
523 felec = _mm256_mul_ps(qq21,felec);
524 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
525 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
526 velec = _mm256_sub_ps(rinv21,pmecorrV);
527 velec = _mm256_mul_ps(qq21,velec);
529 d = _mm256_sub_ps(r21,rswitch);
530 d = _mm256_max_ps(d,_mm256_setzero_ps());
531 d2 = _mm256_mul_ps(d,d);
532 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
534 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
536 /* Evaluate switch function */
537 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
538 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
539 velec = _mm256_mul_ps(velec,sw);
540 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velec = _mm256_and_ps(velec,cutoff_mask);
544 velecsum = _mm256_add_ps(velecsum,velec);
548 fscal = _mm256_and_ps(fscal,cutoff_mask);
550 /* Calculate temporary vectorial force */
551 tx = _mm256_mul_ps(fscal,dx21);
552 ty = _mm256_mul_ps(fscal,dy21);
553 tz = _mm256_mul_ps(fscal,dz21);
555 /* Update vectorial force */
556 fix2 = _mm256_add_ps(fix2,tx);
557 fiy2 = _mm256_add_ps(fiy2,ty);
558 fiz2 = _mm256_add_ps(fiz2,tz);
560 fjx1 = _mm256_add_ps(fjx1,tx);
561 fjy1 = _mm256_add_ps(fjy1,ty);
562 fjz1 = _mm256_add_ps(fjz1,tz);
566 /**************************
567 * CALCULATE INTERACTIONS *
568 **************************/
570 if (gmx_mm256_any_lt(rsq22,rcutoff2))
573 r22 = _mm256_mul_ps(rsq22,rinv22);
575 /* EWALD ELECTROSTATICS */
577 /* Analytical PME correction */
578 zeta2 = _mm256_mul_ps(beta2,rsq22);
579 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
580 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
581 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
582 felec = _mm256_mul_ps(qq22,felec);
583 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
584 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
585 velec = _mm256_sub_ps(rinv22,pmecorrV);
586 velec = _mm256_mul_ps(qq22,velec);
588 d = _mm256_sub_ps(r22,rswitch);
589 d = _mm256_max_ps(d,_mm256_setzero_ps());
590 d2 = _mm256_mul_ps(d,d);
591 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
593 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
595 /* Evaluate switch function */
596 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
597 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
598 velec = _mm256_mul_ps(velec,sw);
599 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
601 /* Update potential sum for this i atom from the interaction with this j atom. */
602 velec = _mm256_and_ps(velec,cutoff_mask);
603 velecsum = _mm256_add_ps(velecsum,velec);
607 fscal = _mm256_and_ps(fscal,cutoff_mask);
609 /* Calculate temporary vectorial force */
610 tx = _mm256_mul_ps(fscal,dx22);
611 ty = _mm256_mul_ps(fscal,dy22);
612 tz = _mm256_mul_ps(fscal,dz22);
614 /* Update vectorial force */
615 fix2 = _mm256_add_ps(fix2,tx);
616 fiy2 = _mm256_add_ps(fiy2,ty);
617 fiz2 = _mm256_add_ps(fiz2,tz);
619 fjx2 = _mm256_add_ps(fjx2,tx);
620 fjy2 = _mm256_add_ps(fjy2,ty);
621 fjz2 = _mm256_add_ps(fjz2,tz);
625 /**************************
626 * CALCULATE INTERACTIONS *
627 **************************/
629 if (gmx_mm256_any_lt(rsq23,rcutoff2))
632 r23 = _mm256_mul_ps(rsq23,rinv23);
634 /* EWALD ELECTROSTATICS */
636 /* Analytical PME correction */
637 zeta2 = _mm256_mul_ps(beta2,rsq23);
638 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
639 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
640 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
641 felec = _mm256_mul_ps(qq23,felec);
642 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
643 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
644 velec = _mm256_sub_ps(rinv23,pmecorrV);
645 velec = _mm256_mul_ps(qq23,velec);
647 d = _mm256_sub_ps(r23,rswitch);
648 d = _mm256_max_ps(d,_mm256_setzero_ps());
649 d2 = _mm256_mul_ps(d,d);
650 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
652 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
654 /* Evaluate switch function */
655 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
656 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
657 velec = _mm256_mul_ps(velec,sw);
658 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 velec = _mm256_and_ps(velec,cutoff_mask);
662 velecsum = _mm256_add_ps(velecsum,velec);
666 fscal = _mm256_and_ps(fscal,cutoff_mask);
668 /* Calculate temporary vectorial force */
669 tx = _mm256_mul_ps(fscal,dx23);
670 ty = _mm256_mul_ps(fscal,dy23);
671 tz = _mm256_mul_ps(fscal,dz23);
673 /* Update vectorial force */
674 fix2 = _mm256_add_ps(fix2,tx);
675 fiy2 = _mm256_add_ps(fiy2,ty);
676 fiz2 = _mm256_add_ps(fiz2,tz);
678 fjx3 = _mm256_add_ps(fjx3,tx);
679 fjy3 = _mm256_add_ps(fjy3,ty);
680 fjz3 = _mm256_add_ps(fjz3,tz);
684 /**************************
685 * CALCULATE INTERACTIONS *
686 **************************/
688 if (gmx_mm256_any_lt(rsq31,rcutoff2))
691 r31 = _mm256_mul_ps(rsq31,rinv31);
693 /* EWALD ELECTROSTATICS */
695 /* Analytical PME correction */
696 zeta2 = _mm256_mul_ps(beta2,rsq31);
697 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
698 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
699 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
700 felec = _mm256_mul_ps(qq31,felec);
701 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
702 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
703 velec = _mm256_sub_ps(rinv31,pmecorrV);
704 velec = _mm256_mul_ps(qq31,velec);
706 d = _mm256_sub_ps(r31,rswitch);
707 d = _mm256_max_ps(d,_mm256_setzero_ps());
708 d2 = _mm256_mul_ps(d,d);
709 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
711 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
713 /* Evaluate switch function */
714 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
715 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
716 velec = _mm256_mul_ps(velec,sw);
717 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
719 /* Update potential sum for this i atom from the interaction with this j atom. */
720 velec = _mm256_and_ps(velec,cutoff_mask);
721 velecsum = _mm256_add_ps(velecsum,velec);
725 fscal = _mm256_and_ps(fscal,cutoff_mask);
727 /* Calculate temporary vectorial force */
728 tx = _mm256_mul_ps(fscal,dx31);
729 ty = _mm256_mul_ps(fscal,dy31);
730 tz = _mm256_mul_ps(fscal,dz31);
732 /* Update vectorial force */
733 fix3 = _mm256_add_ps(fix3,tx);
734 fiy3 = _mm256_add_ps(fiy3,ty);
735 fiz3 = _mm256_add_ps(fiz3,tz);
737 fjx1 = _mm256_add_ps(fjx1,tx);
738 fjy1 = _mm256_add_ps(fjy1,ty);
739 fjz1 = _mm256_add_ps(fjz1,tz);
743 /**************************
744 * CALCULATE INTERACTIONS *
745 **************************/
747 if (gmx_mm256_any_lt(rsq32,rcutoff2))
750 r32 = _mm256_mul_ps(rsq32,rinv32);
752 /* EWALD ELECTROSTATICS */
754 /* Analytical PME correction */
755 zeta2 = _mm256_mul_ps(beta2,rsq32);
756 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
757 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
758 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
759 felec = _mm256_mul_ps(qq32,felec);
760 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
761 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
762 velec = _mm256_sub_ps(rinv32,pmecorrV);
763 velec = _mm256_mul_ps(qq32,velec);
765 d = _mm256_sub_ps(r32,rswitch);
766 d = _mm256_max_ps(d,_mm256_setzero_ps());
767 d2 = _mm256_mul_ps(d,d);
768 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
770 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
772 /* Evaluate switch function */
773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
774 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
775 velec = _mm256_mul_ps(velec,sw);
776 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
778 /* Update potential sum for this i atom from the interaction with this j atom. */
779 velec = _mm256_and_ps(velec,cutoff_mask);
780 velecsum = _mm256_add_ps(velecsum,velec);
784 fscal = _mm256_and_ps(fscal,cutoff_mask);
786 /* Calculate temporary vectorial force */
787 tx = _mm256_mul_ps(fscal,dx32);
788 ty = _mm256_mul_ps(fscal,dy32);
789 tz = _mm256_mul_ps(fscal,dz32);
791 /* Update vectorial force */
792 fix3 = _mm256_add_ps(fix3,tx);
793 fiy3 = _mm256_add_ps(fiy3,ty);
794 fiz3 = _mm256_add_ps(fiz3,tz);
796 fjx2 = _mm256_add_ps(fjx2,tx);
797 fjy2 = _mm256_add_ps(fjy2,ty);
798 fjz2 = _mm256_add_ps(fjz2,tz);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 if (gmx_mm256_any_lt(rsq33,rcutoff2))
809 r33 = _mm256_mul_ps(rsq33,rinv33);
811 /* EWALD ELECTROSTATICS */
813 /* Analytical PME correction */
814 zeta2 = _mm256_mul_ps(beta2,rsq33);
815 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
816 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
817 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
818 felec = _mm256_mul_ps(qq33,felec);
819 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
820 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
821 velec = _mm256_sub_ps(rinv33,pmecorrV);
822 velec = _mm256_mul_ps(qq33,velec);
824 d = _mm256_sub_ps(r33,rswitch);
825 d = _mm256_max_ps(d,_mm256_setzero_ps());
826 d2 = _mm256_mul_ps(d,d);
827 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
829 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
831 /* Evaluate switch function */
832 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
833 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
834 velec = _mm256_mul_ps(velec,sw);
835 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _mm256_and_ps(velec,cutoff_mask);
839 velecsum = _mm256_add_ps(velecsum,velec);
843 fscal = _mm256_and_ps(fscal,cutoff_mask);
845 /* Calculate temporary vectorial force */
846 tx = _mm256_mul_ps(fscal,dx33);
847 ty = _mm256_mul_ps(fscal,dy33);
848 tz = _mm256_mul_ps(fscal,dz33);
850 /* Update vectorial force */
851 fix3 = _mm256_add_ps(fix3,tx);
852 fiy3 = _mm256_add_ps(fiy3,ty);
853 fiz3 = _mm256_add_ps(fiz3,tz);
855 fjx3 = _mm256_add_ps(fjx3,tx);
856 fjy3 = _mm256_add_ps(fjy3,ty);
857 fjz3 = _mm256_add_ps(fjz3,tz);
861 fjptrA = f+j_coord_offsetA;
862 fjptrB = f+j_coord_offsetB;
863 fjptrC = f+j_coord_offsetC;
864 fjptrD = f+j_coord_offsetD;
865 fjptrE = f+j_coord_offsetE;
866 fjptrF = f+j_coord_offsetF;
867 fjptrG = f+j_coord_offsetG;
868 fjptrH = f+j_coord_offsetH;
870 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
871 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
872 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
874 /* Inner loop uses 972 flops */
880 /* Get j neighbor index, and coordinate index */
881 jnrlistA = jjnr[jidx];
882 jnrlistB = jjnr[jidx+1];
883 jnrlistC = jjnr[jidx+2];
884 jnrlistD = jjnr[jidx+3];
885 jnrlistE = jjnr[jidx+4];
886 jnrlistF = jjnr[jidx+5];
887 jnrlistG = jjnr[jidx+6];
888 jnrlistH = jjnr[jidx+7];
889 /* Sign of each element will be negative for non-real atoms.
890 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
891 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
893 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
894 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
896 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
897 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
898 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
899 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
900 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
901 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
902 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
903 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
904 j_coord_offsetA = DIM*jnrA;
905 j_coord_offsetB = DIM*jnrB;
906 j_coord_offsetC = DIM*jnrC;
907 j_coord_offsetD = DIM*jnrD;
908 j_coord_offsetE = DIM*jnrE;
909 j_coord_offsetF = DIM*jnrF;
910 j_coord_offsetG = DIM*jnrG;
911 j_coord_offsetH = DIM*jnrH;
913 /* load j atom coordinates */
914 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
915 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
916 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
917 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
918 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
920 /* Calculate displacement vector */
921 dx11 = _mm256_sub_ps(ix1,jx1);
922 dy11 = _mm256_sub_ps(iy1,jy1);
923 dz11 = _mm256_sub_ps(iz1,jz1);
924 dx12 = _mm256_sub_ps(ix1,jx2);
925 dy12 = _mm256_sub_ps(iy1,jy2);
926 dz12 = _mm256_sub_ps(iz1,jz2);
927 dx13 = _mm256_sub_ps(ix1,jx3);
928 dy13 = _mm256_sub_ps(iy1,jy3);
929 dz13 = _mm256_sub_ps(iz1,jz3);
930 dx21 = _mm256_sub_ps(ix2,jx1);
931 dy21 = _mm256_sub_ps(iy2,jy1);
932 dz21 = _mm256_sub_ps(iz2,jz1);
933 dx22 = _mm256_sub_ps(ix2,jx2);
934 dy22 = _mm256_sub_ps(iy2,jy2);
935 dz22 = _mm256_sub_ps(iz2,jz2);
936 dx23 = _mm256_sub_ps(ix2,jx3);
937 dy23 = _mm256_sub_ps(iy2,jy3);
938 dz23 = _mm256_sub_ps(iz2,jz3);
939 dx31 = _mm256_sub_ps(ix3,jx1);
940 dy31 = _mm256_sub_ps(iy3,jy1);
941 dz31 = _mm256_sub_ps(iz3,jz1);
942 dx32 = _mm256_sub_ps(ix3,jx2);
943 dy32 = _mm256_sub_ps(iy3,jy2);
944 dz32 = _mm256_sub_ps(iz3,jz2);
945 dx33 = _mm256_sub_ps(ix3,jx3);
946 dy33 = _mm256_sub_ps(iy3,jy3);
947 dz33 = _mm256_sub_ps(iz3,jz3);
949 /* Calculate squared distance and things based on it */
950 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
951 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
952 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
953 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
954 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
955 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
956 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
957 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
958 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
960 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
961 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
962 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
963 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
964 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
965 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
966 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
967 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
968 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
970 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
971 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
972 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
973 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
974 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
975 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
976 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
977 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
978 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
980 fjx1 = _mm256_setzero_ps();
981 fjy1 = _mm256_setzero_ps();
982 fjz1 = _mm256_setzero_ps();
983 fjx2 = _mm256_setzero_ps();
984 fjy2 = _mm256_setzero_ps();
985 fjz2 = _mm256_setzero_ps();
986 fjx3 = _mm256_setzero_ps();
987 fjy3 = _mm256_setzero_ps();
988 fjz3 = _mm256_setzero_ps();
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 if (gmx_mm256_any_lt(rsq11,rcutoff2))
997 r11 = _mm256_mul_ps(rsq11,rinv11);
998 r11 = _mm256_andnot_ps(dummy_mask,r11);
1000 /* EWALD ELECTROSTATICS */
1002 /* Analytical PME correction */
1003 zeta2 = _mm256_mul_ps(beta2,rsq11);
1004 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1005 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1006 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1007 felec = _mm256_mul_ps(qq11,felec);
1008 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1009 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1010 velec = _mm256_sub_ps(rinv11,pmecorrV);
1011 velec = _mm256_mul_ps(qq11,velec);
1013 d = _mm256_sub_ps(r11,rswitch);
1014 d = _mm256_max_ps(d,_mm256_setzero_ps());
1015 d2 = _mm256_mul_ps(d,d);
1016 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1018 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1020 /* Evaluate switch function */
1021 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1022 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1023 velec = _mm256_mul_ps(velec,sw);
1024 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm256_and_ps(velec,cutoff_mask);
1028 velec = _mm256_andnot_ps(dummy_mask,velec);
1029 velecsum = _mm256_add_ps(velecsum,velec);
1033 fscal = _mm256_and_ps(fscal,cutoff_mask);
1035 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1037 /* Calculate temporary vectorial force */
1038 tx = _mm256_mul_ps(fscal,dx11);
1039 ty = _mm256_mul_ps(fscal,dy11);
1040 tz = _mm256_mul_ps(fscal,dz11);
1042 /* Update vectorial force */
1043 fix1 = _mm256_add_ps(fix1,tx);
1044 fiy1 = _mm256_add_ps(fiy1,ty);
1045 fiz1 = _mm256_add_ps(fiz1,tz);
1047 fjx1 = _mm256_add_ps(fjx1,tx);
1048 fjy1 = _mm256_add_ps(fjy1,ty);
1049 fjz1 = _mm256_add_ps(fjz1,tz);
1053 /**************************
1054 * CALCULATE INTERACTIONS *
1055 **************************/
1057 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1060 r12 = _mm256_mul_ps(rsq12,rinv12);
1061 r12 = _mm256_andnot_ps(dummy_mask,r12);
1063 /* EWALD ELECTROSTATICS */
1065 /* Analytical PME correction */
1066 zeta2 = _mm256_mul_ps(beta2,rsq12);
1067 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1068 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1069 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1070 felec = _mm256_mul_ps(qq12,felec);
1071 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1072 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1073 velec = _mm256_sub_ps(rinv12,pmecorrV);
1074 velec = _mm256_mul_ps(qq12,velec);
1076 d = _mm256_sub_ps(r12,rswitch);
1077 d = _mm256_max_ps(d,_mm256_setzero_ps());
1078 d2 = _mm256_mul_ps(d,d);
1079 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1081 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1083 /* Evaluate switch function */
1084 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1085 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1086 velec = _mm256_mul_ps(velec,sw);
1087 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1089 /* Update potential sum for this i atom from the interaction with this j atom. */
1090 velec = _mm256_and_ps(velec,cutoff_mask);
1091 velec = _mm256_andnot_ps(dummy_mask,velec);
1092 velecsum = _mm256_add_ps(velecsum,velec);
1096 fscal = _mm256_and_ps(fscal,cutoff_mask);
1098 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1100 /* Calculate temporary vectorial force */
1101 tx = _mm256_mul_ps(fscal,dx12);
1102 ty = _mm256_mul_ps(fscal,dy12);
1103 tz = _mm256_mul_ps(fscal,dz12);
1105 /* Update vectorial force */
1106 fix1 = _mm256_add_ps(fix1,tx);
1107 fiy1 = _mm256_add_ps(fiy1,ty);
1108 fiz1 = _mm256_add_ps(fiz1,tz);
1110 fjx2 = _mm256_add_ps(fjx2,tx);
1111 fjy2 = _mm256_add_ps(fjy2,ty);
1112 fjz2 = _mm256_add_ps(fjz2,tz);
1116 /**************************
1117 * CALCULATE INTERACTIONS *
1118 **************************/
1120 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1123 r13 = _mm256_mul_ps(rsq13,rinv13);
1124 r13 = _mm256_andnot_ps(dummy_mask,r13);
1126 /* EWALD ELECTROSTATICS */
1128 /* Analytical PME correction */
1129 zeta2 = _mm256_mul_ps(beta2,rsq13);
1130 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1131 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1132 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1133 felec = _mm256_mul_ps(qq13,felec);
1134 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1135 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1136 velec = _mm256_sub_ps(rinv13,pmecorrV);
1137 velec = _mm256_mul_ps(qq13,velec);
1139 d = _mm256_sub_ps(r13,rswitch);
1140 d = _mm256_max_ps(d,_mm256_setzero_ps());
1141 d2 = _mm256_mul_ps(d,d);
1142 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1144 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1146 /* Evaluate switch function */
1147 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1148 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1149 velec = _mm256_mul_ps(velec,sw);
1150 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm256_and_ps(velec,cutoff_mask);
1154 velec = _mm256_andnot_ps(dummy_mask,velec);
1155 velecsum = _mm256_add_ps(velecsum,velec);
1159 fscal = _mm256_and_ps(fscal,cutoff_mask);
1161 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1163 /* Calculate temporary vectorial force */
1164 tx = _mm256_mul_ps(fscal,dx13);
1165 ty = _mm256_mul_ps(fscal,dy13);
1166 tz = _mm256_mul_ps(fscal,dz13);
1168 /* Update vectorial force */
1169 fix1 = _mm256_add_ps(fix1,tx);
1170 fiy1 = _mm256_add_ps(fiy1,ty);
1171 fiz1 = _mm256_add_ps(fiz1,tz);
1173 fjx3 = _mm256_add_ps(fjx3,tx);
1174 fjy3 = _mm256_add_ps(fjy3,ty);
1175 fjz3 = _mm256_add_ps(fjz3,tz);
1179 /**************************
1180 * CALCULATE INTERACTIONS *
1181 **************************/
1183 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1186 r21 = _mm256_mul_ps(rsq21,rinv21);
1187 r21 = _mm256_andnot_ps(dummy_mask,r21);
1189 /* EWALD ELECTROSTATICS */
1191 /* Analytical PME correction */
1192 zeta2 = _mm256_mul_ps(beta2,rsq21);
1193 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1194 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1195 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1196 felec = _mm256_mul_ps(qq21,felec);
1197 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1198 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1199 velec = _mm256_sub_ps(rinv21,pmecorrV);
1200 velec = _mm256_mul_ps(qq21,velec);
1202 d = _mm256_sub_ps(r21,rswitch);
1203 d = _mm256_max_ps(d,_mm256_setzero_ps());
1204 d2 = _mm256_mul_ps(d,d);
1205 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1207 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1209 /* Evaluate switch function */
1210 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1211 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1212 velec = _mm256_mul_ps(velec,sw);
1213 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1215 /* Update potential sum for this i atom from the interaction with this j atom. */
1216 velec = _mm256_and_ps(velec,cutoff_mask);
1217 velec = _mm256_andnot_ps(dummy_mask,velec);
1218 velecsum = _mm256_add_ps(velecsum,velec);
1222 fscal = _mm256_and_ps(fscal,cutoff_mask);
1224 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1226 /* Calculate temporary vectorial force */
1227 tx = _mm256_mul_ps(fscal,dx21);
1228 ty = _mm256_mul_ps(fscal,dy21);
1229 tz = _mm256_mul_ps(fscal,dz21);
1231 /* Update vectorial force */
1232 fix2 = _mm256_add_ps(fix2,tx);
1233 fiy2 = _mm256_add_ps(fiy2,ty);
1234 fiz2 = _mm256_add_ps(fiz2,tz);
1236 fjx1 = _mm256_add_ps(fjx1,tx);
1237 fjy1 = _mm256_add_ps(fjy1,ty);
1238 fjz1 = _mm256_add_ps(fjz1,tz);
1242 /**************************
1243 * CALCULATE INTERACTIONS *
1244 **************************/
1246 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1249 r22 = _mm256_mul_ps(rsq22,rinv22);
1250 r22 = _mm256_andnot_ps(dummy_mask,r22);
1252 /* EWALD ELECTROSTATICS */
1254 /* Analytical PME correction */
1255 zeta2 = _mm256_mul_ps(beta2,rsq22);
1256 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1257 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1258 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1259 felec = _mm256_mul_ps(qq22,felec);
1260 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1261 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1262 velec = _mm256_sub_ps(rinv22,pmecorrV);
1263 velec = _mm256_mul_ps(qq22,velec);
1265 d = _mm256_sub_ps(r22,rswitch);
1266 d = _mm256_max_ps(d,_mm256_setzero_ps());
1267 d2 = _mm256_mul_ps(d,d);
1268 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1270 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1272 /* Evaluate switch function */
1273 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1274 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1275 velec = _mm256_mul_ps(velec,sw);
1276 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1278 /* Update potential sum for this i atom from the interaction with this j atom. */
1279 velec = _mm256_and_ps(velec,cutoff_mask);
1280 velec = _mm256_andnot_ps(dummy_mask,velec);
1281 velecsum = _mm256_add_ps(velecsum,velec);
1285 fscal = _mm256_and_ps(fscal,cutoff_mask);
1287 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1289 /* Calculate temporary vectorial force */
1290 tx = _mm256_mul_ps(fscal,dx22);
1291 ty = _mm256_mul_ps(fscal,dy22);
1292 tz = _mm256_mul_ps(fscal,dz22);
1294 /* Update vectorial force */
1295 fix2 = _mm256_add_ps(fix2,tx);
1296 fiy2 = _mm256_add_ps(fiy2,ty);
1297 fiz2 = _mm256_add_ps(fiz2,tz);
1299 fjx2 = _mm256_add_ps(fjx2,tx);
1300 fjy2 = _mm256_add_ps(fjy2,ty);
1301 fjz2 = _mm256_add_ps(fjz2,tz);
1305 /**************************
1306 * CALCULATE INTERACTIONS *
1307 **************************/
1309 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1312 r23 = _mm256_mul_ps(rsq23,rinv23);
1313 r23 = _mm256_andnot_ps(dummy_mask,r23);
1315 /* EWALD ELECTROSTATICS */
1317 /* Analytical PME correction */
1318 zeta2 = _mm256_mul_ps(beta2,rsq23);
1319 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1320 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1321 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1322 felec = _mm256_mul_ps(qq23,felec);
1323 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1324 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1325 velec = _mm256_sub_ps(rinv23,pmecorrV);
1326 velec = _mm256_mul_ps(qq23,velec);
1328 d = _mm256_sub_ps(r23,rswitch);
1329 d = _mm256_max_ps(d,_mm256_setzero_ps());
1330 d2 = _mm256_mul_ps(d,d);
1331 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1333 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1335 /* Evaluate switch function */
1336 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1337 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1338 velec = _mm256_mul_ps(velec,sw);
1339 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1341 /* Update potential sum for this i atom from the interaction with this j atom. */
1342 velec = _mm256_and_ps(velec,cutoff_mask);
1343 velec = _mm256_andnot_ps(dummy_mask,velec);
1344 velecsum = _mm256_add_ps(velecsum,velec);
1348 fscal = _mm256_and_ps(fscal,cutoff_mask);
1350 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1352 /* Calculate temporary vectorial force */
1353 tx = _mm256_mul_ps(fscal,dx23);
1354 ty = _mm256_mul_ps(fscal,dy23);
1355 tz = _mm256_mul_ps(fscal,dz23);
1357 /* Update vectorial force */
1358 fix2 = _mm256_add_ps(fix2,tx);
1359 fiy2 = _mm256_add_ps(fiy2,ty);
1360 fiz2 = _mm256_add_ps(fiz2,tz);
1362 fjx3 = _mm256_add_ps(fjx3,tx);
1363 fjy3 = _mm256_add_ps(fjy3,ty);
1364 fjz3 = _mm256_add_ps(fjz3,tz);
1368 /**************************
1369 * CALCULATE INTERACTIONS *
1370 **************************/
1372 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1375 r31 = _mm256_mul_ps(rsq31,rinv31);
1376 r31 = _mm256_andnot_ps(dummy_mask,r31);
1378 /* EWALD ELECTROSTATICS */
1380 /* Analytical PME correction */
1381 zeta2 = _mm256_mul_ps(beta2,rsq31);
1382 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1383 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1384 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1385 felec = _mm256_mul_ps(qq31,felec);
1386 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1387 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1388 velec = _mm256_sub_ps(rinv31,pmecorrV);
1389 velec = _mm256_mul_ps(qq31,velec);
1391 d = _mm256_sub_ps(r31,rswitch);
1392 d = _mm256_max_ps(d,_mm256_setzero_ps());
1393 d2 = _mm256_mul_ps(d,d);
1394 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1396 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1398 /* Evaluate switch function */
1399 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1400 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1401 velec = _mm256_mul_ps(velec,sw);
1402 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1404 /* Update potential sum for this i atom from the interaction with this j atom. */
1405 velec = _mm256_and_ps(velec,cutoff_mask);
1406 velec = _mm256_andnot_ps(dummy_mask,velec);
1407 velecsum = _mm256_add_ps(velecsum,velec);
1411 fscal = _mm256_and_ps(fscal,cutoff_mask);
1413 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1415 /* Calculate temporary vectorial force */
1416 tx = _mm256_mul_ps(fscal,dx31);
1417 ty = _mm256_mul_ps(fscal,dy31);
1418 tz = _mm256_mul_ps(fscal,dz31);
1420 /* Update vectorial force */
1421 fix3 = _mm256_add_ps(fix3,tx);
1422 fiy3 = _mm256_add_ps(fiy3,ty);
1423 fiz3 = _mm256_add_ps(fiz3,tz);
1425 fjx1 = _mm256_add_ps(fjx1,tx);
1426 fjy1 = _mm256_add_ps(fjy1,ty);
1427 fjz1 = _mm256_add_ps(fjz1,tz);
1431 /**************************
1432 * CALCULATE INTERACTIONS *
1433 **************************/
1435 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1438 r32 = _mm256_mul_ps(rsq32,rinv32);
1439 r32 = _mm256_andnot_ps(dummy_mask,r32);
1441 /* EWALD ELECTROSTATICS */
1443 /* Analytical PME correction */
1444 zeta2 = _mm256_mul_ps(beta2,rsq32);
1445 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1446 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1447 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1448 felec = _mm256_mul_ps(qq32,felec);
1449 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1450 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1451 velec = _mm256_sub_ps(rinv32,pmecorrV);
1452 velec = _mm256_mul_ps(qq32,velec);
1454 d = _mm256_sub_ps(r32,rswitch);
1455 d = _mm256_max_ps(d,_mm256_setzero_ps());
1456 d2 = _mm256_mul_ps(d,d);
1457 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1459 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1461 /* Evaluate switch function */
1462 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1463 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1464 velec = _mm256_mul_ps(velec,sw);
1465 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1467 /* Update potential sum for this i atom from the interaction with this j atom. */
1468 velec = _mm256_and_ps(velec,cutoff_mask);
1469 velec = _mm256_andnot_ps(dummy_mask,velec);
1470 velecsum = _mm256_add_ps(velecsum,velec);
1474 fscal = _mm256_and_ps(fscal,cutoff_mask);
1476 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1478 /* Calculate temporary vectorial force */
1479 tx = _mm256_mul_ps(fscal,dx32);
1480 ty = _mm256_mul_ps(fscal,dy32);
1481 tz = _mm256_mul_ps(fscal,dz32);
1483 /* Update vectorial force */
1484 fix3 = _mm256_add_ps(fix3,tx);
1485 fiy3 = _mm256_add_ps(fiy3,ty);
1486 fiz3 = _mm256_add_ps(fiz3,tz);
1488 fjx2 = _mm256_add_ps(fjx2,tx);
1489 fjy2 = _mm256_add_ps(fjy2,ty);
1490 fjz2 = _mm256_add_ps(fjz2,tz);
1494 /**************************
1495 * CALCULATE INTERACTIONS *
1496 **************************/
1498 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1501 r33 = _mm256_mul_ps(rsq33,rinv33);
1502 r33 = _mm256_andnot_ps(dummy_mask,r33);
1504 /* EWALD ELECTROSTATICS */
1506 /* Analytical PME correction */
1507 zeta2 = _mm256_mul_ps(beta2,rsq33);
1508 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1509 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1510 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1511 felec = _mm256_mul_ps(qq33,felec);
1512 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1513 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1514 velec = _mm256_sub_ps(rinv33,pmecorrV);
1515 velec = _mm256_mul_ps(qq33,velec);
1517 d = _mm256_sub_ps(r33,rswitch);
1518 d = _mm256_max_ps(d,_mm256_setzero_ps());
1519 d2 = _mm256_mul_ps(d,d);
1520 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1522 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1524 /* Evaluate switch function */
1525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1526 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1527 velec = _mm256_mul_ps(velec,sw);
1528 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1530 /* Update potential sum for this i atom from the interaction with this j atom. */
1531 velec = _mm256_and_ps(velec,cutoff_mask);
1532 velec = _mm256_andnot_ps(dummy_mask,velec);
1533 velecsum = _mm256_add_ps(velecsum,velec);
1537 fscal = _mm256_and_ps(fscal,cutoff_mask);
1539 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1541 /* Calculate temporary vectorial force */
1542 tx = _mm256_mul_ps(fscal,dx33);
1543 ty = _mm256_mul_ps(fscal,dy33);
1544 tz = _mm256_mul_ps(fscal,dz33);
1546 /* Update vectorial force */
1547 fix3 = _mm256_add_ps(fix3,tx);
1548 fiy3 = _mm256_add_ps(fiy3,ty);
1549 fiz3 = _mm256_add_ps(fiz3,tz);
1551 fjx3 = _mm256_add_ps(fjx3,tx);
1552 fjy3 = _mm256_add_ps(fjy3,ty);
1553 fjz3 = _mm256_add_ps(fjz3,tz);
1557 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1558 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1559 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1560 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1561 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1562 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1563 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1564 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1566 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1567 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1568 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1570 /* Inner loop uses 981 flops */
1573 /* End of innermost loop */
1575 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1576 f+i_coord_offset+DIM,fshift+i_shift_offset);
1579 /* Update potential energies */
1580 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1582 /* Increment number of inner iterations */
1583 inneriter += j_index_end - j_index_start;
1585 /* Outer loop uses 19 flops */
1588 /* Increment number of outer iterations */
1591 /* Update outer/inner flops */
1593 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*981);
1596 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1597 * Electrostatics interaction: Ewald
1598 * VdW interaction: None
1599 * Geometry: Water4-Water4
1600 * Calculate force/pot: Force
1603 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1604 (t_nblist * gmx_restrict nlist,
1605 rvec * gmx_restrict xx,
1606 rvec * gmx_restrict ff,
1607 t_forcerec * gmx_restrict fr,
1608 t_mdatoms * gmx_restrict mdatoms,
1609 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1610 t_nrnb * gmx_restrict nrnb)
1612 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1613 * just 0 for non-waters.
1614 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1615 * jnr indices corresponding to data put in the four positions in the SIMD register.
1617 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1618 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1619 int jnrA,jnrB,jnrC,jnrD;
1620 int jnrE,jnrF,jnrG,jnrH;
1621 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1622 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1623 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1624 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1625 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1626 real rcutoff_scalar;
1627 real *shiftvec,*fshift,*x,*f;
1628 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1629 real scratch[4*DIM];
1630 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1631 real * vdwioffsetptr1;
1632 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1633 real * vdwioffsetptr2;
1634 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1635 real * vdwioffsetptr3;
1636 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1637 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1638 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1639 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1640 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1641 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1642 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1643 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1644 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1645 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1646 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1647 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1648 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1649 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1650 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1651 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1652 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1655 __m128i ewitab_lo,ewitab_hi;
1656 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1657 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1659 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1660 real rswitch_scalar,d_scalar;
1661 __m256 dummy_mask,cutoff_mask;
1662 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1663 __m256 one = _mm256_set1_ps(1.0);
1664 __m256 two = _mm256_set1_ps(2.0);
1670 jindex = nlist->jindex;
1672 shiftidx = nlist->shift;
1674 shiftvec = fr->shift_vec[0];
1675 fshift = fr->fshift[0];
1676 facel = _mm256_set1_ps(fr->epsfac);
1677 charge = mdatoms->chargeA;
1679 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1680 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1681 beta2 = _mm256_mul_ps(beta,beta);
1682 beta3 = _mm256_mul_ps(beta,beta2);
1684 ewtab = fr->ic->tabq_coul_FDV0;
1685 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1686 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1688 /* Setup water-specific parameters */
1689 inr = nlist->iinr[0];
1690 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1691 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1692 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1694 jq1 = _mm256_set1_ps(charge[inr+1]);
1695 jq2 = _mm256_set1_ps(charge[inr+2]);
1696 jq3 = _mm256_set1_ps(charge[inr+3]);
1697 qq11 = _mm256_mul_ps(iq1,jq1);
1698 qq12 = _mm256_mul_ps(iq1,jq2);
1699 qq13 = _mm256_mul_ps(iq1,jq3);
1700 qq21 = _mm256_mul_ps(iq2,jq1);
1701 qq22 = _mm256_mul_ps(iq2,jq2);
1702 qq23 = _mm256_mul_ps(iq2,jq3);
1703 qq31 = _mm256_mul_ps(iq3,jq1);
1704 qq32 = _mm256_mul_ps(iq3,jq2);
1705 qq33 = _mm256_mul_ps(iq3,jq3);
1707 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1708 rcutoff_scalar = fr->rcoulomb;
1709 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1710 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1712 rswitch_scalar = fr->rcoulomb_switch;
1713 rswitch = _mm256_set1_ps(rswitch_scalar);
1714 /* Setup switch parameters */
1715 d_scalar = rcutoff_scalar-rswitch_scalar;
1716 d = _mm256_set1_ps(d_scalar);
1717 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1718 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1719 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1720 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1721 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1722 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1724 /* Avoid stupid compiler warnings */
1725 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1726 j_coord_offsetA = 0;
1727 j_coord_offsetB = 0;
1728 j_coord_offsetC = 0;
1729 j_coord_offsetD = 0;
1730 j_coord_offsetE = 0;
1731 j_coord_offsetF = 0;
1732 j_coord_offsetG = 0;
1733 j_coord_offsetH = 0;
1738 for(iidx=0;iidx<4*DIM;iidx++)
1740 scratch[iidx] = 0.0;
1743 /* Start outer loop over neighborlists */
1744 for(iidx=0; iidx<nri; iidx++)
1746 /* Load shift vector for this list */
1747 i_shift_offset = DIM*shiftidx[iidx];
1749 /* Load limits for loop over neighbors */
1750 j_index_start = jindex[iidx];
1751 j_index_end = jindex[iidx+1];
1753 /* Get outer coordinate index */
1755 i_coord_offset = DIM*inr;
1757 /* Load i particle coords and add shift vector */
1758 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1759 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1761 fix1 = _mm256_setzero_ps();
1762 fiy1 = _mm256_setzero_ps();
1763 fiz1 = _mm256_setzero_ps();
1764 fix2 = _mm256_setzero_ps();
1765 fiy2 = _mm256_setzero_ps();
1766 fiz2 = _mm256_setzero_ps();
1767 fix3 = _mm256_setzero_ps();
1768 fiy3 = _mm256_setzero_ps();
1769 fiz3 = _mm256_setzero_ps();
1771 /* Start inner kernel loop */
1772 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1775 /* Get j neighbor index, and coordinate index */
1777 jnrB = jjnr[jidx+1];
1778 jnrC = jjnr[jidx+2];
1779 jnrD = jjnr[jidx+3];
1780 jnrE = jjnr[jidx+4];
1781 jnrF = jjnr[jidx+5];
1782 jnrG = jjnr[jidx+6];
1783 jnrH = jjnr[jidx+7];
1784 j_coord_offsetA = DIM*jnrA;
1785 j_coord_offsetB = DIM*jnrB;
1786 j_coord_offsetC = DIM*jnrC;
1787 j_coord_offsetD = DIM*jnrD;
1788 j_coord_offsetE = DIM*jnrE;
1789 j_coord_offsetF = DIM*jnrF;
1790 j_coord_offsetG = DIM*jnrG;
1791 j_coord_offsetH = DIM*jnrH;
1793 /* load j atom coordinates */
1794 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1795 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1796 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1797 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1798 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1800 /* Calculate displacement vector */
1801 dx11 = _mm256_sub_ps(ix1,jx1);
1802 dy11 = _mm256_sub_ps(iy1,jy1);
1803 dz11 = _mm256_sub_ps(iz1,jz1);
1804 dx12 = _mm256_sub_ps(ix1,jx2);
1805 dy12 = _mm256_sub_ps(iy1,jy2);
1806 dz12 = _mm256_sub_ps(iz1,jz2);
1807 dx13 = _mm256_sub_ps(ix1,jx3);
1808 dy13 = _mm256_sub_ps(iy1,jy3);
1809 dz13 = _mm256_sub_ps(iz1,jz3);
1810 dx21 = _mm256_sub_ps(ix2,jx1);
1811 dy21 = _mm256_sub_ps(iy2,jy1);
1812 dz21 = _mm256_sub_ps(iz2,jz1);
1813 dx22 = _mm256_sub_ps(ix2,jx2);
1814 dy22 = _mm256_sub_ps(iy2,jy2);
1815 dz22 = _mm256_sub_ps(iz2,jz2);
1816 dx23 = _mm256_sub_ps(ix2,jx3);
1817 dy23 = _mm256_sub_ps(iy2,jy3);
1818 dz23 = _mm256_sub_ps(iz2,jz3);
1819 dx31 = _mm256_sub_ps(ix3,jx1);
1820 dy31 = _mm256_sub_ps(iy3,jy1);
1821 dz31 = _mm256_sub_ps(iz3,jz1);
1822 dx32 = _mm256_sub_ps(ix3,jx2);
1823 dy32 = _mm256_sub_ps(iy3,jy2);
1824 dz32 = _mm256_sub_ps(iz3,jz2);
1825 dx33 = _mm256_sub_ps(ix3,jx3);
1826 dy33 = _mm256_sub_ps(iy3,jy3);
1827 dz33 = _mm256_sub_ps(iz3,jz3);
1829 /* Calculate squared distance and things based on it */
1830 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1831 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1832 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1833 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1834 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1835 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1836 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1837 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1838 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1840 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1841 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1842 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1843 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1844 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1845 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1846 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1847 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1848 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1850 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1851 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1852 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1853 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1854 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1855 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1856 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1857 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1858 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1860 fjx1 = _mm256_setzero_ps();
1861 fjy1 = _mm256_setzero_ps();
1862 fjz1 = _mm256_setzero_ps();
1863 fjx2 = _mm256_setzero_ps();
1864 fjy2 = _mm256_setzero_ps();
1865 fjz2 = _mm256_setzero_ps();
1866 fjx3 = _mm256_setzero_ps();
1867 fjy3 = _mm256_setzero_ps();
1868 fjz3 = _mm256_setzero_ps();
1870 /**************************
1871 * CALCULATE INTERACTIONS *
1872 **************************/
1874 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1877 r11 = _mm256_mul_ps(rsq11,rinv11);
1879 /* EWALD ELECTROSTATICS */
1881 /* Analytical PME correction */
1882 zeta2 = _mm256_mul_ps(beta2,rsq11);
1883 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1884 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1885 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1886 felec = _mm256_mul_ps(qq11,felec);
1887 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1888 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1889 velec = _mm256_sub_ps(rinv11,pmecorrV);
1890 velec = _mm256_mul_ps(qq11,velec);
1892 d = _mm256_sub_ps(r11,rswitch);
1893 d = _mm256_max_ps(d,_mm256_setzero_ps());
1894 d2 = _mm256_mul_ps(d,d);
1895 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1897 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1899 /* Evaluate switch function */
1900 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1901 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1902 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1906 fscal = _mm256_and_ps(fscal,cutoff_mask);
1908 /* Calculate temporary vectorial force */
1909 tx = _mm256_mul_ps(fscal,dx11);
1910 ty = _mm256_mul_ps(fscal,dy11);
1911 tz = _mm256_mul_ps(fscal,dz11);
1913 /* Update vectorial force */
1914 fix1 = _mm256_add_ps(fix1,tx);
1915 fiy1 = _mm256_add_ps(fiy1,ty);
1916 fiz1 = _mm256_add_ps(fiz1,tz);
1918 fjx1 = _mm256_add_ps(fjx1,tx);
1919 fjy1 = _mm256_add_ps(fjy1,ty);
1920 fjz1 = _mm256_add_ps(fjz1,tz);
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1928 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1931 r12 = _mm256_mul_ps(rsq12,rinv12);
1933 /* EWALD ELECTROSTATICS */
1935 /* Analytical PME correction */
1936 zeta2 = _mm256_mul_ps(beta2,rsq12);
1937 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1938 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1939 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1940 felec = _mm256_mul_ps(qq12,felec);
1941 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1942 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1943 velec = _mm256_sub_ps(rinv12,pmecorrV);
1944 velec = _mm256_mul_ps(qq12,velec);
1946 d = _mm256_sub_ps(r12,rswitch);
1947 d = _mm256_max_ps(d,_mm256_setzero_ps());
1948 d2 = _mm256_mul_ps(d,d);
1949 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1951 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1953 /* Evaluate switch function */
1954 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1955 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1956 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1960 fscal = _mm256_and_ps(fscal,cutoff_mask);
1962 /* Calculate temporary vectorial force */
1963 tx = _mm256_mul_ps(fscal,dx12);
1964 ty = _mm256_mul_ps(fscal,dy12);
1965 tz = _mm256_mul_ps(fscal,dz12);
1967 /* Update vectorial force */
1968 fix1 = _mm256_add_ps(fix1,tx);
1969 fiy1 = _mm256_add_ps(fiy1,ty);
1970 fiz1 = _mm256_add_ps(fiz1,tz);
1972 fjx2 = _mm256_add_ps(fjx2,tx);
1973 fjy2 = _mm256_add_ps(fjy2,ty);
1974 fjz2 = _mm256_add_ps(fjz2,tz);
1978 /**************************
1979 * CALCULATE INTERACTIONS *
1980 **************************/
1982 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1985 r13 = _mm256_mul_ps(rsq13,rinv13);
1987 /* EWALD ELECTROSTATICS */
1989 /* Analytical PME correction */
1990 zeta2 = _mm256_mul_ps(beta2,rsq13);
1991 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1992 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1993 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1994 felec = _mm256_mul_ps(qq13,felec);
1995 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1996 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1997 velec = _mm256_sub_ps(rinv13,pmecorrV);
1998 velec = _mm256_mul_ps(qq13,velec);
2000 d = _mm256_sub_ps(r13,rswitch);
2001 d = _mm256_max_ps(d,_mm256_setzero_ps());
2002 d2 = _mm256_mul_ps(d,d);
2003 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2005 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2007 /* Evaluate switch function */
2008 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2009 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2010 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2014 fscal = _mm256_and_ps(fscal,cutoff_mask);
2016 /* Calculate temporary vectorial force */
2017 tx = _mm256_mul_ps(fscal,dx13);
2018 ty = _mm256_mul_ps(fscal,dy13);
2019 tz = _mm256_mul_ps(fscal,dz13);
2021 /* Update vectorial force */
2022 fix1 = _mm256_add_ps(fix1,tx);
2023 fiy1 = _mm256_add_ps(fiy1,ty);
2024 fiz1 = _mm256_add_ps(fiz1,tz);
2026 fjx3 = _mm256_add_ps(fjx3,tx);
2027 fjy3 = _mm256_add_ps(fjy3,ty);
2028 fjz3 = _mm256_add_ps(fjz3,tz);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2039 r21 = _mm256_mul_ps(rsq21,rinv21);
2041 /* EWALD ELECTROSTATICS */
2043 /* Analytical PME correction */
2044 zeta2 = _mm256_mul_ps(beta2,rsq21);
2045 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2046 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2047 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2048 felec = _mm256_mul_ps(qq21,felec);
2049 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2050 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2051 velec = _mm256_sub_ps(rinv21,pmecorrV);
2052 velec = _mm256_mul_ps(qq21,velec);
2054 d = _mm256_sub_ps(r21,rswitch);
2055 d = _mm256_max_ps(d,_mm256_setzero_ps());
2056 d2 = _mm256_mul_ps(d,d);
2057 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2059 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2061 /* Evaluate switch function */
2062 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2063 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2064 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2068 fscal = _mm256_and_ps(fscal,cutoff_mask);
2070 /* Calculate temporary vectorial force */
2071 tx = _mm256_mul_ps(fscal,dx21);
2072 ty = _mm256_mul_ps(fscal,dy21);
2073 tz = _mm256_mul_ps(fscal,dz21);
2075 /* Update vectorial force */
2076 fix2 = _mm256_add_ps(fix2,tx);
2077 fiy2 = _mm256_add_ps(fiy2,ty);
2078 fiz2 = _mm256_add_ps(fiz2,tz);
2080 fjx1 = _mm256_add_ps(fjx1,tx);
2081 fjy1 = _mm256_add_ps(fjy1,ty);
2082 fjz1 = _mm256_add_ps(fjz1,tz);
2086 /**************************
2087 * CALCULATE INTERACTIONS *
2088 **************************/
2090 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2093 r22 = _mm256_mul_ps(rsq22,rinv22);
2095 /* EWALD ELECTROSTATICS */
2097 /* Analytical PME correction */
2098 zeta2 = _mm256_mul_ps(beta2,rsq22);
2099 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2100 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2101 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2102 felec = _mm256_mul_ps(qq22,felec);
2103 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2104 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2105 velec = _mm256_sub_ps(rinv22,pmecorrV);
2106 velec = _mm256_mul_ps(qq22,velec);
2108 d = _mm256_sub_ps(r22,rswitch);
2109 d = _mm256_max_ps(d,_mm256_setzero_ps());
2110 d2 = _mm256_mul_ps(d,d);
2111 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2113 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2115 /* Evaluate switch function */
2116 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2117 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2118 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2122 fscal = _mm256_and_ps(fscal,cutoff_mask);
2124 /* Calculate temporary vectorial force */
2125 tx = _mm256_mul_ps(fscal,dx22);
2126 ty = _mm256_mul_ps(fscal,dy22);
2127 tz = _mm256_mul_ps(fscal,dz22);
2129 /* Update vectorial force */
2130 fix2 = _mm256_add_ps(fix2,tx);
2131 fiy2 = _mm256_add_ps(fiy2,ty);
2132 fiz2 = _mm256_add_ps(fiz2,tz);
2134 fjx2 = _mm256_add_ps(fjx2,tx);
2135 fjy2 = _mm256_add_ps(fjy2,ty);
2136 fjz2 = _mm256_add_ps(fjz2,tz);
2140 /**************************
2141 * CALCULATE INTERACTIONS *
2142 **************************/
2144 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2147 r23 = _mm256_mul_ps(rsq23,rinv23);
2149 /* EWALD ELECTROSTATICS */
2151 /* Analytical PME correction */
2152 zeta2 = _mm256_mul_ps(beta2,rsq23);
2153 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2154 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2155 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2156 felec = _mm256_mul_ps(qq23,felec);
2157 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2158 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2159 velec = _mm256_sub_ps(rinv23,pmecorrV);
2160 velec = _mm256_mul_ps(qq23,velec);
2162 d = _mm256_sub_ps(r23,rswitch);
2163 d = _mm256_max_ps(d,_mm256_setzero_ps());
2164 d2 = _mm256_mul_ps(d,d);
2165 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2167 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2169 /* Evaluate switch function */
2170 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2171 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2172 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2176 fscal = _mm256_and_ps(fscal,cutoff_mask);
2178 /* Calculate temporary vectorial force */
2179 tx = _mm256_mul_ps(fscal,dx23);
2180 ty = _mm256_mul_ps(fscal,dy23);
2181 tz = _mm256_mul_ps(fscal,dz23);
2183 /* Update vectorial force */
2184 fix2 = _mm256_add_ps(fix2,tx);
2185 fiy2 = _mm256_add_ps(fiy2,ty);
2186 fiz2 = _mm256_add_ps(fiz2,tz);
2188 fjx3 = _mm256_add_ps(fjx3,tx);
2189 fjy3 = _mm256_add_ps(fjy3,ty);
2190 fjz3 = _mm256_add_ps(fjz3,tz);
2194 /**************************
2195 * CALCULATE INTERACTIONS *
2196 **************************/
2198 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2201 r31 = _mm256_mul_ps(rsq31,rinv31);
2203 /* EWALD ELECTROSTATICS */
2205 /* Analytical PME correction */
2206 zeta2 = _mm256_mul_ps(beta2,rsq31);
2207 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2208 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2209 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2210 felec = _mm256_mul_ps(qq31,felec);
2211 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2212 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2213 velec = _mm256_sub_ps(rinv31,pmecorrV);
2214 velec = _mm256_mul_ps(qq31,velec);
2216 d = _mm256_sub_ps(r31,rswitch);
2217 d = _mm256_max_ps(d,_mm256_setzero_ps());
2218 d2 = _mm256_mul_ps(d,d);
2219 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2221 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2223 /* Evaluate switch function */
2224 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2225 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2226 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2230 fscal = _mm256_and_ps(fscal,cutoff_mask);
2232 /* Calculate temporary vectorial force */
2233 tx = _mm256_mul_ps(fscal,dx31);
2234 ty = _mm256_mul_ps(fscal,dy31);
2235 tz = _mm256_mul_ps(fscal,dz31);
2237 /* Update vectorial force */
2238 fix3 = _mm256_add_ps(fix3,tx);
2239 fiy3 = _mm256_add_ps(fiy3,ty);
2240 fiz3 = _mm256_add_ps(fiz3,tz);
2242 fjx1 = _mm256_add_ps(fjx1,tx);
2243 fjy1 = _mm256_add_ps(fjy1,ty);
2244 fjz1 = _mm256_add_ps(fjz1,tz);
2248 /**************************
2249 * CALCULATE INTERACTIONS *
2250 **************************/
2252 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2255 r32 = _mm256_mul_ps(rsq32,rinv32);
2257 /* EWALD ELECTROSTATICS */
2259 /* Analytical PME correction */
2260 zeta2 = _mm256_mul_ps(beta2,rsq32);
2261 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2262 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2263 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2264 felec = _mm256_mul_ps(qq32,felec);
2265 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2266 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2267 velec = _mm256_sub_ps(rinv32,pmecorrV);
2268 velec = _mm256_mul_ps(qq32,velec);
2270 d = _mm256_sub_ps(r32,rswitch);
2271 d = _mm256_max_ps(d,_mm256_setzero_ps());
2272 d2 = _mm256_mul_ps(d,d);
2273 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2275 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2277 /* Evaluate switch function */
2278 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2279 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2280 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2284 fscal = _mm256_and_ps(fscal,cutoff_mask);
2286 /* Calculate temporary vectorial force */
2287 tx = _mm256_mul_ps(fscal,dx32);
2288 ty = _mm256_mul_ps(fscal,dy32);
2289 tz = _mm256_mul_ps(fscal,dz32);
2291 /* Update vectorial force */
2292 fix3 = _mm256_add_ps(fix3,tx);
2293 fiy3 = _mm256_add_ps(fiy3,ty);
2294 fiz3 = _mm256_add_ps(fiz3,tz);
2296 fjx2 = _mm256_add_ps(fjx2,tx);
2297 fjy2 = _mm256_add_ps(fjy2,ty);
2298 fjz2 = _mm256_add_ps(fjz2,tz);
2302 /**************************
2303 * CALCULATE INTERACTIONS *
2304 **************************/
2306 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2309 r33 = _mm256_mul_ps(rsq33,rinv33);
2311 /* EWALD ELECTROSTATICS */
2313 /* Analytical PME correction */
2314 zeta2 = _mm256_mul_ps(beta2,rsq33);
2315 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2316 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2317 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2318 felec = _mm256_mul_ps(qq33,felec);
2319 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2320 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2321 velec = _mm256_sub_ps(rinv33,pmecorrV);
2322 velec = _mm256_mul_ps(qq33,velec);
2324 d = _mm256_sub_ps(r33,rswitch);
2325 d = _mm256_max_ps(d,_mm256_setzero_ps());
2326 d2 = _mm256_mul_ps(d,d);
2327 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2329 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2331 /* Evaluate switch function */
2332 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2333 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2334 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2338 fscal = _mm256_and_ps(fscal,cutoff_mask);
2340 /* Calculate temporary vectorial force */
2341 tx = _mm256_mul_ps(fscal,dx33);
2342 ty = _mm256_mul_ps(fscal,dy33);
2343 tz = _mm256_mul_ps(fscal,dz33);
2345 /* Update vectorial force */
2346 fix3 = _mm256_add_ps(fix3,tx);
2347 fiy3 = _mm256_add_ps(fiy3,ty);
2348 fiz3 = _mm256_add_ps(fiz3,tz);
2350 fjx3 = _mm256_add_ps(fjx3,tx);
2351 fjy3 = _mm256_add_ps(fjy3,ty);
2352 fjz3 = _mm256_add_ps(fjz3,tz);
2356 fjptrA = f+j_coord_offsetA;
2357 fjptrB = f+j_coord_offsetB;
2358 fjptrC = f+j_coord_offsetC;
2359 fjptrD = f+j_coord_offsetD;
2360 fjptrE = f+j_coord_offsetE;
2361 fjptrF = f+j_coord_offsetF;
2362 fjptrG = f+j_coord_offsetG;
2363 fjptrH = f+j_coord_offsetH;
2365 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2366 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2367 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2369 /* Inner loop uses 945 flops */
2372 if(jidx<j_index_end)
2375 /* Get j neighbor index, and coordinate index */
2376 jnrlistA = jjnr[jidx];
2377 jnrlistB = jjnr[jidx+1];
2378 jnrlistC = jjnr[jidx+2];
2379 jnrlistD = jjnr[jidx+3];
2380 jnrlistE = jjnr[jidx+4];
2381 jnrlistF = jjnr[jidx+5];
2382 jnrlistG = jjnr[jidx+6];
2383 jnrlistH = jjnr[jidx+7];
2384 /* Sign of each element will be negative for non-real atoms.
2385 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2386 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2388 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2389 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2391 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2392 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2393 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2394 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2395 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2396 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2397 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2398 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2399 j_coord_offsetA = DIM*jnrA;
2400 j_coord_offsetB = DIM*jnrB;
2401 j_coord_offsetC = DIM*jnrC;
2402 j_coord_offsetD = DIM*jnrD;
2403 j_coord_offsetE = DIM*jnrE;
2404 j_coord_offsetF = DIM*jnrF;
2405 j_coord_offsetG = DIM*jnrG;
2406 j_coord_offsetH = DIM*jnrH;
2408 /* load j atom coordinates */
2409 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2410 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2411 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
2412 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
2413 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2415 /* Calculate displacement vector */
2416 dx11 = _mm256_sub_ps(ix1,jx1);
2417 dy11 = _mm256_sub_ps(iy1,jy1);
2418 dz11 = _mm256_sub_ps(iz1,jz1);
2419 dx12 = _mm256_sub_ps(ix1,jx2);
2420 dy12 = _mm256_sub_ps(iy1,jy2);
2421 dz12 = _mm256_sub_ps(iz1,jz2);
2422 dx13 = _mm256_sub_ps(ix1,jx3);
2423 dy13 = _mm256_sub_ps(iy1,jy3);
2424 dz13 = _mm256_sub_ps(iz1,jz3);
2425 dx21 = _mm256_sub_ps(ix2,jx1);
2426 dy21 = _mm256_sub_ps(iy2,jy1);
2427 dz21 = _mm256_sub_ps(iz2,jz1);
2428 dx22 = _mm256_sub_ps(ix2,jx2);
2429 dy22 = _mm256_sub_ps(iy2,jy2);
2430 dz22 = _mm256_sub_ps(iz2,jz2);
2431 dx23 = _mm256_sub_ps(ix2,jx3);
2432 dy23 = _mm256_sub_ps(iy2,jy3);
2433 dz23 = _mm256_sub_ps(iz2,jz3);
2434 dx31 = _mm256_sub_ps(ix3,jx1);
2435 dy31 = _mm256_sub_ps(iy3,jy1);
2436 dz31 = _mm256_sub_ps(iz3,jz1);
2437 dx32 = _mm256_sub_ps(ix3,jx2);
2438 dy32 = _mm256_sub_ps(iy3,jy2);
2439 dz32 = _mm256_sub_ps(iz3,jz2);
2440 dx33 = _mm256_sub_ps(ix3,jx3);
2441 dy33 = _mm256_sub_ps(iy3,jy3);
2442 dz33 = _mm256_sub_ps(iz3,jz3);
2444 /* Calculate squared distance and things based on it */
2445 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2446 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2447 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2448 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2449 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2450 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2451 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2452 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2453 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2455 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2456 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2457 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2458 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2459 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2460 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2461 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2462 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2463 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2465 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2466 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2467 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2468 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2469 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2470 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2471 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2472 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2473 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2475 fjx1 = _mm256_setzero_ps();
2476 fjy1 = _mm256_setzero_ps();
2477 fjz1 = _mm256_setzero_ps();
2478 fjx2 = _mm256_setzero_ps();
2479 fjy2 = _mm256_setzero_ps();
2480 fjz2 = _mm256_setzero_ps();
2481 fjx3 = _mm256_setzero_ps();
2482 fjy3 = _mm256_setzero_ps();
2483 fjz3 = _mm256_setzero_ps();
2485 /**************************
2486 * CALCULATE INTERACTIONS *
2487 **************************/
2489 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2492 r11 = _mm256_mul_ps(rsq11,rinv11);
2493 r11 = _mm256_andnot_ps(dummy_mask,r11);
2495 /* EWALD ELECTROSTATICS */
2497 /* Analytical PME correction */
2498 zeta2 = _mm256_mul_ps(beta2,rsq11);
2499 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2500 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2501 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2502 felec = _mm256_mul_ps(qq11,felec);
2503 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2504 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2505 velec = _mm256_sub_ps(rinv11,pmecorrV);
2506 velec = _mm256_mul_ps(qq11,velec);
2508 d = _mm256_sub_ps(r11,rswitch);
2509 d = _mm256_max_ps(d,_mm256_setzero_ps());
2510 d2 = _mm256_mul_ps(d,d);
2511 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2513 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2515 /* Evaluate switch function */
2516 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2517 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2518 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2522 fscal = _mm256_and_ps(fscal,cutoff_mask);
2524 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2526 /* Calculate temporary vectorial force */
2527 tx = _mm256_mul_ps(fscal,dx11);
2528 ty = _mm256_mul_ps(fscal,dy11);
2529 tz = _mm256_mul_ps(fscal,dz11);
2531 /* Update vectorial force */
2532 fix1 = _mm256_add_ps(fix1,tx);
2533 fiy1 = _mm256_add_ps(fiy1,ty);
2534 fiz1 = _mm256_add_ps(fiz1,tz);
2536 fjx1 = _mm256_add_ps(fjx1,tx);
2537 fjy1 = _mm256_add_ps(fjy1,ty);
2538 fjz1 = _mm256_add_ps(fjz1,tz);
2542 /**************************
2543 * CALCULATE INTERACTIONS *
2544 **************************/
2546 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2549 r12 = _mm256_mul_ps(rsq12,rinv12);
2550 r12 = _mm256_andnot_ps(dummy_mask,r12);
2552 /* EWALD ELECTROSTATICS */
2554 /* Analytical PME correction */
2555 zeta2 = _mm256_mul_ps(beta2,rsq12);
2556 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2557 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2558 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2559 felec = _mm256_mul_ps(qq12,felec);
2560 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2561 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2562 velec = _mm256_sub_ps(rinv12,pmecorrV);
2563 velec = _mm256_mul_ps(qq12,velec);
2565 d = _mm256_sub_ps(r12,rswitch);
2566 d = _mm256_max_ps(d,_mm256_setzero_ps());
2567 d2 = _mm256_mul_ps(d,d);
2568 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2570 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2572 /* Evaluate switch function */
2573 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2574 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2575 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2579 fscal = _mm256_and_ps(fscal,cutoff_mask);
2581 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2583 /* Calculate temporary vectorial force */
2584 tx = _mm256_mul_ps(fscal,dx12);
2585 ty = _mm256_mul_ps(fscal,dy12);
2586 tz = _mm256_mul_ps(fscal,dz12);
2588 /* Update vectorial force */
2589 fix1 = _mm256_add_ps(fix1,tx);
2590 fiy1 = _mm256_add_ps(fiy1,ty);
2591 fiz1 = _mm256_add_ps(fiz1,tz);
2593 fjx2 = _mm256_add_ps(fjx2,tx);
2594 fjy2 = _mm256_add_ps(fjy2,ty);
2595 fjz2 = _mm256_add_ps(fjz2,tz);
2599 /**************************
2600 * CALCULATE INTERACTIONS *
2601 **************************/
2603 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2606 r13 = _mm256_mul_ps(rsq13,rinv13);
2607 r13 = _mm256_andnot_ps(dummy_mask,r13);
2609 /* EWALD ELECTROSTATICS */
2611 /* Analytical PME correction */
2612 zeta2 = _mm256_mul_ps(beta2,rsq13);
2613 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2614 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2615 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2616 felec = _mm256_mul_ps(qq13,felec);
2617 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2618 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2619 velec = _mm256_sub_ps(rinv13,pmecorrV);
2620 velec = _mm256_mul_ps(qq13,velec);
2622 d = _mm256_sub_ps(r13,rswitch);
2623 d = _mm256_max_ps(d,_mm256_setzero_ps());
2624 d2 = _mm256_mul_ps(d,d);
2625 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2627 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2629 /* Evaluate switch function */
2630 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2631 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2632 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2636 fscal = _mm256_and_ps(fscal,cutoff_mask);
2638 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2640 /* Calculate temporary vectorial force */
2641 tx = _mm256_mul_ps(fscal,dx13);
2642 ty = _mm256_mul_ps(fscal,dy13);
2643 tz = _mm256_mul_ps(fscal,dz13);
2645 /* Update vectorial force */
2646 fix1 = _mm256_add_ps(fix1,tx);
2647 fiy1 = _mm256_add_ps(fiy1,ty);
2648 fiz1 = _mm256_add_ps(fiz1,tz);
2650 fjx3 = _mm256_add_ps(fjx3,tx);
2651 fjy3 = _mm256_add_ps(fjy3,ty);
2652 fjz3 = _mm256_add_ps(fjz3,tz);
2656 /**************************
2657 * CALCULATE INTERACTIONS *
2658 **************************/
2660 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2663 r21 = _mm256_mul_ps(rsq21,rinv21);
2664 r21 = _mm256_andnot_ps(dummy_mask,r21);
2666 /* EWALD ELECTROSTATICS */
2668 /* Analytical PME correction */
2669 zeta2 = _mm256_mul_ps(beta2,rsq21);
2670 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2671 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2672 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2673 felec = _mm256_mul_ps(qq21,felec);
2674 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2675 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2676 velec = _mm256_sub_ps(rinv21,pmecorrV);
2677 velec = _mm256_mul_ps(qq21,velec);
2679 d = _mm256_sub_ps(r21,rswitch);
2680 d = _mm256_max_ps(d,_mm256_setzero_ps());
2681 d2 = _mm256_mul_ps(d,d);
2682 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2684 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2686 /* Evaluate switch function */
2687 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2688 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2689 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2693 fscal = _mm256_and_ps(fscal,cutoff_mask);
2695 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2697 /* Calculate temporary vectorial force */
2698 tx = _mm256_mul_ps(fscal,dx21);
2699 ty = _mm256_mul_ps(fscal,dy21);
2700 tz = _mm256_mul_ps(fscal,dz21);
2702 /* Update vectorial force */
2703 fix2 = _mm256_add_ps(fix2,tx);
2704 fiy2 = _mm256_add_ps(fiy2,ty);
2705 fiz2 = _mm256_add_ps(fiz2,tz);
2707 fjx1 = _mm256_add_ps(fjx1,tx);
2708 fjy1 = _mm256_add_ps(fjy1,ty);
2709 fjz1 = _mm256_add_ps(fjz1,tz);
2713 /**************************
2714 * CALCULATE INTERACTIONS *
2715 **************************/
2717 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2720 r22 = _mm256_mul_ps(rsq22,rinv22);
2721 r22 = _mm256_andnot_ps(dummy_mask,r22);
2723 /* EWALD ELECTROSTATICS */
2725 /* Analytical PME correction */
2726 zeta2 = _mm256_mul_ps(beta2,rsq22);
2727 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2728 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2729 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2730 felec = _mm256_mul_ps(qq22,felec);
2731 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2732 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2733 velec = _mm256_sub_ps(rinv22,pmecorrV);
2734 velec = _mm256_mul_ps(qq22,velec);
2736 d = _mm256_sub_ps(r22,rswitch);
2737 d = _mm256_max_ps(d,_mm256_setzero_ps());
2738 d2 = _mm256_mul_ps(d,d);
2739 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2741 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2743 /* Evaluate switch function */
2744 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2745 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2746 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2750 fscal = _mm256_and_ps(fscal,cutoff_mask);
2752 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2754 /* Calculate temporary vectorial force */
2755 tx = _mm256_mul_ps(fscal,dx22);
2756 ty = _mm256_mul_ps(fscal,dy22);
2757 tz = _mm256_mul_ps(fscal,dz22);
2759 /* Update vectorial force */
2760 fix2 = _mm256_add_ps(fix2,tx);
2761 fiy2 = _mm256_add_ps(fiy2,ty);
2762 fiz2 = _mm256_add_ps(fiz2,tz);
2764 fjx2 = _mm256_add_ps(fjx2,tx);
2765 fjy2 = _mm256_add_ps(fjy2,ty);
2766 fjz2 = _mm256_add_ps(fjz2,tz);
2770 /**************************
2771 * CALCULATE INTERACTIONS *
2772 **************************/
2774 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2777 r23 = _mm256_mul_ps(rsq23,rinv23);
2778 r23 = _mm256_andnot_ps(dummy_mask,r23);
2780 /* EWALD ELECTROSTATICS */
2782 /* Analytical PME correction */
2783 zeta2 = _mm256_mul_ps(beta2,rsq23);
2784 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2785 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2786 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2787 felec = _mm256_mul_ps(qq23,felec);
2788 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2789 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2790 velec = _mm256_sub_ps(rinv23,pmecorrV);
2791 velec = _mm256_mul_ps(qq23,velec);
2793 d = _mm256_sub_ps(r23,rswitch);
2794 d = _mm256_max_ps(d,_mm256_setzero_ps());
2795 d2 = _mm256_mul_ps(d,d);
2796 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2798 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2800 /* Evaluate switch function */
2801 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2802 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2803 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2807 fscal = _mm256_and_ps(fscal,cutoff_mask);
2809 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2811 /* Calculate temporary vectorial force */
2812 tx = _mm256_mul_ps(fscal,dx23);
2813 ty = _mm256_mul_ps(fscal,dy23);
2814 tz = _mm256_mul_ps(fscal,dz23);
2816 /* Update vectorial force */
2817 fix2 = _mm256_add_ps(fix2,tx);
2818 fiy2 = _mm256_add_ps(fiy2,ty);
2819 fiz2 = _mm256_add_ps(fiz2,tz);
2821 fjx3 = _mm256_add_ps(fjx3,tx);
2822 fjy3 = _mm256_add_ps(fjy3,ty);
2823 fjz3 = _mm256_add_ps(fjz3,tz);
2827 /**************************
2828 * CALCULATE INTERACTIONS *
2829 **************************/
2831 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2834 r31 = _mm256_mul_ps(rsq31,rinv31);
2835 r31 = _mm256_andnot_ps(dummy_mask,r31);
2837 /* EWALD ELECTROSTATICS */
2839 /* Analytical PME correction */
2840 zeta2 = _mm256_mul_ps(beta2,rsq31);
2841 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2842 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2843 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2844 felec = _mm256_mul_ps(qq31,felec);
2845 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2846 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2847 velec = _mm256_sub_ps(rinv31,pmecorrV);
2848 velec = _mm256_mul_ps(qq31,velec);
2850 d = _mm256_sub_ps(r31,rswitch);
2851 d = _mm256_max_ps(d,_mm256_setzero_ps());
2852 d2 = _mm256_mul_ps(d,d);
2853 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2855 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2857 /* Evaluate switch function */
2858 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2859 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2860 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2864 fscal = _mm256_and_ps(fscal,cutoff_mask);
2866 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2868 /* Calculate temporary vectorial force */
2869 tx = _mm256_mul_ps(fscal,dx31);
2870 ty = _mm256_mul_ps(fscal,dy31);
2871 tz = _mm256_mul_ps(fscal,dz31);
2873 /* Update vectorial force */
2874 fix3 = _mm256_add_ps(fix3,tx);
2875 fiy3 = _mm256_add_ps(fiy3,ty);
2876 fiz3 = _mm256_add_ps(fiz3,tz);
2878 fjx1 = _mm256_add_ps(fjx1,tx);
2879 fjy1 = _mm256_add_ps(fjy1,ty);
2880 fjz1 = _mm256_add_ps(fjz1,tz);
2884 /**************************
2885 * CALCULATE INTERACTIONS *
2886 **************************/
2888 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2891 r32 = _mm256_mul_ps(rsq32,rinv32);
2892 r32 = _mm256_andnot_ps(dummy_mask,r32);
2894 /* EWALD ELECTROSTATICS */
2896 /* Analytical PME correction */
2897 zeta2 = _mm256_mul_ps(beta2,rsq32);
2898 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2899 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2900 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2901 felec = _mm256_mul_ps(qq32,felec);
2902 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2903 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2904 velec = _mm256_sub_ps(rinv32,pmecorrV);
2905 velec = _mm256_mul_ps(qq32,velec);
2907 d = _mm256_sub_ps(r32,rswitch);
2908 d = _mm256_max_ps(d,_mm256_setzero_ps());
2909 d2 = _mm256_mul_ps(d,d);
2910 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2912 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2914 /* Evaluate switch function */
2915 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2916 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2917 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2921 fscal = _mm256_and_ps(fscal,cutoff_mask);
2923 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2925 /* Calculate temporary vectorial force */
2926 tx = _mm256_mul_ps(fscal,dx32);
2927 ty = _mm256_mul_ps(fscal,dy32);
2928 tz = _mm256_mul_ps(fscal,dz32);
2930 /* Update vectorial force */
2931 fix3 = _mm256_add_ps(fix3,tx);
2932 fiy3 = _mm256_add_ps(fiy3,ty);
2933 fiz3 = _mm256_add_ps(fiz3,tz);
2935 fjx2 = _mm256_add_ps(fjx2,tx);
2936 fjy2 = _mm256_add_ps(fjy2,ty);
2937 fjz2 = _mm256_add_ps(fjz2,tz);
2941 /**************************
2942 * CALCULATE INTERACTIONS *
2943 **************************/
2945 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2948 r33 = _mm256_mul_ps(rsq33,rinv33);
2949 r33 = _mm256_andnot_ps(dummy_mask,r33);
2951 /* EWALD ELECTROSTATICS */
2953 /* Analytical PME correction */
2954 zeta2 = _mm256_mul_ps(beta2,rsq33);
2955 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2956 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2957 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2958 felec = _mm256_mul_ps(qq33,felec);
2959 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
2960 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
2961 velec = _mm256_sub_ps(rinv33,pmecorrV);
2962 velec = _mm256_mul_ps(qq33,velec);
2964 d = _mm256_sub_ps(r33,rswitch);
2965 d = _mm256_max_ps(d,_mm256_setzero_ps());
2966 d2 = _mm256_mul_ps(d,d);
2967 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2969 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2971 /* Evaluate switch function */
2972 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2973 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2974 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2978 fscal = _mm256_and_ps(fscal,cutoff_mask);
2980 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2982 /* Calculate temporary vectorial force */
2983 tx = _mm256_mul_ps(fscal,dx33);
2984 ty = _mm256_mul_ps(fscal,dy33);
2985 tz = _mm256_mul_ps(fscal,dz33);
2987 /* Update vectorial force */
2988 fix3 = _mm256_add_ps(fix3,tx);
2989 fiy3 = _mm256_add_ps(fiy3,ty);
2990 fiz3 = _mm256_add_ps(fiz3,tz);
2992 fjx3 = _mm256_add_ps(fjx3,tx);
2993 fjy3 = _mm256_add_ps(fjy3,ty);
2994 fjz3 = _mm256_add_ps(fjz3,tz);
2998 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2999 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3000 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3001 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3002 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3003 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3004 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3005 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3007 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3008 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
3009 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3011 /* Inner loop uses 954 flops */
3014 /* End of innermost loop */
3016 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3017 f+i_coord_offset+DIM,fshift+i_shift_offset);
3019 /* Increment number of inner iterations */
3020 inneriter += j_index_end - j_index_start;
3022 /* Outer loop uses 18 flops */
3025 /* Increment number of outer iterations */
3028 /* Update outer/inner flops */
3030 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*954);