2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 real * vdwioffsetptr3;
95 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
97 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
99 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
100 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
101 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
102 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
103 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
104 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
105 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
106 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
107 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
108 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
109 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
110 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
111 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
112 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
113 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
114 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
117 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
120 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
121 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
123 __m128i ewitab_lo,ewitab_hi;
124 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
125 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
127 __m256 dummy_mask,cutoff_mask;
128 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
129 __m256 one = _mm256_set1_ps(1.0);
130 __m256 two = _mm256_set1_ps(2.0);
136 jindex = nlist->jindex;
138 shiftidx = nlist->shift;
140 shiftvec = fr->shift_vec[0];
141 fshift = fr->fshift[0];
142 facel = _mm256_set1_ps(fr->epsfac);
143 charge = mdatoms->chargeA;
144 nvdwtype = fr->ntype;
146 vdwtype = mdatoms->typeA;
148 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
149 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
150 beta2 = _mm256_mul_ps(beta,beta);
151 beta3 = _mm256_mul_ps(beta,beta2);
153 ewtab = fr->ic->tabq_coul_FDV0;
154 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
155 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
157 /* Setup water-specific parameters */
158 inr = nlist->iinr[0];
159 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
160 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
161 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
162 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
164 jq1 = _mm256_set1_ps(charge[inr+1]);
165 jq2 = _mm256_set1_ps(charge[inr+2]);
166 jq3 = _mm256_set1_ps(charge[inr+3]);
167 vdwjidx0A = 2*vdwtype[inr+0];
168 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
169 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
170 qq11 = _mm256_mul_ps(iq1,jq1);
171 qq12 = _mm256_mul_ps(iq1,jq2);
172 qq13 = _mm256_mul_ps(iq1,jq3);
173 qq21 = _mm256_mul_ps(iq2,jq1);
174 qq22 = _mm256_mul_ps(iq2,jq2);
175 qq23 = _mm256_mul_ps(iq2,jq3);
176 qq31 = _mm256_mul_ps(iq3,jq1);
177 qq32 = _mm256_mul_ps(iq3,jq2);
178 qq33 = _mm256_mul_ps(iq3,jq3);
180 /* Avoid stupid compiler warnings */
181 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
194 for(iidx=0;iidx<4*DIM;iidx++)
199 /* Start outer loop over neighborlists */
200 for(iidx=0; iidx<nri; iidx++)
202 /* Load shift vector for this list */
203 i_shift_offset = DIM*shiftidx[iidx];
205 /* Load limits for loop over neighbors */
206 j_index_start = jindex[iidx];
207 j_index_end = jindex[iidx+1];
209 /* Get outer coordinate index */
211 i_coord_offset = DIM*inr;
213 /* Load i particle coords and add shift vector */
214 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
215 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
217 fix0 = _mm256_setzero_ps();
218 fiy0 = _mm256_setzero_ps();
219 fiz0 = _mm256_setzero_ps();
220 fix1 = _mm256_setzero_ps();
221 fiy1 = _mm256_setzero_ps();
222 fiz1 = _mm256_setzero_ps();
223 fix2 = _mm256_setzero_ps();
224 fiy2 = _mm256_setzero_ps();
225 fiz2 = _mm256_setzero_ps();
226 fix3 = _mm256_setzero_ps();
227 fiy3 = _mm256_setzero_ps();
228 fiz3 = _mm256_setzero_ps();
230 /* Reset potential sums */
231 velecsum = _mm256_setzero_ps();
232 vvdwsum = _mm256_setzero_ps();
234 /* Start inner kernel loop */
235 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
238 /* Get j neighbor index, and coordinate index */
247 j_coord_offsetA = DIM*jnrA;
248 j_coord_offsetB = DIM*jnrB;
249 j_coord_offsetC = DIM*jnrC;
250 j_coord_offsetD = DIM*jnrD;
251 j_coord_offsetE = DIM*jnrE;
252 j_coord_offsetF = DIM*jnrF;
253 j_coord_offsetG = DIM*jnrG;
254 j_coord_offsetH = DIM*jnrH;
256 /* load j atom coordinates */
257 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
258 x+j_coord_offsetC,x+j_coord_offsetD,
259 x+j_coord_offsetE,x+j_coord_offsetF,
260 x+j_coord_offsetG,x+j_coord_offsetH,
261 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
262 &jy2,&jz2,&jx3,&jy3,&jz3);
264 /* Calculate displacement vector */
265 dx00 = _mm256_sub_ps(ix0,jx0);
266 dy00 = _mm256_sub_ps(iy0,jy0);
267 dz00 = _mm256_sub_ps(iz0,jz0);
268 dx11 = _mm256_sub_ps(ix1,jx1);
269 dy11 = _mm256_sub_ps(iy1,jy1);
270 dz11 = _mm256_sub_ps(iz1,jz1);
271 dx12 = _mm256_sub_ps(ix1,jx2);
272 dy12 = _mm256_sub_ps(iy1,jy2);
273 dz12 = _mm256_sub_ps(iz1,jz2);
274 dx13 = _mm256_sub_ps(ix1,jx3);
275 dy13 = _mm256_sub_ps(iy1,jy3);
276 dz13 = _mm256_sub_ps(iz1,jz3);
277 dx21 = _mm256_sub_ps(ix2,jx1);
278 dy21 = _mm256_sub_ps(iy2,jy1);
279 dz21 = _mm256_sub_ps(iz2,jz1);
280 dx22 = _mm256_sub_ps(ix2,jx2);
281 dy22 = _mm256_sub_ps(iy2,jy2);
282 dz22 = _mm256_sub_ps(iz2,jz2);
283 dx23 = _mm256_sub_ps(ix2,jx3);
284 dy23 = _mm256_sub_ps(iy2,jy3);
285 dz23 = _mm256_sub_ps(iz2,jz3);
286 dx31 = _mm256_sub_ps(ix3,jx1);
287 dy31 = _mm256_sub_ps(iy3,jy1);
288 dz31 = _mm256_sub_ps(iz3,jz1);
289 dx32 = _mm256_sub_ps(ix3,jx2);
290 dy32 = _mm256_sub_ps(iy3,jy2);
291 dz32 = _mm256_sub_ps(iz3,jz2);
292 dx33 = _mm256_sub_ps(ix3,jx3);
293 dy33 = _mm256_sub_ps(iy3,jy3);
294 dz33 = _mm256_sub_ps(iz3,jz3);
296 /* Calculate squared distance and things based on it */
297 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
298 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
299 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
300 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
301 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
302 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
303 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
304 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
305 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
306 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
308 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
309 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
310 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
311 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
312 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
313 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
314 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
315 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
316 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
318 rinvsq00 = gmx_mm256_inv_ps(rsq00);
319 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
320 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
321 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
322 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
323 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
324 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
325 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
326 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
327 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
329 fjx0 = _mm256_setzero_ps();
330 fjy0 = _mm256_setzero_ps();
331 fjz0 = _mm256_setzero_ps();
332 fjx1 = _mm256_setzero_ps();
333 fjy1 = _mm256_setzero_ps();
334 fjz1 = _mm256_setzero_ps();
335 fjx2 = _mm256_setzero_ps();
336 fjy2 = _mm256_setzero_ps();
337 fjz2 = _mm256_setzero_ps();
338 fjx3 = _mm256_setzero_ps();
339 fjy3 = _mm256_setzero_ps();
340 fjz3 = _mm256_setzero_ps();
342 /**************************
343 * CALCULATE INTERACTIONS *
344 **************************/
346 /* LENNARD-JONES DISPERSION/REPULSION */
348 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
349 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
350 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
351 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
352 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
354 /* Update potential sum for this i atom from the interaction with this j atom. */
355 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
359 /* Calculate temporary vectorial force */
360 tx = _mm256_mul_ps(fscal,dx00);
361 ty = _mm256_mul_ps(fscal,dy00);
362 tz = _mm256_mul_ps(fscal,dz00);
364 /* Update vectorial force */
365 fix0 = _mm256_add_ps(fix0,tx);
366 fiy0 = _mm256_add_ps(fiy0,ty);
367 fiz0 = _mm256_add_ps(fiz0,tz);
369 fjx0 = _mm256_add_ps(fjx0,tx);
370 fjy0 = _mm256_add_ps(fjy0,ty);
371 fjz0 = _mm256_add_ps(fjz0,tz);
373 /**************************
374 * CALCULATE INTERACTIONS *
375 **************************/
377 r11 = _mm256_mul_ps(rsq11,rinv11);
379 /* EWALD ELECTROSTATICS */
381 /* Analytical PME correction */
382 zeta2 = _mm256_mul_ps(beta2,rsq11);
383 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
384 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
385 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
386 felec = _mm256_mul_ps(qq11,felec);
387 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
388 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
389 velec = _mm256_sub_ps(rinv11,pmecorrV);
390 velec = _mm256_mul_ps(qq11,velec);
392 /* Update potential sum for this i atom from the interaction with this j atom. */
393 velecsum = _mm256_add_ps(velecsum,velec);
397 /* Calculate temporary vectorial force */
398 tx = _mm256_mul_ps(fscal,dx11);
399 ty = _mm256_mul_ps(fscal,dy11);
400 tz = _mm256_mul_ps(fscal,dz11);
402 /* Update vectorial force */
403 fix1 = _mm256_add_ps(fix1,tx);
404 fiy1 = _mm256_add_ps(fiy1,ty);
405 fiz1 = _mm256_add_ps(fiz1,tz);
407 fjx1 = _mm256_add_ps(fjx1,tx);
408 fjy1 = _mm256_add_ps(fjy1,ty);
409 fjz1 = _mm256_add_ps(fjz1,tz);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 r12 = _mm256_mul_ps(rsq12,rinv12);
417 /* EWALD ELECTROSTATICS */
419 /* Analytical PME correction */
420 zeta2 = _mm256_mul_ps(beta2,rsq12);
421 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
422 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
423 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
424 felec = _mm256_mul_ps(qq12,felec);
425 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
426 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
427 velec = _mm256_sub_ps(rinv12,pmecorrV);
428 velec = _mm256_mul_ps(qq12,velec);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm256_add_ps(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm256_mul_ps(fscal,dx12);
437 ty = _mm256_mul_ps(fscal,dy12);
438 tz = _mm256_mul_ps(fscal,dz12);
440 /* Update vectorial force */
441 fix1 = _mm256_add_ps(fix1,tx);
442 fiy1 = _mm256_add_ps(fiy1,ty);
443 fiz1 = _mm256_add_ps(fiz1,tz);
445 fjx2 = _mm256_add_ps(fjx2,tx);
446 fjy2 = _mm256_add_ps(fjy2,ty);
447 fjz2 = _mm256_add_ps(fjz2,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 r13 = _mm256_mul_ps(rsq13,rinv13);
455 /* EWALD ELECTROSTATICS */
457 /* Analytical PME correction */
458 zeta2 = _mm256_mul_ps(beta2,rsq13);
459 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
460 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
461 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
462 felec = _mm256_mul_ps(qq13,felec);
463 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
464 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
465 velec = _mm256_sub_ps(rinv13,pmecorrV);
466 velec = _mm256_mul_ps(qq13,velec);
468 /* Update potential sum for this i atom from the interaction with this j atom. */
469 velecsum = _mm256_add_ps(velecsum,velec);
473 /* Calculate temporary vectorial force */
474 tx = _mm256_mul_ps(fscal,dx13);
475 ty = _mm256_mul_ps(fscal,dy13);
476 tz = _mm256_mul_ps(fscal,dz13);
478 /* Update vectorial force */
479 fix1 = _mm256_add_ps(fix1,tx);
480 fiy1 = _mm256_add_ps(fiy1,ty);
481 fiz1 = _mm256_add_ps(fiz1,tz);
483 fjx3 = _mm256_add_ps(fjx3,tx);
484 fjy3 = _mm256_add_ps(fjy3,ty);
485 fjz3 = _mm256_add_ps(fjz3,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 r21 = _mm256_mul_ps(rsq21,rinv21);
493 /* EWALD ELECTROSTATICS */
495 /* Analytical PME correction */
496 zeta2 = _mm256_mul_ps(beta2,rsq21);
497 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
498 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
499 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
500 felec = _mm256_mul_ps(qq21,felec);
501 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
502 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
503 velec = _mm256_sub_ps(rinv21,pmecorrV);
504 velec = _mm256_mul_ps(qq21,velec);
506 /* Update potential sum for this i atom from the interaction with this j atom. */
507 velecsum = _mm256_add_ps(velecsum,velec);
511 /* Calculate temporary vectorial force */
512 tx = _mm256_mul_ps(fscal,dx21);
513 ty = _mm256_mul_ps(fscal,dy21);
514 tz = _mm256_mul_ps(fscal,dz21);
516 /* Update vectorial force */
517 fix2 = _mm256_add_ps(fix2,tx);
518 fiy2 = _mm256_add_ps(fiy2,ty);
519 fiz2 = _mm256_add_ps(fiz2,tz);
521 fjx1 = _mm256_add_ps(fjx1,tx);
522 fjy1 = _mm256_add_ps(fjy1,ty);
523 fjz1 = _mm256_add_ps(fjz1,tz);
525 /**************************
526 * CALCULATE INTERACTIONS *
527 **************************/
529 r22 = _mm256_mul_ps(rsq22,rinv22);
531 /* EWALD ELECTROSTATICS */
533 /* Analytical PME correction */
534 zeta2 = _mm256_mul_ps(beta2,rsq22);
535 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
536 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
537 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
538 felec = _mm256_mul_ps(qq22,felec);
539 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
540 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
541 velec = _mm256_sub_ps(rinv22,pmecorrV);
542 velec = _mm256_mul_ps(qq22,velec);
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velecsum = _mm256_add_ps(velecsum,velec);
549 /* Calculate temporary vectorial force */
550 tx = _mm256_mul_ps(fscal,dx22);
551 ty = _mm256_mul_ps(fscal,dy22);
552 tz = _mm256_mul_ps(fscal,dz22);
554 /* Update vectorial force */
555 fix2 = _mm256_add_ps(fix2,tx);
556 fiy2 = _mm256_add_ps(fiy2,ty);
557 fiz2 = _mm256_add_ps(fiz2,tz);
559 fjx2 = _mm256_add_ps(fjx2,tx);
560 fjy2 = _mm256_add_ps(fjy2,ty);
561 fjz2 = _mm256_add_ps(fjz2,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 r23 = _mm256_mul_ps(rsq23,rinv23);
569 /* EWALD ELECTROSTATICS */
571 /* Analytical PME correction */
572 zeta2 = _mm256_mul_ps(beta2,rsq23);
573 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
574 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
575 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
576 felec = _mm256_mul_ps(qq23,felec);
577 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
578 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
579 velec = _mm256_sub_ps(rinv23,pmecorrV);
580 velec = _mm256_mul_ps(qq23,velec);
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velecsum = _mm256_add_ps(velecsum,velec);
587 /* Calculate temporary vectorial force */
588 tx = _mm256_mul_ps(fscal,dx23);
589 ty = _mm256_mul_ps(fscal,dy23);
590 tz = _mm256_mul_ps(fscal,dz23);
592 /* Update vectorial force */
593 fix2 = _mm256_add_ps(fix2,tx);
594 fiy2 = _mm256_add_ps(fiy2,ty);
595 fiz2 = _mm256_add_ps(fiz2,tz);
597 fjx3 = _mm256_add_ps(fjx3,tx);
598 fjy3 = _mm256_add_ps(fjy3,ty);
599 fjz3 = _mm256_add_ps(fjz3,tz);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r31 = _mm256_mul_ps(rsq31,rinv31);
607 /* EWALD ELECTROSTATICS */
609 /* Analytical PME correction */
610 zeta2 = _mm256_mul_ps(beta2,rsq31);
611 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
612 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
613 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
614 felec = _mm256_mul_ps(qq31,felec);
615 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
616 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
617 velec = _mm256_sub_ps(rinv31,pmecorrV);
618 velec = _mm256_mul_ps(qq31,velec);
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velecsum = _mm256_add_ps(velecsum,velec);
625 /* Calculate temporary vectorial force */
626 tx = _mm256_mul_ps(fscal,dx31);
627 ty = _mm256_mul_ps(fscal,dy31);
628 tz = _mm256_mul_ps(fscal,dz31);
630 /* Update vectorial force */
631 fix3 = _mm256_add_ps(fix3,tx);
632 fiy3 = _mm256_add_ps(fiy3,ty);
633 fiz3 = _mm256_add_ps(fiz3,tz);
635 fjx1 = _mm256_add_ps(fjx1,tx);
636 fjy1 = _mm256_add_ps(fjy1,ty);
637 fjz1 = _mm256_add_ps(fjz1,tz);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 r32 = _mm256_mul_ps(rsq32,rinv32);
645 /* EWALD ELECTROSTATICS */
647 /* Analytical PME correction */
648 zeta2 = _mm256_mul_ps(beta2,rsq32);
649 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
650 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
651 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
652 felec = _mm256_mul_ps(qq32,felec);
653 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
654 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
655 velec = _mm256_sub_ps(rinv32,pmecorrV);
656 velec = _mm256_mul_ps(qq32,velec);
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velecsum = _mm256_add_ps(velecsum,velec);
663 /* Calculate temporary vectorial force */
664 tx = _mm256_mul_ps(fscal,dx32);
665 ty = _mm256_mul_ps(fscal,dy32);
666 tz = _mm256_mul_ps(fscal,dz32);
668 /* Update vectorial force */
669 fix3 = _mm256_add_ps(fix3,tx);
670 fiy3 = _mm256_add_ps(fiy3,ty);
671 fiz3 = _mm256_add_ps(fiz3,tz);
673 fjx2 = _mm256_add_ps(fjx2,tx);
674 fjy2 = _mm256_add_ps(fjy2,ty);
675 fjz2 = _mm256_add_ps(fjz2,tz);
677 /**************************
678 * CALCULATE INTERACTIONS *
679 **************************/
681 r33 = _mm256_mul_ps(rsq33,rinv33);
683 /* EWALD ELECTROSTATICS */
685 /* Analytical PME correction */
686 zeta2 = _mm256_mul_ps(beta2,rsq33);
687 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
688 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
689 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
690 felec = _mm256_mul_ps(qq33,felec);
691 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
692 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
693 velec = _mm256_sub_ps(rinv33,pmecorrV);
694 velec = _mm256_mul_ps(qq33,velec);
696 /* Update potential sum for this i atom from the interaction with this j atom. */
697 velecsum = _mm256_add_ps(velecsum,velec);
701 /* Calculate temporary vectorial force */
702 tx = _mm256_mul_ps(fscal,dx33);
703 ty = _mm256_mul_ps(fscal,dy33);
704 tz = _mm256_mul_ps(fscal,dz33);
706 /* Update vectorial force */
707 fix3 = _mm256_add_ps(fix3,tx);
708 fiy3 = _mm256_add_ps(fiy3,ty);
709 fiz3 = _mm256_add_ps(fiz3,tz);
711 fjx3 = _mm256_add_ps(fjx3,tx);
712 fjy3 = _mm256_add_ps(fjy3,ty);
713 fjz3 = _mm256_add_ps(fjz3,tz);
715 fjptrA = f+j_coord_offsetA;
716 fjptrB = f+j_coord_offsetB;
717 fjptrC = f+j_coord_offsetC;
718 fjptrD = f+j_coord_offsetD;
719 fjptrE = f+j_coord_offsetE;
720 fjptrF = f+j_coord_offsetF;
721 fjptrG = f+j_coord_offsetG;
722 fjptrH = f+j_coord_offsetH;
724 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
725 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
726 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
728 /* Inner loop uses 791 flops */
734 /* Get j neighbor index, and coordinate index */
735 jnrlistA = jjnr[jidx];
736 jnrlistB = jjnr[jidx+1];
737 jnrlistC = jjnr[jidx+2];
738 jnrlistD = jjnr[jidx+3];
739 jnrlistE = jjnr[jidx+4];
740 jnrlistF = jjnr[jidx+5];
741 jnrlistG = jjnr[jidx+6];
742 jnrlistH = jjnr[jidx+7];
743 /* Sign of each element will be negative for non-real atoms.
744 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
745 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
747 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
748 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
750 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
751 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
752 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
753 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
754 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
755 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
756 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
757 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
758 j_coord_offsetA = DIM*jnrA;
759 j_coord_offsetB = DIM*jnrB;
760 j_coord_offsetC = DIM*jnrC;
761 j_coord_offsetD = DIM*jnrD;
762 j_coord_offsetE = DIM*jnrE;
763 j_coord_offsetF = DIM*jnrF;
764 j_coord_offsetG = DIM*jnrG;
765 j_coord_offsetH = DIM*jnrH;
767 /* load j atom coordinates */
768 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
769 x+j_coord_offsetC,x+j_coord_offsetD,
770 x+j_coord_offsetE,x+j_coord_offsetF,
771 x+j_coord_offsetG,x+j_coord_offsetH,
772 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
773 &jy2,&jz2,&jx3,&jy3,&jz3);
775 /* Calculate displacement vector */
776 dx00 = _mm256_sub_ps(ix0,jx0);
777 dy00 = _mm256_sub_ps(iy0,jy0);
778 dz00 = _mm256_sub_ps(iz0,jz0);
779 dx11 = _mm256_sub_ps(ix1,jx1);
780 dy11 = _mm256_sub_ps(iy1,jy1);
781 dz11 = _mm256_sub_ps(iz1,jz1);
782 dx12 = _mm256_sub_ps(ix1,jx2);
783 dy12 = _mm256_sub_ps(iy1,jy2);
784 dz12 = _mm256_sub_ps(iz1,jz2);
785 dx13 = _mm256_sub_ps(ix1,jx3);
786 dy13 = _mm256_sub_ps(iy1,jy3);
787 dz13 = _mm256_sub_ps(iz1,jz3);
788 dx21 = _mm256_sub_ps(ix2,jx1);
789 dy21 = _mm256_sub_ps(iy2,jy1);
790 dz21 = _mm256_sub_ps(iz2,jz1);
791 dx22 = _mm256_sub_ps(ix2,jx2);
792 dy22 = _mm256_sub_ps(iy2,jy2);
793 dz22 = _mm256_sub_ps(iz2,jz2);
794 dx23 = _mm256_sub_ps(ix2,jx3);
795 dy23 = _mm256_sub_ps(iy2,jy3);
796 dz23 = _mm256_sub_ps(iz2,jz3);
797 dx31 = _mm256_sub_ps(ix3,jx1);
798 dy31 = _mm256_sub_ps(iy3,jy1);
799 dz31 = _mm256_sub_ps(iz3,jz1);
800 dx32 = _mm256_sub_ps(ix3,jx2);
801 dy32 = _mm256_sub_ps(iy3,jy2);
802 dz32 = _mm256_sub_ps(iz3,jz2);
803 dx33 = _mm256_sub_ps(ix3,jx3);
804 dy33 = _mm256_sub_ps(iy3,jy3);
805 dz33 = _mm256_sub_ps(iz3,jz3);
807 /* Calculate squared distance and things based on it */
808 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
809 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
810 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
811 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
812 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
813 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
814 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
815 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
816 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
817 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
819 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
820 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
821 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
822 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
823 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
824 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
825 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
826 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
827 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
829 rinvsq00 = gmx_mm256_inv_ps(rsq00);
830 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
831 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
832 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
833 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
834 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
835 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
836 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
837 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
838 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
840 fjx0 = _mm256_setzero_ps();
841 fjy0 = _mm256_setzero_ps();
842 fjz0 = _mm256_setzero_ps();
843 fjx1 = _mm256_setzero_ps();
844 fjy1 = _mm256_setzero_ps();
845 fjz1 = _mm256_setzero_ps();
846 fjx2 = _mm256_setzero_ps();
847 fjy2 = _mm256_setzero_ps();
848 fjz2 = _mm256_setzero_ps();
849 fjx3 = _mm256_setzero_ps();
850 fjy3 = _mm256_setzero_ps();
851 fjz3 = _mm256_setzero_ps();
853 /**************************
854 * CALCULATE INTERACTIONS *
855 **************************/
857 /* LENNARD-JONES DISPERSION/REPULSION */
859 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
860 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
861 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
862 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
863 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
865 /* Update potential sum for this i atom from the interaction with this j atom. */
866 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
867 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
871 fscal = _mm256_andnot_ps(dummy_mask,fscal);
873 /* Calculate temporary vectorial force */
874 tx = _mm256_mul_ps(fscal,dx00);
875 ty = _mm256_mul_ps(fscal,dy00);
876 tz = _mm256_mul_ps(fscal,dz00);
878 /* Update vectorial force */
879 fix0 = _mm256_add_ps(fix0,tx);
880 fiy0 = _mm256_add_ps(fiy0,ty);
881 fiz0 = _mm256_add_ps(fiz0,tz);
883 fjx0 = _mm256_add_ps(fjx0,tx);
884 fjy0 = _mm256_add_ps(fjy0,ty);
885 fjz0 = _mm256_add_ps(fjz0,tz);
887 /**************************
888 * CALCULATE INTERACTIONS *
889 **************************/
891 r11 = _mm256_mul_ps(rsq11,rinv11);
892 r11 = _mm256_andnot_ps(dummy_mask,r11);
894 /* EWALD ELECTROSTATICS */
896 /* Analytical PME correction */
897 zeta2 = _mm256_mul_ps(beta2,rsq11);
898 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
899 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
900 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
901 felec = _mm256_mul_ps(qq11,felec);
902 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
903 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
904 velec = _mm256_sub_ps(rinv11,pmecorrV);
905 velec = _mm256_mul_ps(qq11,velec);
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm256_andnot_ps(dummy_mask,velec);
909 velecsum = _mm256_add_ps(velecsum,velec);
913 fscal = _mm256_andnot_ps(dummy_mask,fscal);
915 /* Calculate temporary vectorial force */
916 tx = _mm256_mul_ps(fscal,dx11);
917 ty = _mm256_mul_ps(fscal,dy11);
918 tz = _mm256_mul_ps(fscal,dz11);
920 /* Update vectorial force */
921 fix1 = _mm256_add_ps(fix1,tx);
922 fiy1 = _mm256_add_ps(fiy1,ty);
923 fiz1 = _mm256_add_ps(fiz1,tz);
925 fjx1 = _mm256_add_ps(fjx1,tx);
926 fjy1 = _mm256_add_ps(fjy1,ty);
927 fjz1 = _mm256_add_ps(fjz1,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 r12 = _mm256_mul_ps(rsq12,rinv12);
934 r12 = _mm256_andnot_ps(dummy_mask,r12);
936 /* EWALD ELECTROSTATICS */
938 /* Analytical PME correction */
939 zeta2 = _mm256_mul_ps(beta2,rsq12);
940 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
941 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
942 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
943 felec = _mm256_mul_ps(qq12,felec);
944 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
945 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
946 velec = _mm256_sub_ps(rinv12,pmecorrV);
947 velec = _mm256_mul_ps(qq12,velec);
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm256_andnot_ps(dummy_mask,velec);
951 velecsum = _mm256_add_ps(velecsum,velec);
955 fscal = _mm256_andnot_ps(dummy_mask,fscal);
957 /* Calculate temporary vectorial force */
958 tx = _mm256_mul_ps(fscal,dx12);
959 ty = _mm256_mul_ps(fscal,dy12);
960 tz = _mm256_mul_ps(fscal,dz12);
962 /* Update vectorial force */
963 fix1 = _mm256_add_ps(fix1,tx);
964 fiy1 = _mm256_add_ps(fiy1,ty);
965 fiz1 = _mm256_add_ps(fiz1,tz);
967 fjx2 = _mm256_add_ps(fjx2,tx);
968 fjy2 = _mm256_add_ps(fjy2,ty);
969 fjz2 = _mm256_add_ps(fjz2,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 r13 = _mm256_mul_ps(rsq13,rinv13);
976 r13 = _mm256_andnot_ps(dummy_mask,r13);
978 /* EWALD ELECTROSTATICS */
980 /* Analytical PME correction */
981 zeta2 = _mm256_mul_ps(beta2,rsq13);
982 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
983 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
984 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
985 felec = _mm256_mul_ps(qq13,felec);
986 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
987 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
988 velec = _mm256_sub_ps(rinv13,pmecorrV);
989 velec = _mm256_mul_ps(qq13,velec);
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec = _mm256_andnot_ps(dummy_mask,velec);
993 velecsum = _mm256_add_ps(velecsum,velec);
997 fscal = _mm256_andnot_ps(dummy_mask,fscal);
999 /* Calculate temporary vectorial force */
1000 tx = _mm256_mul_ps(fscal,dx13);
1001 ty = _mm256_mul_ps(fscal,dy13);
1002 tz = _mm256_mul_ps(fscal,dz13);
1004 /* Update vectorial force */
1005 fix1 = _mm256_add_ps(fix1,tx);
1006 fiy1 = _mm256_add_ps(fiy1,ty);
1007 fiz1 = _mm256_add_ps(fiz1,tz);
1009 fjx3 = _mm256_add_ps(fjx3,tx);
1010 fjy3 = _mm256_add_ps(fjy3,ty);
1011 fjz3 = _mm256_add_ps(fjz3,tz);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 r21 = _mm256_mul_ps(rsq21,rinv21);
1018 r21 = _mm256_andnot_ps(dummy_mask,r21);
1020 /* EWALD ELECTROSTATICS */
1022 /* Analytical PME correction */
1023 zeta2 = _mm256_mul_ps(beta2,rsq21);
1024 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1025 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1026 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1027 felec = _mm256_mul_ps(qq21,felec);
1028 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1029 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1030 velec = _mm256_sub_ps(rinv21,pmecorrV);
1031 velec = _mm256_mul_ps(qq21,velec);
1033 /* Update potential sum for this i atom from the interaction with this j atom. */
1034 velec = _mm256_andnot_ps(dummy_mask,velec);
1035 velecsum = _mm256_add_ps(velecsum,velec);
1039 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1041 /* Calculate temporary vectorial force */
1042 tx = _mm256_mul_ps(fscal,dx21);
1043 ty = _mm256_mul_ps(fscal,dy21);
1044 tz = _mm256_mul_ps(fscal,dz21);
1046 /* Update vectorial force */
1047 fix2 = _mm256_add_ps(fix2,tx);
1048 fiy2 = _mm256_add_ps(fiy2,ty);
1049 fiz2 = _mm256_add_ps(fiz2,tz);
1051 fjx1 = _mm256_add_ps(fjx1,tx);
1052 fjy1 = _mm256_add_ps(fjy1,ty);
1053 fjz1 = _mm256_add_ps(fjz1,tz);
1055 /**************************
1056 * CALCULATE INTERACTIONS *
1057 **************************/
1059 r22 = _mm256_mul_ps(rsq22,rinv22);
1060 r22 = _mm256_andnot_ps(dummy_mask,r22);
1062 /* EWALD ELECTROSTATICS */
1064 /* Analytical PME correction */
1065 zeta2 = _mm256_mul_ps(beta2,rsq22);
1066 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1067 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1068 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1069 felec = _mm256_mul_ps(qq22,felec);
1070 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1071 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1072 velec = _mm256_sub_ps(rinv22,pmecorrV);
1073 velec = _mm256_mul_ps(qq22,velec);
1075 /* Update potential sum for this i atom from the interaction with this j atom. */
1076 velec = _mm256_andnot_ps(dummy_mask,velec);
1077 velecsum = _mm256_add_ps(velecsum,velec);
1081 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1083 /* Calculate temporary vectorial force */
1084 tx = _mm256_mul_ps(fscal,dx22);
1085 ty = _mm256_mul_ps(fscal,dy22);
1086 tz = _mm256_mul_ps(fscal,dz22);
1088 /* Update vectorial force */
1089 fix2 = _mm256_add_ps(fix2,tx);
1090 fiy2 = _mm256_add_ps(fiy2,ty);
1091 fiz2 = _mm256_add_ps(fiz2,tz);
1093 fjx2 = _mm256_add_ps(fjx2,tx);
1094 fjy2 = _mm256_add_ps(fjy2,ty);
1095 fjz2 = _mm256_add_ps(fjz2,tz);
1097 /**************************
1098 * CALCULATE INTERACTIONS *
1099 **************************/
1101 r23 = _mm256_mul_ps(rsq23,rinv23);
1102 r23 = _mm256_andnot_ps(dummy_mask,r23);
1104 /* EWALD ELECTROSTATICS */
1106 /* Analytical PME correction */
1107 zeta2 = _mm256_mul_ps(beta2,rsq23);
1108 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1109 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1110 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1111 felec = _mm256_mul_ps(qq23,felec);
1112 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1113 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1114 velec = _mm256_sub_ps(rinv23,pmecorrV);
1115 velec = _mm256_mul_ps(qq23,velec);
1117 /* Update potential sum for this i atom from the interaction with this j atom. */
1118 velec = _mm256_andnot_ps(dummy_mask,velec);
1119 velecsum = _mm256_add_ps(velecsum,velec);
1123 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1125 /* Calculate temporary vectorial force */
1126 tx = _mm256_mul_ps(fscal,dx23);
1127 ty = _mm256_mul_ps(fscal,dy23);
1128 tz = _mm256_mul_ps(fscal,dz23);
1130 /* Update vectorial force */
1131 fix2 = _mm256_add_ps(fix2,tx);
1132 fiy2 = _mm256_add_ps(fiy2,ty);
1133 fiz2 = _mm256_add_ps(fiz2,tz);
1135 fjx3 = _mm256_add_ps(fjx3,tx);
1136 fjy3 = _mm256_add_ps(fjy3,ty);
1137 fjz3 = _mm256_add_ps(fjz3,tz);
1139 /**************************
1140 * CALCULATE INTERACTIONS *
1141 **************************/
1143 r31 = _mm256_mul_ps(rsq31,rinv31);
1144 r31 = _mm256_andnot_ps(dummy_mask,r31);
1146 /* EWALD ELECTROSTATICS */
1148 /* Analytical PME correction */
1149 zeta2 = _mm256_mul_ps(beta2,rsq31);
1150 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1151 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1152 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1153 felec = _mm256_mul_ps(qq31,felec);
1154 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1155 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1156 velec = _mm256_sub_ps(rinv31,pmecorrV);
1157 velec = _mm256_mul_ps(qq31,velec);
1159 /* Update potential sum for this i atom from the interaction with this j atom. */
1160 velec = _mm256_andnot_ps(dummy_mask,velec);
1161 velecsum = _mm256_add_ps(velecsum,velec);
1165 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1167 /* Calculate temporary vectorial force */
1168 tx = _mm256_mul_ps(fscal,dx31);
1169 ty = _mm256_mul_ps(fscal,dy31);
1170 tz = _mm256_mul_ps(fscal,dz31);
1172 /* Update vectorial force */
1173 fix3 = _mm256_add_ps(fix3,tx);
1174 fiy3 = _mm256_add_ps(fiy3,ty);
1175 fiz3 = _mm256_add_ps(fiz3,tz);
1177 fjx1 = _mm256_add_ps(fjx1,tx);
1178 fjy1 = _mm256_add_ps(fjy1,ty);
1179 fjz1 = _mm256_add_ps(fjz1,tz);
1181 /**************************
1182 * CALCULATE INTERACTIONS *
1183 **************************/
1185 r32 = _mm256_mul_ps(rsq32,rinv32);
1186 r32 = _mm256_andnot_ps(dummy_mask,r32);
1188 /* EWALD ELECTROSTATICS */
1190 /* Analytical PME correction */
1191 zeta2 = _mm256_mul_ps(beta2,rsq32);
1192 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1193 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1194 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1195 felec = _mm256_mul_ps(qq32,felec);
1196 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1197 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1198 velec = _mm256_sub_ps(rinv32,pmecorrV);
1199 velec = _mm256_mul_ps(qq32,velec);
1201 /* Update potential sum for this i atom from the interaction with this j atom. */
1202 velec = _mm256_andnot_ps(dummy_mask,velec);
1203 velecsum = _mm256_add_ps(velecsum,velec);
1207 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm256_mul_ps(fscal,dx32);
1211 ty = _mm256_mul_ps(fscal,dy32);
1212 tz = _mm256_mul_ps(fscal,dz32);
1214 /* Update vectorial force */
1215 fix3 = _mm256_add_ps(fix3,tx);
1216 fiy3 = _mm256_add_ps(fiy3,ty);
1217 fiz3 = _mm256_add_ps(fiz3,tz);
1219 fjx2 = _mm256_add_ps(fjx2,tx);
1220 fjy2 = _mm256_add_ps(fjy2,ty);
1221 fjz2 = _mm256_add_ps(fjz2,tz);
1223 /**************************
1224 * CALCULATE INTERACTIONS *
1225 **************************/
1227 r33 = _mm256_mul_ps(rsq33,rinv33);
1228 r33 = _mm256_andnot_ps(dummy_mask,r33);
1230 /* EWALD ELECTROSTATICS */
1232 /* Analytical PME correction */
1233 zeta2 = _mm256_mul_ps(beta2,rsq33);
1234 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1235 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1236 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1237 felec = _mm256_mul_ps(qq33,felec);
1238 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1239 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1240 velec = _mm256_sub_ps(rinv33,pmecorrV);
1241 velec = _mm256_mul_ps(qq33,velec);
1243 /* Update potential sum for this i atom from the interaction with this j atom. */
1244 velec = _mm256_andnot_ps(dummy_mask,velec);
1245 velecsum = _mm256_add_ps(velecsum,velec);
1249 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1251 /* Calculate temporary vectorial force */
1252 tx = _mm256_mul_ps(fscal,dx33);
1253 ty = _mm256_mul_ps(fscal,dy33);
1254 tz = _mm256_mul_ps(fscal,dz33);
1256 /* Update vectorial force */
1257 fix3 = _mm256_add_ps(fix3,tx);
1258 fiy3 = _mm256_add_ps(fiy3,ty);
1259 fiz3 = _mm256_add_ps(fiz3,tz);
1261 fjx3 = _mm256_add_ps(fjx3,tx);
1262 fjy3 = _mm256_add_ps(fjy3,ty);
1263 fjz3 = _mm256_add_ps(fjz3,tz);
1265 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1266 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1267 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1268 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1269 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1270 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1271 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1272 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1274 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1275 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1276 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1278 /* Inner loop uses 800 flops */
1281 /* End of innermost loop */
1283 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1284 f+i_coord_offset,fshift+i_shift_offset);
1287 /* Update potential energies */
1288 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1289 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1291 /* Increment number of inner iterations */
1292 inneriter += j_index_end - j_index_start;
1294 /* Outer loop uses 26 flops */
1297 /* Increment number of outer iterations */
1300 /* Update outer/inner flops */
1302 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*800);
1305 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_single
1306 * Electrostatics interaction: Ewald
1307 * VdW interaction: LennardJones
1308 * Geometry: Water4-Water4
1309 * Calculate force/pot: Force
1312 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_single
1313 (t_nblist * gmx_restrict nlist,
1314 rvec * gmx_restrict xx,
1315 rvec * gmx_restrict ff,
1316 t_forcerec * gmx_restrict fr,
1317 t_mdatoms * gmx_restrict mdatoms,
1318 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1319 t_nrnb * gmx_restrict nrnb)
1321 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1322 * just 0 for non-waters.
1323 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1324 * jnr indices corresponding to data put in the four positions in the SIMD register.
1326 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1327 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1328 int jnrA,jnrB,jnrC,jnrD;
1329 int jnrE,jnrF,jnrG,jnrH;
1330 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1331 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1332 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1333 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1334 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1335 real rcutoff_scalar;
1336 real *shiftvec,*fshift,*x,*f;
1337 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1338 real scratch[4*DIM];
1339 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1340 real * vdwioffsetptr0;
1341 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1342 real * vdwioffsetptr1;
1343 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1344 real * vdwioffsetptr2;
1345 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1346 real * vdwioffsetptr3;
1347 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1348 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1349 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1350 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1351 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1352 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1353 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1354 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1355 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1356 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1357 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1358 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1359 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1360 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1361 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1362 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1363 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1364 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1365 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1366 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1369 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1372 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1373 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1375 __m128i ewitab_lo,ewitab_hi;
1376 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1377 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1379 __m256 dummy_mask,cutoff_mask;
1380 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1381 __m256 one = _mm256_set1_ps(1.0);
1382 __m256 two = _mm256_set1_ps(2.0);
1388 jindex = nlist->jindex;
1390 shiftidx = nlist->shift;
1392 shiftvec = fr->shift_vec[0];
1393 fshift = fr->fshift[0];
1394 facel = _mm256_set1_ps(fr->epsfac);
1395 charge = mdatoms->chargeA;
1396 nvdwtype = fr->ntype;
1397 vdwparam = fr->nbfp;
1398 vdwtype = mdatoms->typeA;
1400 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1401 beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1402 beta2 = _mm256_mul_ps(beta,beta);
1403 beta3 = _mm256_mul_ps(beta,beta2);
1405 ewtab = fr->ic->tabq_coul_F;
1406 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1407 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1409 /* Setup water-specific parameters */
1410 inr = nlist->iinr[0];
1411 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1412 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1413 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1414 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1416 jq1 = _mm256_set1_ps(charge[inr+1]);
1417 jq2 = _mm256_set1_ps(charge[inr+2]);
1418 jq3 = _mm256_set1_ps(charge[inr+3]);
1419 vdwjidx0A = 2*vdwtype[inr+0];
1420 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1421 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1422 qq11 = _mm256_mul_ps(iq1,jq1);
1423 qq12 = _mm256_mul_ps(iq1,jq2);
1424 qq13 = _mm256_mul_ps(iq1,jq3);
1425 qq21 = _mm256_mul_ps(iq2,jq1);
1426 qq22 = _mm256_mul_ps(iq2,jq2);
1427 qq23 = _mm256_mul_ps(iq2,jq3);
1428 qq31 = _mm256_mul_ps(iq3,jq1);
1429 qq32 = _mm256_mul_ps(iq3,jq2);
1430 qq33 = _mm256_mul_ps(iq3,jq3);
1432 /* Avoid stupid compiler warnings */
1433 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1434 j_coord_offsetA = 0;
1435 j_coord_offsetB = 0;
1436 j_coord_offsetC = 0;
1437 j_coord_offsetD = 0;
1438 j_coord_offsetE = 0;
1439 j_coord_offsetF = 0;
1440 j_coord_offsetG = 0;
1441 j_coord_offsetH = 0;
1446 for(iidx=0;iidx<4*DIM;iidx++)
1448 scratch[iidx] = 0.0;
1451 /* Start outer loop over neighborlists */
1452 for(iidx=0; iidx<nri; iidx++)
1454 /* Load shift vector for this list */
1455 i_shift_offset = DIM*shiftidx[iidx];
1457 /* Load limits for loop over neighbors */
1458 j_index_start = jindex[iidx];
1459 j_index_end = jindex[iidx+1];
1461 /* Get outer coordinate index */
1463 i_coord_offset = DIM*inr;
1465 /* Load i particle coords and add shift vector */
1466 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1467 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1469 fix0 = _mm256_setzero_ps();
1470 fiy0 = _mm256_setzero_ps();
1471 fiz0 = _mm256_setzero_ps();
1472 fix1 = _mm256_setzero_ps();
1473 fiy1 = _mm256_setzero_ps();
1474 fiz1 = _mm256_setzero_ps();
1475 fix2 = _mm256_setzero_ps();
1476 fiy2 = _mm256_setzero_ps();
1477 fiz2 = _mm256_setzero_ps();
1478 fix3 = _mm256_setzero_ps();
1479 fiy3 = _mm256_setzero_ps();
1480 fiz3 = _mm256_setzero_ps();
1482 /* Start inner kernel loop */
1483 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1486 /* Get j neighbor index, and coordinate index */
1488 jnrB = jjnr[jidx+1];
1489 jnrC = jjnr[jidx+2];
1490 jnrD = jjnr[jidx+3];
1491 jnrE = jjnr[jidx+4];
1492 jnrF = jjnr[jidx+5];
1493 jnrG = jjnr[jidx+6];
1494 jnrH = jjnr[jidx+7];
1495 j_coord_offsetA = DIM*jnrA;
1496 j_coord_offsetB = DIM*jnrB;
1497 j_coord_offsetC = DIM*jnrC;
1498 j_coord_offsetD = DIM*jnrD;
1499 j_coord_offsetE = DIM*jnrE;
1500 j_coord_offsetF = DIM*jnrF;
1501 j_coord_offsetG = DIM*jnrG;
1502 j_coord_offsetH = DIM*jnrH;
1504 /* load j atom coordinates */
1505 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1506 x+j_coord_offsetC,x+j_coord_offsetD,
1507 x+j_coord_offsetE,x+j_coord_offsetF,
1508 x+j_coord_offsetG,x+j_coord_offsetH,
1509 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1510 &jy2,&jz2,&jx3,&jy3,&jz3);
1512 /* Calculate displacement vector */
1513 dx00 = _mm256_sub_ps(ix0,jx0);
1514 dy00 = _mm256_sub_ps(iy0,jy0);
1515 dz00 = _mm256_sub_ps(iz0,jz0);
1516 dx11 = _mm256_sub_ps(ix1,jx1);
1517 dy11 = _mm256_sub_ps(iy1,jy1);
1518 dz11 = _mm256_sub_ps(iz1,jz1);
1519 dx12 = _mm256_sub_ps(ix1,jx2);
1520 dy12 = _mm256_sub_ps(iy1,jy2);
1521 dz12 = _mm256_sub_ps(iz1,jz2);
1522 dx13 = _mm256_sub_ps(ix1,jx3);
1523 dy13 = _mm256_sub_ps(iy1,jy3);
1524 dz13 = _mm256_sub_ps(iz1,jz3);
1525 dx21 = _mm256_sub_ps(ix2,jx1);
1526 dy21 = _mm256_sub_ps(iy2,jy1);
1527 dz21 = _mm256_sub_ps(iz2,jz1);
1528 dx22 = _mm256_sub_ps(ix2,jx2);
1529 dy22 = _mm256_sub_ps(iy2,jy2);
1530 dz22 = _mm256_sub_ps(iz2,jz2);
1531 dx23 = _mm256_sub_ps(ix2,jx3);
1532 dy23 = _mm256_sub_ps(iy2,jy3);
1533 dz23 = _mm256_sub_ps(iz2,jz3);
1534 dx31 = _mm256_sub_ps(ix3,jx1);
1535 dy31 = _mm256_sub_ps(iy3,jy1);
1536 dz31 = _mm256_sub_ps(iz3,jz1);
1537 dx32 = _mm256_sub_ps(ix3,jx2);
1538 dy32 = _mm256_sub_ps(iy3,jy2);
1539 dz32 = _mm256_sub_ps(iz3,jz2);
1540 dx33 = _mm256_sub_ps(ix3,jx3);
1541 dy33 = _mm256_sub_ps(iy3,jy3);
1542 dz33 = _mm256_sub_ps(iz3,jz3);
1544 /* Calculate squared distance and things based on it */
1545 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1546 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1547 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1548 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1549 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1550 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1551 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1552 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1553 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1554 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1556 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1557 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1558 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1559 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1560 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1561 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1562 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1563 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1564 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1566 rinvsq00 = gmx_mm256_inv_ps(rsq00);
1567 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1568 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1569 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1570 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1571 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1572 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1573 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1574 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1575 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1577 fjx0 = _mm256_setzero_ps();
1578 fjy0 = _mm256_setzero_ps();
1579 fjz0 = _mm256_setzero_ps();
1580 fjx1 = _mm256_setzero_ps();
1581 fjy1 = _mm256_setzero_ps();
1582 fjz1 = _mm256_setzero_ps();
1583 fjx2 = _mm256_setzero_ps();
1584 fjy2 = _mm256_setzero_ps();
1585 fjz2 = _mm256_setzero_ps();
1586 fjx3 = _mm256_setzero_ps();
1587 fjy3 = _mm256_setzero_ps();
1588 fjz3 = _mm256_setzero_ps();
1590 /**************************
1591 * CALCULATE INTERACTIONS *
1592 **************************/
1594 /* LENNARD-JONES DISPERSION/REPULSION */
1596 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1597 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1601 /* Calculate temporary vectorial force */
1602 tx = _mm256_mul_ps(fscal,dx00);
1603 ty = _mm256_mul_ps(fscal,dy00);
1604 tz = _mm256_mul_ps(fscal,dz00);
1606 /* Update vectorial force */
1607 fix0 = _mm256_add_ps(fix0,tx);
1608 fiy0 = _mm256_add_ps(fiy0,ty);
1609 fiz0 = _mm256_add_ps(fiz0,tz);
1611 fjx0 = _mm256_add_ps(fjx0,tx);
1612 fjy0 = _mm256_add_ps(fjy0,ty);
1613 fjz0 = _mm256_add_ps(fjz0,tz);
1615 /**************************
1616 * CALCULATE INTERACTIONS *
1617 **************************/
1619 r11 = _mm256_mul_ps(rsq11,rinv11);
1621 /* EWALD ELECTROSTATICS */
1623 /* Analytical PME correction */
1624 zeta2 = _mm256_mul_ps(beta2,rsq11);
1625 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1626 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1627 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1628 felec = _mm256_mul_ps(qq11,felec);
1632 /* Calculate temporary vectorial force */
1633 tx = _mm256_mul_ps(fscal,dx11);
1634 ty = _mm256_mul_ps(fscal,dy11);
1635 tz = _mm256_mul_ps(fscal,dz11);
1637 /* Update vectorial force */
1638 fix1 = _mm256_add_ps(fix1,tx);
1639 fiy1 = _mm256_add_ps(fiy1,ty);
1640 fiz1 = _mm256_add_ps(fiz1,tz);
1642 fjx1 = _mm256_add_ps(fjx1,tx);
1643 fjy1 = _mm256_add_ps(fjy1,ty);
1644 fjz1 = _mm256_add_ps(fjz1,tz);
1646 /**************************
1647 * CALCULATE INTERACTIONS *
1648 **************************/
1650 r12 = _mm256_mul_ps(rsq12,rinv12);
1652 /* EWALD ELECTROSTATICS */
1654 /* Analytical PME correction */
1655 zeta2 = _mm256_mul_ps(beta2,rsq12);
1656 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1657 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1658 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1659 felec = _mm256_mul_ps(qq12,felec);
1663 /* Calculate temporary vectorial force */
1664 tx = _mm256_mul_ps(fscal,dx12);
1665 ty = _mm256_mul_ps(fscal,dy12);
1666 tz = _mm256_mul_ps(fscal,dz12);
1668 /* Update vectorial force */
1669 fix1 = _mm256_add_ps(fix1,tx);
1670 fiy1 = _mm256_add_ps(fiy1,ty);
1671 fiz1 = _mm256_add_ps(fiz1,tz);
1673 fjx2 = _mm256_add_ps(fjx2,tx);
1674 fjy2 = _mm256_add_ps(fjy2,ty);
1675 fjz2 = _mm256_add_ps(fjz2,tz);
1677 /**************************
1678 * CALCULATE INTERACTIONS *
1679 **************************/
1681 r13 = _mm256_mul_ps(rsq13,rinv13);
1683 /* EWALD ELECTROSTATICS */
1685 /* Analytical PME correction */
1686 zeta2 = _mm256_mul_ps(beta2,rsq13);
1687 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1688 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1689 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1690 felec = _mm256_mul_ps(qq13,felec);
1694 /* Calculate temporary vectorial force */
1695 tx = _mm256_mul_ps(fscal,dx13);
1696 ty = _mm256_mul_ps(fscal,dy13);
1697 tz = _mm256_mul_ps(fscal,dz13);
1699 /* Update vectorial force */
1700 fix1 = _mm256_add_ps(fix1,tx);
1701 fiy1 = _mm256_add_ps(fiy1,ty);
1702 fiz1 = _mm256_add_ps(fiz1,tz);
1704 fjx3 = _mm256_add_ps(fjx3,tx);
1705 fjy3 = _mm256_add_ps(fjy3,ty);
1706 fjz3 = _mm256_add_ps(fjz3,tz);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 r21 = _mm256_mul_ps(rsq21,rinv21);
1714 /* EWALD ELECTROSTATICS */
1716 /* Analytical PME correction */
1717 zeta2 = _mm256_mul_ps(beta2,rsq21);
1718 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1719 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1720 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1721 felec = _mm256_mul_ps(qq21,felec);
1725 /* Calculate temporary vectorial force */
1726 tx = _mm256_mul_ps(fscal,dx21);
1727 ty = _mm256_mul_ps(fscal,dy21);
1728 tz = _mm256_mul_ps(fscal,dz21);
1730 /* Update vectorial force */
1731 fix2 = _mm256_add_ps(fix2,tx);
1732 fiy2 = _mm256_add_ps(fiy2,ty);
1733 fiz2 = _mm256_add_ps(fiz2,tz);
1735 fjx1 = _mm256_add_ps(fjx1,tx);
1736 fjy1 = _mm256_add_ps(fjy1,ty);
1737 fjz1 = _mm256_add_ps(fjz1,tz);
1739 /**************************
1740 * CALCULATE INTERACTIONS *
1741 **************************/
1743 r22 = _mm256_mul_ps(rsq22,rinv22);
1745 /* EWALD ELECTROSTATICS */
1747 /* Analytical PME correction */
1748 zeta2 = _mm256_mul_ps(beta2,rsq22);
1749 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1750 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1751 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1752 felec = _mm256_mul_ps(qq22,felec);
1756 /* Calculate temporary vectorial force */
1757 tx = _mm256_mul_ps(fscal,dx22);
1758 ty = _mm256_mul_ps(fscal,dy22);
1759 tz = _mm256_mul_ps(fscal,dz22);
1761 /* Update vectorial force */
1762 fix2 = _mm256_add_ps(fix2,tx);
1763 fiy2 = _mm256_add_ps(fiy2,ty);
1764 fiz2 = _mm256_add_ps(fiz2,tz);
1766 fjx2 = _mm256_add_ps(fjx2,tx);
1767 fjy2 = _mm256_add_ps(fjy2,ty);
1768 fjz2 = _mm256_add_ps(fjz2,tz);
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 r23 = _mm256_mul_ps(rsq23,rinv23);
1776 /* EWALD ELECTROSTATICS */
1778 /* Analytical PME correction */
1779 zeta2 = _mm256_mul_ps(beta2,rsq23);
1780 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1781 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1782 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1783 felec = _mm256_mul_ps(qq23,felec);
1787 /* Calculate temporary vectorial force */
1788 tx = _mm256_mul_ps(fscal,dx23);
1789 ty = _mm256_mul_ps(fscal,dy23);
1790 tz = _mm256_mul_ps(fscal,dz23);
1792 /* Update vectorial force */
1793 fix2 = _mm256_add_ps(fix2,tx);
1794 fiy2 = _mm256_add_ps(fiy2,ty);
1795 fiz2 = _mm256_add_ps(fiz2,tz);
1797 fjx3 = _mm256_add_ps(fjx3,tx);
1798 fjy3 = _mm256_add_ps(fjy3,ty);
1799 fjz3 = _mm256_add_ps(fjz3,tz);
1801 /**************************
1802 * CALCULATE INTERACTIONS *
1803 **************************/
1805 r31 = _mm256_mul_ps(rsq31,rinv31);
1807 /* EWALD ELECTROSTATICS */
1809 /* Analytical PME correction */
1810 zeta2 = _mm256_mul_ps(beta2,rsq31);
1811 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1812 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1813 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1814 felec = _mm256_mul_ps(qq31,felec);
1818 /* Calculate temporary vectorial force */
1819 tx = _mm256_mul_ps(fscal,dx31);
1820 ty = _mm256_mul_ps(fscal,dy31);
1821 tz = _mm256_mul_ps(fscal,dz31);
1823 /* Update vectorial force */
1824 fix3 = _mm256_add_ps(fix3,tx);
1825 fiy3 = _mm256_add_ps(fiy3,ty);
1826 fiz3 = _mm256_add_ps(fiz3,tz);
1828 fjx1 = _mm256_add_ps(fjx1,tx);
1829 fjy1 = _mm256_add_ps(fjy1,ty);
1830 fjz1 = _mm256_add_ps(fjz1,tz);
1832 /**************************
1833 * CALCULATE INTERACTIONS *
1834 **************************/
1836 r32 = _mm256_mul_ps(rsq32,rinv32);
1838 /* EWALD ELECTROSTATICS */
1840 /* Analytical PME correction */
1841 zeta2 = _mm256_mul_ps(beta2,rsq32);
1842 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1843 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1844 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1845 felec = _mm256_mul_ps(qq32,felec);
1849 /* Calculate temporary vectorial force */
1850 tx = _mm256_mul_ps(fscal,dx32);
1851 ty = _mm256_mul_ps(fscal,dy32);
1852 tz = _mm256_mul_ps(fscal,dz32);
1854 /* Update vectorial force */
1855 fix3 = _mm256_add_ps(fix3,tx);
1856 fiy3 = _mm256_add_ps(fiy3,ty);
1857 fiz3 = _mm256_add_ps(fiz3,tz);
1859 fjx2 = _mm256_add_ps(fjx2,tx);
1860 fjy2 = _mm256_add_ps(fjy2,ty);
1861 fjz2 = _mm256_add_ps(fjz2,tz);
1863 /**************************
1864 * CALCULATE INTERACTIONS *
1865 **************************/
1867 r33 = _mm256_mul_ps(rsq33,rinv33);
1869 /* EWALD ELECTROSTATICS */
1871 /* Analytical PME correction */
1872 zeta2 = _mm256_mul_ps(beta2,rsq33);
1873 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1874 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1875 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1876 felec = _mm256_mul_ps(qq33,felec);
1880 /* Calculate temporary vectorial force */
1881 tx = _mm256_mul_ps(fscal,dx33);
1882 ty = _mm256_mul_ps(fscal,dy33);
1883 tz = _mm256_mul_ps(fscal,dz33);
1885 /* Update vectorial force */
1886 fix3 = _mm256_add_ps(fix3,tx);
1887 fiy3 = _mm256_add_ps(fiy3,ty);
1888 fiz3 = _mm256_add_ps(fiz3,tz);
1890 fjx3 = _mm256_add_ps(fjx3,tx);
1891 fjy3 = _mm256_add_ps(fjy3,ty);
1892 fjz3 = _mm256_add_ps(fjz3,tz);
1894 fjptrA = f+j_coord_offsetA;
1895 fjptrB = f+j_coord_offsetB;
1896 fjptrC = f+j_coord_offsetC;
1897 fjptrD = f+j_coord_offsetD;
1898 fjptrE = f+j_coord_offsetE;
1899 fjptrF = f+j_coord_offsetF;
1900 fjptrG = f+j_coord_offsetG;
1901 fjptrH = f+j_coord_offsetH;
1903 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1904 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1905 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1907 /* Inner loop uses 534 flops */
1910 if(jidx<j_index_end)
1913 /* Get j neighbor index, and coordinate index */
1914 jnrlistA = jjnr[jidx];
1915 jnrlistB = jjnr[jidx+1];
1916 jnrlistC = jjnr[jidx+2];
1917 jnrlistD = jjnr[jidx+3];
1918 jnrlistE = jjnr[jidx+4];
1919 jnrlistF = jjnr[jidx+5];
1920 jnrlistG = jjnr[jidx+6];
1921 jnrlistH = jjnr[jidx+7];
1922 /* Sign of each element will be negative for non-real atoms.
1923 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1924 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1926 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1927 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1929 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1930 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1931 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1932 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1933 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1934 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1935 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1936 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1937 j_coord_offsetA = DIM*jnrA;
1938 j_coord_offsetB = DIM*jnrB;
1939 j_coord_offsetC = DIM*jnrC;
1940 j_coord_offsetD = DIM*jnrD;
1941 j_coord_offsetE = DIM*jnrE;
1942 j_coord_offsetF = DIM*jnrF;
1943 j_coord_offsetG = DIM*jnrG;
1944 j_coord_offsetH = DIM*jnrH;
1946 /* load j atom coordinates */
1947 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1948 x+j_coord_offsetC,x+j_coord_offsetD,
1949 x+j_coord_offsetE,x+j_coord_offsetF,
1950 x+j_coord_offsetG,x+j_coord_offsetH,
1951 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1952 &jy2,&jz2,&jx3,&jy3,&jz3);
1954 /* Calculate displacement vector */
1955 dx00 = _mm256_sub_ps(ix0,jx0);
1956 dy00 = _mm256_sub_ps(iy0,jy0);
1957 dz00 = _mm256_sub_ps(iz0,jz0);
1958 dx11 = _mm256_sub_ps(ix1,jx1);
1959 dy11 = _mm256_sub_ps(iy1,jy1);
1960 dz11 = _mm256_sub_ps(iz1,jz1);
1961 dx12 = _mm256_sub_ps(ix1,jx2);
1962 dy12 = _mm256_sub_ps(iy1,jy2);
1963 dz12 = _mm256_sub_ps(iz1,jz2);
1964 dx13 = _mm256_sub_ps(ix1,jx3);
1965 dy13 = _mm256_sub_ps(iy1,jy3);
1966 dz13 = _mm256_sub_ps(iz1,jz3);
1967 dx21 = _mm256_sub_ps(ix2,jx1);
1968 dy21 = _mm256_sub_ps(iy2,jy1);
1969 dz21 = _mm256_sub_ps(iz2,jz1);
1970 dx22 = _mm256_sub_ps(ix2,jx2);
1971 dy22 = _mm256_sub_ps(iy2,jy2);
1972 dz22 = _mm256_sub_ps(iz2,jz2);
1973 dx23 = _mm256_sub_ps(ix2,jx3);
1974 dy23 = _mm256_sub_ps(iy2,jy3);
1975 dz23 = _mm256_sub_ps(iz2,jz3);
1976 dx31 = _mm256_sub_ps(ix3,jx1);
1977 dy31 = _mm256_sub_ps(iy3,jy1);
1978 dz31 = _mm256_sub_ps(iz3,jz1);
1979 dx32 = _mm256_sub_ps(ix3,jx2);
1980 dy32 = _mm256_sub_ps(iy3,jy2);
1981 dz32 = _mm256_sub_ps(iz3,jz2);
1982 dx33 = _mm256_sub_ps(ix3,jx3);
1983 dy33 = _mm256_sub_ps(iy3,jy3);
1984 dz33 = _mm256_sub_ps(iz3,jz3);
1986 /* Calculate squared distance and things based on it */
1987 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1988 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1989 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1990 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1991 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1992 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1993 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1994 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1995 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1996 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1998 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1999 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2000 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2001 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2002 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2003 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2004 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2005 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2006 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2008 rinvsq00 = gmx_mm256_inv_ps(rsq00);
2009 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
2010 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
2011 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
2012 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
2013 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
2014 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
2015 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
2016 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
2017 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
2019 fjx0 = _mm256_setzero_ps();
2020 fjy0 = _mm256_setzero_ps();
2021 fjz0 = _mm256_setzero_ps();
2022 fjx1 = _mm256_setzero_ps();
2023 fjy1 = _mm256_setzero_ps();
2024 fjz1 = _mm256_setzero_ps();
2025 fjx2 = _mm256_setzero_ps();
2026 fjy2 = _mm256_setzero_ps();
2027 fjz2 = _mm256_setzero_ps();
2028 fjx3 = _mm256_setzero_ps();
2029 fjy3 = _mm256_setzero_ps();
2030 fjz3 = _mm256_setzero_ps();
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 /* LENNARD-JONES DISPERSION/REPULSION */
2038 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2039 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2043 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2045 /* Calculate temporary vectorial force */
2046 tx = _mm256_mul_ps(fscal,dx00);
2047 ty = _mm256_mul_ps(fscal,dy00);
2048 tz = _mm256_mul_ps(fscal,dz00);
2050 /* Update vectorial force */
2051 fix0 = _mm256_add_ps(fix0,tx);
2052 fiy0 = _mm256_add_ps(fiy0,ty);
2053 fiz0 = _mm256_add_ps(fiz0,tz);
2055 fjx0 = _mm256_add_ps(fjx0,tx);
2056 fjy0 = _mm256_add_ps(fjy0,ty);
2057 fjz0 = _mm256_add_ps(fjz0,tz);
2059 /**************************
2060 * CALCULATE INTERACTIONS *
2061 **************************/
2063 r11 = _mm256_mul_ps(rsq11,rinv11);
2064 r11 = _mm256_andnot_ps(dummy_mask,r11);
2066 /* EWALD ELECTROSTATICS */
2068 /* Analytical PME correction */
2069 zeta2 = _mm256_mul_ps(beta2,rsq11);
2070 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
2071 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2072 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2073 felec = _mm256_mul_ps(qq11,felec);
2077 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2079 /* Calculate temporary vectorial force */
2080 tx = _mm256_mul_ps(fscal,dx11);
2081 ty = _mm256_mul_ps(fscal,dy11);
2082 tz = _mm256_mul_ps(fscal,dz11);
2084 /* Update vectorial force */
2085 fix1 = _mm256_add_ps(fix1,tx);
2086 fiy1 = _mm256_add_ps(fiy1,ty);
2087 fiz1 = _mm256_add_ps(fiz1,tz);
2089 fjx1 = _mm256_add_ps(fjx1,tx);
2090 fjy1 = _mm256_add_ps(fjy1,ty);
2091 fjz1 = _mm256_add_ps(fjz1,tz);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 r12 = _mm256_mul_ps(rsq12,rinv12);
2098 r12 = _mm256_andnot_ps(dummy_mask,r12);
2100 /* EWALD ELECTROSTATICS */
2102 /* Analytical PME correction */
2103 zeta2 = _mm256_mul_ps(beta2,rsq12);
2104 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
2105 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2106 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2107 felec = _mm256_mul_ps(qq12,felec);
2111 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2113 /* Calculate temporary vectorial force */
2114 tx = _mm256_mul_ps(fscal,dx12);
2115 ty = _mm256_mul_ps(fscal,dy12);
2116 tz = _mm256_mul_ps(fscal,dz12);
2118 /* Update vectorial force */
2119 fix1 = _mm256_add_ps(fix1,tx);
2120 fiy1 = _mm256_add_ps(fiy1,ty);
2121 fiz1 = _mm256_add_ps(fiz1,tz);
2123 fjx2 = _mm256_add_ps(fjx2,tx);
2124 fjy2 = _mm256_add_ps(fjy2,ty);
2125 fjz2 = _mm256_add_ps(fjz2,tz);
2127 /**************************
2128 * CALCULATE INTERACTIONS *
2129 **************************/
2131 r13 = _mm256_mul_ps(rsq13,rinv13);
2132 r13 = _mm256_andnot_ps(dummy_mask,r13);
2134 /* EWALD ELECTROSTATICS */
2136 /* Analytical PME correction */
2137 zeta2 = _mm256_mul_ps(beta2,rsq13);
2138 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
2139 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2140 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2141 felec = _mm256_mul_ps(qq13,felec);
2145 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2147 /* Calculate temporary vectorial force */
2148 tx = _mm256_mul_ps(fscal,dx13);
2149 ty = _mm256_mul_ps(fscal,dy13);
2150 tz = _mm256_mul_ps(fscal,dz13);
2152 /* Update vectorial force */
2153 fix1 = _mm256_add_ps(fix1,tx);
2154 fiy1 = _mm256_add_ps(fiy1,ty);
2155 fiz1 = _mm256_add_ps(fiz1,tz);
2157 fjx3 = _mm256_add_ps(fjx3,tx);
2158 fjy3 = _mm256_add_ps(fjy3,ty);
2159 fjz3 = _mm256_add_ps(fjz3,tz);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 r21 = _mm256_mul_ps(rsq21,rinv21);
2166 r21 = _mm256_andnot_ps(dummy_mask,r21);
2168 /* EWALD ELECTROSTATICS */
2170 /* Analytical PME correction */
2171 zeta2 = _mm256_mul_ps(beta2,rsq21);
2172 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
2173 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2174 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2175 felec = _mm256_mul_ps(qq21,felec);
2179 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2181 /* Calculate temporary vectorial force */
2182 tx = _mm256_mul_ps(fscal,dx21);
2183 ty = _mm256_mul_ps(fscal,dy21);
2184 tz = _mm256_mul_ps(fscal,dz21);
2186 /* Update vectorial force */
2187 fix2 = _mm256_add_ps(fix2,tx);
2188 fiy2 = _mm256_add_ps(fiy2,ty);
2189 fiz2 = _mm256_add_ps(fiz2,tz);
2191 fjx1 = _mm256_add_ps(fjx1,tx);
2192 fjy1 = _mm256_add_ps(fjy1,ty);
2193 fjz1 = _mm256_add_ps(fjz1,tz);
2195 /**************************
2196 * CALCULATE INTERACTIONS *
2197 **************************/
2199 r22 = _mm256_mul_ps(rsq22,rinv22);
2200 r22 = _mm256_andnot_ps(dummy_mask,r22);
2202 /* EWALD ELECTROSTATICS */
2204 /* Analytical PME correction */
2205 zeta2 = _mm256_mul_ps(beta2,rsq22);
2206 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
2207 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2208 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2209 felec = _mm256_mul_ps(qq22,felec);
2213 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2215 /* Calculate temporary vectorial force */
2216 tx = _mm256_mul_ps(fscal,dx22);
2217 ty = _mm256_mul_ps(fscal,dy22);
2218 tz = _mm256_mul_ps(fscal,dz22);
2220 /* Update vectorial force */
2221 fix2 = _mm256_add_ps(fix2,tx);
2222 fiy2 = _mm256_add_ps(fiy2,ty);
2223 fiz2 = _mm256_add_ps(fiz2,tz);
2225 fjx2 = _mm256_add_ps(fjx2,tx);
2226 fjy2 = _mm256_add_ps(fjy2,ty);
2227 fjz2 = _mm256_add_ps(fjz2,tz);
2229 /**************************
2230 * CALCULATE INTERACTIONS *
2231 **************************/
2233 r23 = _mm256_mul_ps(rsq23,rinv23);
2234 r23 = _mm256_andnot_ps(dummy_mask,r23);
2236 /* EWALD ELECTROSTATICS */
2238 /* Analytical PME correction */
2239 zeta2 = _mm256_mul_ps(beta2,rsq23);
2240 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2241 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2242 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2243 felec = _mm256_mul_ps(qq23,felec);
2247 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2249 /* Calculate temporary vectorial force */
2250 tx = _mm256_mul_ps(fscal,dx23);
2251 ty = _mm256_mul_ps(fscal,dy23);
2252 tz = _mm256_mul_ps(fscal,dz23);
2254 /* Update vectorial force */
2255 fix2 = _mm256_add_ps(fix2,tx);
2256 fiy2 = _mm256_add_ps(fiy2,ty);
2257 fiz2 = _mm256_add_ps(fiz2,tz);
2259 fjx3 = _mm256_add_ps(fjx3,tx);
2260 fjy3 = _mm256_add_ps(fjy3,ty);
2261 fjz3 = _mm256_add_ps(fjz3,tz);
2263 /**************************
2264 * CALCULATE INTERACTIONS *
2265 **************************/
2267 r31 = _mm256_mul_ps(rsq31,rinv31);
2268 r31 = _mm256_andnot_ps(dummy_mask,r31);
2270 /* EWALD ELECTROSTATICS */
2272 /* Analytical PME correction */
2273 zeta2 = _mm256_mul_ps(beta2,rsq31);
2274 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2275 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2276 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2277 felec = _mm256_mul_ps(qq31,felec);
2281 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2283 /* Calculate temporary vectorial force */
2284 tx = _mm256_mul_ps(fscal,dx31);
2285 ty = _mm256_mul_ps(fscal,dy31);
2286 tz = _mm256_mul_ps(fscal,dz31);
2288 /* Update vectorial force */
2289 fix3 = _mm256_add_ps(fix3,tx);
2290 fiy3 = _mm256_add_ps(fiy3,ty);
2291 fiz3 = _mm256_add_ps(fiz3,tz);
2293 fjx1 = _mm256_add_ps(fjx1,tx);
2294 fjy1 = _mm256_add_ps(fjy1,ty);
2295 fjz1 = _mm256_add_ps(fjz1,tz);
2297 /**************************
2298 * CALCULATE INTERACTIONS *
2299 **************************/
2301 r32 = _mm256_mul_ps(rsq32,rinv32);
2302 r32 = _mm256_andnot_ps(dummy_mask,r32);
2304 /* EWALD ELECTROSTATICS */
2306 /* Analytical PME correction */
2307 zeta2 = _mm256_mul_ps(beta2,rsq32);
2308 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2309 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2310 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2311 felec = _mm256_mul_ps(qq32,felec);
2315 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2317 /* Calculate temporary vectorial force */
2318 tx = _mm256_mul_ps(fscal,dx32);
2319 ty = _mm256_mul_ps(fscal,dy32);
2320 tz = _mm256_mul_ps(fscal,dz32);
2322 /* Update vectorial force */
2323 fix3 = _mm256_add_ps(fix3,tx);
2324 fiy3 = _mm256_add_ps(fiy3,ty);
2325 fiz3 = _mm256_add_ps(fiz3,tz);
2327 fjx2 = _mm256_add_ps(fjx2,tx);
2328 fjy2 = _mm256_add_ps(fjy2,ty);
2329 fjz2 = _mm256_add_ps(fjz2,tz);
2331 /**************************
2332 * CALCULATE INTERACTIONS *
2333 **************************/
2335 r33 = _mm256_mul_ps(rsq33,rinv33);
2336 r33 = _mm256_andnot_ps(dummy_mask,r33);
2338 /* EWALD ELECTROSTATICS */
2340 /* Analytical PME correction */
2341 zeta2 = _mm256_mul_ps(beta2,rsq33);
2342 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2343 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2344 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2345 felec = _mm256_mul_ps(qq33,felec);
2349 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2351 /* Calculate temporary vectorial force */
2352 tx = _mm256_mul_ps(fscal,dx33);
2353 ty = _mm256_mul_ps(fscal,dy33);
2354 tz = _mm256_mul_ps(fscal,dz33);
2356 /* Update vectorial force */
2357 fix3 = _mm256_add_ps(fix3,tx);
2358 fiy3 = _mm256_add_ps(fiy3,ty);
2359 fiz3 = _mm256_add_ps(fiz3,tz);
2361 fjx3 = _mm256_add_ps(fjx3,tx);
2362 fjy3 = _mm256_add_ps(fjy3,ty);
2363 fjz3 = _mm256_add_ps(fjz3,tz);
2365 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2366 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2367 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2368 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2369 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2370 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2371 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2372 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2374 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2375 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2376 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2378 /* Inner loop uses 543 flops */
2381 /* End of innermost loop */
2383 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2384 f+i_coord_offset,fshift+i_shift_offset);
2386 /* Increment number of inner iterations */
2387 inneriter += j_index_end - j_index_start;
2389 /* Outer loop uses 24 flops */
2392 /* Increment number of outer iterations */
2395 /* Update outer/inner flops */
2397 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*543);