2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_single
52 * Electrostatics interaction: Coulomb
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
114 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
115 __m256 dummy_mask,cutoff_mask;
116 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
117 __m256 one = _mm256_set1_ps(1.0);
118 __m256 two = _mm256_set1_ps(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm256_set1_ps(fr->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
134 vdwtype = mdatoms->typeA;
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
139 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
140 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
141 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
143 jq0 = _mm256_set1_ps(charge[inr+0]);
144 jq1 = _mm256_set1_ps(charge[inr+1]);
145 jq2 = _mm256_set1_ps(charge[inr+2]);
146 vdwjidx0A = 2*vdwtype[inr+0];
147 qq00 = _mm256_mul_ps(iq0,jq0);
148 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
149 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
150 qq01 = _mm256_mul_ps(iq0,jq1);
151 qq02 = _mm256_mul_ps(iq0,jq2);
152 qq10 = _mm256_mul_ps(iq1,jq0);
153 qq11 = _mm256_mul_ps(iq1,jq1);
154 qq12 = _mm256_mul_ps(iq1,jq2);
155 qq20 = _mm256_mul_ps(iq2,jq0);
156 qq21 = _mm256_mul_ps(iq2,jq1);
157 qq22 = _mm256_mul_ps(iq2,jq2);
159 /* Avoid stupid compiler warnings */
160 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
173 for(iidx=0;iidx<4*DIM;iidx++)
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm256_setzero_ps();
197 fiy0 = _mm256_setzero_ps();
198 fiz0 = _mm256_setzero_ps();
199 fix1 = _mm256_setzero_ps();
200 fiy1 = _mm256_setzero_ps();
201 fiz1 = _mm256_setzero_ps();
202 fix2 = _mm256_setzero_ps();
203 fiy2 = _mm256_setzero_ps();
204 fiz2 = _mm256_setzero_ps();
206 /* Reset potential sums */
207 velecsum = _mm256_setzero_ps();
208 vvdwsum = _mm256_setzero_ps();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
214 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
225 j_coord_offsetC = DIM*jnrC;
226 j_coord_offsetD = DIM*jnrD;
227 j_coord_offsetE = DIM*jnrE;
228 j_coord_offsetF = DIM*jnrF;
229 j_coord_offsetG = DIM*jnrG;
230 j_coord_offsetH = DIM*jnrH;
232 /* load j atom coordinates */
233 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234 x+j_coord_offsetC,x+j_coord_offsetD,
235 x+j_coord_offsetE,x+j_coord_offsetF,
236 x+j_coord_offsetG,x+j_coord_offsetH,
237 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
239 /* Calculate displacement vector */
240 dx00 = _mm256_sub_ps(ix0,jx0);
241 dy00 = _mm256_sub_ps(iy0,jy0);
242 dz00 = _mm256_sub_ps(iz0,jz0);
243 dx01 = _mm256_sub_ps(ix0,jx1);
244 dy01 = _mm256_sub_ps(iy0,jy1);
245 dz01 = _mm256_sub_ps(iz0,jz1);
246 dx02 = _mm256_sub_ps(ix0,jx2);
247 dy02 = _mm256_sub_ps(iy0,jy2);
248 dz02 = _mm256_sub_ps(iz0,jz2);
249 dx10 = _mm256_sub_ps(ix1,jx0);
250 dy10 = _mm256_sub_ps(iy1,jy0);
251 dz10 = _mm256_sub_ps(iz1,jz0);
252 dx11 = _mm256_sub_ps(ix1,jx1);
253 dy11 = _mm256_sub_ps(iy1,jy1);
254 dz11 = _mm256_sub_ps(iz1,jz1);
255 dx12 = _mm256_sub_ps(ix1,jx2);
256 dy12 = _mm256_sub_ps(iy1,jy2);
257 dz12 = _mm256_sub_ps(iz1,jz2);
258 dx20 = _mm256_sub_ps(ix2,jx0);
259 dy20 = _mm256_sub_ps(iy2,jy0);
260 dz20 = _mm256_sub_ps(iz2,jz0);
261 dx21 = _mm256_sub_ps(ix2,jx1);
262 dy21 = _mm256_sub_ps(iy2,jy1);
263 dz21 = _mm256_sub_ps(iz2,jz1);
264 dx22 = _mm256_sub_ps(ix2,jx2);
265 dy22 = _mm256_sub_ps(iy2,jy2);
266 dz22 = _mm256_sub_ps(iz2,jz2);
268 /* Calculate squared distance and things based on it */
269 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
270 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
271 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
272 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
273 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
274 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
275 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
276 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
277 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
279 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
280 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
281 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
282 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
283 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
284 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
285 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
286 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
287 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
289 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
290 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
291 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
292 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
293 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
294 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
295 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
296 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
297 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
299 fjx0 = _mm256_setzero_ps();
300 fjy0 = _mm256_setzero_ps();
301 fjz0 = _mm256_setzero_ps();
302 fjx1 = _mm256_setzero_ps();
303 fjy1 = _mm256_setzero_ps();
304 fjz1 = _mm256_setzero_ps();
305 fjx2 = _mm256_setzero_ps();
306 fjy2 = _mm256_setzero_ps();
307 fjz2 = _mm256_setzero_ps();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 /* COULOMB ELECTROSTATICS */
314 velec = _mm256_mul_ps(qq00,rinv00);
315 felec = _mm256_mul_ps(velec,rinvsq00);
317 /* LENNARD-JONES DISPERSION/REPULSION */
319 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
320 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
321 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
322 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
323 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
325 /* Update potential sum for this i atom from the interaction with this j atom. */
326 velecsum = _mm256_add_ps(velecsum,velec);
327 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
329 fscal = _mm256_add_ps(felec,fvdw);
331 /* Calculate temporary vectorial force */
332 tx = _mm256_mul_ps(fscal,dx00);
333 ty = _mm256_mul_ps(fscal,dy00);
334 tz = _mm256_mul_ps(fscal,dz00);
336 /* Update vectorial force */
337 fix0 = _mm256_add_ps(fix0,tx);
338 fiy0 = _mm256_add_ps(fiy0,ty);
339 fiz0 = _mm256_add_ps(fiz0,tz);
341 fjx0 = _mm256_add_ps(fjx0,tx);
342 fjy0 = _mm256_add_ps(fjy0,ty);
343 fjz0 = _mm256_add_ps(fjz0,tz);
345 /**************************
346 * CALCULATE INTERACTIONS *
347 **************************/
349 /* COULOMB ELECTROSTATICS */
350 velec = _mm256_mul_ps(qq01,rinv01);
351 felec = _mm256_mul_ps(velec,rinvsq01);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velecsum = _mm256_add_ps(velecsum,velec);
358 /* Calculate temporary vectorial force */
359 tx = _mm256_mul_ps(fscal,dx01);
360 ty = _mm256_mul_ps(fscal,dy01);
361 tz = _mm256_mul_ps(fscal,dz01);
363 /* Update vectorial force */
364 fix0 = _mm256_add_ps(fix0,tx);
365 fiy0 = _mm256_add_ps(fiy0,ty);
366 fiz0 = _mm256_add_ps(fiz0,tz);
368 fjx1 = _mm256_add_ps(fjx1,tx);
369 fjy1 = _mm256_add_ps(fjy1,ty);
370 fjz1 = _mm256_add_ps(fjz1,tz);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 /* COULOMB ELECTROSTATICS */
377 velec = _mm256_mul_ps(qq02,rinv02);
378 felec = _mm256_mul_ps(velec,rinvsq02);
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velecsum = _mm256_add_ps(velecsum,velec);
385 /* Calculate temporary vectorial force */
386 tx = _mm256_mul_ps(fscal,dx02);
387 ty = _mm256_mul_ps(fscal,dy02);
388 tz = _mm256_mul_ps(fscal,dz02);
390 /* Update vectorial force */
391 fix0 = _mm256_add_ps(fix0,tx);
392 fiy0 = _mm256_add_ps(fiy0,ty);
393 fiz0 = _mm256_add_ps(fiz0,tz);
395 fjx2 = _mm256_add_ps(fjx2,tx);
396 fjy2 = _mm256_add_ps(fjy2,ty);
397 fjz2 = _mm256_add_ps(fjz2,tz);
399 /**************************
400 * CALCULATE INTERACTIONS *
401 **************************/
403 /* COULOMB ELECTROSTATICS */
404 velec = _mm256_mul_ps(qq10,rinv10);
405 felec = _mm256_mul_ps(velec,rinvsq10);
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velecsum = _mm256_add_ps(velecsum,velec);
412 /* Calculate temporary vectorial force */
413 tx = _mm256_mul_ps(fscal,dx10);
414 ty = _mm256_mul_ps(fscal,dy10);
415 tz = _mm256_mul_ps(fscal,dz10);
417 /* Update vectorial force */
418 fix1 = _mm256_add_ps(fix1,tx);
419 fiy1 = _mm256_add_ps(fiy1,ty);
420 fiz1 = _mm256_add_ps(fiz1,tz);
422 fjx0 = _mm256_add_ps(fjx0,tx);
423 fjy0 = _mm256_add_ps(fjy0,ty);
424 fjz0 = _mm256_add_ps(fjz0,tz);
426 /**************************
427 * CALCULATE INTERACTIONS *
428 **************************/
430 /* COULOMB ELECTROSTATICS */
431 velec = _mm256_mul_ps(qq11,rinv11);
432 felec = _mm256_mul_ps(velec,rinvsq11);
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velecsum = _mm256_add_ps(velecsum,velec);
439 /* Calculate temporary vectorial force */
440 tx = _mm256_mul_ps(fscal,dx11);
441 ty = _mm256_mul_ps(fscal,dy11);
442 tz = _mm256_mul_ps(fscal,dz11);
444 /* Update vectorial force */
445 fix1 = _mm256_add_ps(fix1,tx);
446 fiy1 = _mm256_add_ps(fiy1,ty);
447 fiz1 = _mm256_add_ps(fiz1,tz);
449 fjx1 = _mm256_add_ps(fjx1,tx);
450 fjy1 = _mm256_add_ps(fjy1,ty);
451 fjz1 = _mm256_add_ps(fjz1,tz);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 /* COULOMB ELECTROSTATICS */
458 velec = _mm256_mul_ps(qq12,rinv12);
459 felec = _mm256_mul_ps(velec,rinvsq12);
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum = _mm256_add_ps(velecsum,velec);
466 /* Calculate temporary vectorial force */
467 tx = _mm256_mul_ps(fscal,dx12);
468 ty = _mm256_mul_ps(fscal,dy12);
469 tz = _mm256_mul_ps(fscal,dz12);
471 /* Update vectorial force */
472 fix1 = _mm256_add_ps(fix1,tx);
473 fiy1 = _mm256_add_ps(fiy1,ty);
474 fiz1 = _mm256_add_ps(fiz1,tz);
476 fjx2 = _mm256_add_ps(fjx2,tx);
477 fjy2 = _mm256_add_ps(fjy2,ty);
478 fjz2 = _mm256_add_ps(fjz2,tz);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 /* COULOMB ELECTROSTATICS */
485 velec = _mm256_mul_ps(qq20,rinv20);
486 felec = _mm256_mul_ps(velec,rinvsq20);
488 /* Update potential sum for this i atom from the interaction with this j atom. */
489 velecsum = _mm256_add_ps(velecsum,velec);
493 /* Calculate temporary vectorial force */
494 tx = _mm256_mul_ps(fscal,dx20);
495 ty = _mm256_mul_ps(fscal,dy20);
496 tz = _mm256_mul_ps(fscal,dz20);
498 /* Update vectorial force */
499 fix2 = _mm256_add_ps(fix2,tx);
500 fiy2 = _mm256_add_ps(fiy2,ty);
501 fiz2 = _mm256_add_ps(fiz2,tz);
503 fjx0 = _mm256_add_ps(fjx0,tx);
504 fjy0 = _mm256_add_ps(fjy0,ty);
505 fjz0 = _mm256_add_ps(fjz0,tz);
507 /**************************
508 * CALCULATE INTERACTIONS *
509 **************************/
511 /* COULOMB ELECTROSTATICS */
512 velec = _mm256_mul_ps(qq21,rinv21);
513 felec = _mm256_mul_ps(velec,rinvsq21);
515 /* Update potential sum for this i atom from the interaction with this j atom. */
516 velecsum = _mm256_add_ps(velecsum,velec);
520 /* Calculate temporary vectorial force */
521 tx = _mm256_mul_ps(fscal,dx21);
522 ty = _mm256_mul_ps(fscal,dy21);
523 tz = _mm256_mul_ps(fscal,dz21);
525 /* Update vectorial force */
526 fix2 = _mm256_add_ps(fix2,tx);
527 fiy2 = _mm256_add_ps(fiy2,ty);
528 fiz2 = _mm256_add_ps(fiz2,tz);
530 fjx1 = _mm256_add_ps(fjx1,tx);
531 fjy1 = _mm256_add_ps(fjy1,ty);
532 fjz1 = _mm256_add_ps(fjz1,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 /* COULOMB ELECTROSTATICS */
539 velec = _mm256_mul_ps(qq22,rinv22);
540 felec = _mm256_mul_ps(velec,rinvsq22);
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velecsum = _mm256_add_ps(velecsum,velec);
547 /* Calculate temporary vectorial force */
548 tx = _mm256_mul_ps(fscal,dx22);
549 ty = _mm256_mul_ps(fscal,dy22);
550 tz = _mm256_mul_ps(fscal,dz22);
552 /* Update vectorial force */
553 fix2 = _mm256_add_ps(fix2,tx);
554 fiy2 = _mm256_add_ps(fiy2,ty);
555 fiz2 = _mm256_add_ps(fiz2,tz);
557 fjx2 = _mm256_add_ps(fjx2,tx);
558 fjy2 = _mm256_add_ps(fjy2,ty);
559 fjz2 = _mm256_add_ps(fjz2,tz);
561 fjptrA = f+j_coord_offsetA;
562 fjptrB = f+j_coord_offsetB;
563 fjptrC = f+j_coord_offsetC;
564 fjptrD = f+j_coord_offsetD;
565 fjptrE = f+j_coord_offsetE;
566 fjptrF = f+j_coord_offsetF;
567 fjptrG = f+j_coord_offsetG;
568 fjptrH = f+j_coord_offsetH;
570 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
571 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
573 /* Inner loop uses 255 flops */
579 /* Get j neighbor index, and coordinate index */
580 jnrlistA = jjnr[jidx];
581 jnrlistB = jjnr[jidx+1];
582 jnrlistC = jjnr[jidx+2];
583 jnrlistD = jjnr[jidx+3];
584 jnrlistE = jjnr[jidx+4];
585 jnrlistF = jjnr[jidx+5];
586 jnrlistG = jjnr[jidx+6];
587 jnrlistH = jjnr[jidx+7];
588 /* Sign of each element will be negative for non-real atoms.
589 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
590 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
592 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
593 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
595 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
596 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
597 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
598 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
599 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
600 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
601 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
602 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
603 j_coord_offsetA = DIM*jnrA;
604 j_coord_offsetB = DIM*jnrB;
605 j_coord_offsetC = DIM*jnrC;
606 j_coord_offsetD = DIM*jnrD;
607 j_coord_offsetE = DIM*jnrE;
608 j_coord_offsetF = DIM*jnrF;
609 j_coord_offsetG = DIM*jnrG;
610 j_coord_offsetH = DIM*jnrH;
612 /* load j atom coordinates */
613 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
614 x+j_coord_offsetC,x+j_coord_offsetD,
615 x+j_coord_offsetE,x+j_coord_offsetF,
616 x+j_coord_offsetG,x+j_coord_offsetH,
617 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
619 /* Calculate displacement vector */
620 dx00 = _mm256_sub_ps(ix0,jx0);
621 dy00 = _mm256_sub_ps(iy0,jy0);
622 dz00 = _mm256_sub_ps(iz0,jz0);
623 dx01 = _mm256_sub_ps(ix0,jx1);
624 dy01 = _mm256_sub_ps(iy0,jy1);
625 dz01 = _mm256_sub_ps(iz0,jz1);
626 dx02 = _mm256_sub_ps(ix0,jx2);
627 dy02 = _mm256_sub_ps(iy0,jy2);
628 dz02 = _mm256_sub_ps(iz0,jz2);
629 dx10 = _mm256_sub_ps(ix1,jx0);
630 dy10 = _mm256_sub_ps(iy1,jy0);
631 dz10 = _mm256_sub_ps(iz1,jz0);
632 dx11 = _mm256_sub_ps(ix1,jx1);
633 dy11 = _mm256_sub_ps(iy1,jy1);
634 dz11 = _mm256_sub_ps(iz1,jz1);
635 dx12 = _mm256_sub_ps(ix1,jx2);
636 dy12 = _mm256_sub_ps(iy1,jy2);
637 dz12 = _mm256_sub_ps(iz1,jz2);
638 dx20 = _mm256_sub_ps(ix2,jx0);
639 dy20 = _mm256_sub_ps(iy2,jy0);
640 dz20 = _mm256_sub_ps(iz2,jz0);
641 dx21 = _mm256_sub_ps(ix2,jx1);
642 dy21 = _mm256_sub_ps(iy2,jy1);
643 dz21 = _mm256_sub_ps(iz2,jz1);
644 dx22 = _mm256_sub_ps(ix2,jx2);
645 dy22 = _mm256_sub_ps(iy2,jy2);
646 dz22 = _mm256_sub_ps(iz2,jz2);
648 /* Calculate squared distance and things based on it */
649 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
650 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
651 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
652 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
653 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
654 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
655 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
656 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
657 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
659 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
660 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
661 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
662 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
663 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
664 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
665 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
666 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
667 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
669 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
670 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
671 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
672 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
673 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
674 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
675 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
676 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
677 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
679 fjx0 = _mm256_setzero_ps();
680 fjy0 = _mm256_setzero_ps();
681 fjz0 = _mm256_setzero_ps();
682 fjx1 = _mm256_setzero_ps();
683 fjy1 = _mm256_setzero_ps();
684 fjz1 = _mm256_setzero_ps();
685 fjx2 = _mm256_setzero_ps();
686 fjy2 = _mm256_setzero_ps();
687 fjz2 = _mm256_setzero_ps();
689 /**************************
690 * CALCULATE INTERACTIONS *
691 **************************/
693 /* COULOMB ELECTROSTATICS */
694 velec = _mm256_mul_ps(qq00,rinv00);
695 felec = _mm256_mul_ps(velec,rinvsq00);
697 /* LENNARD-JONES DISPERSION/REPULSION */
699 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
700 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
701 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
702 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
703 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
705 /* Update potential sum for this i atom from the interaction with this j atom. */
706 velec = _mm256_andnot_ps(dummy_mask,velec);
707 velecsum = _mm256_add_ps(velecsum,velec);
708 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
709 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
711 fscal = _mm256_add_ps(felec,fvdw);
713 fscal = _mm256_andnot_ps(dummy_mask,fscal);
715 /* Calculate temporary vectorial force */
716 tx = _mm256_mul_ps(fscal,dx00);
717 ty = _mm256_mul_ps(fscal,dy00);
718 tz = _mm256_mul_ps(fscal,dz00);
720 /* Update vectorial force */
721 fix0 = _mm256_add_ps(fix0,tx);
722 fiy0 = _mm256_add_ps(fiy0,ty);
723 fiz0 = _mm256_add_ps(fiz0,tz);
725 fjx0 = _mm256_add_ps(fjx0,tx);
726 fjy0 = _mm256_add_ps(fjy0,ty);
727 fjz0 = _mm256_add_ps(fjz0,tz);
729 /**************************
730 * CALCULATE INTERACTIONS *
731 **************************/
733 /* COULOMB ELECTROSTATICS */
734 velec = _mm256_mul_ps(qq01,rinv01);
735 felec = _mm256_mul_ps(velec,rinvsq01);
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velec = _mm256_andnot_ps(dummy_mask,velec);
739 velecsum = _mm256_add_ps(velecsum,velec);
743 fscal = _mm256_andnot_ps(dummy_mask,fscal);
745 /* Calculate temporary vectorial force */
746 tx = _mm256_mul_ps(fscal,dx01);
747 ty = _mm256_mul_ps(fscal,dy01);
748 tz = _mm256_mul_ps(fscal,dz01);
750 /* Update vectorial force */
751 fix0 = _mm256_add_ps(fix0,tx);
752 fiy0 = _mm256_add_ps(fiy0,ty);
753 fiz0 = _mm256_add_ps(fiz0,tz);
755 fjx1 = _mm256_add_ps(fjx1,tx);
756 fjy1 = _mm256_add_ps(fjy1,ty);
757 fjz1 = _mm256_add_ps(fjz1,tz);
759 /**************************
760 * CALCULATE INTERACTIONS *
761 **************************/
763 /* COULOMB ELECTROSTATICS */
764 velec = _mm256_mul_ps(qq02,rinv02);
765 felec = _mm256_mul_ps(velec,rinvsq02);
767 /* Update potential sum for this i atom from the interaction with this j atom. */
768 velec = _mm256_andnot_ps(dummy_mask,velec);
769 velecsum = _mm256_add_ps(velecsum,velec);
773 fscal = _mm256_andnot_ps(dummy_mask,fscal);
775 /* Calculate temporary vectorial force */
776 tx = _mm256_mul_ps(fscal,dx02);
777 ty = _mm256_mul_ps(fscal,dy02);
778 tz = _mm256_mul_ps(fscal,dz02);
780 /* Update vectorial force */
781 fix0 = _mm256_add_ps(fix0,tx);
782 fiy0 = _mm256_add_ps(fiy0,ty);
783 fiz0 = _mm256_add_ps(fiz0,tz);
785 fjx2 = _mm256_add_ps(fjx2,tx);
786 fjy2 = _mm256_add_ps(fjy2,ty);
787 fjz2 = _mm256_add_ps(fjz2,tz);
789 /**************************
790 * CALCULATE INTERACTIONS *
791 **************************/
793 /* COULOMB ELECTROSTATICS */
794 velec = _mm256_mul_ps(qq10,rinv10);
795 felec = _mm256_mul_ps(velec,rinvsq10);
797 /* Update potential sum for this i atom from the interaction with this j atom. */
798 velec = _mm256_andnot_ps(dummy_mask,velec);
799 velecsum = _mm256_add_ps(velecsum,velec);
803 fscal = _mm256_andnot_ps(dummy_mask,fscal);
805 /* Calculate temporary vectorial force */
806 tx = _mm256_mul_ps(fscal,dx10);
807 ty = _mm256_mul_ps(fscal,dy10);
808 tz = _mm256_mul_ps(fscal,dz10);
810 /* Update vectorial force */
811 fix1 = _mm256_add_ps(fix1,tx);
812 fiy1 = _mm256_add_ps(fiy1,ty);
813 fiz1 = _mm256_add_ps(fiz1,tz);
815 fjx0 = _mm256_add_ps(fjx0,tx);
816 fjy0 = _mm256_add_ps(fjy0,ty);
817 fjz0 = _mm256_add_ps(fjz0,tz);
819 /**************************
820 * CALCULATE INTERACTIONS *
821 **************************/
823 /* COULOMB ELECTROSTATICS */
824 velec = _mm256_mul_ps(qq11,rinv11);
825 felec = _mm256_mul_ps(velec,rinvsq11);
827 /* Update potential sum for this i atom from the interaction with this j atom. */
828 velec = _mm256_andnot_ps(dummy_mask,velec);
829 velecsum = _mm256_add_ps(velecsum,velec);
833 fscal = _mm256_andnot_ps(dummy_mask,fscal);
835 /* Calculate temporary vectorial force */
836 tx = _mm256_mul_ps(fscal,dx11);
837 ty = _mm256_mul_ps(fscal,dy11);
838 tz = _mm256_mul_ps(fscal,dz11);
840 /* Update vectorial force */
841 fix1 = _mm256_add_ps(fix1,tx);
842 fiy1 = _mm256_add_ps(fiy1,ty);
843 fiz1 = _mm256_add_ps(fiz1,tz);
845 fjx1 = _mm256_add_ps(fjx1,tx);
846 fjy1 = _mm256_add_ps(fjy1,ty);
847 fjz1 = _mm256_add_ps(fjz1,tz);
849 /**************************
850 * CALCULATE INTERACTIONS *
851 **************************/
853 /* COULOMB ELECTROSTATICS */
854 velec = _mm256_mul_ps(qq12,rinv12);
855 felec = _mm256_mul_ps(velec,rinvsq12);
857 /* Update potential sum for this i atom from the interaction with this j atom. */
858 velec = _mm256_andnot_ps(dummy_mask,velec);
859 velecsum = _mm256_add_ps(velecsum,velec);
863 fscal = _mm256_andnot_ps(dummy_mask,fscal);
865 /* Calculate temporary vectorial force */
866 tx = _mm256_mul_ps(fscal,dx12);
867 ty = _mm256_mul_ps(fscal,dy12);
868 tz = _mm256_mul_ps(fscal,dz12);
870 /* Update vectorial force */
871 fix1 = _mm256_add_ps(fix1,tx);
872 fiy1 = _mm256_add_ps(fiy1,ty);
873 fiz1 = _mm256_add_ps(fiz1,tz);
875 fjx2 = _mm256_add_ps(fjx2,tx);
876 fjy2 = _mm256_add_ps(fjy2,ty);
877 fjz2 = _mm256_add_ps(fjz2,tz);
879 /**************************
880 * CALCULATE INTERACTIONS *
881 **************************/
883 /* COULOMB ELECTROSTATICS */
884 velec = _mm256_mul_ps(qq20,rinv20);
885 felec = _mm256_mul_ps(velec,rinvsq20);
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm256_andnot_ps(dummy_mask,velec);
889 velecsum = _mm256_add_ps(velecsum,velec);
893 fscal = _mm256_andnot_ps(dummy_mask,fscal);
895 /* Calculate temporary vectorial force */
896 tx = _mm256_mul_ps(fscal,dx20);
897 ty = _mm256_mul_ps(fscal,dy20);
898 tz = _mm256_mul_ps(fscal,dz20);
900 /* Update vectorial force */
901 fix2 = _mm256_add_ps(fix2,tx);
902 fiy2 = _mm256_add_ps(fiy2,ty);
903 fiz2 = _mm256_add_ps(fiz2,tz);
905 fjx0 = _mm256_add_ps(fjx0,tx);
906 fjy0 = _mm256_add_ps(fjy0,ty);
907 fjz0 = _mm256_add_ps(fjz0,tz);
909 /**************************
910 * CALCULATE INTERACTIONS *
911 **************************/
913 /* COULOMB ELECTROSTATICS */
914 velec = _mm256_mul_ps(qq21,rinv21);
915 felec = _mm256_mul_ps(velec,rinvsq21);
917 /* Update potential sum for this i atom from the interaction with this j atom. */
918 velec = _mm256_andnot_ps(dummy_mask,velec);
919 velecsum = _mm256_add_ps(velecsum,velec);
923 fscal = _mm256_andnot_ps(dummy_mask,fscal);
925 /* Calculate temporary vectorial force */
926 tx = _mm256_mul_ps(fscal,dx21);
927 ty = _mm256_mul_ps(fscal,dy21);
928 tz = _mm256_mul_ps(fscal,dz21);
930 /* Update vectorial force */
931 fix2 = _mm256_add_ps(fix2,tx);
932 fiy2 = _mm256_add_ps(fiy2,ty);
933 fiz2 = _mm256_add_ps(fiz2,tz);
935 fjx1 = _mm256_add_ps(fjx1,tx);
936 fjy1 = _mm256_add_ps(fjy1,ty);
937 fjz1 = _mm256_add_ps(fjz1,tz);
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 /* COULOMB ELECTROSTATICS */
944 velec = _mm256_mul_ps(qq22,rinv22);
945 felec = _mm256_mul_ps(velec,rinvsq22);
947 /* Update potential sum for this i atom from the interaction with this j atom. */
948 velec = _mm256_andnot_ps(dummy_mask,velec);
949 velecsum = _mm256_add_ps(velecsum,velec);
953 fscal = _mm256_andnot_ps(dummy_mask,fscal);
955 /* Calculate temporary vectorial force */
956 tx = _mm256_mul_ps(fscal,dx22);
957 ty = _mm256_mul_ps(fscal,dy22);
958 tz = _mm256_mul_ps(fscal,dz22);
960 /* Update vectorial force */
961 fix2 = _mm256_add_ps(fix2,tx);
962 fiy2 = _mm256_add_ps(fiy2,ty);
963 fiz2 = _mm256_add_ps(fiz2,tz);
965 fjx2 = _mm256_add_ps(fjx2,tx);
966 fjy2 = _mm256_add_ps(fjy2,ty);
967 fjz2 = _mm256_add_ps(fjz2,tz);
969 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
970 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
971 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
972 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
973 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
974 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
975 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
976 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
978 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
979 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
981 /* Inner loop uses 255 flops */
984 /* End of innermost loop */
986 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
987 f+i_coord_offset,fshift+i_shift_offset);
990 /* Update potential energies */
991 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
992 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
994 /* Increment number of inner iterations */
995 inneriter += j_index_end - j_index_start;
997 /* Outer loop uses 20 flops */
1000 /* Increment number of outer iterations */
1003 /* Update outer/inner flops */
1005 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*255);
1008 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_single
1009 * Electrostatics interaction: Coulomb
1010 * VdW interaction: LennardJones
1011 * Geometry: Water3-Water3
1012 * Calculate force/pot: Force
1015 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_single
1016 (t_nblist * gmx_restrict nlist,
1017 rvec * gmx_restrict xx,
1018 rvec * gmx_restrict ff,
1019 t_forcerec * gmx_restrict fr,
1020 t_mdatoms * gmx_restrict mdatoms,
1021 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1022 t_nrnb * gmx_restrict nrnb)
1024 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1025 * just 0 for non-waters.
1026 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1027 * jnr indices corresponding to data put in the four positions in the SIMD register.
1029 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1030 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1031 int jnrA,jnrB,jnrC,jnrD;
1032 int jnrE,jnrF,jnrG,jnrH;
1033 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1034 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1035 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1036 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1037 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1038 real rcutoff_scalar;
1039 real *shiftvec,*fshift,*x,*f;
1040 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1041 real scratch[4*DIM];
1042 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1043 real * vdwioffsetptr0;
1044 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1045 real * vdwioffsetptr1;
1046 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1047 real * vdwioffsetptr2;
1048 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1049 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1050 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1051 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1052 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1053 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1054 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1055 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1056 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1057 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1058 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1059 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1060 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1061 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1062 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1063 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1064 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1067 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1070 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1071 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1072 __m256 dummy_mask,cutoff_mask;
1073 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1074 __m256 one = _mm256_set1_ps(1.0);
1075 __m256 two = _mm256_set1_ps(2.0);
1081 jindex = nlist->jindex;
1083 shiftidx = nlist->shift;
1085 shiftvec = fr->shift_vec[0];
1086 fshift = fr->fshift[0];
1087 facel = _mm256_set1_ps(fr->epsfac);
1088 charge = mdatoms->chargeA;
1089 nvdwtype = fr->ntype;
1090 vdwparam = fr->nbfp;
1091 vdwtype = mdatoms->typeA;
1093 /* Setup water-specific parameters */
1094 inr = nlist->iinr[0];
1095 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1096 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1097 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1098 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1100 jq0 = _mm256_set1_ps(charge[inr+0]);
1101 jq1 = _mm256_set1_ps(charge[inr+1]);
1102 jq2 = _mm256_set1_ps(charge[inr+2]);
1103 vdwjidx0A = 2*vdwtype[inr+0];
1104 qq00 = _mm256_mul_ps(iq0,jq0);
1105 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1106 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1107 qq01 = _mm256_mul_ps(iq0,jq1);
1108 qq02 = _mm256_mul_ps(iq0,jq2);
1109 qq10 = _mm256_mul_ps(iq1,jq0);
1110 qq11 = _mm256_mul_ps(iq1,jq1);
1111 qq12 = _mm256_mul_ps(iq1,jq2);
1112 qq20 = _mm256_mul_ps(iq2,jq0);
1113 qq21 = _mm256_mul_ps(iq2,jq1);
1114 qq22 = _mm256_mul_ps(iq2,jq2);
1116 /* Avoid stupid compiler warnings */
1117 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1118 j_coord_offsetA = 0;
1119 j_coord_offsetB = 0;
1120 j_coord_offsetC = 0;
1121 j_coord_offsetD = 0;
1122 j_coord_offsetE = 0;
1123 j_coord_offsetF = 0;
1124 j_coord_offsetG = 0;
1125 j_coord_offsetH = 0;
1130 for(iidx=0;iidx<4*DIM;iidx++)
1132 scratch[iidx] = 0.0;
1135 /* Start outer loop over neighborlists */
1136 for(iidx=0; iidx<nri; iidx++)
1138 /* Load shift vector for this list */
1139 i_shift_offset = DIM*shiftidx[iidx];
1141 /* Load limits for loop over neighbors */
1142 j_index_start = jindex[iidx];
1143 j_index_end = jindex[iidx+1];
1145 /* Get outer coordinate index */
1147 i_coord_offset = DIM*inr;
1149 /* Load i particle coords and add shift vector */
1150 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1151 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1153 fix0 = _mm256_setzero_ps();
1154 fiy0 = _mm256_setzero_ps();
1155 fiz0 = _mm256_setzero_ps();
1156 fix1 = _mm256_setzero_ps();
1157 fiy1 = _mm256_setzero_ps();
1158 fiz1 = _mm256_setzero_ps();
1159 fix2 = _mm256_setzero_ps();
1160 fiy2 = _mm256_setzero_ps();
1161 fiz2 = _mm256_setzero_ps();
1163 /* Start inner kernel loop */
1164 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1167 /* Get j neighbor index, and coordinate index */
1169 jnrB = jjnr[jidx+1];
1170 jnrC = jjnr[jidx+2];
1171 jnrD = jjnr[jidx+3];
1172 jnrE = jjnr[jidx+4];
1173 jnrF = jjnr[jidx+5];
1174 jnrG = jjnr[jidx+6];
1175 jnrH = jjnr[jidx+7];
1176 j_coord_offsetA = DIM*jnrA;
1177 j_coord_offsetB = DIM*jnrB;
1178 j_coord_offsetC = DIM*jnrC;
1179 j_coord_offsetD = DIM*jnrD;
1180 j_coord_offsetE = DIM*jnrE;
1181 j_coord_offsetF = DIM*jnrF;
1182 j_coord_offsetG = DIM*jnrG;
1183 j_coord_offsetH = DIM*jnrH;
1185 /* load j atom coordinates */
1186 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1187 x+j_coord_offsetC,x+j_coord_offsetD,
1188 x+j_coord_offsetE,x+j_coord_offsetF,
1189 x+j_coord_offsetG,x+j_coord_offsetH,
1190 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1192 /* Calculate displacement vector */
1193 dx00 = _mm256_sub_ps(ix0,jx0);
1194 dy00 = _mm256_sub_ps(iy0,jy0);
1195 dz00 = _mm256_sub_ps(iz0,jz0);
1196 dx01 = _mm256_sub_ps(ix0,jx1);
1197 dy01 = _mm256_sub_ps(iy0,jy1);
1198 dz01 = _mm256_sub_ps(iz0,jz1);
1199 dx02 = _mm256_sub_ps(ix0,jx2);
1200 dy02 = _mm256_sub_ps(iy0,jy2);
1201 dz02 = _mm256_sub_ps(iz0,jz2);
1202 dx10 = _mm256_sub_ps(ix1,jx0);
1203 dy10 = _mm256_sub_ps(iy1,jy0);
1204 dz10 = _mm256_sub_ps(iz1,jz0);
1205 dx11 = _mm256_sub_ps(ix1,jx1);
1206 dy11 = _mm256_sub_ps(iy1,jy1);
1207 dz11 = _mm256_sub_ps(iz1,jz1);
1208 dx12 = _mm256_sub_ps(ix1,jx2);
1209 dy12 = _mm256_sub_ps(iy1,jy2);
1210 dz12 = _mm256_sub_ps(iz1,jz2);
1211 dx20 = _mm256_sub_ps(ix2,jx0);
1212 dy20 = _mm256_sub_ps(iy2,jy0);
1213 dz20 = _mm256_sub_ps(iz2,jz0);
1214 dx21 = _mm256_sub_ps(ix2,jx1);
1215 dy21 = _mm256_sub_ps(iy2,jy1);
1216 dz21 = _mm256_sub_ps(iz2,jz1);
1217 dx22 = _mm256_sub_ps(ix2,jx2);
1218 dy22 = _mm256_sub_ps(iy2,jy2);
1219 dz22 = _mm256_sub_ps(iz2,jz2);
1221 /* Calculate squared distance and things based on it */
1222 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1223 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1224 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1225 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1226 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1227 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1228 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1229 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1230 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1232 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1233 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1234 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1235 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1236 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1237 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1238 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1239 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1240 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1242 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1243 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1244 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1245 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1246 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1247 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1248 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1249 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1250 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1252 fjx0 = _mm256_setzero_ps();
1253 fjy0 = _mm256_setzero_ps();
1254 fjz0 = _mm256_setzero_ps();
1255 fjx1 = _mm256_setzero_ps();
1256 fjy1 = _mm256_setzero_ps();
1257 fjz1 = _mm256_setzero_ps();
1258 fjx2 = _mm256_setzero_ps();
1259 fjy2 = _mm256_setzero_ps();
1260 fjz2 = _mm256_setzero_ps();
1262 /**************************
1263 * CALCULATE INTERACTIONS *
1264 **************************/
1266 /* COULOMB ELECTROSTATICS */
1267 velec = _mm256_mul_ps(qq00,rinv00);
1268 felec = _mm256_mul_ps(velec,rinvsq00);
1270 /* LENNARD-JONES DISPERSION/REPULSION */
1272 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1273 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1275 fscal = _mm256_add_ps(felec,fvdw);
1277 /* Calculate temporary vectorial force */
1278 tx = _mm256_mul_ps(fscal,dx00);
1279 ty = _mm256_mul_ps(fscal,dy00);
1280 tz = _mm256_mul_ps(fscal,dz00);
1282 /* Update vectorial force */
1283 fix0 = _mm256_add_ps(fix0,tx);
1284 fiy0 = _mm256_add_ps(fiy0,ty);
1285 fiz0 = _mm256_add_ps(fiz0,tz);
1287 fjx0 = _mm256_add_ps(fjx0,tx);
1288 fjy0 = _mm256_add_ps(fjy0,ty);
1289 fjz0 = _mm256_add_ps(fjz0,tz);
1291 /**************************
1292 * CALCULATE INTERACTIONS *
1293 **************************/
1295 /* COULOMB ELECTROSTATICS */
1296 velec = _mm256_mul_ps(qq01,rinv01);
1297 felec = _mm256_mul_ps(velec,rinvsq01);
1301 /* Calculate temporary vectorial force */
1302 tx = _mm256_mul_ps(fscal,dx01);
1303 ty = _mm256_mul_ps(fscal,dy01);
1304 tz = _mm256_mul_ps(fscal,dz01);
1306 /* Update vectorial force */
1307 fix0 = _mm256_add_ps(fix0,tx);
1308 fiy0 = _mm256_add_ps(fiy0,ty);
1309 fiz0 = _mm256_add_ps(fiz0,tz);
1311 fjx1 = _mm256_add_ps(fjx1,tx);
1312 fjy1 = _mm256_add_ps(fjy1,ty);
1313 fjz1 = _mm256_add_ps(fjz1,tz);
1315 /**************************
1316 * CALCULATE INTERACTIONS *
1317 **************************/
1319 /* COULOMB ELECTROSTATICS */
1320 velec = _mm256_mul_ps(qq02,rinv02);
1321 felec = _mm256_mul_ps(velec,rinvsq02);
1325 /* Calculate temporary vectorial force */
1326 tx = _mm256_mul_ps(fscal,dx02);
1327 ty = _mm256_mul_ps(fscal,dy02);
1328 tz = _mm256_mul_ps(fscal,dz02);
1330 /* Update vectorial force */
1331 fix0 = _mm256_add_ps(fix0,tx);
1332 fiy0 = _mm256_add_ps(fiy0,ty);
1333 fiz0 = _mm256_add_ps(fiz0,tz);
1335 fjx2 = _mm256_add_ps(fjx2,tx);
1336 fjy2 = _mm256_add_ps(fjy2,ty);
1337 fjz2 = _mm256_add_ps(fjz2,tz);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 /* COULOMB ELECTROSTATICS */
1344 velec = _mm256_mul_ps(qq10,rinv10);
1345 felec = _mm256_mul_ps(velec,rinvsq10);
1349 /* Calculate temporary vectorial force */
1350 tx = _mm256_mul_ps(fscal,dx10);
1351 ty = _mm256_mul_ps(fscal,dy10);
1352 tz = _mm256_mul_ps(fscal,dz10);
1354 /* Update vectorial force */
1355 fix1 = _mm256_add_ps(fix1,tx);
1356 fiy1 = _mm256_add_ps(fiy1,ty);
1357 fiz1 = _mm256_add_ps(fiz1,tz);
1359 fjx0 = _mm256_add_ps(fjx0,tx);
1360 fjy0 = _mm256_add_ps(fjy0,ty);
1361 fjz0 = _mm256_add_ps(fjz0,tz);
1363 /**************************
1364 * CALCULATE INTERACTIONS *
1365 **************************/
1367 /* COULOMB ELECTROSTATICS */
1368 velec = _mm256_mul_ps(qq11,rinv11);
1369 felec = _mm256_mul_ps(velec,rinvsq11);
1373 /* Calculate temporary vectorial force */
1374 tx = _mm256_mul_ps(fscal,dx11);
1375 ty = _mm256_mul_ps(fscal,dy11);
1376 tz = _mm256_mul_ps(fscal,dz11);
1378 /* Update vectorial force */
1379 fix1 = _mm256_add_ps(fix1,tx);
1380 fiy1 = _mm256_add_ps(fiy1,ty);
1381 fiz1 = _mm256_add_ps(fiz1,tz);
1383 fjx1 = _mm256_add_ps(fjx1,tx);
1384 fjy1 = _mm256_add_ps(fjy1,ty);
1385 fjz1 = _mm256_add_ps(fjz1,tz);
1387 /**************************
1388 * CALCULATE INTERACTIONS *
1389 **************************/
1391 /* COULOMB ELECTROSTATICS */
1392 velec = _mm256_mul_ps(qq12,rinv12);
1393 felec = _mm256_mul_ps(velec,rinvsq12);
1397 /* Calculate temporary vectorial force */
1398 tx = _mm256_mul_ps(fscal,dx12);
1399 ty = _mm256_mul_ps(fscal,dy12);
1400 tz = _mm256_mul_ps(fscal,dz12);
1402 /* Update vectorial force */
1403 fix1 = _mm256_add_ps(fix1,tx);
1404 fiy1 = _mm256_add_ps(fiy1,ty);
1405 fiz1 = _mm256_add_ps(fiz1,tz);
1407 fjx2 = _mm256_add_ps(fjx2,tx);
1408 fjy2 = _mm256_add_ps(fjy2,ty);
1409 fjz2 = _mm256_add_ps(fjz2,tz);
1411 /**************************
1412 * CALCULATE INTERACTIONS *
1413 **************************/
1415 /* COULOMB ELECTROSTATICS */
1416 velec = _mm256_mul_ps(qq20,rinv20);
1417 felec = _mm256_mul_ps(velec,rinvsq20);
1421 /* Calculate temporary vectorial force */
1422 tx = _mm256_mul_ps(fscal,dx20);
1423 ty = _mm256_mul_ps(fscal,dy20);
1424 tz = _mm256_mul_ps(fscal,dz20);
1426 /* Update vectorial force */
1427 fix2 = _mm256_add_ps(fix2,tx);
1428 fiy2 = _mm256_add_ps(fiy2,ty);
1429 fiz2 = _mm256_add_ps(fiz2,tz);
1431 fjx0 = _mm256_add_ps(fjx0,tx);
1432 fjy0 = _mm256_add_ps(fjy0,ty);
1433 fjz0 = _mm256_add_ps(fjz0,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 /* COULOMB ELECTROSTATICS */
1440 velec = _mm256_mul_ps(qq21,rinv21);
1441 felec = _mm256_mul_ps(velec,rinvsq21);
1445 /* Calculate temporary vectorial force */
1446 tx = _mm256_mul_ps(fscal,dx21);
1447 ty = _mm256_mul_ps(fscal,dy21);
1448 tz = _mm256_mul_ps(fscal,dz21);
1450 /* Update vectorial force */
1451 fix2 = _mm256_add_ps(fix2,tx);
1452 fiy2 = _mm256_add_ps(fiy2,ty);
1453 fiz2 = _mm256_add_ps(fiz2,tz);
1455 fjx1 = _mm256_add_ps(fjx1,tx);
1456 fjy1 = _mm256_add_ps(fjy1,ty);
1457 fjz1 = _mm256_add_ps(fjz1,tz);
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1463 /* COULOMB ELECTROSTATICS */
1464 velec = _mm256_mul_ps(qq22,rinv22);
1465 felec = _mm256_mul_ps(velec,rinvsq22);
1469 /* Calculate temporary vectorial force */
1470 tx = _mm256_mul_ps(fscal,dx22);
1471 ty = _mm256_mul_ps(fscal,dy22);
1472 tz = _mm256_mul_ps(fscal,dz22);
1474 /* Update vectorial force */
1475 fix2 = _mm256_add_ps(fix2,tx);
1476 fiy2 = _mm256_add_ps(fiy2,ty);
1477 fiz2 = _mm256_add_ps(fiz2,tz);
1479 fjx2 = _mm256_add_ps(fjx2,tx);
1480 fjy2 = _mm256_add_ps(fjy2,ty);
1481 fjz2 = _mm256_add_ps(fjz2,tz);
1483 fjptrA = f+j_coord_offsetA;
1484 fjptrB = f+j_coord_offsetB;
1485 fjptrC = f+j_coord_offsetC;
1486 fjptrD = f+j_coord_offsetD;
1487 fjptrE = f+j_coord_offsetE;
1488 fjptrF = f+j_coord_offsetF;
1489 fjptrG = f+j_coord_offsetG;
1490 fjptrH = f+j_coord_offsetH;
1492 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1493 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1495 /* Inner loop uses 241 flops */
1498 if(jidx<j_index_end)
1501 /* Get j neighbor index, and coordinate index */
1502 jnrlistA = jjnr[jidx];
1503 jnrlistB = jjnr[jidx+1];
1504 jnrlistC = jjnr[jidx+2];
1505 jnrlistD = jjnr[jidx+3];
1506 jnrlistE = jjnr[jidx+4];
1507 jnrlistF = jjnr[jidx+5];
1508 jnrlistG = jjnr[jidx+6];
1509 jnrlistH = jjnr[jidx+7];
1510 /* Sign of each element will be negative for non-real atoms.
1511 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1512 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1514 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1515 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1517 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1518 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1519 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1520 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1521 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1522 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1523 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1524 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1525 j_coord_offsetA = DIM*jnrA;
1526 j_coord_offsetB = DIM*jnrB;
1527 j_coord_offsetC = DIM*jnrC;
1528 j_coord_offsetD = DIM*jnrD;
1529 j_coord_offsetE = DIM*jnrE;
1530 j_coord_offsetF = DIM*jnrF;
1531 j_coord_offsetG = DIM*jnrG;
1532 j_coord_offsetH = DIM*jnrH;
1534 /* load j atom coordinates */
1535 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1536 x+j_coord_offsetC,x+j_coord_offsetD,
1537 x+j_coord_offsetE,x+j_coord_offsetF,
1538 x+j_coord_offsetG,x+j_coord_offsetH,
1539 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1541 /* Calculate displacement vector */
1542 dx00 = _mm256_sub_ps(ix0,jx0);
1543 dy00 = _mm256_sub_ps(iy0,jy0);
1544 dz00 = _mm256_sub_ps(iz0,jz0);
1545 dx01 = _mm256_sub_ps(ix0,jx1);
1546 dy01 = _mm256_sub_ps(iy0,jy1);
1547 dz01 = _mm256_sub_ps(iz0,jz1);
1548 dx02 = _mm256_sub_ps(ix0,jx2);
1549 dy02 = _mm256_sub_ps(iy0,jy2);
1550 dz02 = _mm256_sub_ps(iz0,jz2);
1551 dx10 = _mm256_sub_ps(ix1,jx0);
1552 dy10 = _mm256_sub_ps(iy1,jy0);
1553 dz10 = _mm256_sub_ps(iz1,jz0);
1554 dx11 = _mm256_sub_ps(ix1,jx1);
1555 dy11 = _mm256_sub_ps(iy1,jy1);
1556 dz11 = _mm256_sub_ps(iz1,jz1);
1557 dx12 = _mm256_sub_ps(ix1,jx2);
1558 dy12 = _mm256_sub_ps(iy1,jy2);
1559 dz12 = _mm256_sub_ps(iz1,jz2);
1560 dx20 = _mm256_sub_ps(ix2,jx0);
1561 dy20 = _mm256_sub_ps(iy2,jy0);
1562 dz20 = _mm256_sub_ps(iz2,jz0);
1563 dx21 = _mm256_sub_ps(ix2,jx1);
1564 dy21 = _mm256_sub_ps(iy2,jy1);
1565 dz21 = _mm256_sub_ps(iz2,jz1);
1566 dx22 = _mm256_sub_ps(ix2,jx2);
1567 dy22 = _mm256_sub_ps(iy2,jy2);
1568 dz22 = _mm256_sub_ps(iz2,jz2);
1570 /* Calculate squared distance and things based on it */
1571 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1572 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1573 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1574 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1575 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1576 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1577 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1578 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1579 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1581 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1582 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1583 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1584 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1585 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1586 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1587 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1588 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1589 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1591 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1592 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1593 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1594 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1595 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1596 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1597 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1598 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1599 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1601 fjx0 = _mm256_setzero_ps();
1602 fjy0 = _mm256_setzero_ps();
1603 fjz0 = _mm256_setzero_ps();
1604 fjx1 = _mm256_setzero_ps();
1605 fjy1 = _mm256_setzero_ps();
1606 fjz1 = _mm256_setzero_ps();
1607 fjx2 = _mm256_setzero_ps();
1608 fjy2 = _mm256_setzero_ps();
1609 fjz2 = _mm256_setzero_ps();
1611 /**************************
1612 * CALCULATE INTERACTIONS *
1613 **************************/
1615 /* COULOMB ELECTROSTATICS */
1616 velec = _mm256_mul_ps(qq00,rinv00);
1617 felec = _mm256_mul_ps(velec,rinvsq00);
1619 /* LENNARD-JONES DISPERSION/REPULSION */
1621 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1622 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1624 fscal = _mm256_add_ps(felec,fvdw);
1626 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1628 /* Calculate temporary vectorial force */
1629 tx = _mm256_mul_ps(fscal,dx00);
1630 ty = _mm256_mul_ps(fscal,dy00);
1631 tz = _mm256_mul_ps(fscal,dz00);
1633 /* Update vectorial force */
1634 fix0 = _mm256_add_ps(fix0,tx);
1635 fiy0 = _mm256_add_ps(fiy0,ty);
1636 fiz0 = _mm256_add_ps(fiz0,tz);
1638 fjx0 = _mm256_add_ps(fjx0,tx);
1639 fjy0 = _mm256_add_ps(fjy0,ty);
1640 fjz0 = _mm256_add_ps(fjz0,tz);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 /* COULOMB ELECTROSTATICS */
1647 velec = _mm256_mul_ps(qq01,rinv01);
1648 felec = _mm256_mul_ps(velec,rinvsq01);
1652 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1654 /* Calculate temporary vectorial force */
1655 tx = _mm256_mul_ps(fscal,dx01);
1656 ty = _mm256_mul_ps(fscal,dy01);
1657 tz = _mm256_mul_ps(fscal,dz01);
1659 /* Update vectorial force */
1660 fix0 = _mm256_add_ps(fix0,tx);
1661 fiy0 = _mm256_add_ps(fiy0,ty);
1662 fiz0 = _mm256_add_ps(fiz0,tz);
1664 fjx1 = _mm256_add_ps(fjx1,tx);
1665 fjy1 = _mm256_add_ps(fjy1,ty);
1666 fjz1 = _mm256_add_ps(fjz1,tz);
1668 /**************************
1669 * CALCULATE INTERACTIONS *
1670 **************************/
1672 /* COULOMB ELECTROSTATICS */
1673 velec = _mm256_mul_ps(qq02,rinv02);
1674 felec = _mm256_mul_ps(velec,rinvsq02);
1678 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1680 /* Calculate temporary vectorial force */
1681 tx = _mm256_mul_ps(fscal,dx02);
1682 ty = _mm256_mul_ps(fscal,dy02);
1683 tz = _mm256_mul_ps(fscal,dz02);
1685 /* Update vectorial force */
1686 fix0 = _mm256_add_ps(fix0,tx);
1687 fiy0 = _mm256_add_ps(fiy0,ty);
1688 fiz0 = _mm256_add_ps(fiz0,tz);
1690 fjx2 = _mm256_add_ps(fjx2,tx);
1691 fjy2 = _mm256_add_ps(fjy2,ty);
1692 fjz2 = _mm256_add_ps(fjz2,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 /* COULOMB ELECTROSTATICS */
1699 velec = _mm256_mul_ps(qq10,rinv10);
1700 felec = _mm256_mul_ps(velec,rinvsq10);
1704 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1706 /* Calculate temporary vectorial force */
1707 tx = _mm256_mul_ps(fscal,dx10);
1708 ty = _mm256_mul_ps(fscal,dy10);
1709 tz = _mm256_mul_ps(fscal,dz10);
1711 /* Update vectorial force */
1712 fix1 = _mm256_add_ps(fix1,tx);
1713 fiy1 = _mm256_add_ps(fiy1,ty);
1714 fiz1 = _mm256_add_ps(fiz1,tz);
1716 fjx0 = _mm256_add_ps(fjx0,tx);
1717 fjy0 = _mm256_add_ps(fjy0,ty);
1718 fjz0 = _mm256_add_ps(fjz0,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 /* COULOMB ELECTROSTATICS */
1725 velec = _mm256_mul_ps(qq11,rinv11);
1726 felec = _mm256_mul_ps(velec,rinvsq11);
1730 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1732 /* Calculate temporary vectorial force */
1733 tx = _mm256_mul_ps(fscal,dx11);
1734 ty = _mm256_mul_ps(fscal,dy11);
1735 tz = _mm256_mul_ps(fscal,dz11);
1737 /* Update vectorial force */
1738 fix1 = _mm256_add_ps(fix1,tx);
1739 fiy1 = _mm256_add_ps(fiy1,ty);
1740 fiz1 = _mm256_add_ps(fiz1,tz);
1742 fjx1 = _mm256_add_ps(fjx1,tx);
1743 fjy1 = _mm256_add_ps(fjy1,ty);
1744 fjz1 = _mm256_add_ps(fjz1,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 /* COULOMB ELECTROSTATICS */
1751 velec = _mm256_mul_ps(qq12,rinv12);
1752 felec = _mm256_mul_ps(velec,rinvsq12);
1756 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1758 /* Calculate temporary vectorial force */
1759 tx = _mm256_mul_ps(fscal,dx12);
1760 ty = _mm256_mul_ps(fscal,dy12);
1761 tz = _mm256_mul_ps(fscal,dz12);
1763 /* Update vectorial force */
1764 fix1 = _mm256_add_ps(fix1,tx);
1765 fiy1 = _mm256_add_ps(fiy1,ty);
1766 fiz1 = _mm256_add_ps(fiz1,tz);
1768 fjx2 = _mm256_add_ps(fjx2,tx);
1769 fjy2 = _mm256_add_ps(fjy2,ty);
1770 fjz2 = _mm256_add_ps(fjz2,tz);
1772 /**************************
1773 * CALCULATE INTERACTIONS *
1774 **************************/
1776 /* COULOMB ELECTROSTATICS */
1777 velec = _mm256_mul_ps(qq20,rinv20);
1778 felec = _mm256_mul_ps(velec,rinvsq20);
1782 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1784 /* Calculate temporary vectorial force */
1785 tx = _mm256_mul_ps(fscal,dx20);
1786 ty = _mm256_mul_ps(fscal,dy20);
1787 tz = _mm256_mul_ps(fscal,dz20);
1789 /* Update vectorial force */
1790 fix2 = _mm256_add_ps(fix2,tx);
1791 fiy2 = _mm256_add_ps(fiy2,ty);
1792 fiz2 = _mm256_add_ps(fiz2,tz);
1794 fjx0 = _mm256_add_ps(fjx0,tx);
1795 fjy0 = _mm256_add_ps(fjy0,ty);
1796 fjz0 = _mm256_add_ps(fjz0,tz);
1798 /**************************
1799 * CALCULATE INTERACTIONS *
1800 **************************/
1802 /* COULOMB ELECTROSTATICS */
1803 velec = _mm256_mul_ps(qq21,rinv21);
1804 felec = _mm256_mul_ps(velec,rinvsq21);
1808 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1810 /* Calculate temporary vectorial force */
1811 tx = _mm256_mul_ps(fscal,dx21);
1812 ty = _mm256_mul_ps(fscal,dy21);
1813 tz = _mm256_mul_ps(fscal,dz21);
1815 /* Update vectorial force */
1816 fix2 = _mm256_add_ps(fix2,tx);
1817 fiy2 = _mm256_add_ps(fiy2,ty);
1818 fiz2 = _mm256_add_ps(fiz2,tz);
1820 fjx1 = _mm256_add_ps(fjx1,tx);
1821 fjy1 = _mm256_add_ps(fjy1,ty);
1822 fjz1 = _mm256_add_ps(fjz1,tz);
1824 /**************************
1825 * CALCULATE INTERACTIONS *
1826 **************************/
1828 /* COULOMB ELECTROSTATICS */
1829 velec = _mm256_mul_ps(qq22,rinv22);
1830 felec = _mm256_mul_ps(velec,rinvsq22);
1834 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1836 /* Calculate temporary vectorial force */
1837 tx = _mm256_mul_ps(fscal,dx22);
1838 ty = _mm256_mul_ps(fscal,dy22);
1839 tz = _mm256_mul_ps(fscal,dz22);
1841 /* Update vectorial force */
1842 fix2 = _mm256_add_ps(fix2,tx);
1843 fiy2 = _mm256_add_ps(fiy2,ty);
1844 fiz2 = _mm256_add_ps(fiz2,tz);
1846 fjx2 = _mm256_add_ps(fjx2,tx);
1847 fjy2 = _mm256_add_ps(fjy2,ty);
1848 fjz2 = _mm256_add_ps(fjz2,tz);
1850 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1851 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1852 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1853 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1854 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1855 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1856 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1857 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1859 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1860 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1862 /* Inner loop uses 241 flops */
1865 /* End of innermost loop */
1867 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1868 f+i_coord_offset,fshift+i_shift_offset);
1870 /* Increment number of inner iterations */
1871 inneriter += j_index_end - j_index_start;
1873 /* Outer loop uses 18 flops */
1876 /* Increment number of outer iterations */
1879 /* Update outer/inner flops */
1881 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*241);