2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_256_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrE,jnrF,jnrG,jnrH;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 real * vdwioffsetptr0;
86 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 real * vdwioffsetptr1;
88 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 real * vdwioffsetptr3;
92 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
94 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
102 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
105 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
106 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
107 __m256 dummy_mask,cutoff_mask;
108 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
109 __m256 one = _mm256_set1_ps(1.0);
110 __m256 two = _mm256_set1_ps(2.0);
116 jindex = nlist->jindex;
118 shiftidx = nlist->shift;
120 shiftvec = fr->shift_vec[0];
121 fshift = fr->fshift[0];
122 facel = _mm256_set1_ps(fr->ic->epsfac);
123 charge = mdatoms->chargeA;
124 krf = _mm256_set1_ps(fr->ic->k_rf);
125 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
126 crf = _mm256_set1_ps(fr->ic->c_rf);
127 nvdwtype = fr->ntype;
129 vdwtype = mdatoms->typeA;
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
134 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
135 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
136 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
138 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
139 rcutoff_scalar = fr->ic->rcoulomb;
140 rcutoff = _mm256_set1_ps(rcutoff_scalar);
141 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
143 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
144 rvdw = _mm256_set1_ps(fr->ic->rvdw);
146 /* Avoid stupid compiler warnings */
147 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183 fix0 = _mm256_setzero_ps();
184 fiy0 = _mm256_setzero_ps();
185 fiz0 = _mm256_setzero_ps();
186 fix1 = _mm256_setzero_ps();
187 fiy1 = _mm256_setzero_ps();
188 fiz1 = _mm256_setzero_ps();
189 fix2 = _mm256_setzero_ps();
190 fiy2 = _mm256_setzero_ps();
191 fiz2 = _mm256_setzero_ps();
192 fix3 = _mm256_setzero_ps();
193 fiy3 = _mm256_setzero_ps();
194 fiz3 = _mm256_setzero_ps();
196 /* Reset potential sums */
197 velecsum = _mm256_setzero_ps();
198 vvdwsum = _mm256_setzero_ps();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
204 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA = DIM*jnrA;
214 j_coord_offsetB = DIM*jnrB;
215 j_coord_offsetC = DIM*jnrC;
216 j_coord_offsetD = DIM*jnrD;
217 j_coord_offsetE = DIM*jnrE;
218 j_coord_offsetF = DIM*jnrF;
219 j_coord_offsetG = DIM*jnrG;
220 j_coord_offsetH = DIM*jnrH;
222 /* load j atom coordinates */
223 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
224 x+j_coord_offsetC,x+j_coord_offsetD,
225 x+j_coord_offsetE,x+j_coord_offsetF,
226 x+j_coord_offsetG,x+j_coord_offsetH,
229 /* Calculate displacement vector */
230 dx00 = _mm256_sub_ps(ix0,jx0);
231 dy00 = _mm256_sub_ps(iy0,jy0);
232 dz00 = _mm256_sub_ps(iz0,jz0);
233 dx10 = _mm256_sub_ps(ix1,jx0);
234 dy10 = _mm256_sub_ps(iy1,jy0);
235 dz10 = _mm256_sub_ps(iz1,jz0);
236 dx20 = _mm256_sub_ps(ix2,jx0);
237 dy20 = _mm256_sub_ps(iy2,jy0);
238 dz20 = _mm256_sub_ps(iz2,jz0);
239 dx30 = _mm256_sub_ps(ix3,jx0);
240 dy30 = _mm256_sub_ps(iy3,jy0);
241 dz30 = _mm256_sub_ps(iz3,jz0);
243 /* Calculate squared distance and things based on it */
244 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
245 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
246 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
247 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
249 rinv10 = avx256_invsqrt_f(rsq10);
250 rinv20 = avx256_invsqrt_f(rsq20);
251 rinv30 = avx256_invsqrt_f(rsq30);
253 rinvsq00 = avx256_inv_f(rsq00);
254 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
255 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
256 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
258 /* Load parameters for j particles */
259 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
260 charge+jnrC+0,charge+jnrD+0,
261 charge+jnrE+0,charge+jnrF+0,
262 charge+jnrG+0,charge+jnrH+0);
263 vdwjidx0A = 2*vdwtype[jnrA+0];
264 vdwjidx0B = 2*vdwtype[jnrB+0];
265 vdwjidx0C = 2*vdwtype[jnrC+0];
266 vdwjidx0D = 2*vdwtype[jnrD+0];
267 vdwjidx0E = 2*vdwtype[jnrE+0];
268 vdwjidx0F = 2*vdwtype[jnrF+0];
269 vdwjidx0G = 2*vdwtype[jnrG+0];
270 vdwjidx0H = 2*vdwtype[jnrH+0];
272 fjx0 = _mm256_setzero_ps();
273 fjy0 = _mm256_setzero_ps();
274 fjz0 = _mm256_setzero_ps();
276 /**************************
277 * CALCULATE INTERACTIONS *
278 **************************/
280 if (gmx_mm256_any_lt(rsq00,rcutoff2))
283 /* Compute parameters for interactions between i and j atoms */
284 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
285 vdwioffsetptr0+vdwjidx0B,
286 vdwioffsetptr0+vdwjidx0C,
287 vdwioffsetptr0+vdwjidx0D,
288 vdwioffsetptr0+vdwjidx0E,
289 vdwioffsetptr0+vdwjidx0F,
290 vdwioffsetptr0+vdwjidx0G,
291 vdwioffsetptr0+vdwjidx0H,
294 /* LENNARD-JONES DISPERSION/REPULSION */
296 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
297 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
298 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
299 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
300 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
301 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
303 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
305 /* Update potential sum for this i atom from the interaction with this j atom. */
306 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
307 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
311 fscal = _mm256_and_ps(fscal,cutoff_mask);
313 /* Calculate temporary vectorial force */
314 tx = _mm256_mul_ps(fscal,dx00);
315 ty = _mm256_mul_ps(fscal,dy00);
316 tz = _mm256_mul_ps(fscal,dz00);
318 /* Update vectorial force */
319 fix0 = _mm256_add_ps(fix0,tx);
320 fiy0 = _mm256_add_ps(fiy0,ty);
321 fiz0 = _mm256_add_ps(fiz0,tz);
323 fjx0 = _mm256_add_ps(fjx0,tx);
324 fjy0 = _mm256_add_ps(fjy0,ty);
325 fjz0 = _mm256_add_ps(fjz0,tz);
329 /**************************
330 * CALCULATE INTERACTIONS *
331 **************************/
333 if (gmx_mm256_any_lt(rsq10,rcutoff2))
336 /* Compute parameters for interactions between i and j atoms */
337 qq10 = _mm256_mul_ps(iq1,jq0);
339 /* REACTION-FIELD ELECTROSTATICS */
340 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
341 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
343 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velec = _mm256_and_ps(velec,cutoff_mask);
347 velecsum = _mm256_add_ps(velecsum,velec);
351 fscal = _mm256_and_ps(fscal,cutoff_mask);
353 /* Calculate temporary vectorial force */
354 tx = _mm256_mul_ps(fscal,dx10);
355 ty = _mm256_mul_ps(fscal,dy10);
356 tz = _mm256_mul_ps(fscal,dz10);
358 /* Update vectorial force */
359 fix1 = _mm256_add_ps(fix1,tx);
360 fiy1 = _mm256_add_ps(fiy1,ty);
361 fiz1 = _mm256_add_ps(fiz1,tz);
363 fjx0 = _mm256_add_ps(fjx0,tx);
364 fjy0 = _mm256_add_ps(fjy0,ty);
365 fjz0 = _mm256_add_ps(fjz0,tz);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 if (gmx_mm256_any_lt(rsq20,rcutoff2))
376 /* Compute parameters for interactions between i and j atoms */
377 qq20 = _mm256_mul_ps(iq2,jq0);
379 /* REACTION-FIELD ELECTROSTATICS */
380 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
381 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
383 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velec = _mm256_and_ps(velec,cutoff_mask);
387 velecsum = _mm256_add_ps(velecsum,velec);
391 fscal = _mm256_and_ps(fscal,cutoff_mask);
393 /* Calculate temporary vectorial force */
394 tx = _mm256_mul_ps(fscal,dx20);
395 ty = _mm256_mul_ps(fscal,dy20);
396 tz = _mm256_mul_ps(fscal,dz20);
398 /* Update vectorial force */
399 fix2 = _mm256_add_ps(fix2,tx);
400 fiy2 = _mm256_add_ps(fiy2,ty);
401 fiz2 = _mm256_add_ps(fiz2,tz);
403 fjx0 = _mm256_add_ps(fjx0,tx);
404 fjy0 = _mm256_add_ps(fjy0,ty);
405 fjz0 = _mm256_add_ps(fjz0,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 if (gmx_mm256_any_lt(rsq30,rcutoff2))
416 /* Compute parameters for interactions between i and j atoms */
417 qq30 = _mm256_mul_ps(iq3,jq0);
419 /* REACTION-FIELD ELECTROSTATICS */
420 velec = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_add_ps(rinv30,_mm256_mul_ps(krf,rsq30)),crf));
421 felec = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
423 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
425 /* Update potential sum for this i atom from the interaction with this j atom. */
426 velec = _mm256_and_ps(velec,cutoff_mask);
427 velecsum = _mm256_add_ps(velecsum,velec);
431 fscal = _mm256_and_ps(fscal,cutoff_mask);
433 /* Calculate temporary vectorial force */
434 tx = _mm256_mul_ps(fscal,dx30);
435 ty = _mm256_mul_ps(fscal,dy30);
436 tz = _mm256_mul_ps(fscal,dz30);
438 /* Update vectorial force */
439 fix3 = _mm256_add_ps(fix3,tx);
440 fiy3 = _mm256_add_ps(fiy3,ty);
441 fiz3 = _mm256_add_ps(fiz3,tz);
443 fjx0 = _mm256_add_ps(fjx0,tx);
444 fjy0 = _mm256_add_ps(fjy0,ty);
445 fjz0 = _mm256_add_ps(fjz0,tz);
449 fjptrA = f+j_coord_offsetA;
450 fjptrB = f+j_coord_offsetB;
451 fjptrC = f+j_coord_offsetC;
452 fjptrD = f+j_coord_offsetD;
453 fjptrE = f+j_coord_offsetE;
454 fjptrF = f+j_coord_offsetF;
455 fjptrG = f+j_coord_offsetG;
456 fjptrH = f+j_coord_offsetH;
458 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
460 /* Inner loop uses 152 flops */
466 /* Get j neighbor index, and coordinate index */
467 jnrlistA = jjnr[jidx];
468 jnrlistB = jjnr[jidx+1];
469 jnrlistC = jjnr[jidx+2];
470 jnrlistD = jjnr[jidx+3];
471 jnrlistE = jjnr[jidx+4];
472 jnrlistF = jjnr[jidx+5];
473 jnrlistG = jjnr[jidx+6];
474 jnrlistH = jjnr[jidx+7];
475 /* Sign of each element will be negative for non-real atoms.
476 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
477 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
479 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
480 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
482 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
483 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
484 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
485 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
486 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
487 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
488 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
489 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
490 j_coord_offsetA = DIM*jnrA;
491 j_coord_offsetB = DIM*jnrB;
492 j_coord_offsetC = DIM*jnrC;
493 j_coord_offsetD = DIM*jnrD;
494 j_coord_offsetE = DIM*jnrE;
495 j_coord_offsetF = DIM*jnrF;
496 j_coord_offsetG = DIM*jnrG;
497 j_coord_offsetH = DIM*jnrH;
499 /* load j atom coordinates */
500 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
501 x+j_coord_offsetC,x+j_coord_offsetD,
502 x+j_coord_offsetE,x+j_coord_offsetF,
503 x+j_coord_offsetG,x+j_coord_offsetH,
506 /* Calculate displacement vector */
507 dx00 = _mm256_sub_ps(ix0,jx0);
508 dy00 = _mm256_sub_ps(iy0,jy0);
509 dz00 = _mm256_sub_ps(iz0,jz0);
510 dx10 = _mm256_sub_ps(ix1,jx0);
511 dy10 = _mm256_sub_ps(iy1,jy0);
512 dz10 = _mm256_sub_ps(iz1,jz0);
513 dx20 = _mm256_sub_ps(ix2,jx0);
514 dy20 = _mm256_sub_ps(iy2,jy0);
515 dz20 = _mm256_sub_ps(iz2,jz0);
516 dx30 = _mm256_sub_ps(ix3,jx0);
517 dy30 = _mm256_sub_ps(iy3,jy0);
518 dz30 = _mm256_sub_ps(iz3,jz0);
520 /* Calculate squared distance and things based on it */
521 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
522 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
523 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
524 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
526 rinv10 = avx256_invsqrt_f(rsq10);
527 rinv20 = avx256_invsqrt_f(rsq20);
528 rinv30 = avx256_invsqrt_f(rsq30);
530 rinvsq00 = avx256_inv_f(rsq00);
531 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
532 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
533 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
535 /* Load parameters for j particles */
536 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
537 charge+jnrC+0,charge+jnrD+0,
538 charge+jnrE+0,charge+jnrF+0,
539 charge+jnrG+0,charge+jnrH+0);
540 vdwjidx0A = 2*vdwtype[jnrA+0];
541 vdwjidx0B = 2*vdwtype[jnrB+0];
542 vdwjidx0C = 2*vdwtype[jnrC+0];
543 vdwjidx0D = 2*vdwtype[jnrD+0];
544 vdwjidx0E = 2*vdwtype[jnrE+0];
545 vdwjidx0F = 2*vdwtype[jnrF+0];
546 vdwjidx0G = 2*vdwtype[jnrG+0];
547 vdwjidx0H = 2*vdwtype[jnrH+0];
549 fjx0 = _mm256_setzero_ps();
550 fjy0 = _mm256_setzero_ps();
551 fjz0 = _mm256_setzero_ps();
553 /**************************
554 * CALCULATE INTERACTIONS *
555 **************************/
557 if (gmx_mm256_any_lt(rsq00,rcutoff2))
560 /* Compute parameters for interactions between i and j atoms */
561 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
562 vdwioffsetptr0+vdwjidx0B,
563 vdwioffsetptr0+vdwjidx0C,
564 vdwioffsetptr0+vdwjidx0D,
565 vdwioffsetptr0+vdwjidx0E,
566 vdwioffsetptr0+vdwjidx0F,
567 vdwioffsetptr0+vdwjidx0G,
568 vdwioffsetptr0+vdwjidx0H,
571 /* LENNARD-JONES DISPERSION/REPULSION */
573 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
574 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
575 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
576 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
577 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
578 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
580 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
584 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
585 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
589 fscal = _mm256_and_ps(fscal,cutoff_mask);
591 fscal = _mm256_andnot_ps(dummy_mask,fscal);
593 /* Calculate temporary vectorial force */
594 tx = _mm256_mul_ps(fscal,dx00);
595 ty = _mm256_mul_ps(fscal,dy00);
596 tz = _mm256_mul_ps(fscal,dz00);
598 /* Update vectorial force */
599 fix0 = _mm256_add_ps(fix0,tx);
600 fiy0 = _mm256_add_ps(fiy0,ty);
601 fiz0 = _mm256_add_ps(fiz0,tz);
603 fjx0 = _mm256_add_ps(fjx0,tx);
604 fjy0 = _mm256_add_ps(fjy0,ty);
605 fjz0 = _mm256_add_ps(fjz0,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 if (gmx_mm256_any_lt(rsq10,rcutoff2))
616 /* Compute parameters for interactions between i and j atoms */
617 qq10 = _mm256_mul_ps(iq1,jq0);
619 /* REACTION-FIELD ELECTROSTATICS */
620 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
621 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
623 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
625 /* Update potential sum for this i atom from the interaction with this j atom. */
626 velec = _mm256_and_ps(velec,cutoff_mask);
627 velec = _mm256_andnot_ps(dummy_mask,velec);
628 velecsum = _mm256_add_ps(velecsum,velec);
632 fscal = _mm256_and_ps(fscal,cutoff_mask);
634 fscal = _mm256_andnot_ps(dummy_mask,fscal);
636 /* Calculate temporary vectorial force */
637 tx = _mm256_mul_ps(fscal,dx10);
638 ty = _mm256_mul_ps(fscal,dy10);
639 tz = _mm256_mul_ps(fscal,dz10);
641 /* Update vectorial force */
642 fix1 = _mm256_add_ps(fix1,tx);
643 fiy1 = _mm256_add_ps(fiy1,ty);
644 fiz1 = _mm256_add_ps(fiz1,tz);
646 fjx0 = _mm256_add_ps(fjx0,tx);
647 fjy0 = _mm256_add_ps(fjy0,ty);
648 fjz0 = _mm256_add_ps(fjz0,tz);
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 if (gmx_mm256_any_lt(rsq20,rcutoff2))
659 /* Compute parameters for interactions between i and j atoms */
660 qq20 = _mm256_mul_ps(iq2,jq0);
662 /* REACTION-FIELD ELECTROSTATICS */
663 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
664 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
666 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
668 /* Update potential sum for this i atom from the interaction with this j atom. */
669 velec = _mm256_and_ps(velec,cutoff_mask);
670 velec = _mm256_andnot_ps(dummy_mask,velec);
671 velecsum = _mm256_add_ps(velecsum,velec);
675 fscal = _mm256_and_ps(fscal,cutoff_mask);
677 fscal = _mm256_andnot_ps(dummy_mask,fscal);
679 /* Calculate temporary vectorial force */
680 tx = _mm256_mul_ps(fscal,dx20);
681 ty = _mm256_mul_ps(fscal,dy20);
682 tz = _mm256_mul_ps(fscal,dz20);
684 /* Update vectorial force */
685 fix2 = _mm256_add_ps(fix2,tx);
686 fiy2 = _mm256_add_ps(fiy2,ty);
687 fiz2 = _mm256_add_ps(fiz2,tz);
689 fjx0 = _mm256_add_ps(fjx0,tx);
690 fjy0 = _mm256_add_ps(fjy0,ty);
691 fjz0 = _mm256_add_ps(fjz0,tz);
695 /**************************
696 * CALCULATE INTERACTIONS *
697 **************************/
699 if (gmx_mm256_any_lt(rsq30,rcutoff2))
702 /* Compute parameters for interactions between i and j atoms */
703 qq30 = _mm256_mul_ps(iq3,jq0);
705 /* REACTION-FIELD ELECTROSTATICS */
706 velec = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_add_ps(rinv30,_mm256_mul_ps(krf,rsq30)),crf));
707 felec = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
709 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
711 /* Update potential sum for this i atom from the interaction with this j atom. */
712 velec = _mm256_and_ps(velec,cutoff_mask);
713 velec = _mm256_andnot_ps(dummy_mask,velec);
714 velecsum = _mm256_add_ps(velecsum,velec);
718 fscal = _mm256_and_ps(fscal,cutoff_mask);
720 fscal = _mm256_andnot_ps(dummy_mask,fscal);
722 /* Calculate temporary vectorial force */
723 tx = _mm256_mul_ps(fscal,dx30);
724 ty = _mm256_mul_ps(fscal,dy30);
725 tz = _mm256_mul_ps(fscal,dz30);
727 /* Update vectorial force */
728 fix3 = _mm256_add_ps(fix3,tx);
729 fiy3 = _mm256_add_ps(fiy3,ty);
730 fiz3 = _mm256_add_ps(fiz3,tz);
732 fjx0 = _mm256_add_ps(fjx0,tx);
733 fjy0 = _mm256_add_ps(fjy0,ty);
734 fjz0 = _mm256_add_ps(fjz0,tz);
738 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
739 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
740 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
741 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
742 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
743 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
744 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
745 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
747 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
749 /* Inner loop uses 152 flops */
752 /* End of innermost loop */
754 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
755 f+i_coord_offset,fshift+i_shift_offset);
758 /* Update potential energies */
759 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
760 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
762 /* Increment number of inner iterations */
763 inneriter += j_index_end - j_index_start;
765 /* Outer loop uses 26 flops */
768 /* Increment number of outer iterations */
771 /* Update outer/inner flops */
773 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*152);
776 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_single
777 * Electrostatics interaction: ReactionField
778 * VdW interaction: LennardJones
779 * Geometry: Water4-Particle
780 * Calculate force/pot: Force
783 nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_256_single
784 (t_nblist * gmx_restrict nlist,
785 rvec * gmx_restrict xx,
786 rvec * gmx_restrict ff,
787 struct t_forcerec * gmx_restrict fr,
788 t_mdatoms * gmx_restrict mdatoms,
789 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
790 t_nrnb * gmx_restrict nrnb)
792 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
793 * just 0 for non-waters.
794 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
795 * jnr indices corresponding to data put in the four positions in the SIMD register.
797 int i_shift_offset,i_coord_offset,outeriter,inneriter;
798 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
799 int jnrA,jnrB,jnrC,jnrD;
800 int jnrE,jnrF,jnrG,jnrH;
801 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
802 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
803 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
804 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
805 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
807 real *shiftvec,*fshift,*x,*f;
808 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
810 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
811 real * vdwioffsetptr0;
812 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
813 real * vdwioffsetptr1;
814 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
815 real * vdwioffsetptr2;
816 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
817 real * vdwioffsetptr3;
818 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
819 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
820 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
821 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
822 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
823 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
824 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
825 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
828 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
831 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
832 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
833 __m256 dummy_mask,cutoff_mask;
834 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
835 __m256 one = _mm256_set1_ps(1.0);
836 __m256 two = _mm256_set1_ps(2.0);
842 jindex = nlist->jindex;
844 shiftidx = nlist->shift;
846 shiftvec = fr->shift_vec[0];
847 fshift = fr->fshift[0];
848 facel = _mm256_set1_ps(fr->ic->epsfac);
849 charge = mdatoms->chargeA;
850 krf = _mm256_set1_ps(fr->ic->k_rf);
851 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
852 crf = _mm256_set1_ps(fr->ic->c_rf);
853 nvdwtype = fr->ntype;
855 vdwtype = mdatoms->typeA;
857 /* Setup water-specific parameters */
858 inr = nlist->iinr[0];
859 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
860 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
861 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
862 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
864 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
865 rcutoff_scalar = fr->ic->rcoulomb;
866 rcutoff = _mm256_set1_ps(rcutoff_scalar);
867 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
869 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
870 rvdw = _mm256_set1_ps(fr->ic->rvdw);
872 /* Avoid stupid compiler warnings */
873 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
886 for(iidx=0;iidx<4*DIM;iidx++)
891 /* Start outer loop over neighborlists */
892 for(iidx=0; iidx<nri; iidx++)
894 /* Load shift vector for this list */
895 i_shift_offset = DIM*shiftidx[iidx];
897 /* Load limits for loop over neighbors */
898 j_index_start = jindex[iidx];
899 j_index_end = jindex[iidx+1];
901 /* Get outer coordinate index */
903 i_coord_offset = DIM*inr;
905 /* Load i particle coords and add shift vector */
906 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
907 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
909 fix0 = _mm256_setzero_ps();
910 fiy0 = _mm256_setzero_ps();
911 fiz0 = _mm256_setzero_ps();
912 fix1 = _mm256_setzero_ps();
913 fiy1 = _mm256_setzero_ps();
914 fiz1 = _mm256_setzero_ps();
915 fix2 = _mm256_setzero_ps();
916 fiy2 = _mm256_setzero_ps();
917 fiz2 = _mm256_setzero_ps();
918 fix3 = _mm256_setzero_ps();
919 fiy3 = _mm256_setzero_ps();
920 fiz3 = _mm256_setzero_ps();
922 /* Start inner kernel loop */
923 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
926 /* Get j neighbor index, and coordinate index */
935 j_coord_offsetA = DIM*jnrA;
936 j_coord_offsetB = DIM*jnrB;
937 j_coord_offsetC = DIM*jnrC;
938 j_coord_offsetD = DIM*jnrD;
939 j_coord_offsetE = DIM*jnrE;
940 j_coord_offsetF = DIM*jnrF;
941 j_coord_offsetG = DIM*jnrG;
942 j_coord_offsetH = DIM*jnrH;
944 /* load j atom coordinates */
945 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
946 x+j_coord_offsetC,x+j_coord_offsetD,
947 x+j_coord_offsetE,x+j_coord_offsetF,
948 x+j_coord_offsetG,x+j_coord_offsetH,
951 /* Calculate displacement vector */
952 dx00 = _mm256_sub_ps(ix0,jx0);
953 dy00 = _mm256_sub_ps(iy0,jy0);
954 dz00 = _mm256_sub_ps(iz0,jz0);
955 dx10 = _mm256_sub_ps(ix1,jx0);
956 dy10 = _mm256_sub_ps(iy1,jy0);
957 dz10 = _mm256_sub_ps(iz1,jz0);
958 dx20 = _mm256_sub_ps(ix2,jx0);
959 dy20 = _mm256_sub_ps(iy2,jy0);
960 dz20 = _mm256_sub_ps(iz2,jz0);
961 dx30 = _mm256_sub_ps(ix3,jx0);
962 dy30 = _mm256_sub_ps(iy3,jy0);
963 dz30 = _mm256_sub_ps(iz3,jz0);
965 /* Calculate squared distance and things based on it */
966 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
967 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
968 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
969 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
971 rinv10 = avx256_invsqrt_f(rsq10);
972 rinv20 = avx256_invsqrt_f(rsq20);
973 rinv30 = avx256_invsqrt_f(rsq30);
975 rinvsq00 = avx256_inv_f(rsq00);
976 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
977 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
978 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
980 /* Load parameters for j particles */
981 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
982 charge+jnrC+0,charge+jnrD+0,
983 charge+jnrE+0,charge+jnrF+0,
984 charge+jnrG+0,charge+jnrH+0);
985 vdwjidx0A = 2*vdwtype[jnrA+0];
986 vdwjidx0B = 2*vdwtype[jnrB+0];
987 vdwjidx0C = 2*vdwtype[jnrC+0];
988 vdwjidx0D = 2*vdwtype[jnrD+0];
989 vdwjidx0E = 2*vdwtype[jnrE+0];
990 vdwjidx0F = 2*vdwtype[jnrF+0];
991 vdwjidx0G = 2*vdwtype[jnrG+0];
992 vdwjidx0H = 2*vdwtype[jnrH+0];
994 fjx0 = _mm256_setzero_ps();
995 fjy0 = _mm256_setzero_ps();
996 fjz0 = _mm256_setzero_ps();
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1005 /* Compute parameters for interactions between i and j atoms */
1006 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1007 vdwioffsetptr0+vdwjidx0B,
1008 vdwioffsetptr0+vdwjidx0C,
1009 vdwioffsetptr0+vdwjidx0D,
1010 vdwioffsetptr0+vdwjidx0E,
1011 vdwioffsetptr0+vdwjidx0F,
1012 vdwioffsetptr0+vdwjidx0G,
1013 vdwioffsetptr0+vdwjidx0H,
1016 /* LENNARD-JONES DISPERSION/REPULSION */
1018 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1019 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1021 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1025 fscal = _mm256_and_ps(fscal,cutoff_mask);
1027 /* Calculate temporary vectorial force */
1028 tx = _mm256_mul_ps(fscal,dx00);
1029 ty = _mm256_mul_ps(fscal,dy00);
1030 tz = _mm256_mul_ps(fscal,dz00);
1032 /* Update vectorial force */
1033 fix0 = _mm256_add_ps(fix0,tx);
1034 fiy0 = _mm256_add_ps(fiy0,ty);
1035 fiz0 = _mm256_add_ps(fiz0,tz);
1037 fjx0 = _mm256_add_ps(fjx0,tx);
1038 fjy0 = _mm256_add_ps(fjy0,ty);
1039 fjz0 = _mm256_add_ps(fjz0,tz);
1043 /**************************
1044 * CALCULATE INTERACTIONS *
1045 **************************/
1047 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1050 /* Compute parameters for interactions between i and j atoms */
1051 qq10 = _mm256_mul_ps(iq1,jq0);
1053 /* REACTION-FIELD ELECTROSTATICS */
1054 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1056 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1060 fscal = _mm256_and_ps(fscal,cutoff_mask);
1062 /* Calculate temporary vectorial force */
1063 tx = _mm256_mul_ps(fscal,dx10);
1064 ty = _mm256_mul_ps(fscal,dy10);
1065 tz = _mm256_mul_ps(fscal,dz10);
1067 /* Update vectorial force */
1068 fix1 = _mm256_add_ps(fix1,tx);
1069 fiy1 = _mm256_add_ps(fiy1,ty);
1070 fiz1 = _mm256_add_ps(fiz1,tz);
1072 fjx0 = _mm256_add_ps(fjx0,tx);
1073 fjy0 = _mm256_add_ps(fjy0,ty);
1074 fjz0 = _mm256_add_ps(fjz0,tz);
1078 /**************************
1079 * CALCULATE INTERACTIONS *
1080 **************************/
1082 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1085 /* Compute parameters for interactions between i and j atoms */
1086 qq20 = _mm256_mul_ps(iq2,jq0);
1088 /* REACTION-FIELD ELECTROSTATICS */
1089 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1091 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1095 fscal = _mm256_and_ps(fscal,cutoff_mask);
1097 /* Calculate temporary vectorial force */
1098 tx = _mm256_mul_ps(fscal,dx20);
1099 ty = _mm256_mul_ps(fscal,dy20);
1100 tz = _mm256_mul_ps(fscal,dz20);
1102 /* Update vectorial force */
1103 fix2 = _mm256_add_ps(fix2,tx);
1104 fiy2 = _mm256_add_ps(fiy2,ty);
1105 fiz2 = _mm256_add_ps(fiz2,tz);
1107 fjx0 = _mm256_add_ps(fjx0,tx);
1108 fjy0 = _mm256_add_ps(fjy0,ty);
1109 fjz0 = _mm256_add_ps(fjz0,tz);
1113 /**************************
1114 * CALCULATE INTERACTIONS *
1115 **************************/
1117 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1120 /* Compute parameters for interactions between i and j atoms */
1121 qq30 = _mm256_mul_ps(iq3,jq0);
1123 /* REACTION-FIELD ELECTROSTATICS */
1124 felec = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
1126 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1130 fscal = _mm256_and_ps(fscal,cutoff_mask);
1132 /* Calculate temporary vectorial force */
1133 tx = _mm256_mul_ps(fscal,dx30);
1134 ty = _mm256_mul_ps(fscal,dy30);
1135 tz = _mm256_mul_ps(fscal,dz30);
1137 /* Update vectorial force */
1138 fix3 = _mm256_add_ps(fix3,tx);
1139 fiy3 = _mm256_add_ps(fiy3,ty);
1140 fiz3 = _mm256_add_ps(fiz3,tz);
1142 fjx0 = _mm256_add_ps(fjx0,tx);
1143 fjy0 = _mm256_add_ps(fjy0,ty);
1144 fjz0 = _mm256_add_ps(fjz0,tz);
1148 fjptrA = f+j_coord_offsetA;
1149 fjptrB = f+j_coord_offsetB;
1150 fjptrC = f+j_coord_offsetC;
1151 fjptrD = f+j_coord_offsetD;
1152 fjptrE = f+j_coord_offsetE;
1153 fjptrF = f+j_coord_offsetF;
1154 fjptrG = f+j_coord_offsetG;
1155 fjptrH = f+j_coord_offsetH;
1157 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1159 /* Inner loop uses 123 flops */
1162 if(jidx<j_index_end)
1165 /* Get j neighbor index, and coordinate index */
1166 jnrlistA = jjnr[jidx];
1167 jnrlistB = jjnr[jidx+1];
1168 jnrlistC = jjnr[jidx+2];
1169 jnrlistD = jjnr[jidx+3];
1170 jnrlistE = jjnr[jidx+4];
1171 jnrlistF = jjnr[jidx+5];
1172 jnrlistG = jjnr[jidx+6];
1173 jnrlistH = jjnr[jidx+7];
1174 /* Sign of each element will be negative for non-real atoms.
1175 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1176 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1178 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1179 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1181 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1182 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1183 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1184 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1185 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1186 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1187 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1188 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1189 j_coord_offsetA = DIM*jnrA;
1190 j_coord_offsetB = DIM*jnrB;
1191 j_coord_offsetC = DIM*jnrC;
1192 j_coord_offsetD = DIM*jnrD;
1193 j_coord_offsetE = DIM*jnrE;
1194 j_coord_offsetF = DIM*jnrF;
1195 j_coord_offsetG = DIM*jnrG;
1196 j_coord_offsetH = DIM*jnrH;
1198 /* load j atom coordinates */
1199 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1200 x+j_coord_offsetC,x+j_coord_offsetD,
1201 x+j_coord_offsetE,x+j_coord_offsetF,
1202 x+j_coord_offsetG,x+j_coord_offsetH,
1205 /* Calculate displacement vector */
1206 dx00 = _mm256_sub_ps(ix0,jx0);
1207 dy00 = _mm256_sub_ps(iy0,jy0);
1208 dz00 = _mm256_sub_ps(iz0,jz0);
1209 dx10 = _mm256_sub_ps(ix1,jx0);
1210 dy10 = _mm256_sub_ps(iy1,jy0);
1211 dz10 = _mm256_sub_ps(iz1,jz0);
1212 dx20 = _mm256_sub_ps(ix2,jx0);
1213 dy20 = _mm256_sub_ps(iy2,jy0);
1214 dz20 = _mm256_sub_ps(iz2,jz0);
1215 dx30 = _mm256_sub_ps(ix3,jx0);
1216 dy30 = _mm256_sub_ps(iy3,jy0);
1217 dz30 = _mm256_sub_ps(iz3,jz0);
1219 /* Calculate squared distance and things based on it */
1220 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1221 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1222 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1223 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1225 rinv10 = avx256_invsqrt_f(rsq10);
1226 rinv20 = avx256_invsqrt_f(rsq20);
1227 rinv30 = avx256_invsqrt_f(rsq30);
1229 rinvsq00 = avx256_inv_f(rsq00);
1230 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1231 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1232 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
1234 /* Load parameters for j particles */
1235 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1236 charge+jnrC+0,charge+jnrD+0,
1237 charge+jnrE+0,charge+jnrF+0,
1238 charge+jnrG+0,charge+jnrH+0);
1239 vdwjidx0A = 2*vdwtype[jnrA+0];
1240 vdwjidx0B = 2*vdwtype[jnrB+0];
1241 vdwjidx0C = 2*vdwtype[jnrC+0];
1242 vdwjidx0D = 2*vdwtype[jnrD+0];
1243 vdwjidx0E = 2*vdwtype[jnrE+0];
1244 vdwjidx0F = 2*vdwtype[jnrF+0];
1245 vdwjidx0G = 2*vdwtype[jnrG+0];
1246 vdwjidx0H = 2*vdwtype[jnrH+0];
1248 fjx0 = _mm256_setzero_ps();
1249 fjy0 = _mm256_setzero_ps();
1250 fjz0 = _mm256_setzero_ps();
1252 /**************************
1253 * CALCULATE INTERACTIONS *
1254 **************************/
1256 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1259 /* Compute parameters for interactions between i and j atoms */
1260 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1261 vdwioffsetptr0+vdwjidx0B,
1262 vdwioffsetptr0+vdwjidx0C,
1263 vdwioffsetptr0+vdwjidx0D,
1264 vdwioffsetptr0+vdwjidx0E,
1265 vdwioffsetptr0+vdwjidx0F,
1266 vdwioffsetptr0+vdwjidx0G,
1267 vdwioffsetptr0+vdwjidx0H,
1270 /* LENNARD-JONES DISPERSION/REPULSION */
1272 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1273 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1275 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1279 fscal = _mm256_and_ps(fscal,cutoff_mask);
1281 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1283 /* Calculate temporary vectorial force */
1284 tx = _mm256_mul_ps(fscal,dx00);
1285 ty = _mm256_mul_ps(fscal,dy00);
1286 tz = _mm256_mul_ps(fscal,dz00);
1288 /* Update vectorial force */
1289 fix0 = _mm256_add_ps(fix0,tx);
1290 fiy0 = _mm256_add_ps(fiy0,ty);
1291 fiz0 = _mm256_add_ps(fiz0,tz);
1293 fjx0 = _mm256_add_ps(fjx0,tx);
1294 fjy0 = _mm256_add_ps(fjy0,ty);
1295 fjz0 = _mm256_add_ps(fjz0,tz);
1299 /**************************
1300 * CALCULATE INTERACTIONS *
1301 **************************/
1303 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1306 /* Compute parameters for interactions between i and j atoms */
1307 qq10 = _mm256_mul_ps(iq1,jq0);
1309 /* REACTION-FIELD ELECTROSTATICS */
1310 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1312 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1316 fscal = _mm256_and_ps(fscal,cutoff_mask);
1318 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1320 /* Calculate temporary vectorial force */
1321 tx = _mm256_mul_ps(fscal,dx10);
1322 ty = _mm256_mul_ps(fscal,dy10);
1323 tz = _mm256_mul_ps(fscal,dz10);
1325 /* Update vectorial force */
1326 fix1 = _mm256_add_ps(fix1,tx);
1327 fiy1 = _mm256_add_ps(fiy1,ty);
1328 fiz1 = _mm256_add_ps(fiz1,tz);
1330 fjx0 = _mm256_add_ps(fjx0,tx);
1331 fjy0 = _mm256_add_ps(fjy0,ty);
1332 fjz0 = _mm256_add_ps(fjz0,tz);
1336 /**************************
1337 * CALCULATE INTERACTIONS *
1338 **************************/
1340 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1343 /* Compute parameters for interactions between i and j atoms */
1344 qq20 = _mm256_mul_ps(iq2,jq0);
1346 /* REACTION-FIELD ELECTROSTATICS */
1347 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1349 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1353 fscal = _mm256_and_ps(fscal,cutoff_mask);
1355 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1357 /* Calculate temporary vectorial force */
1358 tx = _mm256_mul_ps(fscal,dx20);
1359 ty = _mm256_mul_ps(fscal,dy20);
1360 tz = _mm256_mul_ps(fscal,dz20);
1362 /* Update vectorial force */
1363 fix2 = _mm256_add_ps(fix2,tx);
1364 fiy2 = _mm256_add_ps(fiy2,ty);
1365 fiz2 = _mm256_add_ps(fiz2,tz);
1367 fjx0 = _mm256_add_ps(fjx0,tx);
1368 fjy0 = _mm256_add_ps(fjy0,ty);
1369 fjz0 = _mm256_add_ps(fjz0,tz);
1373 /**************************
1374 * CALCULATE INTERACTIONS *
1375 **************************/
1377 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1380 /* Compute parameters for interactions between i and j atoms */
1381 qq30 = _mm256_mul_ps(iq3,jq0);
1383 /* REACTION-FIELD ELECTROSTATICS */
1384 felec = _mm256_mul_ps(qq30,_mm256_sub_ps(_mm256_mul_ps(rinv30,rinvsq30),krf2));
1386 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1390 fscal = _mm256_and_ps(fscal,cutoff_mask);
1392 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1394 /* Calculate temporary vectorial force */
1395 tx = _mm256_mul_ps(fscal,dx30);
1396 ty = _mm256_mul_ps(fscal,dy30);
1397 tz = _mm256_mul_ps(fscal,dz30);
1399 /* Update vectorial force */
1400 fix3 = _mm256_add_ps(fix3,tx);
1401 fiy3 = _mm256_add_ps(fiy3,ty);
1402 fiz3 = _mm256_add_ps(fiz3,tz);
1404 fjx0 = _mm256_add_ps(fjx0,tx);
1405 fjy0 = _mm256_add_ps(fjy0,ty);
1406 fjz0 = _mm256_add_ps(fjz0,tz);
1410 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1411 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1412 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1413 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1414 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1415 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1416 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1417 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1419 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1421 /* Inner loop uses 123 flops */
1424 /* End of innermost loop */
1426 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1427 f+i_coord_offset,fshift+i_shift_offset);
1429 /* Increment number of inner iterations */
1430 inneriter += j_index_end - j_index_start;
1432 /* Outer loop uses 24 flops */
1435 /* Increment number of outer iterations */
1438 /* Update outer/inner flops */
1440 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*123);