2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: None
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
102 __m128i ewitab_lo,ewitab_hi;
103 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
106 __m256 dummy_mask,cutoff_mask;
107 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108 __m256 one = _mm256_set1_ps(1.0);
109 __m256 two = _mm256_set1_ps(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm256_set1_ps(fr->epsfac);
122 charge = mdatoms->chargeA;
124 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
125 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
126 beta2 = _mm256_mul_ps(beta,beta);
127 beta3 = _mm256_mul_ps(beta,beta2);
129 ewtab = fr->ic->tabq_coul_FDV0;
130 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
131 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
133 /* Setup water-specific parameters */
134 inr = nlist->iinr[0];
135 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
136 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
137 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
139 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
140 rcutoff_scalar = fr->rcoulomb;
141 rcutoff = _mm256_set1_ps(rcutoff_scalar);
142 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
144 /* Avoid stupid compiler warnings */
145 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
158 for(iidx=0;iidx<4*DIM;iidx++)
163 /* Start outer loop over neighborlists */
164 for(iidx=0; iidx<nri; iidx++)
166 /* Load shift vector for this list */
167 i_shift_offset = DIM*shiftidx[iidx];
169 /* Load limits for loop over neighbors */
170 j_index_start = jindex[iidx];
171 j_index_end = jindex[iidx+1];
173 /* Get outer coordinate index */
175 i_coord_offset = DIM*inr;
177 /* Load i particle coords and add shift vector */
178 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
179 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
181 fix1 = _mm256_setzero_ps();
182 fiy1 = _mm256_setzero_ps();
183 fiz1 = _mm256_setzero_ps();
184 fix2 = _mm256_setzero_ps();
185 fiy2 = _mm256_setzero_ps();
186 fiz2 = _mm256_setzero_ps();
187 fix3 = _mm256_setzero_ps();
188 fiy3 = _mm256_setzero_ps();
189 fiz3 = _mm256_setzero_ps();
191 /* Reset potential sums */
192 velecsum = _mm256_setzero_ps();
194 /* Start inner kernel loop */
195 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
198 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA = DIM*jnrA;
208 j_coord_offsetB = DIM*jnrB;
209 j_coord_offsetC = DIM*jnrC;
210 j_coord_offsetD = DIM*jnrD;
211 j_coord_offsetE = DIM*jnrE;
212 j_coord_offsetF = DIM*jnrF;
213 j_coord_offsetG = DIM*jnrG;
214 j_coord_offsetH = DIM*jnrH;
216 /* load j atom coordinates */
217 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
218 x+j_coord_offsetC,x+j_coord_offsetD,
219 x+j_coord_offsetE,x+j_coord_offsetF,
220 x+j_coord_offsetG,x+j_coord_offsetH,
223 /* Calculate displacement vector */
224 dx10 = _mm256_sub_ps(ix1,jx0);
225 dy10 = _mm256_sub_ps(iy1,jy0);
226 dz10 = _mm256_sub_ps(iz1,jz0);
227 dx20 = _mm256_sub_ps(ix2,jx0);
228 dy20 = _mm256_sub_ps(iy2,jy0);
229 dz20 = _mm256_sub_ps(iz2,jz0);
230 dx30 = _mm256_sub_ps(ix3,jx0);
231 dy30 = _mm256_sub_ps(iy3,jy0);
232 dz30 = _mm256_sub_ps(iz3,jz0);
234 /* Calculate squared distance and things based on it */
235 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
236 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
237 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
239 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
240 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
241 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
243 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
244 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
245 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
247 /* Load parameters for j particles */
248 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
249 charge+jnrC+0,charge+jnrD+0,
250 charge+jnrE+0,charge+jnrF+0,
251 charge+jnrG+0,charge+jnrH+0);
253 fjx0 = _mm256_setzero_ps();
254 fjy0 = _mm256_setzero_ps();
255 fjz0 = _mm256_setzero_ps();
257 /**************************
258 * CALCULATE INTERACTIONS *
259 **************************/
261 if (gmx_mm256_any_lt(rsq10,rcutoff2))
264 r10 = _mm256_mul_ps(rsq10,rinv10);
266 /* Compute parameters for interactions between i and j atoms */
267 qq10 = _mm256_mul_ps(iq1,jq0);
269 /* EWALD ELECTROSTATICS */
271 /* Analytical PME correction */
272 zeta2 = _mm256_mul_ps(beta2,rsq10);
273 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
274 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
275 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
276 felec = _mm256_mul_ps(qq10,felec);
277 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
278 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
279 velec = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
280 velec = _mm256_mul_ps(qq10,velec);
282 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
284 /* Update potential sum for this i atom from the interaction with this j atom. */
285 velec = _mm256_and_ps(velec,cutoff_mask);
286 velecsum = _mm256_add_ps(velecsum,velec);
290 fscal = _mm256_and_ps(fscal,cutoff_mask);
292 /* Calculate temporary vectorial force */
293 tx = _mm256_mul_ps(fscal,dx10);
294 ty = _mm256_mul_ps(fscal,dy10);
295 tz = _mm256_mul_ps(fscal,dz10);
297 /* Update vectorial force */
298 fix1 = _mm256_add_ps(fix1,tx);
299 fiy1 = _mm256_add_ps(fiy1,ty);
300 fiz1 = _mm256_add_ps(fiz1,tz);
302 fjx0 = _mm256_add_ps(fjx0,tx);
303 fjy0 = _mm256_add_ps(fjy0,ty);
304 fjz0 = _mm256_add_ps(fjz0,tz);
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm256_any_lt(rsq20,rcutoff2))
315 r20 = _mm256_mul_ps(rsq20,rinv20);
317 /* Compute parameters for interactions between i and j atoms */
318 qq20 = _mm256_mul_ps(iq2,jq0);
320 /* EWALD ELECTROSTATICS */
322 /* Analytical PME correction */
323 zeta2 = _mm256_mul_ps(beta2,rsq20);
324 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
325 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
326 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
327 felec = _mm256_mul_ps(qq20,felec);
328 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
329 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
330 velec = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
331 velec = _mm256_mul_ps(qq20,velec);
333 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 velec = _mm256_and_ps(velec,cutoff_mask);
337 velecsum = _mm256_add_ps(velecsum,velec);
341 fscal = _mm256_and_ps(fscal,cutoff_mask);
343 /* Calculate temporary vectorial force */
344 tx = _mm256_mul_ps(fscal,dx20);
345 ty = _mm256_mul_ps(fscal,dy20);
346 tz = _mm256_mul_ps(fscal,dz20);
348 /* Update vectorial force */
349 fix2 = _mm256_add_ps(fix2,tx);
350 fiy2 = _mm256_add_ps(fiy2,ty);
351 fiz2 = _mm256_add_ps(fiz2,tz);
353 fjx0 = _mm256_add_ps(fjx0,tx);
354 fjy0 = _mm256_add_ps(fjy0,ty);
355 fjz0 = _mm256_add_ps(fjz0,tz);
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 if (gmx_mm256_any_lt(rsq30,rcutoff2))
366 r30 = _mm256_mul_ps(rsq30,rinv30);
368 /* Compute parameters for interactions between i and j atoms */
369 qq30 = _mm256_mul_ps(iq3,jq0);
371 /* EWALD ELECTROSTATICS */
373 /* Analytical PME correction */
374 zeta2 = _mm256_mul_ps(beta2,rsq30);
375 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
376 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
377 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
378 felec = _mm256_mul_ps(qq30,felec);
379 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
380 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
381 velec = _mm256_sub_ps(_mm256_sub_ps(rinv30,sh_ewald),pmecorrV);
382 velec = _mm256_mul_ps(qq30,velec);
384 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
386 /* Update potential sum for this i atom from the interaction with this j atom. */
387 velec = _mm256_and_ps(velec,cutoff_mask);
388 velecsum = _mm256_add_ps(velecsum,velec);
392 fscal = _mm256_and_ps(fscal,cutoff_mask);
394 /* Calculate temporary vectorial force */
395 tx = _mm256_mul_ps(fscal,dx30);
396 ty = _mm256_mul_ps(fscal,dy30);
397 tz = _mm256_mul_ps(fscal,dz30);
399 /* Update vectorial force */
400 fix3 = _mm256_add_ps(fix3,tx);
401 fiy3 = _mm256_add_ps(fiy3,ty);
402 fiz3 = _mm256_add_ps(fiz3,tz);
404 fjx0 = _mm256_add_ps(fjx0,tx);
405 fjy0 = _mm256_add_ps(fjy0,ty);
406 fjz0 = _mm256_add_ps(fjz0,tz);
410 fjptrA = f+j_coord_offsetA;
411 fjptrB = f+j_coord_offsetB;
412 fjptrC = f+j_coord_offsetC;
413 fjptrD = f+j_coord_offsetD;
414 fjptrE = f+j_coord_offsetE;
415 fjptrF = f+j_coord_offsetF;
416 fjptrG = f+j_coord_offsetG;
417 fjptrH = f+j_coord_offsetH;
419 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
421 /* Inner loop uses 330 flops */
427 /* Get j neighbor index, and coordinate index */
428 jnrlistA = jjnr[jidx];
429 jnrlistB = jjnr[jidx+1];
430 jnrlistC = jjnr[jidx+2];
431 jnrlistD = jjnr[jidx+3];
432 jnrlistE = jjnr[jidx+4];
433 jnrlistF = jjnr[jidx+5];
434 jnrlistG = jjnr[jidx+6];
435 jnrlistH = jjnr[jidx+7];
436 /* Sign of each element will be negative for non-real atoms.
437 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
438 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
440 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
441 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
443 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
444 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
445 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
446 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
447 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
448 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
449 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
450 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
451 j_coord_offsetA = DIM*jnrA;
452 j_coord_offsetB = DIM*jnrB;
453 j_coord_offsetC = DIM*jnrC;
454 j_coord_offsetD = DIM*jnrD;
455 j_coord_offsetE = DIM*jnrE;
456 j_coord_offsetF = DIM*jnrF;
457 j_coord_offsetG = DIM*jnrG;
458 j_coord_offsetH = DIM*jnrH;
460 /* load j atom coordinates */
461 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
462 x+j_coord_offsetC,x+j_coord_offsetD,
463 x+j_coord_offsetE,x+j_coord_offsetF,
464 x+j_coord_offsetG,x+j_coord_offsetH,
467 /* Calculate displacement vector */
468 dx10 = _mm256_sub_ps(ix1,jx0);
469 dy10 = _mm256_sub_ps(iy1,jy0);
470 dz10 = _mm256_sub_ps(iz1,jz0);
471 dx20 = _mm256_sub_ps(ix2,jx0);
472 dy20 = _mm256_sub_ps(iy2,jy0);
473 dz20 = _mm256_sub_ps(iz2,jz0);
474 dx30 = _mm256_sub_ps(ix3,jx0);
475 dy30 = _mm256_sub_ps(iy3,jy0);
476 dz30 = _mm256_sub_ps(iz3,jz0);
478 /* Calculate squared distance and things based on it */
479 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
480 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
481 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
483 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
484 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
485 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
487 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
488 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
489 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
491 /* Load parameters for j particles */
492 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
493 charge+jnrC+0,charge+jnrD+0,
494 charge+jnrE+0,charge+jnrF+0,
495 charge+jnrG+0,charge+jnrH+0);
497 fjx0 = _mm256_setzero_ps();
498 fjy0 = _mm256_setzero_ps();
499 fjz0 = _mm256_setzero_ps();
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 if (gmx_mm256_any_lt(rsq10,rcutoff2))
508 r10 = _mm256_mul_ps(rsq10,rinv10);
509 r10 = _mm256_andnot_ps(dummy_mask,r10);
511 /* Compute parameters for interactions between i and j atoms */
512 qq10 = _mm256_mul_ps(iq1,jq0);
514 /* EWALD ELECTROSTATICS */
516 /* Analytical PME correction */
517 zeta2 = _mm256_mul_ps(beta2,rsq10);
518 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
519 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
520 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
521 felec = _mm256_mul_ps(qq10,felec);
522 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
523 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
524 velec = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
525 velec = _mm256_mul_ps(qq10,velec);
527 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
529 /* Update potential sum for this i atom from the interaction with this j atom. */
530 velec = _mm256_and_ps(velec,cutoff_mask);
531 velec = _mm256_andnot_ps(dummy_mask,velec);
532 velecsum = _mm256_add_ps(velecsum,velec);
536 fscal = _mm256_and_ps(fscal,cutoff_mask);
538 fscal = _mm256_andnot_ps(dummy_mask,fscal);
540 /* Calculate temporary vectorial force */
541 tx = _mm256_mul_ps(fscal,dx10);
542 ty = _mm256_mul_ps(fscal,dy10);
543 tz = _mm256_mul_ps(fscal,dz10);
545 /* Update vectorial force */
546 fix1 = _mm256_add_ps(fix1,tx);
547 fiy1 = _mm256_add_ps(fiy1,ty);
548 fiz1 = _mm256_add_ps(fiz1,tz);
550 fjx0 = _mm256_add_ps(fjx0,tx);
551 fjy0 = _mm256_add_ps(fjy0,ty);
552 fjz0 = _mm256_add_ps(fjz0,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 if (gmx_mm256_any_lt(rsq20,rcutoff2))
563 r20 = _mm256_mul_ps(rsq20,rinv20);
564 r20 = _mm256_andnot_ps(dummy_mask,r20);
566 /* Compute parameters for interactions between i and j atoms */
567 qq20 = _mm256_mul_ps(iq2,jq0);
569 /* EWALD ELECTROSTATICS */
571 /* Analytical PME correction */
572 zeta2 = _mm256_mul_ps(beta2,rsq20);
573 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
574 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
575 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
576 felec = _mm256_mul_ps(qq20,felec);
577 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
578 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
579 velec = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
580 velec = _mm256_mul_ps(qq20,velec);
582 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velec = _mm256_and_ps(velec,cutoff_mask);
586 velec = _mm256_andnot_ps(dummy_mask,velec);
587 velecsum = _mm256_add_ps(velecsum,velec);
591 fscal = _mm256_and_ps(fscal,cutoff_mask);
593 fscal = _mm256_andnot_ps(dummy_mask,fscal);
595 /* Calculate temporary vectorial force */
596 tx = _mm256_mul_ps(fscal,dx20);
597 ty = _mm256_mul_ps(fscal,dy20);
598 tz = _mm256_mul_ps(fscal,dz20);
600 /* Update vectorial force */
601 fix2 = _mm256_add_ps(fix2,tx);
602 fiy2 = _mm256_add_ps(fiy2,ty);
603 fiz2 = _mm256_add_ps(fiz2,tz);
605 fjx0 = _mm256_add_ps(fjx0,tx);
606 fjy0 = _mm256_add_ps(fjy0,ty);
607 fjz0 = _mm256_add_ps(fjz0,tz);
611 /**************************
612 * CALCULATE INTERACTIONS *
613 **************************/
615 if (gmx_mm256_any_lt(rsq30,rcutoff2))
618 r30 = _mm256_mul_ps(rsq30,rinv30);
619 r30 = _mm256_andnot_ps(dummy_mask,r30);
621 /* Compute parameters for interactions between i and j atoms */
622 qq30 = _mm256_mul_ps(iq3,jq0);
624 /* EWALD ELECTROSTATICS */
626 /* Analytical PME correction */
627 zeta2 = _mm256_mul_ps(beta2,rsq30);
628 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
629 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
630 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
631 felec = _mm256_mul_ps(qq30,felec);
632 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
633 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
634 velec = _mm256_sub_ps(_mm256_sub_ps(rinv30,sh_ewald),pmecorrV);
635 velec = _mm256_mul_ps(qq30,velec);
637 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
639 /* Update potential sum for this i atom from the interaction with this j atom. */
640 velec = _mm256_and_ps(velec,cutoff_mask);
641 velec = _mm256_andnot_ps(dummy_mask,velec);
642 velecsum = _mm256_add_ps(velecsum,velec);
646 fscal = _mm256_and_ps(fscal,cutoff_mask);
648 fscal = _mm256_andnot_ps(dummy_mask,fscal);
650 /* Calculate temporary vectorial force */
651 tx = _mm256_mul_ps(fscal,dx30);
652 ty = _mm256_mul_ps(fscal,dy30);
653 tz = _mm256_mul_ps(fscal,dz30);
655 /* Update vectorial force */
656 fix3 = _mm256_add_ps(fix3,tx);
657 fiy3 = _mm256_add_ps(fiy3,ty);
658 fiz3 = _mm256_add_ps(fiz3,tz);
660 fjx0 = _mm256_add_ps(fjx0,tx);
661 fjy0 = _mm256_add_ps(fjy0,ty);
662 fjz0 = _mm256_add_ps(fjz0,tz);
666 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
667 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
668 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
669 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
670 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
671 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
672 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
673 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
675 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
677 /* Inner loop uses 333 flops */
680 /* End of innermost loop */
682 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
683 f+i_coord_offset+DIM,fshift+i_shift_offset);
686 /* Update potential energies */
687 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
689 /* Increment number of inner iterations */
690 inneriter += j_index_end - j_index_start;
692 /* Outer loop uses 19 flops */
695 /* Increment number of outer iterations */
698 /* Update outer/inner flops */
700 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*333);
703 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_single
704 * Electrostatics interaction: Ewald
705 * VdW interaction: None
706 * Geometry: Water4-Particle
707 * Calculate force/pot: Force
710 nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_avx_256_single
711 (t_nblist * gmx_restrict nlist,
712 rvec * gmx_restrict xx,
713 rvec * gmx_restrict ff,
714 t_forcerec * gmx_restrict fr,
715 t_mdatoms * gmx_restrict mdatoms,
716 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
717 t_nrnb * gmx_restrict nrnb)
719 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
720 * just 0 for non-waters.
721 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
722 * jnr indices corresponding to data put in the four positions in the SIMD register.
724 int i_shift_offset,i_coord_offset,outeriter,inneriter;
725 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
726 int jnrA,jnrB,jnrC,jnrD;
727 int jnrE,jnrF,jnrG,jnrH;
728 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
729 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
730 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
731 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
732 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
734 real *shiftvec,*fshift,*x,*f;
735 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
737 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
738 real * vdwioffsetptr1;
739 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
740 real * vdwioffsetptr2;
741 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
742 real * vdwioffsetptr3;
743 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
744 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
745 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
746 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
747 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
748 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
749 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
752 __m128i ewitab_lo,ewitab_hi;
753 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
754 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
756 __m256 dummy_mask,cutoff_mask;
757 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
758 __m256 one = _mm256_set1_ps(1.0);
759 __m256 two = _mm256_set1_ps(2.0);
765 jindex = nlist->jindex;
767 shiftidx = nlist->shift;
769 shiftvec = fr->shift_vec[0];
770 fshift = fr->fshift[0];
771 facel = _mm256_set1_ps(fr->epsfac);
772 charge = mdatoms->chargeA;
774 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
775 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
776 beta2 = _mm256_mul_ps(beta,beta);
777 beta3 = _mm256_mul_ps(beta,beta2);
779 ewtab = fr->ic->tabq_coul_F;
780 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
781 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
783 /* Setup water-specific parameters */
784 inr = nlist->iinr[0];
785 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
786 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
787 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
789 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
790 rcutoff_scalar = fr->rcoulomb;
791 rcutoff = _mm256_set1_ps(rcutoff_scalar);
792 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
794 /* Avoid stupid compiler warnings */
795 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
808 for(iidx=0;iidx<4*DIM;iidx++)
813 /* Start outer loop over neighborlists */
814 for(iidx=0; iidx<nri; iidx++)
816 /* Load shift vector for this list */
817 i_shift_offset = DIM*shiftidx[iidx];
819 /* Load limits for loop over neighbors */
820 j_index_start = jindex[iidx];
821 j_index_end = jindex[iidx+1];
823 /* Get outer coordinate index */
825 i_coord_offset = DIM*inr;
827 /* Load i particle coords and add shift vector */
828 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
829 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
831 fix1 = _mm256_setzero_ps();
832 fiy1 = _mm256_setzero_ps();
833 fiz1 = _mm256_setzero_ps();
834 fix2 = _mm256_setzero_ps();
835 fiy2 = _mm256_setzero_ps();
836 fiz2 = _mm256_setzero_ps();
837 fix3 = _mm256_setzero_ps();
838 fiy3 = _mm256_setzero_ps();
839 fiz3 = _mm256_setzero_ps();
841 /* Start inner kernel loop */
842 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
845 /* Get j neighbor index, and coordinate index */
854 j_coord_offsetA = DIM*jnrA;
855 j_coord_offsetB = DIM*jnrB;
856 j_coord_offsetC = DIM*jnrC;
857 j_coord_offsetD = DIM*jnrD;
858 j_coord_offsetE = DIM*jnrE;
859 j_coord_offsetF = DIM*jnrF;
860 j_coord_offsetG = DIM*jnrG;
861 j_coord_offsetH = DIM*jnrH;
863 /* load j atom coordinates */
864 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
865 x+j_coord_offsetC,x+j_coord_offsetD,
866 x+j_coord_offsetE,x+j_coord_offsetF,
867 x+j_coord_offsetG,x+j_coord_offsetH,
870 /* Calculate displacement vector */
871 dx10 = _mm256_sub_ps(ix1,jx0);
872 dy10 = _mm256_sub_ps(iy1,jy0);
873 dz10 = _mm256_sub_ps(iz1,jz0);
874 dx20 = _mm256_sub_ps(ix2,jx0);
875 dy20 = _mm256_sub_ps(iy2,jy0);
876 dz20 = _mm256_sub_ps(iz2,jz0);
877 dx30 = _mm256_sub_ps(ix3,jx0);
878 dy30 = _mm256_sub_ps(iy3,jy0);
879 dz30 = _mm256_sub_ps(iz3,jz0);
881 /* Calculate squared distance and things based on it */
882 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
883 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
884 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
886 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
887 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
888 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
890 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
891 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
892 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
894 /* Load parameters for j particles */
895 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
896 charge+jnrC+0,charge+jnrD+0,
897 charge+jnrE+0,charge+jnrF+0,
898 charge+jnrG+0,charge+jnrH+0);
900 fjx0 = _mm256_setzero_ps();
901 fjy0 = _mm256_setzero_ps();
902 fjz0 = _mm256_setzero_ps();
904 /**************************
905 * CALCULATE INTERACTIONS *
906 **************************/
908 if (gmx_mm256_any_lt(rsq10,rcutoff2))
911 r10 = _mm256_mul_ps(rsq10,rinv10);
913 /* Compute parameters for interactions between i and j atoms */
914 qq10 = _mm256_mul_ps(iq1,jq0);
916 /* EWALD ELECTROSTATICS */
918 /* Analytical PME correction */
919 zeta2 = _mm256_mul_ps(beta2,rsq10);
920 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
921 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
922 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
923 felec = _mm256_mul_ps(qq10,felec);
925 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
929 fscal = _mm256_and_ps(fscal,cutoff_mask);
931 /* Calculate temporary vectorial force */
932 tx = _mm256_mul_ps(fscal,dx10);
933 ty = _mm256_mul_ps(fscal,dy10);
934 tz = _mm256_mul_ps(fscal,dz10);
936 /* Update vectorial force */
937 fix1 = _mm256_add_ps(fix1,tx);
938 fiy1 = _mm256_add_ps(fiy1,ty);
939 fiz1 = _mm256_add_ps(fiz1,tz);
941 fjx0 = _mm256_add_ps(fjx0,tx);
942 fjy0 = _mm256_add_ps(fjy0,ty);
943 fjz0 = _mm256_add_ps(fjz0,tz);
947 /**************************
948 * CALCULATE INTERACTIONS *
949 **************************/
951 if (gmx_mm256_any_lt(rsq20,rcutoff2))
954 r20 = _mm256_mul_ps(rsq20,rinv20);
956 /* Compute parameters for interactions between i and j atoms */
957 qq20 = _mm256_mul_ps(iq2,jq0);
959 /* EWALD ELECTROSTATICS */
961 /* Analytical PME correction */
962 zeta2 = _mm256_mul_ps(beta2,rsq20);
963 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
964 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
965 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
966 felec = _mm256_mul_ps(qq20,felec);
968 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
972 fscal = _mm256_and_ps(fscal,cutoff_mask);
974 /* Calculate temporary vectorial force */
975 tx = _mm256_mul_ps(fscal,dx20);
976 ty = _mm256_mul_ps(fscal,dy20);
977 tz = _mm256_mul_ps(fscal,dz20);
979 /* Update vectorial force */
980 fix2 = _mm256_add_ps(fix2,tx);
981 fiy2 = _mm256_add_ps(fiy2,ty);
982 fiz2 = _mm256_add_ps(fiz2,tz);
984 fjx0 = _mm256_add_ps(fjx0,tx);
985 fjy0 = _mm256_add_ps(fjy0,ty);
986 fjz0 = _mm256_add_ps(fjz0,tz);
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 if (gmx_mm256_any_lt(rsq30,rcutoff2))
997 r30 = _mm256_mul_ps(rsq30,rinv30);
999 /* Compute parameters for interactions between i and j atoms */
1000 qq30 = _mm256_mul_ps(iq3,jq0);
1002 /* EWALD ELECTROSTATICS */
1004 /* Analytical PME correction */
1005 zeta2 = _mm256_mul_ps(beta2,rsq30);
1006 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
1007 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1008 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1009 felec = _mm256_mul_ps(qq30,felec);
1011 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1015 fscal = _mm256_and_ps(fscal,cutoff_mask);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm256_mul_ps(fscal,dx30);
1019 ty = _mm256_mul_ps(fscal,dy30);
1020 tz = _mm256_mul_ps(fscal,dz30);
1022 /* Update vectorial force */
1023 fix3 = _mm256_add_ps(fix3,tx);
1024 fiy3 = _mm256_add_ps(fiy3,ty);
1025 fiz3 = _mm256_add_ps(fiz3,tz);
1027 fjx0 = _mm256_add_ps(fjx0,tx);
1028 fjy0 = _mm256_add_ps(fjy0,ty);
1029 fjz0 = _mm256_add_ps(fjz0,tz);
1033 fjptrA = f+j_coord_offsetA;
1034 fjptrB = f+j_coord_offsetB;
1035 fjptrC = f+j_coord_offsetC;
1036 fjptrD = f+j_coord_offsetD;
1037 fjptrE = f+j_coord_offsetE;
1038 fjptrF = f+j_coord_offsetF;
1039 fjptrG = f+j_coord_offsetG;
1040 fjptrH = f+j_coord_offsetH;
1042 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1044 /* Inner loop uses 180 flops */
1047 if(jidx<j_index_end)
1050 /* Get j neighbor index, and coordinate index */
1051 jnrlistA = jjnr[jidx];
1052 jnrlistB = jjnr[jidx+1];
1053 jnrlistC = jjnr[jidx+2];
1054 jnrlistD = jjnr[jidx+3];
1055 jnrlistE = jjnr[jidx+4];
1056 jnrlistF = jjnr[jidx+5];
1057 jnrlistG = jjnr[jidx+6];
1058 jnrlistH = jjnr[jidx+7];
1059 /* Sign of each element will be negative for non-real atoms.
1060 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1061 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1063 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1064 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1066 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1067 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1068 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1069 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1070 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1071 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1072 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1073 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1074 j_coord_offsetA = DIM*jnrA;
1075 j_coord_offsetB = DIM*jnrB;
1076 j_coord_offsetC = DIM*jnrC;
1077 j_coord_offsetD = DIM*jnrD;
1078 j_coord_offsetE = DIM*jnrE;
1079 j_coord_offsetF = DIM*jnrF;
1080 j_coord_offsetG = DIM*jnrG;
1081 j_coord_offsetH = DIM*jnrH;
1083 /* load j atom coordinates */
1084 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1085 x+j_coord_offsetC,x+j_coord_offsetD,
1086 x+j_coord_offsetE,x+j_coord_offsetF,
1087 x+j_coord_offsetG,x+j_coord_offsetH,
1090 /* Calculate displacement vector */
1091 dx10 = _mm256_sub_ps(ix1,jx0);
1092 dy10 = _mm256_sub_ps(iy1,jy0);
1093 dz10 = _mm256_sub_ps(iz1,jz0);
1094 dx20 = _mm256_sub_ps(ix2,jx0);
1095 dy20 = _mm256_sub_ps(iy2,jy0);
1096 dz20 = _mm256_sub_ps(iz2,jz0);
1097 dx30 = _mm256_sub_ps(ix3,jx0);
1098 dy30 = _mm256_sub_ps(iy3,jy0);
1099 dz30 = _mm256_sub_ps(iz3,jz0);
1101 /* Calculate squared distance and things based on it */
1102 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1103 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1104 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1106 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1107 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1108 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1110 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1111 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1112 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
1114 /* Load parameters for j particles */
1115 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1116 charge+jnrC+0,charge+jnrD+0,
1117 charge+jnrE+0,charge+jnrF+0,
1118 charge+jnrG+0,charge+jnrH+0);
1120 fjx0 = _mm256_setzero_ps();
1121 fjy0 = _mm256_setzero_ps();
1122 fjz0 = _mm256_setzero_ps();
1124 /**************************
1125 * CALCULATE INTERACTIONS *
1126 **************************/
1128 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1131 r10 = _mm256_mul_ps(rsq10,rinv10);
1132 r10 = _mm256_andnot_ps(dummy_mask,r10);
1134 /* Compute parameters for interactions between i and j atoms */
1135 qq10 = _mm256_mul_ps(iq1,jq0);
1137 /* EWALD ELECTROSTATICS */
1139 /* Analytical PME correction */
1140 zeta2 = _mm256_mul_ps(beta2,rsq10);
1141 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1142 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1143 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1144 felec = _mm256_mul_ps(qq10,felec);
1146 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1150 fscal = _mm256_and_ps(fscal,cutoff_mask);
1152 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1154 /* Calculate temporary vectorial force */
1155 tx = _mm256_mul_ps(fscal,dx10);
1156 ty = _mm256_mul_ps(fscal,dy10);
1157 tz = _mm256_mul_ps(fscal,dz10);
1159 /* Update vectorial force */
1160 fix1 = _mm256_add_ps(fix1,tx);
1161 fiy1 = _mm256_add_ps(fiy1,ty);
1162 fiz1 = _mm256_add_ps(fiz1,tz);
1164 fjx0 = _mm256_add_ps(fjx0,tx);
1165 fjy0 = _mm256_add_ps(fjy0,ty);
1166 fjz0 = _mm256_add_ps(fjz0,tz);
1170 /**************************
1171 * CALCULATE INTERACTIONS *
1172 **************************/
1174 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1177 r20 = _mm256_mul_ps(rsq20,rinv20);
1178 r20 = _mm256_andnot_ps(dummy_mask,r20);
1180 /* Compute parameters for interactions between i and j atoms */
1181 qq20 = _mm256_mul_ps(iq2,jq0);
1183 /* EWALD ELECTROSTATICS */
1185 /* Analytical PME correction */
1186 zeta2 = _mm256_mul_ps(beta2,rsq20);
1187 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1188 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1189 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1190 felec = _mm256_mul_ps(qq20,felec);
1192 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1196 fscal = _mm256_and_ps(fscal,cutoff_mask);
1198 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1200 /* Calculate temporary vectorial force */
1201 tx = _mm256_mul_ps(fscal,dx20);
1202 ty = _mm256_mul_ps(fscal,dy20);
1203 tz = _mm256_mul_ps(fscal,dz20);
1205 /* Update vectorial force */
1206 fix2 = _mm256_add_ps(fix2,tx);
1207 fiy2 = _mm256_add_ps(fiy2,ty);
1208 fiz2 = _mm256_add_ps(fiz2,tz);
1210 fjx0 = _mm256_add_ps(fjx0,tx);
1211 fjy0 = _mm256_add_ps(fjy0,ty);
1212 fjz0 = _mm256_add_ps(fjz0,tz);
1216 /**************************
1217 * CALCULATE INTERACTIONS *
1218 **************************/
1220 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1223 r30 = _mm256_mul_ps(rsq30,rinv30);
1224 r30 = _mm256_andnot_ps(dummy_mask,r30);
1226 /* Compute parameters for interactions between i and j atoms */
1227 qq30 = _mm256_mul_ps(iq3,jq0);
1229 /* EWALD ELECTROSTATICS */
1231 /* Analytical PME correction */
1232 zeta2 = _mm256_mul_ps(beta2,rsq30);
1233 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
1234 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1235 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1236 felec = _mm256_mul_ps(qq30,felec);
1238 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1242 fscal = _mm256_and_ps(fscal,cutoff_mask);
1244 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1246 /* Calculate temporary vectorial force */
1247 tx = _mm256_mul_ps(fscal,dx30);
1248 ty = _mm256_mul_ps(fscal,dy30);
1249 tz = _mm256_mul_ps(fscal,dz30);
1251 /* Update vectorial force */
1252 fix3 = _mm256_add_ps(fix3,tx);
1253 fiy3 = _mm256_add_ps(fiy3,ty);
1254 fiz3 = _mm256_add_ps(fiz3,tz);
1256 fjx0 = _mm256_add_ps(fjx0,tx);
1257 fjy0 = _mm256_add_ps(fjy0,ty);
1258 fjz0 = _mm256_add_ps(fjz0,tz);
1262 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1263 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1264 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1265 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1266 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1267 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1268 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1269 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1271 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1273 /* Inner loop uses 183 flops */
1276 /* End of innermost loop */
1278 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1279 f+i_coord_offset+DIM,fshift+i_shift_offset);
1281 /* Increment number of inner iterations */
1282 inneriter += j_index_end - j_index_start;
1284 /* Outer loop uses 18 flops */
1287 /* Increment number of outer iterations */
1290 /* Update outer/inner flops */
1292 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*183);