2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr1;
73 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
81 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
82 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
83 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
86 __m128i ewitab_lo,ewitab_hi;
87 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
88 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
90 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
91 real rswitch_scalar,d_scalar;
92 __m256 dummy_mask,cutoff_mask;
93 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
94 __m256 one = _mm256_set1_ps(1.0);
95 __m256 two = _mm256_set1_ps(2.0);
101 jindex = nlist->jindex;
103 shiftidx = nlist->shift;
105 shiftvec = fr->shift_vec[0];
106 fshift = fr->fshift[0];
107 facel = _mm256_set1_ps(fr->epsfac);
108 charge = mdatoms->chargeA;
110 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
111 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
112 beta2 = _mm256_mul_ps(beta,beta);
113 beta3 = _mm256_mul_ps(beta,beta2);
115 ewtab = fr->ic->tabq_coul_FDV0;
116 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
117 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
119 /* Setup water-specific parameters */
120 inr = nlist->iinr[0];
121 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
122 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
123 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
125 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
126 rcutoff_scalar = fr->rcoulomb;
127 rcutoff = _mm256_set1_ps(rcutoff_scalar);
128 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
130 rswitch_scalar = fr->rcoulomb_switch;
131 rswitch = _mm256_set1_ps(rswitch_scalar);
132 /* Setup switch parameters */
133 d_scalar = rcutoff_scalar-rswitch_scalar;
134 d = _mm256_set1_ps(d_scalar);
135 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
136 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
137 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
138 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
139 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
140 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
142 /* Avoid stupid compiler warnings */
143 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
156 for(iidx=0;iidx<4*DIM;iidx++)
161 /* Start outer loop over neighborlists */
162 for(iidx=0; iidx<nri; iidx++)
164 /* Load shift vector for this list */
165 i_shift_offset = DIM*shiftidx[iidx];
167 /* Load limits for loop over neighbors */
168 j_index_start = jindex[iidx];
169 j_index_end = jindex[iidx+1];
171 /* Get outer coordinate index */
173 i_coord_offset = DIM*inr;
175 /* Load i particle coords and add shift vector */
176 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
177 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
179 fix1 = _mm256_setzero_ps();
180 fiy1 = _mm256_setzero_ps();
181 fiz1 = _mm256_setzero_ps();
182 fix2 = _mm256_setzero_ps();
183 fiy2 = _mm256_setzero_ps();
184 fiz2 = _mm256_setzero_ps();
185 fix3 = _mm256_setzero_ps();
186 fiy3 = _mm256_setzero_ps();
187 fiz3 = _mm256_setzero_ps();
189 /* Reset potential sums */
190 velecsum = _mm256_setzero_ps();
192 /* Start inner kernel loop */
193 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
196 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA = DIM*jnrA;
206 j_coord_offsetB = DIM*jnrB;
207 j_coord_offsetC = DIM*jnrC;
208 j_coord_offsetD = DIM*jnrD;
209 j_coord_offsetE = DIM*jnrE;
210 j_coord_offsetF = DIM*jnrF;
211 j_coord_offsetG = DIM*jnrG;
212 j_coord_offsetH = DIM*jnrH;
214 /* load j atom coordinates */
215 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
216 x+j_coord_offsetC,x+j_coord_offsetD,
217 x+j_coord_offsetE,x+j_coord_offsetF,
218 x+j_coord_offsetG,x+j_coord_offsetH,
221 /* Calculate displacement vector */
222 dx10 = _mm256_sub_ps(ix1,jx0);
223 dy10 = _mm256_sub_ps(iy1,jy0);
224 dz10 = _mm256_sub_ps(iz1,jz0);
225 dx20 = _mm256_sub_ps(ix2,jx0);
226 dy20 = _mm256_sub_ps(iy2,jy0);
227 dz20 = _mm256_sub_ps(iz2,jz0);
228 dx30 = _mm256_sub_ps(ix3,jx0);
229 dy30 = _mm256_sub_ps(iy3,jy0);
230 dz30 = _mm256_sub_ps(iz3,jz0);
232 /* Calculate squared distance and things based on it */
233 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
234 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
235 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
237 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
238 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
239 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
241 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
242 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
243 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
245 /* Load parameters for j particles */
246 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
247 charge+jnrC+0,charge+jnrD+0,
248 charge+jnrE+0,charge+jnrF+0,
249 charge+jnrG+0,charge+jnrH+0);
251 fjx0 = _mm256_setzero_ps();
252 fjy0 = _mm256_setzero_ps();
253 fjz0 = _mm256_setzero_ps();
255 /**************************
256 * CALCULATE INTERACTIONS *
257 **************************/
259 if (gmx_mm256_any_lt(rsq10,rcutoff2))
262 r10 = _mm256_mul_ps(rsq10,rinv10);
264 /* Compute parameters for interactions between i and j atoms */
265 qq10 = _mm256_mul_ps(iq1,jq0);
267 /* EWALD ELECTROSTATICS */
269 /* Analytical PME correction */
270 zeta2 = _mm256_mul_ps(beta2,rsq10);
271 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
272 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
273 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
274 felec = _mm256_mul_ps(qq10,felec);
275 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
276 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
277 velec = _mm256_sub_ps(rinv10,pmecorrV);
278 velec = _mm256_mul_ps(qq10,velec);
280 d = _mm256_sub_ps(r10,rswitch);
281 d = _mm256_max_ps(d,_mm256_setzero_ps());
282 d2 = _mm256_mul_ps(d,d);
283 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
285 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
287 /* Evaluate switch function */
288 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
289 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
290 velec = _mm256_mul_ps(velec,sw);
291 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
293 /* Update potential sum for this i atom from the interaction with this j atom. */
294 velec = _mm256_and_ps(velec,cutoff_mask);
295 velecsum = _mm256_add_ps(velecsum,velec);
299 fscal = _mm256_and_ps(fscal,cutoff_mask);
301 /* Calculate temporary vectorial force */
302 tx = _mm256_mul_ps(fscal,dx10);
303 ty = _mm256_mul_ps(fscal,dy10);
304 tz = _mm256_mul_ps(fscal,dz10);
306 /* Update vectorial force */
307 fix1 = _mm256_add_ps(fix1,tx);
308 fiy1 = _mm256_add_ps(fiy1,ty);
309 fiz1 = _mm256_add_ps(fiz1,tz);
311 fjx0 = _mm256_add_ps(fjx0,tx);
312 fjy0 = _mm256_add_ps(fjy0,ty);
313 fjz0 = _mm256_add_ps(fjz0,tz);
317 /**************************
318 * CALCULATE INTERACTIONS *
319 **************************/
321 if (gmx_mm256_any_lt(rsq20,rcutoff2))
324 r20 = _mm256_mul_ps(rsq20,rinv20);
326 /* Compute parameters for interactions between i and j atoms */
327 qq20 = _mm256_mul_ps(iq2,jq0);
329 /* EWALD ELECTROSTATICS */
331 /* Analytical PME correction */
332 zeta2 = _mm256_mul_ps(beta2,rsq20);
333 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
334 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
335 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
336 felec = _mm256_mul_ps(qq20,felec);
337 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
338 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
339 velec = _mm256_sub_ps(rinv20,pmecorrV);
340 velec = _mm256_mul_ps(qq20,velec);
342 d = _mm256_sub_ps(r20,rswitch);
343 d = _mm256_max_ps(d,_mm256_setzero_ps());
344 d2 = _mm256_mul_ps(d,d);
345 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
347 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
349 /* Evaluate switch function */
350 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
351 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
352 velec = _mm256_mul_ps(velec,sw);
353 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 velec = _mm256_and_ps(velec,cutoff_mask);
357 velecsum = _mm256_add_ps(velecsum,velec);
361 fscal = _mm256_and_ps(fscal,cutoff_mask);
363 /* Calculate temporary vectorial force */
364 tx = _mm256_mul_ps(fscal,dx20);
365 ty = _mm256_mul_ps(fscal,dy20);
366 tz = _mm256_mul_ps(fscal,dz20);
368 /* Update vectorial force */
369 fix2 = _mm256_add_ps(fix2,tx);
370 fiy2 = _mm256_add_ps(fiy2,ty);
371 fiz2 = _mm256_add_ps(fiz2,tz);
373 fjx0 = _mm256_add_ps(fjx0,tx);
374 fjy0 = _mm256_add_ps(fjy0,ty);
375 fjz0 = _mm256_add_ps(fjz0,tz);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 if (gmx_mm256_any_lt(rsq30,rcutoff2))
386 r30 = _mm256_mul_ps(rsq30,rinv30);
388 /* Compute parameters for interactions between i and j atoms */
389 qq30 = _mm256_mul_ps(iq3,jq0);
391 /* EWALD ELECTROSTATICS */
393 /* Analytical PME correction */
394 zeta2 = _mm256_mul_ps(beta2,rsq30);
395 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
396 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
397 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
398 felec = _mm256_mul_ps(qq30,felec);
399 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
400 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
401 velec = _mm256_sub_ps(rinv30,pmecorrV);
402 velec = _mm256_mul_ps(qq30,velec);
404 d = _mm256_sub_ps(r30,rswitch);
405 d = _mm256_max_ps(d,_mm256_setzero_ps());
406 d2 = _mm256_mul_ps(d,d);
407 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
409 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
411 /* Evaluate switch function */
412 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
413 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
414 velec = _mm256_mul_ps(velec,sw);
415 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velec = _mm256_and_ps(velec,cutoff_mask);
419 velecsum = _mm256_add_ps(velecsum,velec);
423 fscal = _mm256_and_ps(fscal,cutoff_mask);
425 /* Calculate temporary vectorial force */
426 tx = _mm256_mul_ps(fscal,dx30);
427 ty = _mm256_mul_ps(fscal,dy30);
428 tz = _mm256_mul_ps(fscal,dz30);
430 /* Update vectorial force */
431 fix3 = _mm256_add_ps(fix3,tx);
432 fiy3 = _mm256_add_ps(fiy3,ty);
433 fiz3 = _mm256_add_ps(fiz3,tz);
435 fjx0 = _mm256_add_ps(fjx0,tx);
436 fjy0 = _mm256_add_ps(fjy0,ty);
437 fjz0 = _mm256_add_ps(fjz0,tz);
441 fjptrA = f+j_coord_offsetA;
442 fjptrB = f+j_coord_offsetB;
443 fjptrC = f+j_coord_offsetC;
444 fjptrD = f+j_coord_offsetD;
445 fjptrE = f+j_coord_offsetE;
446 fjptrF = f+j_coord_offsetF;
447 fjptrG = f+j_coord_offsetG;
448 fjptrH = f+j_coord_offsetH;
450 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
452 /* Inner loop uses 327 flops */
458 /* Get j neighbor index, and coordinate index */
459 jnrlistA = jjnr[jidx];
460 jnrlistB = jjnr[jidx+1];
461 jnrlistC = jjnr[jidx+2];
462 jnrlistD = jjnr[jidx+3];
463 jnrlistE = jjnr[jidx+4];
464 jnrlistF = jjnr[jidx+5];
465 jnrlistG = jjnr[jidx+6];
466 jnrlistH = jjnr[jidx+7];
467 /* Sign of each element will be negative for non-real atoms.
468 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
469 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
471 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
472 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
474 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
475 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
476 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
477 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
478 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
479 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
480 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
481 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
482 j_coord_offsetA = DIM*jnrA;
483 j_coord_offsetB = DIM*jnrB;
484 j_coord_offsetC = DIM*jnrC;
485 j_coord_offsetD = DIM*jnrD;
486 j_coord_offsetE = DIM*jnrE;
487 j_coord_offsetF = DIM*jnrF;
488 j_coord_offsetG = DIM*jnrG;
489 j_coord_offsetH = DIM*jnrH;
491 /* load j atom coordinates */
492 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
493 x+j_coord_offsetC,x+j_coord_offsetD,
494 x+j_coord_offsetE,x+j_coord_offsetF,
495 x+j_coord_offsetG,x+j_coord_offsetH,
498 /* Calculate displacement vector */
499 dx10 = _mm256_sub_ps(ix1,jx0);
500 dy10 = _mm256_sub_ps(iy1,jy0);
501 dz10 = _mm256_sub_ps(iz1,jz0);
502 dx20 = _mm256_sub_ps(ix2,jx0);
503 dy20 = _mm256_sub_ps(iy2,jy0);
504 dz20 = _mm256_sub_ps(iz2,jz0);
505 dx30 = _mm256_sub_ps(ix3,jx0);
506 dy30 = _mm256_sub_ps(iy3,jy0);
507 dz30 = _mm256_sub_ps(iz3,jz0);
509 /* Calculate squared distance and things based on it */
510 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
511 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
512 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
514 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
515 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
516 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
518 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
519 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
520 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
522 /* Load parameters for j particles */
523 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
524 charge+jnrC+0,charge+jnrD+0,
525 charge+jnrE+0,charge+jnrF+0,
526 charge+jnrG+0,charge+jnrH+0);
528 fjx0 = _mm256_setzero_ps();
529 fjy0 = _mm256_setzero_ps();
530 fjz0 = _mm256_setzero_ps();
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 if (gmx_mm256_any_lt(rsq10,rcutoff2))
539 r10 = _mm256_mul_ps(rsq10,rinv10);
540 r10 = _mm256_andnot_ps(dummy_mask,r10);
542 /* Compute parameters for interactions between i and j atoms */
543 qq10 = _mm256_mul_ps(iq1,jq0);
545 /* EWALD ELECTROSTATICS */
547 /* Analytical PME correction */
548 zeta2 = _mm256_mul_ps(beta2,rsq10);
549 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
550 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
551 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
552 felec = _mm256_mul_ps(qq10,felec);
553 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
554 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
555 velec = _mm256_sub_ps(rinv10,pmecorrV);
556 velec = _mm256_mul_ps(qq10,velec);
558 d = _mm256_sub_ps(r10,rswitch);
559 d = _mm256_max_ps(d,_mm256_setzero_ps());
560 d2 = _mm256_mul_ps(d,d);
561 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
563 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
565 /* Evaluate switch function */
566 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
567 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
568 velec = _mm256_mul_ps(velec,sw);
569 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
571 /* Update potential sum for this i atom from the interaction with this j atom. */
572 velec = _mm256_and_ps(velec,cutoff_mask);
573 velec = _mm256_andnot_ps(dummy_mask,velec);
574 velecsum = _mm256_add_ps(velecsum,velec);
578 fscal = _mm256_and_ps(fscal,cutoff_mask);
580 fscal = _mm256_andnot_ps(dummy_mask,fscal);
582 /* Calculate temporary vectorial force */
583 tx = _mm256_mul_ps(fscal,dx10);
584 ty = _mm256_mul_ps(fscal,dy10);
585 tz = _mm256_mul_ps(fscal,dz10);
587 /* Update vectorial force */
588 fix1 = _mm256_add_ps(fix1,tx);
589 fiy1 = _mm256_add_ps(fiy1,ty);
590 fiz1 = _mm256_add_ps(fiz1,tz);
592 fjx0 = _mm256_add_ps(fjx0,tx);
593 fjy0 = _mm256_add_ps(fjy0,ty);
594 fjz0 = _mm256_add_ps(fjz0,tz);
598 /**************************
599 * CALCULATE INTERACTIONS *
600 **************************/
602 if (gmx_mm256_any_lt(rsq20,rcutoff2))
605 r20 = _mm256_mul_ps(rsq20,rinv20);
606 r20 = _mm256_andnot_ps(dummy_mask,r20);
608 /* Compute parameters for interactions between i and j atoms */
609 qq20 = _mm256_mul_ps(iq2,jq0);
611 /* EWALD ELECTROSTATICS */
613 /* Analytical PME correction */
614 zeta2 = _mm256_mul_ps(beta2,rsq20);
615 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
616 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
617 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
618 felec = _mm256_mul_ps(qq20,felec);
619 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
620 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
621 velec = _mm256_sub_ps(rinv20,pmecorrV);
622 velec = _mm256_mul_ps(qq20,velec);
624 d = _mm256_sub_ps(r20,rswitch);
625 d = _mm256_max_ps(d,_mm256_setzero_ps());
626 d2 = _mm256_mul_ps(d,d);
627 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
629 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
631 /* Evaluate switch function */
632 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
633 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
634 velec = _mm256_mul_ps(velec,sw);
635 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
637 /* Update potential sum for this i atom from the interaction with this j atom. */
638 velec = _mm256_and_ps(velec,cutoff_mask);
639 velec = _mm256_andnot_ps(dummy_mask,velec);
640 velecsum = _mm256_add_ps(velecsum,velec);
644 fscal = _mm256_and_ps(fscal,cutoff_mask);
646 fscal = _mm256_andnot_ps(dummy_mask,fscal);
648 /* Calculate temporary vectorial force */
649 tx = _mm256_mul_ps(fscal,dx20);
650 ty = _mm256_mul_ps(fscal,dy20);
651 tz = _mm256_mul_ps(fscal,dz20);
653 /* Update vectorial force */
654 fix2 = _mm256_add_ps(fix2,tx);
655 fiy2 = _mm256_add_ps(fiy2,ty);
656 fiz2 = _mm256_add_ps(fiz2,tz);
658 fjx0 = _mm256_add_ps(fjx0,tx);
659 fjy0 = _mm256_add_ps(fjy0,ty);
660 fjz0 = _mm256_add_ps(fjz0,tz);
664 /**************************
665 * CALCULATE INTERACTIONS *
666 **************************/
668 if (gmx_mm256_any_lt(rsq30,rcutoff2))
671 r30 = _mm256_mul_ps(rsq30,rinv30);
672 r30 = _mm256_andnot_ps(dummy_mask,r30);
674 /* Compute parameters for interactions between i and j atoms */
675 qq30 = _mm256_mul_ps(iq3,jq0);
677 /* EWALD ELECTROSTATICS */
679 /* Analytical PME correction */
680 zeta2 = _mm256_mul_ps(beta2,rsq30);
681 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
682 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
683 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
684 felec = _mm256_mul_ps(qq30,felec);
685 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
686 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
687 velec = _mm256_sub_ps(rinv30,pmecorrV);
688 velec = _mm256_mul_ps(qq30,velec);
690 d = _mm256_sub_ps(r30,rswitch);
691 d = _mm256_max_ps(d,_mm256_setzero_ps());
692 d2 = _mm256_mul_ps(d,d);
693 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
695 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
697 /* Evaluate switch function */
698 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
699 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
700 velec = _mm256_mul_ps(velec,sw);
701 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
703 /* Update potential sum for this i atom from the interaction with this j atom. */
704 velec = _mm256_and_ps(velec,cutoff_mask);
705 velec = _mm256_andnot_ps(dummy_mask,velec);
706 velecsum = _mm256_add_ps(velecsum,velec);
710 fscal = _mm256_and_ps(fscal,cutoff_mask);
712 fscal = _mm256_andnot_ps(dummy_mask,fscal);
714 /* Calculate temporary vectorial force */
715 tx = _mm256_mul_ps(fscal,dx30);
716 ty = _mm256_mul_ps(fscal,dy30);
717 tz = _mm256_mul_ps(fscal,dz30);
719 /* Update vectorial force */
720 fix3 = _mm256_add_ps(fix3,tx);
721 fiy3 = _mm256_add_ps(fiy3,ty);
722 fiz3 = _mm256_add_ps(fiz3,tz);
724 fjx0 = _mm256_add_ps(fjx0,tx);
725 fjy0 = _mm256_add_ps(fjy0,ty);
726 fjz0 = _mm256_add_ps(fjz0,tz);
730 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
731 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
732 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
733 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
734 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
735 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
736 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
737 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
739 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
741 /* Inner loop uses 330 flops */
744 /* End of innermost loop */
746 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
747 f+i_coord_offset+DIM,fshift+i_shift_offset);
750 /* Update potential energies */
751 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
753 /* Increment number of inner iterations */
754 inneriter += j_index_end - j_index_start;
756 /* Outer loop uses 19 flops */
759 /* Increment number of outer iterations */
762 /* Update outer/inner flops */
764 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*330);
767 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_single
768 * Electrostatics interaction: Ewald
769 * VdW interaction: None
770 * Geometry: Water4-Particle
771 * Calculate force/pot: Force
774 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_avx_256_single
775 (t_nblist * gmx_restrict nlist,
776 rvec * gmx_restrict xx,
777 rvec * gmx_restrict ff,
778 t_forcerec * gmx_restrict fr,
779 t_mdatoms * gmx_restrict mdatoms,
780 nb_kernel_data_t * gmx_restrict kernel_data,
781 t_nrnb * gmx_restrict nrnb)
783 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
784 * just 0 for non-waters.
785 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
786 * jnr indices corresponding to data put in the four positions in the SIMD register.
788 int i_shift_offset,i_coord_offset,outeriter,inneriter;
789 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
790 int jnrA,jnrB,jnrC,jnrD;
791 int jnrE,jnrF,jnrG,jnrH;
792 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
793 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
794 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
795 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
796 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
798 real *shiftvec,*fshift,*x,*f;
799 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
801 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
802 real * vdwioffsetptr1;
803 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
804 real * vdwioffsetptr2;
805 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
806 real * vdwioffsetptr3;
807 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
808 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
809 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
810 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
811 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
812 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
813 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
816 __m128i ewitab_lo,ewitab_hi;
817 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
818 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
820 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
821 real rswitch_scalar,d_scalar;
822 __m256 dummy_mask,cutoff_mask;
823 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
824 __m256 one = _mm256_set1_ps(1.0);
825 __m256 two = _mm256_set1_ps(2.0);
831 jindex = nlist->jindex;
833 shiftidx = nlist->shift;
835 shiftvec = fr->shift_vec[0];
836 fshift = fr->fshift[0];
837 facel = _mm256_set1_ps(fr->epsfac);
838 charge = mdatoms->chargeA;
840 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
841 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
842 beta2 = _mm256_mul_ps(beta,beta);
843 beta3 = _mm256_mul_ps(beta,beta2);
845 ewtab = fr->ic->tabq_coul_FDV0;
846 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
847 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
849 /* Setup water-specific parameters */
850 inr = nlist->iinr[0];
851 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
852 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
853 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
855 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
856 rcutoff_scalar = fr->rcoulomb;
857 rcutoff = _mm256_set1_ps(rcutoff_scalar);
858 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
860 rswitch_scalar = fr->rcoulomb_switch;
861 rswitch = _mm256_set1_ps(rswitch_scalar);
862 /* Setup switch parameters */
863 d_scalar = rcutoff_scalar-rswitch_scalar;
864 d = _mm256_set1_ps(d_scalar);
865 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
866 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
867 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
868 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
869 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
870 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
872 /* Avoid stupid compiler warnings */
873 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
886 for(iidx=0;iidx<4*DIM;iidx++)
891 /* Start outer loop over neighborlists */
892 for(iidx=0; iidx<nri; iidx++)
894 /* Load shift vector for this list */
895 i_shift_offset = DIM*shiftidx[iidx];
897 /* Load limits for loop over neighbors */
898 j_index_start = jindex[iidx];
899 j_index_end = jindex[iidx+1];
901 /* Get outer coordinate index */
903 i_coord_offset = DIM*inr;
905 /* Load i particle coords and add shift vector */
906 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
907 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
909 fix1 = _mm256_setzero_ps();
910 fiy1 = _mm256_setzero_ps();
911 fiz1 = _mm256_setzero_ps();
912 fix2 = _mm256_setzero_ps();
913 fiy2 = _mm256_setzero_ps();
914 fiz2 = _mm256_setzero_ps();
915 fix3 = _mm256_setzero_ps();
916 fiy3 = _mm256_setzero_ps();
917 fiz3 = _mm256_setzero_ps();
919 /* Start inner kernel loop */
920 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
923 /* Get j neighbor index, and coordinate index */
932 j_coord_offsetA = DIM*jnrA;
933 j_coord_offsetB = DIM*jnrB;
934 j_coord_offsetC = DIM*jnrC;
935 j_coord_offsetD = DIM*jnrD;
936 j_coord_offsetE = DIM*jnrE;
937 j_coord_offsetF = DIM*jnrF;
938 j_coord_offsetG = DIM*jnrG;
939 j_coord_offsetH = DIM*jnrH;
941 /* load j atom coordinates */
942 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
943 x+j_coord_offsetC,x+j_coord_offsetD,
944 x+j_coord_offsetE,x+j_coord_offsetF,
945 x+j_coord_offsetG,x+j_coord_offsetH,
948 /* Calculate displacement vector */
949 dx10 = _mm256_sub_ps(ix1,jx0);
950 dy10 = _mm256_sub_ps(iy1,jy0);
951 dz10 = _mm256_sub_ps(iz1,jz0);
952 dx20 = _mm256_sub_ps(ix2,jx0);
953 dy20 = _mm256_sub_ps(iy2,jy0);
954 dz20 = _mm256_sub_ps(iz2,jz0);
955 dx30 = _mm256_sub_ps(ix3,jx0);
956 dy30 = _mm256_sub_ps(iy3,jy0);
957 dz30 = _mm256_sub_ps(iz3,jz0);
959 /* Calculate squared distance and things based on it */
960 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
961 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
962 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
964 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
965 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
966 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
968 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
969 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
970 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
972 /* Load parameters for j particles */
973 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
974 charge+jnrC+0,charge+jnrD+0,
975 charge+jnrE+0,charge+jnrF+0,
976 charge+jnrG+0,charge+jnrH+0);
978 fjx0 = _mm256_setzero_ps();
979 fjy0 = _mm256_setzero_ps();
980 fjz0 = _mm256_setzero_ps();
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
986 if (gmx_mm256_any_lt(rsq10,rcutoff2))
989 r10 = _mm256_mul_ps(rsq10,rinv10);
991 /* Compute parameters for interactions between i and j atoms */
992 qq10 = _mm256_mul_ps(iq1,jq0);
994 /* EWALD ELECTROSTATICS */
996 /* Analytical PME correction */
997 zeta2 = _mm256_mul_ps(beta2,rsq10);
998 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
999 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1000 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1001 felec = _mm256_mul_ps(qq10,felec);
1002 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1003 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1004 velec = _mm256_sub_ps(rinv10,pmecorrV);
1005 velec = _mm256_mul_ps(qq10,velec);
1007 d = _mm256_sub_ps(r10,rswitch);
1008 d = _mm256_max_ps(d,_mm256_setzero_ps());
1009 d2 = _mm256_mul_ps(d,d);
1010 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1012 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1014 /* Evaluate switch function */
1015 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1016 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1017 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1021 fscal = _mm256_and_ps(fscal,cutoff_mask);
1023 /* Calculate temporary vectorial force */
1024 tx = _mm256_mul_ps(fscal,dx10);
1025 ty = _mm256_mul_ps(fscal,dy10);
1026 tz = _mm256_mul_ps(fscal,dz10);
1028 /* Update vectorial force */
1029 fix1 = _mm256_add_ps(fix1,tx);
1030 fiy1 = _mm256_add_ps(fiy1,ty);
1031 fiz1 = _mm256_add_ps(fiz1,tz);
1033 fjx0 = _mm256_add_ps(fjx0,tx);
1034 fjy0 = _mm256_add_ps(fjy0,ty);
1035 fjz0 = _mm256_add_ps(fjz0,tz);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1046 r20 = _mm256_mul_ps(rsq20,rinv20);
1048 /* Compute parameters for interactions between i and j atoms */
1049 qq20 = _mm256_mul_ps(iq2,jq0);
1051 /* EWALD ELECTROSTATICS */
1053 /* Analytical PME correction */
1054 zeta2 = _mm256_mul_ps(beta2,rsq20);
1055 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1056 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1057 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1058 felec = _mm256_mul_ps(qq20,felec);
1059 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1060 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1061 velec = _mm256_sub_ps(rinv20,pmecorrV);
1062 velec = _mm256_mul_ps(qq20,velec);
1064 d = _mm256_sub_ps(r20,rswitch);
1065 d = _mm256_max_ps(d,_mm256_setzero_ps());
1066 d2 = _mm256_mul_ps(d,d);
1067 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1069 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1071 /* Evaluate switch function */
1072 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1073 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1074 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1078 fscal = _mm256_and_ps(fscal,cutoff_mask);
1080 /* Calculate temporary vectorial force */
1081 tx = _mm256_mul_ps(fscal,dx20);
1082 ty = _mm256_mul_ps(fscal,dy20);
1083 tz = _mm256_mul_ps(fscal,dz20);
1085 /* Update vectorial force */
1086 fix2 = _mm256_add_ps(fix2,tx);
1087 fiy2 = _mm256_add_ps(fiy2,ty);
1088 fiz2 = _mm256_add_ps(fiz2,tz);
1090 fjx0 = _mm256_add_ps(fjx0,tx);
1091 fjy0 = _mm256_add_ps(fjy0,ty);
1092 fjz0 = _mm256_add_ps(fjz0,tz);
1096 /**************************
1097 * CALCULATE INTERACTIONS *
1098 **************************/
1100 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1103 r30 = _mm256_mul_ps(rsq30,rinv30);
1105 /* Compute parameters for interactions between i and j atoms */
1106 qq30 = _mm256_mul_ps(iq3,jq0);
1108 /* EWALD ELECTROSTATICS */
1110 /* Analytical PME correction */
1111 zeta2 = _mm256_mul_ps(beta2,rsq30);
1112 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
1113 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1114 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1115 felec = _mm256_mul_ps(qq30,felec);
1116 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1117 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1118 velec = _mm256_sub_ps(rinv30,pmecorrV);
1119 velec = _mm256_mul_ps(qq30,velec);
1121 d = _mm256_sub_ps(r30,rswitch);
1122 d = _mm256_max_ps(d,_mm256_setzero_ps());
1123 d2 = _mm256_mul_ps(d,d);
1124 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1126 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1128 /* Evaluate switch function */
1129 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1130 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
1131 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1135 fscal = _mm256_and_ps(fscal,cutoff_mask);
1137 /* Calculate temporary vectorial force */
1138 tx = _mm256_mul_ps(fscal,dx30);
1139 ty = _mm256_mul_ps(fscal,dy30);
1140 tz = _mm256_mul_ps(fscal,dz30);
1142 /* Update vectorial force */
1143 fix3 = _mm256_add_ps(fix3,tx);
1144 fiy3 = _mm256_add_ps(fiy3,ty);
1145 fiz3 = _mm256_add_ps(fiz3,tz);
1147 fjx0 = _mm256_add_ps(fjx0,tx);
1148 fjy0 = _mm256_add_ps(fjy0,ty);
1149 fjz0 = _mm256_add_ps(fjz0,tz);
1153 fjptrA = f+j_coord_offsetA;
1154 fjptrB = f+j_coord_offsetB;
1155 fjptrC = f+j_coord_offsetC;
1156 fjptrD = f+j_coord_offsetD;
1157 fjptrE = f+j_coord_offsetE;
1158 fjptrF = f+j_coord_offsetF;
1159 fjptrG = f+j_coord_offsetG;
1160 fjptrH = f+j_coord_offsetH;
1162 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1164 /* Inner loop uses 318 flops */
1167 if(jidx<j_index_end)
1170 /* Get j neighbor index, and coordinate index */
1171 jnrlistA = jjnr[jidx];
1172 jnrlistB = jjnr[jidx+1];
1173 jnrlistC = jjnr[jidx+2];
1174 jnrlistD = jjnr[jidx+3];
1175 jnrlistE = jjnr[jidx+4];
1176 jnrlistF = jjnr[jidx+5];
1177 jnrlistG = jjnr[jidx+6];
1178 jnrlistH = jjnr[jidx+7];
1179 /* Sign of each element will be negative for non-real atoms.
1180 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1181 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1183 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1184 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1186 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1187 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1188 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1189 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1190 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1191 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1192 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1193 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1194 j_coord_offsetA = DIM*jnrA;
1195 j_coord_offsetB = DIM*jnrB;
1196 j_coord_offsetC = DIM*jnrC;
1197 j_coord_offsetD = DIM*jnrD;
1198 j_coord_offsetE = DIM*jnrE;
1199 j_coord_offsetF = DIM*jnrF;
1200 j_coord_offsetG = DIM*jnrG;
1201 j_coord_offsetH = DIM*jnrH;
1203 /* load j atom coordinates */
1204 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1205 x+j_coord_offsetC,x+j_coord_offsetD,
1206 x+j_coord_offsetE,x+j_coord_offsetF,
1207 x+j_coord_offsetG,x+j_coord_offsetH,
1210 /* Calculate displacement vector */
1211 dx10 = _mm256_sub_ps(ix1,jx0);
1212 dy10 = _mm256_sub_ps(iy1,jy0);
1213 dz10 = _mm256_sub_ps(iz1,jz0);
1214 dx20 = _mm256_sub_ps(ix2,jx0);
1215 dy20 = _mm256_sub_ps(iy2,jy0);
1216 dz20 = _mm256_sub_ps(iz2,jz0);
1217 dx30 = _mm256_sub_ps(ix3,jx0);
1218 dy30 = _mm256_sub_ps(iy3,jy0);
1219 dz30 = _mm256_sub_ps(iz3,jz0);
1221 /* Calculate squared distance and things based on it */
1222 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1223 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1224 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1226 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1227 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1228 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1230 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1231 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1232 rinvsq30 = _mm256_mul_ps(rinv30,rinv30);
1234 /* Load parameters for j particles */
1235 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1236 charge+jnrC+0,charge+jnrD+0,
1237 charge+jnrE+0,charge+jnrF+0,
1238 charge+jnrG+0,charge+jnrH+0);
1240 fjx0 = _mm256_setzero_ps();
1241 fjy0 = _mm256_setzero_ps();
1242 fjz0 = _mm256_setzero_ps();
1244 /**************************
1245 * CALCULATE INTERACTIONS *
1246 **************************/
1248 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1251 r10 = _mm256_mul_ps(rsq10,rinv10);
1252 r10 = _mm256_andnot_ps(dummy_mask,r10);
1254 /* Compute parameters for interactions between i and j atoms */
1255 qq10 = _mm256_mul_ps(iq1,jq0);
1257 /* EWALD ELECTROSTATICS */
1259 /* Analytical PME correction */
1260 zeta2 = _mm256_mul_ps(beta2,rsq10);
1261 rinv3 = _mm256_mul_ps(rinvsq10,rinv10);
1262 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1263 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1264 felec = _mm256_mul_ps(qq10,felec);
1265 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1266 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1267 velec = _mm256_sub_ps(rinv10,pmecorrV);
1268 velec = _mm256_mul_ps(qq10,velec);
1270 d = _mm256_sub_ps(r10,rswitch);
1271 d = _mm256_max_ps(d,_mm256_setzero_ps());
1272 d2 = _mm256_mul_ps(d,d);
1273 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1275 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1277 /* Evaluate switch function */
1278 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1279 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1280 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1284 fscal = _mm256_and_ps(fscal,cutoff_mask);
1286 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1288 /* Calculate temporary vectorial force */
1289 tx = _mm256_mul_ps(fscal,dx10);
1290 ty = _mm256_mul_ps(fscal,dy10);
1291 tz = _mm256_mul_ps(fscal,dz10);
1293 /* Update vectorial force */
1294 fix1 = _mm256_add_ps(fix1,tx);
1295 fiy1 = _mm256_add_ps(fiy1,ty);
1296 fiz1 = _mm256_add_ps(fiz1,tz);
1298 fjx0 = _mm256_add_ps(fjx0,tx);
1299 fjy0 = _mm256_add_ps(fjy0,ty);
1300 fjz0 = _mm256_add_ps(fjz0,tz);
1304 /**************************
1305 * CALCULATE INTERACTIONS *
1306 **************************/
1308 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1311 r20 = _mm256_mul_ps(rsq20,rinv20);
1312 r20 = _mm256_andnot_ps(dummy_mask,r20);
1314 /* Compute parameters for interactions between i and j atoms */
1315 qq20 = _mm256_mul_ps(iq2,jq0);
1317 /* EWALD ELECTROSTATICS */
1319 /* Analytical PME correction */
1320 zeta2 = _mm256_mul_ps(beta2,rsq20);
1321 rinv3 = _mm256_mul_ps(rinvsq20,rinv20);
1322 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1323 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1324 felec = _mm256_mul_ps(qq20,felec);
1325 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1326 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1327 velec = _mm256_sub_ps(rinv20,pmecorrV);
1328 velec = _mm256_mul_ps(qq20,velec);
1330 d = _mm256_sub_ps(r20,rswitch);
1331 d = _mm256_max_ps(d,_mm256_setzero_ps());
1332 d2 = _mm256_mul_ps(d,d);
1333 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1335 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1337 /* Evaluate switch function */
1338 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1339 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1340 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1344 fscal = _mm256_and_ps(fscal,cutoff_mask);
1346 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1348 /* Calculate temporary vectorial force */
1349 tx = _mm256_mul_ps(fscal,dx20);
1350 ty = _mm256_mul_ps(fscal,dy20);
1351 tz = _mm256_mul_ps(fscal,dz20);
1353 /* Update vectorial force */
1354 fix2 = _mm256_add_ps(fix2,tx);
1355 fiy2 = _mm256_add_ps(fiy2,ty);
1356 fiz2 = _mm256_add_ps(fiz2,tz);
1358 fjx0 = _mm256_add_ps(fjx0,tx);
1359 fjy0 = _mm256_add_ps(fjy0,ty);
1360 fjz0 = _mm256_add_ps(fjz0,tz);
1364 /**************************
1365 * CALCULATE INTERACTIONS *
1366 **************************/
1368 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1371 r30 = _mm256_mul_ps(rsq30,rinv30);
1372 r30 = _mm256_andnot_ps(dummy_mask,r30);
1374 /* Compute parameters for interactions between i and j atoms */
1375 qq30 = _mm256_mul_ps(iq3,jq0);
1377 /* EWALD ELECTROSTATICS */
1379 /* Analytical PME correction */
1380 zeta2 = _mm256_mul_ps(beta2,rsq30);
1381 rinv3 = _mm256_mul_ps(rinvsq30,rinv30);
1382 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1383 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1384 felec = _mm256_mul_ps(qq30,felec);
1385 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1386 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1387 velec = _mm256_sub_ps(rinv30,pmecorrV);
1388 velec = _mm256_mul_ps(qq30,velec);
1390 d = _mm256_sub_ps(r30,rswitch);
1391 d = _mm256_max_ps(d,_mm256_setzero_ps());
1392 d2 = _mm256_mul_ps(d,d);
1393 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1395 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1397 /* Evaluate switch function */
1398 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1399 felec = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
1400 cutoff_mask = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1404 fscal = _mm256_and_ps(fscal,cutoff_mask);
1406 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1408 /* Calculate temporary vectorial force */
1409 tx = _mm256_mul_ps(fscal,dx30);
1410 ty = _mm256_mul_ps(fscal,dy30);
1411 tz = _mm256_mul_ps(fscal,dz30);
1413 /* Update vectorial force */
1414 fix3 = _mm256_add_ps(fix3,tx);
1415 fiy3 = _mm256_add_ps(fiy3,ty);
1416 fiz3 = _mm256_add_ps(fiz3,tz);
1418 fjx0 = _mm256_add_ps(fjx0,tx);
1419 fjy0 = _mm256_add_ps(fjy0,ty);
1420 fjz0 = _mm256_add_ps(fjz0,tz);
1424 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1425 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1426 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1427 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1428 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1429 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1430 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1431 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1433 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1435 /* Inner loop uses 321 flops */
1438 /* End of innermost loop */
1440 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1441 f+i_coord_offset+DIM,fshift+i_shift_offset);
1443 /* Increment number of inner iterations */
1444 inneriter += j_index_end - j_index_start;
1446 /* Outer loop uses 18 flops */
1449 /* Increment number of outer iterations */
1452 /* Update outer/inner flops */
1454 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*321);