2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr1;
73 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
79 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
81 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
83 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
84 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
85 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
86 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
87 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
88 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
89 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
90 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
91 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
92 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128i ewitab_lo,ewitab_hi;
97 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
100 __m256 dummy_mask,cutoff_mask;
101 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
102 __m256 one = _mm256_set1_ps(1.0);
103 __m256 two = _mm256_set1_ps(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm256_set1_ps(fr->epsfac);
116 charge = mdatoms->chargeA;
118 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
119 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
120 beta2 = _mm256_mul_ps(beta,beta);
121 beta3 = _mm256_mul_ps(beta,beta2);
123 ewtab = fr->ic->tabq_coul_FDV0;
124 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
125 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
130 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
131 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
133 jq1 = _mm256_set1_ps(charge[inr+1]);
134 jq2 = _mm256_set1_ps(charge[inr+2]);
135 jq3 = _mm256_set1_ps(charge[inr+3]);
136 qq11 = _mm256_mul_ps(iq1,jq1);
137 qq12 = _mm256_mul_ps(iq1,jq2);
138 qq13 = _mm256_mul_ps(iq1,jq3);
139 qq21 = _mm256_mul_ps(iq2,jq1);
140 qq22 = _mm256_mul_ps(iq2,jq2);
141 qq23 = _mm256_mul_ps(iq2,jq3);
142 qq31 = _mm256_mul_ps(iq3,jq1);
143 qq32 = _mm256_mul_ps(iq3,jq2);
144 qq33 = _mm256_mul_ps(iq3,jq3);
146 /* Avoid stupid compiler warnings */
147 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
181 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183 fix1 = _mm256_setzero_ps();
184 fiy1 = _mm256_setzero_ps();
185 fiz1 = _mm256_setzero_ps();
186 fix2 = _mm256_setzero_ps();
187 fiy2 = _mm256_setzero_ps();
188 fiz2 = _mm256_setzero_ps();
189 fix3 = _mm256_setzero_ps();
190 fiy3 = _mm256_setzero_ps();
191 fiz3 = _mm256_setzero_ps();
193 /* Reset potential sums */
194 velecsum = _mm256_setzero_ps();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
200 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
211 j_coord_offsetC = DIM*jnrC;
212 j_coord_offsetD = DIM*jnrD;
213 j_coord_offsetE = DIM*jnrE;
214 j_coord_offsetF = DIM*jnrF;
215 j_coord_offsetG = DIM*jnrG;
216 j_coord_offsetH = DIM*jnrH;
218 /* load j atom coordinates */
219 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
220 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
221 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
222 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
223 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
225 /* Calculate displacement vector */
226 dx11 = _mm256_sub_ps(ix1,jx1);
227 dy11 = _mm256_sub_ps(iy1,jy1);
228 dz11 = _mm256_sub_ps(iz1,jz1);
229 dx12 = _mm256_sub_ps(ix1,jx2);
230 dy12 = _mm256_sub_ps(iy1,jy2);
231 dz12 = _mm256_sub_ps(iz1,jz2);
232 dx13 = _mm256_sub_ps(ix1,jx3);
233 dy13 = _mm256_sub_ps(iy1,jy3);
234 dz13 = _mm256_sub_ps(iz1,jz3);
235 dx21 = _mm256_sub_ps(ix2,jx1);
236 dy21 = _mm256_sub_ps(iy2,jy1);
237 dz21 = _mm256_sub_ps(iz2,jz1);
238 dx22 = _mm256_sub_ps(ix2,jx2);
239 dy22 = _mm256_sub_ps(iy2,jy2);
240 dz22 = _mm256_sub_ps(iz2,jz2);
241 dx23 = _mm256_sub_ps(ix2,jx3);
242 dy23 = _mm256_sub_ps(iy2,jy3);
243 dz23 = _mm256_sub_ps(iz2,jz3);
244 dx31 = _mm256_sub_ps(ix3,jx1);
245 dy31 = _mm256_sub_ps(iy3,jy1);
246 dz31 = _mm256_sub_ps(iz3,jz1);
247 dx32 = _mm256_sub_ps(ix3,jx2);
248 dy32 = _mm256_sub_ps(iy3,jy2);
249 dz32 = _mm256_sub_ps(iz3,jz2);
250 dx33 = _mm256_sub_ps(ix3,jx3);
251 dy33 = _mm256_sub_ps(iy3,jy3);
252 dz33 = _mm256_sub_ps(iz3,jz3);
254 /* Calculate squared distance and things based on it */
255 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
256 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
257 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
258 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
259 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
260 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
261 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
262 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
263 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
265 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
266 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
267 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
268 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
269 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
270 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
271 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
272 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
273 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
275 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
276 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
277 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
278 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
279 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
280 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
281 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
282 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
283 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
285 fjx1 = _mm256_setzero_ps();
286 fjy1 = _mm256_setzero_ps();
287 fjz1 = _mm256_setzero_ps();
288 fjx2 = _mm256_setzero_ps();
289 fjy2 = _mm256_setzero_ps();
290 fjz2 = _mm256_setzero_ps();
291 fjx3 = _mm256_setzero_ps();
292 fjy3 = _mm256_setzero_ps();
293 fjz3 = _mm256_setzero_ps();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r11 = _mm256_mul_ps(rsq11,rinv11);
301 /* EWALD ELECTROSTATICS */
303 /* Analytical PME correction */
304 zeta2 = _mm256_mul_ps(beta2,rsq11);
305 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
306 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
307 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
308 felec = _mm256_mul_ps(qq11,felec);
309 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
310 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
311 velec = _mm256_sub_ps(rinv11,pmecorrV);
312 velec = _mm256_mul_ps(qq11,velec);
314 /* Update potential sum for this i atom from the interaction with this j atom. */
315 velecsum = _mm256_add_ps(velecsum,velec);
319 /* Calculate temporary vectorial force */
320 tx = _mm256_mul_ps(fscal,dx11);
321 ty = _mm256_mul_ps(fscal,dy11);
322 tz = _mm256_mul_ps(fscal,dz11);
324 /* Update vectorial force */
325 fix1 = _mm256_add_ps(fix1,tx);
326 fiy1 = _mm256_add_ps(fiy1,ty);
327 fiz1 = _mm256_add_ps(fiz1,tz);
329 fjx1 = _mm256_add_ps(fjx1,tx);
330 fjy1 = _mm256_add_ps(fjy1,ty);
331 fjz1 = _mm256_add_ps(fjz1,tz);
333 /**************************
334 * CALCULATE INTERACTIONS *
335 **************************/
337 r12 = _mm256_mul_ps(rsq12,rinv12);
339 /* EWALD ELECTROSTATICS */
341 /* Analytical PME correction */
342 zeta2 = _mm256_mul_ps(beta2,rsq12);
343 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
344 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
345 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
346 felec = _mm256_mul_ps(qq12,felec);
347 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
348 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
349 velec = _mm256_sub_ps(rinv12,pmecorrV);
350 velec = _mm256_mul_ps(qq12,velec);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velecsum = _mm256_add_ps(velecsum,velec);
357 /* Calculate temporary vectorial force */
358 tx = _mm256_mul_ps(fscal,dx12);
359 ty = _mm256_mul_ps(fscal,dy12);
360 tz = _mm256_mul_ps(fscal,dz12);
362 /* Update vectorial force */
363 fix1 = _mm256_add_ps(fix1,tx);
364 fiy1 = _mm256_add_ps(fiy1,ty);
365 fiz1 = _mm256_add_ps(fiz1,tz);
367 fjx2 = _mm256_add_ps(fjx2,tx);
368 fjy2 = _mm256_add_ps(fjy2,ty);
369 fjz2 = _mm256_add_ps(fjz2,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 r13 = _mm256_mul_ps(rsq13,rinv13);
377 /* EWALD ELECTROSTATICS */
379 /* Analytical PME correction */
380 zeta2 = _mm256_mul_ps(beta2,rsq13);
381 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
382 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
383 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
384 felec = _mm256_mul_ps(qq13,felec);
385 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
386 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
387 velec = _mm256_sub_ps(rinv13,pmecorrV);
388 velec = _mm256_mul_ps(qq13,velec);
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velecsum = _mm256_add_ps(velecsum,velec);
395 /* Calculate temporary vectorial force */
396 tx = _mm256_mul_ps(fscal,dx13);
397 ty = _mm256_mul_ps(fscal,dy13);
398 tz = _mm256_mul_ps(fscal,dz13);
400 /* Update vectorial force */
401 fix1 = _mm256_add_ps(fix1,tx);
402 fiy1 = _mm256_add_ps(fiy1,ty);
403 fiz1 = _mm256_add_ps(fiz1,tz);
405 fjx3 = _mm256_add_ps(fjx3,tx);
406 fjy3 = _mm256_add_ps(fjy3,ty);
407 fjz3 = _mm256_add_ps(fjz3,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 r21 = _mm256_mul_ps(rsq21,rinv21);
415 /* EWALD ELECTROSTATICS */
417 /* Analytical PME correction */
418 zeta2 = _mm256_mul_ps(beta2,rsq21);
419 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
420 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
421 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
422 felec = _mm256_mul_ps(qq21,felec);
423 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
424 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
425 velec = _mm256_sub_ps(rinv21,pmecorrV);
426 velec = _mm256_mul_ps(qq21,velec);
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velecsum = _mm256_add_ps(velecsum,velec);
433 /* Calculate temporary vectorial force */
434 tx = _mm256_mul_ps(fscal,dx21);
435 ty = _mm256_mul_ps(fscal,dy21);
436 tz = _mm256_mul_ps(fscal,dz21);
438 /* Update vectorial force */
439 fix2 = _mm256_add_ps(fix2,tx);
440 fiy2 = _mm256_add_ps(fiy2,ty);
441 fiz2 = _mm256_add_ps(fiz2,tz);
443 fjx1 = _mm256_add_ps(fjx1,tx);
444 fjy1 = _mm256_add_ps(fjy1,ty);
445 fjz1 = _mm256_add_ps(fjz1,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 r22 = _mm256_mul_ps(rsq22,rinv22);
453 /* EWALD ELECTROSTATICS */
455 /* Analytical PME correction */
456 zeta2 = _mm256_mul_ps(beta2,rsq22);
457 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
458 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
459 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
460 felec = _mm256_mul_ps(qq22,felec);
461 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
462 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
463 velec = _mm256_sub_ps(rinv22,pmecorrV);
464 velec = _mm256_mul_ps(qq22,velec);
466 /* Update potential sum for this i atom from the interaction with this j atom. */
467 velecsum = _mm256_add_ps(velecsum,velec);
471 /* Calculate temporary vectorial force */
472 tx = _mm256_mul_ps(fscal,dx22);
473 ty = _mm256_mul_ps(fscal,dy22);
474 tz = _mm256_mul_ps(fscal,dz22);
476 /* Update vectorial force */
477 fix2 = _mm256_add_ps(fix2,tx);
478 fiy2 = _mm256_add_ps(fiy2,ty);
479 fiz2 = _mm256_add_ps(fiz2,tz);
481 fjx2 = _mm256_add_ps(fjx2,tx);
482 fjy2 = _mm256_add_ps(fjy2,ty);
483 fjz2 = _mm256_add_ps(fjz2,tz);
485 /**************************
486 * CALCULATE INTERACTIONS *
487 **************************/
489 r23 = _mm256_mul_ps(rsq23,rinv23);
491 /* EWALD ELECTROSTATICS */
493 /* Analytical PME correction */
494 zeta2 = _mm256_mul_ps(beta2,rsq23);
495 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
496 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
497 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
498 felec = _mm256_mul_ps(qq23,felec);
499 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
500 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
501 velec = _mm256_sub_ps(rinv23,pmecorrV);
502 velec = _mm256_mul_ps(qq23,velec);
504 /* Update potential sum for this i atom from the interaction with this j atom. */
505 velecsum = _mm256_add_ps(velecsum,velec);
509 /* Calculate temporary vectorial force */
510 tx = _mm256_mul_ps(fscal,dx23);
511 ty = _mm256_mul_ps(fscal,dy23);
512 tz = _mm256_mul_ps(fscal,dz23);
514 /* Update vectorial force */
515 fix2 = _mm256_add_ps(fix2,tx);
516 fiy2 = _mm256_add_ps(fiy2,ty);
517 fiz2 = _mm256_add_ps(fiz2,tz);
519 fjx3 = _mm256_add_ps(fjx3,tx);
520 fjy3 = _mm256_add_ps(fjy3,ty);
521 fjz3 = _mm256_add_ps(fjz3,tz);
523 /**************************
524 * CALCULATE INTERACTIONS *
525 **************************/
527 r31 = _mm256_mul_ps(rsq31,rinv31);
529 /* EWALD ELECTROSTATICS */
531 /* Analytical PME correction */
532 zeta2 = _mm256_mul_ps(beta2,rsq31);
533 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
534 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
535 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
536 felec = _mm256_mul_ps(qq31,felec);
537 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
538 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
539 velec = _mm256_sub_ps(rinv31,pmecorrV);
540 velec = _mm256_mul_ps(qq31,velec);
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velecsum = _mm256_add_ps(velecsum,velec);
547 /* Calculate temporary vectorial force */
548 tx = _mm256_mul_ps(fscal,dx31);
549 ty = _mm256_mul_ps(fscal,dy31);
550 tz = _mm256_mul_ps(fscal,dz31);
552 /* Update vectorial force */
553 fix3 = _mm256_add_ps(fix3,tx);
554 fiy3 = _mm256_add_ps(fiy3,ty);
555 fiz3 = _mm256_add_ps(fiz3,tz);
557 fjx1 = _mm256_add_ps(fjx1,tx);
558 fjy1 = _mm256_add_ps(fjy1,ty);
559 fjz1 = _mm256_add_ps(fjz1,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 r32 = _mm256_mul_ps(rsq32,rinv32);
567 /* EWALD ELECTROSTATICS */
569 /* Analytical PME correction */
570 zeta2 = _mm256_mul_ps(beta2,rsq32);
571 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
572 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
573 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
574 felec = _mm256_mul_ps(qq32,felec);
575 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
576 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
577 velec = _mm256_sub_ps(rinv32,pmecorrV);
578 velec = _mm256_mul_ps(qq32,velec);
580 /* Update potential sum for this i atom from the interaction with this j atom. */
581 velecsum = _mm256_add_ps(velecsum,velec);
585 /* Calculate temporary vectorial force */
586 tx = _mm256_mul_ps(fscal,dx32);
587 ty = _mm256_mul_ps(fscal,dy32);
588 tz = _mm256_mul_ps(fscal,dz32);
590 /* Update vectorial force */
591 fix3 = _mm256_add_ps(fix3,tx);
592 fiy3 = _mm256_add_ps(fiy3,ty);
593 fiz3 = _mm256_add_ps(fiz3,tz);
595 fjx2 = _mm256_add_ps(fjx2,tx);
596 fjy2 = _mm256_add_ps(fjy2,ty);
597 fjz2 = _mm256_add_ps(fjz2,tz);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 r33 = _mm256_mul_ps(rsq33,rinv33);
605 /* EWALD ELECTROSTATICS */
607 /* Analytical PME correction */
608 zeta2 = _mm256_mul_ps(beta2,rsq33);
609 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
610 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
611 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
612 felec = _mm256_mul_ps(qq33,felec);
613 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
614 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
615 velec = _mm256_sub_ps(rinv33,pmecorrV);
616 velec = _mm256_mul_ps(qq33,velec);
618 /* Update potential sum for this i atom from the interaction with this j atom. */
619 velecsum = _mm256_add_ps(velecsum,velec);
623 /* Calculate temporary vectorial force */
624 tx = _mm256_mul_ps(fscal,dx33);
625 ty = _mm256_mul_ps(fscal,dy33);
626 tz = _mm256_mul_ps(fscal,dz33);
628 /* Update vectorial force */
629 fix3 = _mm256_add_ps(fix3,tx);
630 fiy3 = _mm256_add_ps(fiy3,ty);
631 fiz3 = _mm256_add_ps(fiz3,tz);
633 fjx3 = _mm256_add_ps(fjx3,tx);
634 fjy3 = _mm256_add_ps(fjy3,ty);
635 fjz3 = _mm256_add_ps(fjz3,tz);
637 fjptrA = f+j_coord_offsetA;
638 fjptrB = f+j_coord_offsetB;
639 fjptrC = f+j_coord_offsetC;
640 fjptrD = f+j_coord_offsetD;
641 fjptrE = f+j_coord_offsetE;
642 fjptrF = f+j_coord_offsetF;
643 fjptrG = f+j_coord_offsetG;
644 fjptrH = f+j_coord_offsetH;
646 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
647 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
648 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
650 /* Inner loop uses 756 flops */
656 /* Get j neighbor index, and coordinate index */
657 jnrlistA = jjnr[jidx];
658 jnrlistB = jjnr[jidx+1];
659 jnrlistC = jjnr[jidx+2];
660 jnrlistD = jjnr[jidx+3];
661 jnrlistE = jjnr[jidx+4];
662 jnrlistF = jjnr[jidx+5];
663 jnrlistG = jjnr[jidx+6];
664 jnrlistH = jjnr[jidx+7];
665 /* Sign of each element will be negative for non-real atoms.
666 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
667 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
669 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
670 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
672 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
673 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
674 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
675 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
676 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
677 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
678 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
679 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
680 j_coord_offsetA = DIM*jnrA;
681 j_coord_offsetB = DIM*jnrB;
682 j_coord_offsetC = DIM*jnrC;
683 j_coord_offsetD = DIM*jnrD;
684 j_coord_offsetE = DIM*jnrE;
685 j_coord_offsetF = DIM*jnrF;
686 j_coord_offsetG = DIM*jnrG;
687 j_coord_offsetH = DIM*jnrH;
689 /* load j atom coordinates */
690 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
691 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
692 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
693 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
694 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
696 /* Calculate displacement vector */
697 dx11 = _mm256_sub_ps(ix1,jx1);
698 dy11 = _mm256_sub_ps(iy1,jy1);
699 dz11 = _mm256_sub_ps(iz1,jz1);
700 dx12 = _mm256_sub_ps(ix1,jx2);
701 dy12 = _mm256_sub_ps(iy1,jy2);
702 dz12 = _mm256_sub_ps(iz1,jz2);
703 dx13 = _mm256_sub_ps(ix1,jx3);
704 dy13 = _mm256_sub_ps(iy1,jy3);
705 dz13 = _mm256_sub_ps(iz1,jz3);
706 dx21 = _mm256_sub_ps(ix2,jx1);
707 dy21 = _mm256_sub_ps(iy2,jy1);
708 dz21 = _mm256_sub_ps(iz2,jz1);
709 dx22 = _mm256_sub_ps(ix2,jx2);
710 dy22 = _mm256_sub_ps(iy2,jy2);
711 dz22 = _mm256_sub_ps(iz2,jz2);
712 dx23 = _mm256_sub_ps(ix2,jx3);
713 dy23 = _mm256_sub_ps(iy2,jy3);
714 dz23 = _mm256_sub_ps(iz2,jz3);
715 dx31 = _mm256_sub_ps(ix3,jx1);
716 dy31 = _mm256_sub_ps(iy3,jy1);
717 dz31 = _mm256_sub_ps(iz3,jz1);
718 dx32 = _mm256_sub_ps(ix3,jx2);
719 dy32 = _mm256_sub_ps(iy3,jy2);
720 dz32 = _mm256_sub_ps(iz3,jz2);
721 dx33 = _mm256_sub_ps(ix3,jx3);
722 dy33 = _mm256_sub_ps(iy3,jy3);
723 dz33 = _mm256_sub_ps(iz3,jz3);
725 /* Calculate squared distance and things based on it */
726 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
727 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
728 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
729 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
730 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
731 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
732 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
733 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
734 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
736 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
737 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
738 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
739 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
740 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
741 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
742 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
743 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
744 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
746 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
747 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
748 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
749 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
750 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
751 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
752 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
753 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
754 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
756 fjx1 = _mm256_setzero_ps();
757 fjy1 = _mm256_setzero_ps();
758 fjz1 = _mm256_setzero_ps();
759 fjx2 = _mm256_setzero_ps();
760 fjy2 = _mm256_setzero_ps();
761 fjz2 = _mm256_setzero_ps();
762 fjx3 = _mm256_setzero_ps();
763 fjy3 = _mm256_setzero_ps();
764 fjz3 = _mm256_setzero_ps();
766 /**************************
767 * CALCULATE INTERACTIONS *
768 **************************/
770 r11 = _mm256_mul_ps(rsq11,rinv11);
771 r11 = _mm256_andnot_ps(dummy_mask,r11);
773 /* EWALD ELECTROSTATICS */
775 /* Analytical PME correction */
776 zeta2 = _mm256_mul_ps(beta2,rsq11);
777 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
778 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
779 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
780 felec = _mm256_mul_ps(qq11,felec);
781 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
782 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
783 velec = _mm256_sub_ps(rinv11,pmecorrV);
784 velec = _mm256_mul_ps(qq11,velec);
786 /* Update potential sum for this i atom from the interaction with this j atom. */
787 velec = _mm256_andnot_ps(dummy_mask,velec);
788 velecsum = _mm256_add_ps(velecsum,velec);
792 fscal = _mm256_andnot_ps(dummy_mask,fscal);
794 /* Calculate temporary vectorial force */
795 tx = _mm256_mul_ps(fscal,dx11);
796 ty = _mm256_mul_ps(fscal,dy11);
797 tz = _mm256_mul_ps(fscal,dz11);
799 /* Update vectorial force */
800 fix1 = _mm256_add_ps(fix1,tx);
801 fiy1 = _mm256_add_ps(fiy1,ty);
802 fiz1 = _mm256_add_ps(fiz1,tz);
804 fjx1 = _mm256_add_ps(fjx1,tx);
805 fjy1 = _mm256_add_ps(fjy1,ty);
806 fjz1 = _mm256_add_ps(fjz1,tz);
808 /**************************
809 * CALCULATE INTERACTIONS *
810 **************************/
812 r12 = _mm256_mul_ps(rsq12,rinv12);
813 r12 = _mm256_andnot_ps(dummy_mask,r12);
815 /* EWALD ELECTROSTATICS */
817 /* Analytical PME correction */
818 zeta2 = _mm256_mul_ps(beta2,rsq12);
819 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
820 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
821 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
822 felec = _mm256_mul_ps(qq12,felec);
823 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
824 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
825 velec = _mm256_sub_ps(rinv12,pmecorrV);
826 velec = _mm256_mul_ps(qq12,velec);
828 /* Update potential sum for this i atom from the interaction with this j atom. */
829 velec = _mm256_andnot_ps(dummy_mask,velec);
830 velecsum = _mm256_add_ps(velecsum,velec);
834 fscal = _mm256_andnot_ps(dummy_mask,fscal);
836 /* Calculate temporary vectorial force */
837 tx = _mm256_mul_ps(fscal,dx12);
838 ty = _mm256_mul_ps(fscal,dy12);
839 tz = _mm256_mul_ps(fscal,dz12);
841 /* Update vectorial force */
842 fix1 = _mm256_add_ps(fix1,tx);
843 fiy1 = _mm256_add_ps(fiy1,ty);
844 fiz1 = _mm256_add_ps(fiz1,tz);
846 fjx2 = _mm256_add_ps(fjx2,tx);
847 fjy2 = _mm256_add_ps(fjy2,ty);
848 fjz2 = _mm256_add_ps(fjz2,tz);
850 /**************************
851 * CALCULATE INTERACTIONS *
852 **************************/
854 r13 = _mm256_mul_ps(rsq13,rinv13);
855 r13 = _mm256_andnot_ps(dummy_mask,r13);
857 /* EWALD ELECTROSTATICS */
859 /* Analytical PME correction */
860 zeta2 = _mm256_mul_ps(beta2,rsq13);
861 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
862 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
863 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
864 felec = _mm256_mul_ps(qq13,felec);
865 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
866 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
867 velec = _mm256_sub_ps(rinv13,pmecorrV);
868 velec = _mm256_mul_ps(qq13,velec);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm256_andnot_ps(dummy_mask,velec);
872 velecsum = _mm256_add_ps(velecsum,velec);
876 fscal = _mm256_andnot_ps(dummy_mask,fscal);
878 /* Calculate temporary vectorial force */
879 tx = _mm256_mul_ps(fscal,dx13);
880 ty = _mm256_mul_ps(fscal,dy13);
881 tz = _mm256_mul_ps(fscal,dz13);
883 /* Update vectorial force */
884 fix1 = _mm256_add_ps(fix1,tx);
885 fiy1 = _mm256_add_ps(fiy1,ty);
886 fiz1 = _mm256_add_ps(fiz1,tz);
888 fjx3 = _mm256_add_ps(fjx3,tx);
889 fjy3 = _mm256_add_ps(fjy3,ty);
890 fjz3 = _mm256_add_ps(fjz3,tz);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 r21 = _mm256_mul_ps(rsq21,rinv21);
897 r21 = _mm256_andnot_ps(dummy_mask,r21);
899 /* EWALD ELECTROSTATICS */
901 /* Analytical PME correction */
902 zeta2 = _mm256_mul_ps(beta2,rsq21);
903 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
904 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
905 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
906 felec = _mm256_mul_ps(qq21,felec);
907 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
908 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
909 velec = _mm256_sub_ps(rinv21,pmecorrV);
910 velec = _mm256_mul_ps(qq21,velec);
912 /* Update potential sum for this i atom from the interaction with this j atom. */
913 velec = _mm256_andnot_ps(dummy_mask,velec);
914 velecsum = _mm256_add_ps(velecsum,velec);
918 fscal = _mm256_andnot_ps(dummy_mask,fscal);
920 /* Calculate temporary vectorial force */
921 tx = _mm256_mul_ps(fscal,dx21);
922 ty = _mm256_mul_ps(fscal,dy21);
923 tz = _mm256_mul_ps(fscal,dz21);
925 /* Update vectorial force */
926 fix2 = _mm256_add_ps(fix2,tx);
927 fiy2 = _mm256_add_ps(fiy2,ty);
928 fiz2 = _mm256_add_ps(fiz2,tz);
930 fjx1 = _mm256_add_ps(fjx1,tx);
931 fjy1 = _mm256_add_ps(fjy1,ty);
932 fjz1 = _mm256_add_ps(fjz1,tz);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r22 = _mm256_mul_ps(rsq22,rinv22);
939 r22 = _mm256_andnot_ps(dummy_mask,r22);
941 /* EWALD ELECTROSTATICS */
943 /* Analytical PME correction */
944 zeta2 = _mm256_mul_ps(beta2,rsq22);
945 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
946 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
947 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
948 felec = _mm256_mul_ps(qq22,felec);
949 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
950 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
951 velec = _mm256_sub_ps(rinv22,pmecorrV);
952 velec = _mm256_mul_ps(qq22,velec);
954 /* Update potential sum for this i atom from the interaction with this j atom. */
955 velec = _mm256_andnot_ps(dummy_mask,velec);
956 velecsum = _mm256_add_ps(velecsum,velec);
960 fscal = _mm256_andnot_ps(dummy_mask,fscal);
962 /* Calculate temporary vectorial force */
963 tx = _mm256_mul_ps(fscal,dx22);
964 ty = _mm256_mul_ps(fscal,dy22);
965 tz = _mm256_mul_ps(fscal,dz22);
967 /* Update vectorial force */
968 fix2 = _mm256_add_ps(fix2,tx);
969 fiy2 = _mm256_add_ps(fiy2,ty);
970 fiz2 = _mm256_add_ps(fiz2,tz);
972 fjx2 = _mm256_add_ps(fjx2,tx);
973 fjy2 = _mm256_add_ps(fjy2,ty);
974 fjz2 = _mm256_add_ps(fjz2,tz);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 r23 = _mm256_mul_ps(rsq23,rinv23);
981 r23 = _mm256_andnot_ps(dummy_mask,r23);
983 /* EWALD ELECTROSTATICS */
985 /* Analytical PME correction */
986 zeta2 = _mm256_mul_ps(beta2,rsq23);
987 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
988 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
989 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
990 felec = _mm256_mul_ps(qq23,felec);
991 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
992 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
993 velec = _mm256_sub_ps(rinv23,pmecorrV);
994 velec = _mm256_mul_ps(qq23,velec);
996 /* Update potential sum for this i atom from the interaction with this j atom. */
997 velec = _mm256_andnot_ps(dummy_mask,velec);
998 velecsum = _mm256_add_ps(velecsum,velec);
1002 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1004 /* Calculate temporary vectorial force */
1005 tx = _mm256_mul_ps(fscal,dx23);
1006 ty = _mm256_mul_ps(fscal,dy23);
1007 tz = _mm256_mul_ps(fscal,dz23);
1009 /* Update vectorial force */
1010 fix2 = _mm256_add_ps(fix2,tx);
1011 fiy2 = _mm256_add_ps(fiy2,ty);
1012 fiz2 = _mm256_add_ps(fiz2,tz);
1014 fjx3 = _mm256_add_ps(fjx3,tx);
1015 fjy3 = _mm256_add_ps(fjy3,ty);
1016 fjz3 = _mm256_add_ps(fjz3,tz);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 r31 = _mm256_mul_ps(rsq31,rinv31);
1023 r31 = _mm256_andnot_ps(dummy_mask,r31);
1025 /* EWALD ELECTROSTATICS */
1027 /* Analytical PME correction */
1028 zeta2 = _mm256_mul_ps(beta2,rsq31);
1029 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1030 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1031 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1032 felec = _mm256_mul_ps(qq31,felec);
1033 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1034 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1035 velec = _mm256_sub_ps(rinv31,pmecorrV);
1036 velec = _mm256_mul_ps(qq31,velec);
1038 /* Update potential sum for this i atom from the interaction with this j atom. */
1039 velec = _mm256_andnot_ps(dummy_mask,velec);
1040 velecsum = _mm256_add_ps(velecsum,velec);
1044 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1046 /* Calculate temporary vectorial force */
1047 tx = _mm256_mul_ps(fscal,dx31);
1048 ty = _mm256_mul_ps(fscal,dy31);
1049 tz = _mm256_mul_ps(fscal,dz31);
1051 /* Update vectorial force */
1052 fix3 = _mm256_add_ps(fix3,tx);
1053 fiy3 = _mm256_add_ps(fiy3,ty);
1054 fiz3 = _mm256_add_ps(fiz3,tz);
1056 fjx1 = _mm256_add_ps(fjx1,tx);
1057 fjy1 = _mm256_add_ps(fjy1,ty);
1058 fjz1 = _mm256_add_ps(fjz1,tz);
1060 /**************************
1061 * CALCULATE INTERACTIONS *
1062 **************************/
1064 r32 = _mm256_mul_ps(rsq32,rinv32);
1065 r32 = _mm256_andnot_ps(dummy_mask,r32);
1067 /* EWALD ELECTROSTATICS */
1069 /* Analytical PME correction */
1070 zeta2 = _mm256_mul_ps(beta2,rsq32);
1071 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1072 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1073 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1074 felec = _mm256_mul_ps(qq32,felec);
1075 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1076 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1077 velec = _mm256_sub_ps(rinv32,pmecorrV);
1078 velec = _mm256_mul_ps(qq32,velec);
1080 /* Update potential sum for this i atom from the interaction with this j atom. */
1081 velec = _mm256_andnot_ps(dummy_mask,velec);
1082 velecsum = _mm256_add_ps(velecsum,velec);
1086 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1088 /* Calculate temporary vectorial force */
1089 tx = _mm256_mul_ps(fscal,dx32);
1090 ty = _mm256_mul_ps(fscal,dy32);
1091 tz = _mm256_mul_ps(fscal,dz32);
1093 /* Update vectorial force */
1094 fix3 = _mm256_add_ps(fix3,tx);
1095 fiy3 = _mm256_add_ps(fiy3,ty);
1096 fiz3 = _mm256_add_ps(fiz3,tz);
1098 fjx2 = _mm256_add_ps(fjx2,tx);
1099 fjy2 = _mm256_add_ps(fjy2,ty);
1100 fjz2 = _mm256_add_ps(fjz2,tz);
1102 /**************************
1103 * CALCULATE INTERACTIONS *
1104 **************************/
1106 r33 = _mm256_mul_ps(rsq33,rinv33);
1107 r33 = _mm256_andnot_ps(dummy_mask,r33);
1109 /* EWALD ELECTROSTATICS */
1111 /* Analytical PME correction */
1112 zeta2 = _mm256_mul_ps(beta2,rsq33);
1113 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1114 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1115 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1116 felec = _mm256_mul_ps(qq33,felec);
1117 pmecorrV = gmx_mm256_pmecorrV_ps(zeta2);
1118 pmecorrV = _mm256_mul_ps(pmecorrV,beta);
1119 velec = _mm256_sub_ps(rinv33,pmecorrV);
1120 velec = _mm256_mul_ps(qq33,velec);
1122 /* Update potential sum for this i atom from the interaction with this j atom. */
1123 velec = _mm256_andnot_ps(dummy_mask,velec);
1124 velecsum = _mm256_add_ps(velecsum,velec);
1128 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1130 /* Calculate temporary vectorial force */
1131 tx = _mm256_mul_ps(fscal,dx33);
1132 ty = _mm256_mul_ps(fscal,dy33);
1133 tz = _mm256_mul_ps(fscal,dz33);
1135 /* Update vectorial force */
1136 fix3 = _mm256_add_ps(fix3,tx);
1137 fiy3 = _mm256_add_ps(fiy3,ty);
1138 fiz3 = _mm256_add_ps(fiz3,tz);
1140 fjx3 = _mm256_add_ps(fjx3,tx);
1141 fjy3 = _mm256_add_ps(fjy3,ty);
1142 fjz3 = _mm256_add_ps(fjz3,tz);
1144 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1145 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1146 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1147 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1148 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1149 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1150 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1151 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1153 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1154 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1155 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1157 /* Inner loop uses 765 flops */
1160 /* End of innermost loop */
1162 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1163 f+i_coord_offset+DIM,fshift+i_shift_offset);
1166 /* Update potential energies */
1167 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1169 /* Increment number of inner iterations */
1170 inneriter += j_index_end - j_index_start;
1172 /* Outer loop uses 19 flops */
1175 /* Increment number of outer iterations */
1178 /* Update outer/inner flops */
1180 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*765);
1183 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_single
1184 * Electrostatics interaction: Ewald
1185 * VdW interaction: None
1186 * Geometry: Water4-Water4
1187 * Calculate force/pot: Force
1190 nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_single
1191 (t_nblist * gmx_restrict nlist,
1192 rvec * gmx_restrict xx,
1193 rvec * gmx_restrict ff,
1194 t_forcerec * gmx_restrict fr,
1195 t_mdatoms * gmx_restrict mdatoms,
1196 nb_kernel_data_t * gmx_restrict kernel_data,
1197 t_nrnb * gmx_restrict nrnb)
1199 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1200 * just 0 for non-waters.
1201 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1202 * jnr indices corresponding to data put in the four positions in the SIMD register.
1204 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1205 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1206 int jnrA,jnrB,jnrC,jnrD;
1207 int jnrE,jnrF,jnrG,jnrH;
1208 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1209 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1210 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1211 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1212 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1213 real rcutoff_scalar;
1214 real *shiftvec,*fshift,*x,*f;
1215 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1216 real scratch[4*DIM];
1217 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1218 real * vdwioffsetptr1;
1219 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1220 real * vdwioffsetptr2;
1221 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1222 real * vdwioffsetptr3;
1223 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1224 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1225 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1226 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1227 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1228 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1229 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1230 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1231 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1232 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1233 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1234 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1235 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1236 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1237 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1238 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1239 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1242 __m128i ewitab_lo,ewitab_hi;
1243 __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1244 __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1246 __m256 dummy_mask,cutoff_mask;
1247 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1248 __m256 one = _mm256_set1_ps(1.0);
1249 __m256 two = _mm256_set1_ps(2.0);
1255 jindex = nlist->jindex;
1257 shiftidx = nlist->shift;
1259 shiftvec = fr->shift_vec[0];
1260 fshift = fr->fshift[0];
1261 facel = _mm256_set1_ps(fr->epsfac);
1262 charge = mdatoms->chargeA;
1264 sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald);
1265 beta = _mm256_set1_ps(fr->ic->ewaldcoeff);
1266 beta2 = _mm256_mul_ps(beta,beta);
1267 beta3 = _mm256_mul_ps(beta,beta2);
1269 ewtab = fr->ic->tabq_coul_F;
1270 ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale);
1271 ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1273 /* Setup water-specific parameters */
1274 inr = nlist->iinr[0];
1275 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1276 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1277 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1279 jq1 = _mm256_set1_ps(charge[inr+1]);
1280 jq2 = _mm256_set1_ps(charge[inr+2]);
1281 jq3 = _mm256_set1_ps(charge[inr+3]);
1282 qq11 = _mm256_mul_ps(iq1,jq1);
1283 qq12 = _mm256_mul_ps(iq1,jq2);
1284 qq13 = _mm256_mul_ps(iq1,jq3);
1285 qq21 = _mm256_mul_ps(iq2,jq1);
1286 qq22 = _mm256_mul_ps(iq2,jq2);
1287 qq23 = _mm256_mul_ps(iq2,jq3);
1288 qq31 = _mm256_mul_ps(iq3,jq1);
1289 qq32 = _mm256_mul_ps(iq3,jq2);
1290 qq33 = _mm256_mul_ps(iq3,jq3);
1292 /* Avoid stupid compiler warnings */
1293 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1294 j_coord_offsetA = 0;
1295 j_coord_offsetB = 0;
1296 j_coord_offsetC = 0;
1297 j_coord_offsetD = 0;
1298 j_coord_offsetE = 0;
1299 j_coord_offsetF = 0;
1300 j_coord_offsetG = 0;
1301 j_coord_offsetH = 0;
1306 for(iidx=0;iidx<4*DIM;iidx++)
1308 scratch[iidx] = 0.0;
1311 /* Start outer loop over neighborlists */
1312 for(iidx=0; iidx<nri; iidx++)
1314 /* Load shift vector for this list */
1315 i_shift_offset = DIM*shiftidx[iidx];
1317 /* Load limits for loop over neighbors */
1318 j_index_start = jindex[iidx];
1319 j_index_end = jindex[iidx+1];
1321 /* Get outer coordinate index */
1323 i_coord_offset = DIM*inr;
1325 /* Load i particle coords and add shift vector */
1326 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1327 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1329 fix1 = _mm256_setzero_ps();
1330 fiy1 = _mm256_setzero_ps();
1331 fiz1 = _mm256_setzero_ps();
1332 fix2 = _mm256_setzero_ps();
1333 fiy2 = _mm256_setzero_ps();
1334 fiz2 = _mm256_setzero_ps();
1335 fix3 = _mm256_setzero_ps();
1336 fiy3 = _mm256_setzero_ps();
1337 fiz3 = _mm256_setzero_ps();
1339 /* Start inner kernel loop */
1340 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1343 /* Get j neighbor index, and coordinate index */
1345 jnrB = jjnr[jidx+1];
1346 jnrC = jjnr[jidx+2];
1347 jnrD = jjnr[jidx+3];
1348 jnrE = jjnr[jidx+4];
1349 jnrF = jjnr[jidx+5];
1350 jnrG = jjnr[jidx+6];
1351 jnrH = jjnr[jidx+7];
1352 j_coord_offsetA = DIM*jnrA;
1353 j_coord_offsetB = DIM*jnrB;
1354 j_coord_offsetC = DIM*jnrC;
1355 j_coord_offsetD = DIM*jnrD;
1356 j_coord_offsetE = DIM*jnrE;
1357 j_coord_offsetF = DIM*jnrF;
1358 j_coord_offsetG = DIM*jnrG;
1359 j_coord_offsetH = DIM*jnrH;
1361 /* load j atom coordinates */
1362 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1363 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1364 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1365 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1366 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1368 /* Calculate displacement vector */
1369 dx11 = _mm256_sub_ps(ix1,jx1);
1370 dy11 = _mm256_sub_ps(iy1,jy1);
1371 dz11 = _mm256_sub_ps(iz1,jz1);
1372 dx12 = _mm256_sub_ps(ix1,jx2);
1373 dy12 = _mm256_sub_ps(iy1,jy2);
1374 dz12 = _mm256_sub_ps(iz1,jz2);
1375 dx13 = _mm256_sub_ps(ix1,jx3);
1376 dy13 = _mm256_sub_ps(iy1,jy3);
1377 dz13 = _mm256_sub_ps(iz1,jz3);
1378 dx21 = _mm256_sub_ps(ix2,jx1);
1379 dy21 = _mm256_sub_ps(iy2,jy1);
1380 dz21 = _mm256_sub_ps(iz2,jz1);
1381 dx22 = _mm256_sub_ps(ix2,jx2);
1382 dy22 = _mm256_sub_ps(iy2,jy2);
1383 dz22 = _mm256_sub_ps(iz2,jz2);
1384 dx23 = _mm256_sub_ps(ix2,jx3);
1385 dy23 = _mm256_sub_ps(iy2,jy3);
1386 dz23 = _mm256_sub_ps(iz2,jz3);
1387 dx31 = _mm256_sub_ps(ix3,jx1);
1388 dy31 = _mm256_sub_ps(iy3,jy1);
1389 dz31 = _mm256_sub_ps(iz3,jz1);
1390 dx32 = _mm256_sub_ps(ix3,jx2);
1391 dy32 = _mm256_sub_ps(iy3,jy2);
1392 dz32 = _mm256_sub_ps(iz3,jz2);
1393 dx33 = _mm256_sub_ps(ix3,jx3);
1394 dy33 = _mm256_sub_ps(iy3,jy3);
1395 dz33 = _mm256_sub_ps(iz3,jz3);
1397 /* Calculate squared distance and things based on it */
1398 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1399 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1400 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1401 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1402 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1403 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1404 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1405 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1406 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1408 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1409 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1410 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1411 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1412 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1413 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1414 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1415 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1416 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1418 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1419 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1420 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1421 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1422 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1423 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1424 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1425 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1426 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1428 fjx1 = _mm256_setzero_ps();
1429 fjy1 = _mm256_setzero_ps();
1430 fjz1 = _mm256_setzero_ps();
1431 fjx2 = _mm256_setzero_ps();
1432 fjy2 = _mm256_setzero_ps();
1433 fjz2 = _mm256_setzero_ps();
1434 fjx3 = _mm256_setzero_ps();
1435 fjy3 = _mm256_setzero_ps();
1436 fjz3 = _mm256_setzero_ps();
1438 /**************************
1439 * CALCULATE INTERACTIONS *
1440 **************************/
1442 r11 = _mm256_mul_ps(rsq11,rinv11);
1444 /* EWALD ELECTROSTATICS */
1446 /* Analytical PME correction */
1447 zeta2 = _mm256_mul_ps(beta2,rsq11);
1448 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1449 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1450 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1451 felec = _mm256_mul_ps(qq11,felec);
1455 /* Calculate temporary vectorial force */
1456 tx = _mm256_mul_ps(fscal,dx11);
1457 ty = _mm256_mul_ps(fscal,dy11);
1458 tz = _mm256_mul_ps(fscal,dz11);
1460 /* Update vectorial force */
1461 fix1 = _mm256_add_ps(fix1,tx);
1462 fiy1 = _mm256_add_ps(fiy1,ty);
1463 fiz1 = _mm256_add_ps(fiz1,tz);
1465 fjx1 = _mm256_add_ps(fjx1,tx);
1466 fjy1 = _mm256_add_ps(fjy1,ty);
1467 fjz1 = _mm256_add_ps(fjz1,tz);
1469 /**************************
1470 * CALCULATE INTERACTIONS *
1471 **************************/
1473 r12 = _mm256_mul_ps(rsq12,rinv12);
1475 /* EWALD ELECTROSTATICS */
1477 /* Analytical PME correction */
1478 zeta2 = _mm256_mul_ps(beta2,rsq12);
1479 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1480 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1481 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1482 felec = _mm256_mul_ps(qq12,felec);
1486 /* Calculate temporary vectorial force */
1487 tx = _mm256_mul_ps(fscal,dx12);
1488 ty = _mm256_mul_ps(fscal,dy12);
1489 tz = _mm256_mul_ps(fscal,dz12);
1491 /* Update vectorial force */
1492 fix1 = _mm256_add_ps(fix1,tx);
1493 fiy1 = _mm256_add_ps(fiy1,ty);
1494 fiz1 = _mm256_add_ps(fiz1,tz);
1496 fjx2 = _mm256_add_ps(fjx2,tx);
1497 fjy2 = _mm256_add_ps(fjy2,ty);
1498 fjz2 = _mm256_add_ps(fjz2,tz);
1500 /**************************
1501 * CALCULATE INTERACTIONS *
1502 **************************/
1504 r13 = _mm256_mul_ps(rsq13,rinv13);
1506 /* EWALD ELECTROSTATICS */
1508 /* Analytical PME correction */
1509 zeta2 = _mm256_mul_ps(beta2,rsq13);
1510 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1511 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1512 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1513 felec = _mm256_mul_ps(qq13,felec);
1517 /* Calculate temporary vectorial force */
1518 tx = _mm256_mul_ps(fscal,dx13);
1519 ty = _mm256_mul_ps(fscal,dy13);
1520 tz = _mm256_mul_ps(fscal,dz13);
1522 /* Update vectorial force */
1523 fix1 = _mm256_add_ps(fix1,tx);
1524 fiy1 = _mm256_add_ps(fiy1,ty);
1525 fiz1 = _mm256_add_ps(fiz1,tz);
1527 fjx3 = _mm256_add_ps(fjx3,tx);
1528 fjy3 = _mm256_add_ps(fjy3,ty);
1529 fjz3 = _mm256_add_ps(fjz3,tz);
1531 /**************************
1532 * CALCULATE INTERACTIONS *
1533 **************************/
1535 r21 = _mm256_mul_ps(rsq21,rinv21);
1537 /* EWALD ELECTROSTATICS */
1539 /* Analytical PME correction */
1540 zeta2 = _mm256_mul_ps(beta2,rsq21);
1541 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1542 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1543 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1544 felec = _mm256_mul_ps(qq21,felec);
1548 /* Calculate temporary vectorial force */
1549 tx = _mm256_mul_ps(fscal,dx21);
1550 ty = _mm256_mul_ps(fscal,dy21);
1551 tz = _mm256_mul_ps(fscal,dz21);
1553 /* Update vectorial force */
1554 fix2 = _mm256_add_ps(fix2,tx);
1555 fiy2 = _mm256_add_ps(fiy2,ty);
1556 fiz2 = _mm256_add_ps(fiz2,tz);
1558 fjx1 = _mm256_add_ps(fjx1,tx);
1559 fjy1 = _mm256_add_ps(fjy1,ty);
1560 fjz1 = _mm256_add_ps(fjz1,tz);
1562 /**************************
1563 * CALCULATE INTERACTIONS *
1564 **************************/
1566 r22 = _mm256_mul_ps(rsq22,rinv22);
1568 /* EWALD ELECTROSTATICS */
1570 /* Analytical PME correction */
1571 zeta2 = _mm256_mul_ps(beta2,rsq22);
1572 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1573 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1574 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1575 felec = _mm256_mul_ps(qq22,felec);
1579 /* Calculate temporary vectorial force */
1580 tx = _mm256_mul_ps(fscal,dx22);
1581 ty = _mm256_mul_ps(fscal,dy22);
1582 tz = _mm256_mul_ps(fscal,dz22);
1584 /* Update vectorial force */
1585 fix2 = _mm256_add_ps(fix2,tx);
1586 fiy2 = _mm256_add_ps(fiy2,ty);
1587 fiz2 = _mm256_add_ps(fiz2,tz);
1589 fjx2 = _mm256_add_ps(fjx2,tx);
1590 fjy2 = _mm256_add_ps(fjy2,ty);
1591 fjz2 = _mm256_add_ps(fjz2,tz);
1593 /**************************
1594 * CALCULATE INTERACTIONS *
1595 **************************/
1597 r23 = _mm256_mul_ps(rsq23,rinv23);
1599 /* EWALD ELECTROSTATICS */
1601 /* Analytical PME correction */
1602 zeta2 = _mm256_mul_ps(beta2,rsq23);
1603 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
1604 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1605 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1606 felec = _mm256_mul_ps(qq23,felec);
1610 /* Calculate temporary vectorial force */
1611 tx = _mm256_mul_ps(fscal,dx23);
1612 ty = _mm256_mul_ps(fscal,dy23);
1613 tz = _mm256_mul_ps(fscal,dz23);
1615 /* Update vectorial force */
1616 fix2 = _mm256_add_ps(fix2,tx);
1617 fiy2 = _mm256_add_ps(fiy2,ty);
1618 fiz2 = _mm256_add_ps(fiz2,tz);
1620 fjx3 = _mm256_add_ps(fjx3,tx);
1621 fjy3 = _mm256_add_ps(fjy3,ty);
1622 fjz3 = _mm256_add_ps(fjz3,tz);
1624 /**************************
1625 * CALCULATE INTERACTIONS *
1626 **************************/
1628 r31 = _mm256_mul_ps(rsq31,rinv31);
1630 /* EWALD ELECTROSTATICS */
1632 /* Analytical PME correction */
1633 zeta2 = _mm256_mul_ps(beta2,rsq31);
1634 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
1635 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1636 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1637 felec = _mm256_mul_ps(qq31,felec);
1641 /* Calculate temporary vectorial force */
1642 tx = _mm256_mul_ps(fscal,dx31);
1643 ty = _mm256_mul_ps(fscal,dy31);
1644 tz = _mm256_mul_ps(fscal,dz31);
1646 /* Update vectorial force */
1647 fix3 = _mm256_add_ps(fix3,tx);
1648 fiy3 = _mm256_add_ps(fiy3,ty);
1649 fiz3 = _mm256_add_ps(fiz3,tz);
1651 fjx1 = _mm256_add_ps(fjx1,tx);
1652 fjy1 = _mm256_add_ps(fjy1,ty);
1653 fjz1 = _mm256_add_ps(fjz1,tz);
1655 /**************************
1656 * CALCULATE INTERACTIONS *
1657 **************************/
1659 r32 = _mm256_mul_ps(rsq32,rinv32);
1661 /* EWALD ELECTROSTATICS */
1663 /* Analytical PME correction */
1664 zeta2 = _mm256_mul_ps(beta2,rsq32);
1665 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
1666 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1667 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1668 felec = _mm256_mul_ps(qq32,felec);
1672 /* Calculate temporary vectorial force */
1673 tx = _mm256_mul_ps(fscal,dx32);
1674 ty = _mm256_mul_ps(fscal,dy32);
1675 tz = _mm256_mul_ps(fscal,dz32);
1677 /* Update vectorial force */
1678 fix3 = _mm256_add_ps(fix3,tx);
1679 fiy3 = _mm256_add_ps(fiy3,ty);
1680 fiz3 = _mm256_add_ps(fiz3,tz);
1682 fjx2 = _mm256_add_ps(fjx2,tx);
1683 fjy2 = _mm256_add_ps(fjy2,ty);
1684 fjz2 = _mm256_add_ps(fjz2,tz);
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 r33 = _mm256_mul_ps(rsq33,rinv33);
1692 /* EWALD ELECTROSTATICS */
1694 /* Analytical PME correction */
1695 zeta2 = _mm256_mul_ps(beta2,rsq33);
1696 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
1697 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1698 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1699 felec = _mm256_mul_ps(qq33,felec);
1703 /* Calculate temporary vectorial force */
1704 tx = _mm256_mul_ps(fscal,dx33);
1705 ty = _mm256_mul_ps(fscal,dy33);
1706 tz = _mm256_mul_ps(fscal,dz33);
1708 /* Update vectorial force */
1709 fix3 = _mm256_add_ps(fix3,tx);
1710 fiy3 = _mm256_add_ps(fiy3,ty);
1711 fiz3 = _mm256_add_ps(fiz3,tz);
1713 fjx3 = _mm256_add_ps(fjx3,tx);
1714 fjy3 = _mm256_add_ps(fjy3,ty);
1715 fjz3 = _mm256_add_ps(fjz3,tz);
1717 fjptrA = f+j_coord_offsetA;
1718 fjptrB = f+j_coord_offsetB;
1719 fjptrC = f+j_coord_offsetC;
1720 fjptrD = f+j_coord_offsetD;
1721 fjptrE = f+j_coord_offsetE;
1722 fjptrF = f+j_coord_offsetF;
1723 fjptrG = f+j_coord_offsetG;
1724 fjptrH = f+j_coord_offsetH;
1726 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1727 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1728 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1730 /* Inner loop uses 504 flops */
1733 if(jidx<j_index_end)
1736 /* Get j neighbor index, and coordinate index */
1737 jnrlistA = jjnr[jidx];
1738 jnrlistB = jjnr[jidx+1];
1739 jnrlistC = jjnr[jidx+2];
1740 jnrlistD = jjnr[jidx+3];
1741 jnrlistE = jjnr[jidx+4];
1742 jnrlistF = jjnr[jidx+5];
1743 jnrlistG = jjnr[jidx+6];
1744 jnrlistH = jjnr[jidx+7];
1745 /* Sign of each element will be negative for non-real atoms.
1746 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1747 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1749 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1750 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1752 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1753 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1754 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1755 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1756 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1757 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1758 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1759 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1760 j_coord_offsetA = DIM*jnrA;
1761 j_coord_offsetB = DIM*jnrB;
1762 j_coord_offsetC = DIM*jnrC;
1763 j_coord_offsetD = DIM*jnrD;
1764 j_coord_offsetE = DIM*jnrE;
1765 j_coord_offsetF = DIM*jnrF;
1766 j_coord_offsetG = DIM*jnrG;
1767 j_coord_offsetH = DIM*jnrH;
1769 /* load j atom coordinates */
1770 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1771 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1772 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1773 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1774 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1776 /* Calculate displacement vector */
1777 dx11 = _mm256_sub_ps(ix1,jx1);
1778 dy11 = _mm256_sub_ps(iy1,jy1);
1779 dz11 = _mm256_sub_ps(iz1,jz1);
1780 dx12 = _mm256_sub_ps(ix1,jx2);
1781 dy12 = _mm256_sub_ps(iy1,jy2);
1782 dz12 = _mm256_sub_ps(iz1,jz2);
1783 dx13 = _mm256_sub_ps(ix1,jx3);
1784 dy13 = _mm256_sub_ps(iy1,jy3);
1785 dz13 = _mm256_sub_ps(iz1,jz3);
1786 dx21 = _mm256_sub_ps(ix2,jx1);
1787 dy21 = _mm256_sub_ps(iy2,jy1);
1788 dz21 = _mm256_sub_ps(iz2,jz1);
1789 dx22 = _mm256_sub_ps(ix2,jx2);
1790 dy22 = _mm256_sub_ps(iy2,jy2);
1791 dz22 = _mm256_sub_ps(iz2,jz2);
1792 dx23 = _mm256_sub_ps(ix2,jx3);
1793 dy23 = _mm256_sub_ps(iy2,jy3);
1794 dz23 = _mm256_sub_ps(iz2,jz3);
1795 dx31 = _mm256_sub_ps(ix3,jx1);
1796 dy31 = _mm256_sub_ps(iy3,jy1);
1797 dz31 = _mm256_sub_ps(iz3,jz1);
1798 dx32 = _mm256_sub_ps(ix3,jx2);
1799 dy32 = _mm256_sub_ps(iy3,jy2);
1800 dz32 = _mm256_sub_ps(iz3,jz2);
1801 dx33 = _mm256_sub_ps(ix3,jx3);
1802 dy33 = _mm256_sub_ps(iy3,jy3);
1803 dz33 = _mm256_sub_ps(iz3,jz3);
1805 /* Calculate squared distance and things based on it */
1806 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1807 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1808 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1809 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1810 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1811 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1812 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1813 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1814 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1816 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1817 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1818 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1819 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1820 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1821 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1822 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1823 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1824 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1826 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1827 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1828 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1829 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1830 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1831 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1832 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1833 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1834 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1836 fjx1 = _mm256_setzero_ps();
1837 fjy1 = _mm256_setzero_ps();
1838 fjz1 = _mm256_setzero_ps();
1839 fjx2 = _mm256_setzero_ps();
1840 fjy2 = _mm256_setzero_ps();
1841 fjz2 = _mm256_setzero_ps();
1842 fjx3 = _mm256_setzero_ps();
1843 fjy3 = _mm256_setzero_ps();
1844 fjz3 = _mm256_setzero_ps();
1846 /**************************
1847 * CALCULATE INTERACTIONS *
1848 **************************/
1850 r11 = _mm256_mul_ps(rsq11,rinv11);
1851 r11 = _mm256_andnot_ps(dummy_mask,r11);
1853 /* EWALD ELECTROSTATICS */
1855 /* Analytical PME correction */
1856 zeta2 = _mm256_mul_ps(beta2,rsq11);
1857 rinv3 = _mm256_mul_ps(rinvsq11,rinv11);
1858 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1859 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1860 felec = _mm256_mul_ps(qq11,felec);
1864 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1866 /* Calculate temporary vectorial force */
1867 tx = _mm256_mul_ps(fscal,dx11);
1868 ty = _mm256_mul_ps(fscal,dy11);
1869 tz = _mm256_mul_ps(fscal,dz11);
1871 /* Update vectorial force */
1872 fix1 = _mm256_add_ps(fix1,tx);
1873 fiy1 = _mm256_add_ps(fiy1,ty);
1874 fiz1 = _mm256_add_ps(fiz1,tz);
1876 fjx1 = _mm256_add_ps(fjx1,tx);
1877 fjy1 = _mm256_add_ps(fjy1,ty);
1878 fjz1 = _mm256_add_ps(fjz1,tz);
1880 /**************************
1881 * CALCULATE INTERACTIONS *
1882 **************************/
1884 r12 = _mm256_mul_ps(rsq12,rinv12);
1885 r12 = _mm256_andnot_ps(dummy_mask,r12);
1887 /* EWALD ELECTROSTATICS */
1889 /* Analytical PME correction */
1890 zeta2 = _mm256_mul_ps(beta2,rsq12);
1891 rinv3 = _mm256_mul_ps(rinvsq12,rinv12);
1892 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1893 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1894 felec = _mm256_mul_ps(qq12,felec);
1898 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1900 /* Calculate temporary vectorial force */
1901 tx = _mm256_mul_ps(fscal,dx12);
1902 ty = _mm256_mul_ps(fscal,dy12);
1903 tz = _mm256_mul_ps(fscal,dz12);
1905 /* Update vectorial force */
1906 fix1 = _mm256_add_ps(fix1,tx);
1907 fiy1 = _mm256_add_ps(fiy1,ty);
1908 fiz1 = _mm256_add_ps(fiz1,tz);
1910 fjx2 = _mm256_add_ps(fjx2,tx);
1911 fjy2 = _mm256_add_ps(fjy2,ty);
1912 fjz2 = _mm256_add_ps(fjz2,tz);
1914 /**************************
1915 * CALCULATE INTERACTIONS *
1916 **************************/
1918 r13 = _mm256_mul_ps(rsq13,rinv13);
1919 r13 = _mm256_andnot_ps(dummy_mask,r13);
1921 /* EWALD ELECTROSTATICS */
1923 /* Analytical PME correction */
1924 zeta2 = _mm256_mul_ps(beta2,rsq13);
1925 rinv3 = _mm256_mul_ps(rinvsq13,rinv13);
1926 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1927 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1928 felec = _mm256_mul_ps(qq13,felec);
1932 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1934 /* Calculate temporary vectorial force */
1935 tx = _mm256_mul_ps(fscal,dx13);
1936 ty = _mm256_mul_ps(fscal,dy13);
1937 tz = _mm256_mul_ps(fscal,dz13);
1939 /* Update vectorial force */
1940 fix1 = _mm256_add_ps(fix1,tx);
1941 fiy1 = _mm256_add_ps(fiy1,ty);
1942 fiz1 = _mm256_add_ps(fiz1,tz);
1944 fjx3 = _mm256_add_ps(fjx3,tx);
1945 fjy3 = _mm256_add_ps(fjy3,ty);
1946 fjz3 = _mm256_add_ps(fjz3,tz);
1948 /**************************
1949 * CALCULATE INTERACTIONS *
1950 **************************/
1952 r21 = _mm256_mul_ps(rsq21,rinv21);
1953 r21 = _mm256_andnot_ps(dummy_mask,r21);
1955 /* EWALD ELECTROSTATICS */
1957 /* Analytical PME correction */
1958 zeta2 = _mm256_mul_ps(beta2,rsq21);
1959 rinv3 = _mm256_mul_ps(rinvsq21,rinv21);
1960 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1961 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1962 felec = _mm256_mul_ps(qq21,felec);
1966 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1968 /* Calculate temporary vectorial force */
1969 tx = _mm256_mul_ps(fscal,dx21);
1970 ty = _mm256_mul_ps(fscal,dy21);
1971 tz = _mm256_mul_ps(fscal,dz21);
1973 /* Update vectorial force */
1974 fix2 = _mm256_add_ps(fix2,tx);
1975 fiy2 = _mm256_add_ps(fiy2,ty);
1976 fiz2 = _mm256_add_ps(fiz2,tz);
1978 fjx1 = _mm256_add_ps(fjx1,tx);
1979 fjy1 = _mm256_add_ps(fjy1,ty);
1980 fjz1 = _mm256_add_ps(fjz1,tz);
1982 /**************************
1983 * CALCULATE INTERACTIONS *
1984 **************************/
1986 r22 = _mm256_mul_ps(rsq22,rinv22);
1987 r22 = _mm256_andnot_ps(dummy_mask,r22);
1989 /* EWALD ELECTROSTATICS */
1991 /* Analytical PME correction */
1992 zeta2 = _mm256_mul_ps(beta2,rsq22);
1993 rinv3 = _mm256_mul_ps(rinvsq22,rinv22);
1994 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
1995 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1996 felec = _mm256_mul_ps(qq22,felec);
2000 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2002 /* Calculate temporary vectorial force */
2003 tx = _mm256_mul_ps(fscal,dx22);
2004 ty = _mm256_mul_ps(fscal,dy22);
2005 tz = _mm256_mul_ps(fscal,dz22);
2007 /* Update vectorial force */
2008 fix2 = _mm256_add_ps(fix2,tx);
2009 fiy2 = _mm256_add_ps(fiy2,ty);
2010 fiz2 = _mm256_add_ps(fiz2,tz);
2012 fjx2 = _mm256_add_ps(fjx2,tx);
2013 fjy2 = _mm256_add_ps(fjy2,ty);
2014 fjz2 = _mm256_add_ps(fjz2,tz);
2016 /**************************
2017 * CALCULATE INTERACTIONS *
2018 **************************/
2020 r23 = _mm256_mul_ps(rsq23,rinv23);
2021 r23 = _mm256_andnot_ps(dummy_mask,r23);
2023 /* EWALD ELECTROSTATICS */
2025 /* Analytical PME correction */
2026 zeta2 = _mm256_mul_ps(beta2,rsq23);
2027 rinv3 = _mm256_mul_ps(rinvsq23,rinv23);
2028 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2029 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2030 felec = _mm256_mul_ps(qq23,felec);
2034 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2036 /* Calculate temporary vectorial force */
2037 tx = _mm256_mul_ps(fscal,dx23);
2038 ty = _mm256_mul_ps(fscal,dy23);
2039 tz = _mm256_mul_ps(fscal,dz23);
2041 /* Update vectorial force */
2042 fix2 = _mm256_add_ps(fix2,tx);
2043 fiy2 = _mm256_add_ps(fiy2,ty);
2044 fiz2 = _mm256_add_ps(fiz2,tz);
2046 fjx3 = _mm256_add_ps(fjx3,tx);
2047 fjy3 = _mm256_add_ps(fjy3,ty);
2048 fjz3 = _mm256_add_ps(fjz3,tz);
2050 /**************************
2051 * CALCULATE INTERACTIONS *
2052 **************************/
2054 r31 = _mm256_mul_ps(rsq31,rinv31);
2055 r31 = _mm256_andnot_ps(dummy_mask,r31);
2057 /* EWALD ELECTROSTATICS */
2059 /* Analytical PME correction */
2060 zeta2 = _mm256_mul_ps(beta2,rsq31);
2061 rinv3 = _mm256_mul_ps(rinvsq31,rinv31);
2062 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2063 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2064 felec = _mm256_mul_ps(qq31,felec);
2068 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2070 /* Calculate temporary vectorial force */
2071 tx = _mm256_mul_ps(fscal,dx31);
2072 ty = _mm256_mul_ps(fscal,dy31);
2073 tz = _mm256_mul_ps(fscal,dz31);
2075 /* Update vectorial force */
2076 fix3 = _mm256_add_ps(fix3,tx);
2077 fiy3 = _mm256_add_ps(fiy3,ty);
2078 fiz3 = _mm256_add_ps(fiz3,tz);
2080 fjx1 = _mm256_add_ps(fjx1,tx);
2081 fjy1 = _mm256_add_ps(fjy1,ty);
2082 fjz1 = _mm256_add_ps(fjz1,tz);
2084 /**************************
2085 * CALCULATE INTERACTIONS *
2086 **************************/
2088 r32 = _mm256_mul_ps(rsq32,rinv32);
2089 r32 = _mm256_andnot_ps(dummy_mask,r32);
2091 /* EWALD ELECTROSTATICS */
2093 /* Analytical PME correction */
2094 zeta2 = _mm256_mul_ps(beta2,rsq32);
2095 rinv3 = _mm256_mul_ps(rinvsq32,rinv32);
2096 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2097 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2098 felec = _mm256_mul_ps(qq32,felec);
2102 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2104 /* Calculate temporary vectorial force */
2105 tx = _mm256_mul_ps(fscal,dx32);
2106 ty = _mm256_mul_ps(fscal,dy32);
2107 tz = _mm256_mul_ps(fscal,dz32);
2109 /* Update vectorial force */
2110 fix3 = _mm256_add_ps(fix3,tx);
2111 fiy3 = _mm256_add_ps(fiy3,ty);
2112 fiz3 = _mm256_add_ps(fiz3,tz);
2114 fjx2 = _mm256_add_ps(fjx2,tx);
2115 fjy2 = _mm256_add_ps(fjy2,ty);
2116 fjz2 = _mm256_add_ps(fjz2,tz);
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 r33 = _mm256_mul_ps(rsq33,rinv33);
2123 r33 = _mm256_andnot_ps(dummy_mask,r33);
2125 /* EWALD ELECTROSTATICS */
2127 /* Analytical PME correction */
2128 zeta2 = _mm256_mul_ps(beta2,rsq33);
2129 rinv3 = _mm256_mul_ps(rinvsq33,rinv33);
2130 pmecorrF = gmx_mm256_pmecorrF_ps(zeta2);
2131 felec = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2132 felec = _mm256_mul_ps(qq33,felec);
2136 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2138 /* Calculate temporary vectorial force */
2139 tx = _mm256_mul_ps(fscal,dx33);
2140 ty = _mm256_mul_ps(fscal,dy33);
2141 tz = _mm256_mul_ps(fscal,dz33);
2143 /* Update vectorial force */
2144 fix3 = _mm256_add_ps(fix3,tx);
2145 fiy3 = _mm256_add_ps(fiy3,ty);
2146 fiz3 = _mm256_add_ps(fiz3,tz);
2148 fjx3 = _mm256_add_ps(fjx3,tx);
2149 fjy3 = _mm256_add_ps(fjy3,ty);
2150 fjz3 = _mm256_add_ps(fjz3,tz);
2152 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2153 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2154 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2155 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2156 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2157 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2158 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2159 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2161 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2162 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2163 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2165 /* Inner loop uses 513 flops */
2168 /* End of innermost loop */
2170 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2171 f+i_coord_offset+DIM,fshift+i_shift_offset);
2173 /* Increment number of inner iterations */
2174 inneriter += j_index_end - j_index_start;
2176 /* Outer loop uses 18 flops */
2179 /* Increment number of outer iterations */
2182 /* Update outer/inner flops */
2184 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*513);