2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m256 dummy_mask,cutoff_mask;
96 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
97 __m256 one = _mm256_set1_ps(1.0);
98 __m256 two = _mm256_set1_ps(2.0);
104 jindex = nlist->jindex;
106 shiftidx = nlist->shift;
108 shiftvec = fr->shift_vec[0];
109 fshift = fr->fshift[0];
110 facel = _mm256_set1_ps(fr->epsfac);
111 charge = mdatoms->chargeA;
113 /* Setup water-specific parameters */
114 inr = nlist->iinr[0];
115 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
116 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
117 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
119 jq0 = _mm256_set1_ps(charge[inr+0]);
120 jq1 = _mm256_set1_ps(charge[inr+1]);
121 jq2 = _mm256_set1_ps(charge[inr+2]);
122 qq00 = _mm256_mul_ps(iq0,jq0);
123 qq01 = _mm256_mul_ps(iq0,jq1);
124 qq02 = _mm256_mul_ps(iq0,jq2);
125 qq10 = _mm256_mul_ps(iq1,jq0);
126 qq11 = _mm256_mul_ps(iq1,jq1);
127 qq12 = _mm256_mul_ps(iq1,jq2);
128 qq20 = _mm256_mul_ps(iq2,jq0);
129 qq21 = _mm256_mul_ps(iq2,jq1);
130 qq22 = _mm256_mul_ps(iq2,jq2);
132 /* Avoid stupid compiler warnings */
133 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
146 for(iidx=0;iidx<4*DIM;iidx++)
151 /* Start outer loop over neighborlists */
152 for(iidx=0; iidx<nri; iidx++)
154 /* Load shift vector for this list */
155 i_shift_offset = DIM*shiftidx[iidx];
157 /* Load limits for loop over neighbors */
158 j_index_start = jindex[iidx];
159 j_index_end = jindex[iidx+1];
161 /* Get outer coordinate index */
163 i_coord_offset = DIM*inr;
165 /* Load i particle coords and add shift vector */
166 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
167 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
169 fix0 = _mm256_setzero_ps();
170 fiy0 = _mm256_setzero_ps();
171 fiz0 = _mm256_setzero_ps();
172 fix1 = _mm256_setzero_ps();
173 fiy1 = _mm256_setzero_ps();
174 fiz1 = _mm256_setzero_ps();
175 fix2 = _mm256_setzero_ps();
176 fiy2 = _mm256_setzero_ps();
177 fiz2 = _mm256_setzero_ps();
179 /* Reset potential sums */
180 velecsum = _mm256_setzero_ps();
182 /* Start inner kernel loop */
183 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
186 /* Get j neighbor index, and coordinate index */
195 j_coord_offsetA = DIM*jnrA;
196 j_coord_offsetB = DIM*jnrB;
197 j_coord_offsetC = DIM*jnrC;
198 j_coord_offsetD = DIM*jnrD;
199 j_coord_offsetE = DIM*jnrE;
200 j_coord_offsetF = DIM*jnrF;
201 j_coord_offsetG = DIM*jnrG;
202 j_coord_offsetH = DIM*jnrH;
204 /* load j atom coordinates */
205 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
206 x+j_coord_offsetC,x+j_coord_offsetD,
207 x+j_coord_offsetE,x+j_coord_offsetF,
208 x+j_coord_offsetG,x+j_coord_offsetH,
209 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
211 /* Calculate displacement vector */
212 dx00 = _mm256_sub_ps(ix0,jx0);
213 dy00 = _mm256_sub_ps(iy0,jy0);
214 dz00 = _mm256_sub_ps(iz0,jz0);
215 dx01 = _mm256_sub_ps(ix0,jx1);
216 dy01 = _mm256_sub_ps(iy0,jy1);
217 dz01 = _mm256_sub_ps(iz0,jz1);
218 dx02 = _mm256_sub_ps(ix0,jx2);
219 dy02 = _mm256_sub_ps(iy0,jy2);
220 dz02 = _mm256_sub_ps(iz0,jz2);
221 dx10 = _mm256_sub_ps(ix1,jx0);
222 dy10 = _mm256_sub_ps(iy1,jy0);
223 dz10 = _mm256_sub_ps(iz1,jz0);
224 dx11 = _mm256_sub_ps(ix1,jx1);
225 dy11 = _mm256_sub_ps(iy1,jy1);
226 dz11 = _mm256_sub_ps(iz1,jz1);
227 dx12 = _mm256_sub_ps(ix1,jx2);
228 dy12 = _mm256_sub_ps(iy1,jy2);
229 dz12 = _mm256_sub_ps(iz1,jz2);
230 dx20 = _mm256_sub_ps(ix2,jx0);
231 dy20 = _mm256_sub_ps(iy2,jy0);
232 dz20 = _mm256_sub_ps(iz2,jz0);
233 dx21 = _mm256_sub_ps(ix2,jx1);
234 dy21 = _mm256_sub_ps(iy2,jy1);
235 dz21 = _mm256_sub_ps(iz2,jz1);
236 dx22 = _mm256_sub_ps(ix2,jx2);
237 dy22 = _mm256_sub_ps(iy2,jy2);
238 dz22 = _mm256_sub_ps(iz2,jz2);
240 /* Calculate squared distance and things based on it */
241 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
242 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
243 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
244 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
245 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
246 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
247 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
248 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
249 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
251 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
252 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
253 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
254 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
255 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
256 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
257 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
258 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
259 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
261 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
262 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
263 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
264 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
265 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
266 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
267 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
268 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
269 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
271 fjx0 = _mm256_setzero_ps();
272 fjy0 = _mm256_setzero_ps();
273 fjz0 = _mm256_setzero_ps();
274 fjx1 = _mm256_setzero_ps();
275 fjy1 = _mm256_setzero_ps();
276 fjz1 = _mm256_setzero_ps();
277 fjx2 = _mm256_setzero_ps();
278 fjy2 = _mm256_setzero_ps();
279 fjz2 = _mm256_setzero_ps();
281 /**************************
282 * CALCULATE INTERACTIONS *
283 **************************/
285 /* COULOMB ELECTROSTATICS */
286 velec = _mm256_mul_ps(qq00,rinv00);
287 felec = _mm256_mul_ps(velec,rinvsq00);
289 /* Update potential sum for this i atom from the interaction with this j atom. */
290 velecsum = _mm256_add_ps(velecsum,velec);
294 /* Calculate temporary vectorial force */
295 tx = _mm256_mul_ps(fscal,dx00);
296 ty = _mm256_mul_ps(fscal,dy00);
297 tz = _mm256_mul_ps(fscal,dz00);
299 /* Update vectorial force */
300 fix0 = _mm256_add_ps(fix0,tx);
301 fiy0 = _mm256_add_ps(fiy0,ty);
302 fiz0 = _mm256_add_ps(fiz0,tz);
304 fjx0 = _mm256_add_ps(fjx0,tx);
305 fjy0 = _mm256_add_ps(fjy0,ty);
306 fjz0 = _mm256_add_ps(fjz0,tz);
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 /* COULOMB ELECTROSTATICS */
313 velec = _mm256_mul_ps(qq01,rinv01);
314 felec = _mm256_mul_ps(velec,rinvsq01);
316 /* Update potential sum for this i atom from the interaction with this j atom. */
317 velecsum = _mm256_add_ps(velecsum,velec);
321 /* Calculate temporary vectorial force */
322 tx = _mm256_mul_ps(fscal,dx01);
323 ty = _mm256_mul_ps(fscal,dy01);
324 tz = _mm256_mul_ps(fscal,dz01);
326 /* Update vectorial force */
327 fix0 = _mm256_add_ps(fix0,tx);
328 fiy0 = _mm256_add_ps(fiy0,ty);
329 fiz0 = _mm256_add_ps(fiz0,tz);
331 fjx1 = _mm256_add_ps(fjx1,tx);
332 fjy1 = _mm256_add_ps(fjy1,ty);
333 fjz1 = _mm256_add_ps(fjz1,tz);
335 /**************************
336 * CALCULATE INTERACTIONS *
337 **************************/
339 /* COULOMB ELECTROSTATICS */
340 velec = _mm256_mul_ps(qq02,rinv02);
341 felec = _mm256_mul_ps(velec,rinvsq02);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velecsum = _mm256_add_ps(velecsum,velec);
348 /* Calculate temporary vectorial force */
349 tx = _mm256_mul_ps(fscal,dx02);
350 ty = _mm256_mul_ps(fscal,dy02);
351 tz = _mm256_mul_ps(fscal,dz02);
353 /* Update vectorial force */
354 fix0 = _mm256_add_ps(fix0,tx);
355 fiy0 = _mm256_add_ps(fiy0,ty);
356 fiz0 = _mm256_add_ps(fiz0,tz);
358 fjx2 = _mm256_add_ps(fjx2,tx);
359 fjy2 = _mm256_add_ps(fjy2,ty);
360 fjz2 = _mm256_add_ps(fjz2,tz);
362 /**************************
363 * CALCULATE INTERACTIONS *
364 **************************/
366 /* COULOMB ELECTROSTATICS */
367 velec = _mm256_mul_ps(qq10,rinv10);
368 felec = _mm256_mul_ps(velec,rinvsq10);
370 /* Update potential sum for this i atom from the interaction with this j atom. */
371 velecsum = _mm256_add_ps(velecsum,velec);
375 /* Calculate temporary vectorial force */
376 tx = _mm256_mul_ps(fscal,dx10);
377 ty = _mm256_mul_ps(fscal,dy10);
378 tz = _mm256_mul_ps(fscal,dz10);
380 /* Update vectorial force */
381 fix1 = _mm256_add_ps(fix1,tx);
382 fiy1 = _mm256_add_ps(fiy1,ty);
383 fiz1 = _mm256_add_ps(fiz1,tz);
385 fjx0 = _mm256_add_ps(fjx0,tx);
386 fjy0 = _mm256_add_ps(fjy0,ty);
387 fjz0 = _mm256_add_ps(fjz0,tz);
389 /**************************
390 * CALCULATE INTERACTIONS *
391 **************************/
393 /* COULOMB ELECTROSTATICS */
394 velec = _mm256_mul_ps(qq11,rinv11);
395 felec = _mm256_mul_ps(velec,rinvsq11);
397 /* Update potential sum for this i atom from the interaction with this j atom. */
398 velecsum = _mm256_add_ps(velecsum,velec);
402 /* Calculate temporary vectorial force */
403 tx = _mm256_mul_ps(fscal,dx11);
404 ty = _mm256_mul_ps(fscal,dy11);
405 tz = _mm256_mul_ps(fscal,dz11);
407 /* Update vectorial force */
408 fix1 = _mm256_add_ps(fix1,tx);
409 fiy1 = _mm256_add_ps(fiy1,ty);
410 fiz1 = _mm256_add_ps(fiz1,tz);
412 fjx1 = _mm256_add_ps(fjx1,tx);
413 fjy1 = _mm256_add_ps(fjy1,ty);
414 fjz1 = _mm256_add_ps(fjz1,tz);
416 /**************************
417 * CALCULATE INTERACTIONS *
418 **************************/
420 /* COULOMB ELECTROSTATICS */
421 velec = _mm256_mul_ps(qq12,rinv12);
422 felec = _mm256_mul_ps(velec,rinvsq12);
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velecsum = _mm256_add_ps(velecsum,velec);
429 /* Calculate temporary vectorial force */
430 tx = _mm256_mul_ps(fscal,dx12);
431 ty = _mm256_mul_ps(fscal,dy12);
432 tz = _mm256_mul_ps(fscal,dz12);
434 /* Update vectorial force */
435 fix1 = _mm256_add_ps(fix1,tx);
436 fiy1 = _mm256_add_ps(fiy1,ty);
437 fiz1 = _mm256_add_ps(fiz1,tz);
439 fjx2 = _mm256_add_ps(fjx2,tx);
440 fjy2 = _mm256_add_ps(fjy2,ty);
441 fjz2 = _mm256_add_ps(fjz2,tz);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 /* COULOMB ELECTROSTATICS */
448 velec = _mm256_mul_ps(qq20,rinv20);
449 felec = _mm256_mul_ps(velec,rinvsq20);
451 /* Update potential sum for this i atom from the interaction with this j atom. */
452 velecsum = _mm256_add_ps(velecsum,velec);
456 /* Calculate temporary vectorial force */
457 tx = _mm256_mul_ps(fscal,dx20);
458 ty = _mm256_mul_ps(fscal,dy20);
459 tz = _mm256_mul_ps(fscal,dz20);
461 /* Update vectorial force */
462 fix2 = _mm256_add_ps(fix2,tx);
463 fiy2 = _mm256_add_ps(fiy2,ty);
464 fiz2 = _mm256_add_ps(fiz2,tz);
466 fjx0 = _mm256_add_ps(fjx0,tx);
467 fjy0 = _mm256_add_ps(fjy0,ty);
468 fjz0 = _mm256_add_ps(fjz0,tz);
470 /**************************
471 * CALCULATE INTERACTIONS *
472 **************************/
474 /* COULOMB ELECTROSTATICS */
475 velec = _mm256_mul_ps(qq21,rinv21);
476 felec = _mm256_mul_ps(velec,rinvsq21);
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velecsum = _mm256_add_ps(velecsum,velec);
483 /* Calculate temporary vectorial force */
484 tx = _mm256_mul_ps(fscal,dx21);
485 ty = _mm256_mul_ps(fscal,dy21);
486 tz = _mm256_mul_ps(fscal,dz21);
488 /* Update vectorial force */
489 fix2 = _mm256_add_ps(fix2,tx);
490 fiy2 = _mm256_add_ps(fiy2,ty);
491 fiz2 = _mm256_add_ps(fiz2,tz);
493 fjx1 = _mm256_add_ps(fjx1,tx);
494 fjy1 = _mm256_add_ps(fjy1,ty);
495 fjz1 = _mm256_add_ps(fjz1,tz);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 /* COULOMB ELECTROSTATICS */
502 velec = _mm256_mul_ps(qq22,rinv22);
503 felec = _mm256_mul_ps(velec,rinvsq22);
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velecsum = _mm256_add_ps(velecsum,velec);
510 /* Calculate temporary vectorial force */
511 tx = _mm256_mul_ps(fscal,dx22);
512 ty = _mm256_mul_ps(fscal,dy22);
513 tz = _mm256_mul_ps(fscal,dz22);
515 /* Update vectorial force */
516 fix2 = _mm256_add_ps(fix2,tx);
517 fiy2 = _mm256_add_ps(fiy2,ty);
518 fiz2 = _mm256_add_ps(fiz2,tz);
520 fjx2 = _mm256_add_ps(fjx2,tx);
521 fjy2 = _mm256_add_ps(fjy2,ty);
522 fjz2 = _mm256_add_ps(fjz2,tz);
524 fjptrA = f+j_coord_offsetA;
525 fjptrB = f+j_coord_offsetB;
526 fjptrC = f+j_coord_offsetC;
527 fjptrD = f+j_coord_offsetD;
528 fjptrE = f+j_coord_offsetE;
529 fjptrF = f+j_coord_offsetF;
530 fjptrG = f+j_coord_offsetG;
531 fjptrH = f+j_coord_offsetH;
533 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
534 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
536 /* Inner loop uses 243 flops */
542 /* Get j neighbor index, and coordinate index */
543 jnrlistA = jjnr[jidx];
544 jnrlistB = jjnr[jidx+1];
545 jnrlistC = jjnr[jidx+2];
546 jnrlistD = jjnr[jidx+3];
547 jnrlistE = jjnr[jidx+4];
548 jnrlistF = jjnr[jidx+5];
549 jnrlistG = jjnr[jidx+6];
550 jnrlistH = jjnr[jidx+7];
551 /* Sign of each element will be negative for non-real atoms.
552 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
553 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
555 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
556 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
558 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
559 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
560 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
561 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
562 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
563 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
564 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
565 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
566 j_coord_offsetA = DIM*jnrA;
567 j_coord_offsetB = DIM*jnrB;
568 j_coord_offsetC = DIM*jnrC;
569 j_coord_offsetD = DIM*jnrD;
570 j_coord_offsetE = DIM*jnrE;
571 j_coord_offsetF = DIM*jnrF;
572 j_coord_offsetG = DIM*jnrG;
573 j_coord_offsetH = DIM*jnrH;
575 /* load j atom coordinates */
576 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
577 x+j_coord_offsetC,x+j_coord_offsetD,
578 x+j_coord_offsetE,x+j_coord_offsetF,
579 x+j_coord_offsetG,x+j_coord_offsetH,
580 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
582 /* Calculate displacement vector */
583 dx00 = _mm256_sub_ps(ix0,jx0);
584 dy00 = _mm256_sub_ps(iy0,jy0);
585 dz00 = _mm256_sub_ps(iz0,jz0);
586 dx01 = _mm256_sub_ps(ix0,jx1);
587 dy01 = _mm256_sub_ps(iy0,jy1);
588 dz01 = _mm256_sub_ps(iz0,jz1);
589 dx02 = _mm256_sub_ps(ix0,jx2);
590 dy02 = _mm256_sub_ps(iy0,jy2);
591 dz02 = _mm256_sub_ps(iz0,jz2);
592 dx10 = _mm256_sub_ps(ix1,jx0);
593 dy10 = _mm256_sub_ps(iy1,jy0);
594 dz10 = _mm256_sub_ps(iz1,jz0);
595 dx11 = _mm256_sub_ps(ix1,jx1);
596 dy11 = _mm256_sub_ps(iy1,jy1);
597 dz11 = _mm256_sub_ps(iz1,jz1);
598 dx12 = _mm256_sub_ps(ix1,jx2);
599 dy12 = _mm256_sub_ps(iy1,jy2);
600 dz12 = _mm256_sub_ps(iz1,jz2);
601 dx20 = _mm256_sub_ps(ix2,jx0);
602 dy20 = _mm256_sub_ps(iy2,jy0);
603 dz20 = _mm256_sub_ps(iz2,jz0);
604 dx21 = _mm256_sub_ps(ix2,jx1);
605 dy21 = _mm256_sub_ps(iy2,jy1);
606 dz21 = _mm256_sub_ps(iz2,jz1);
607 dx22 = _mm256_sub_ps(ix2,jx2);
608 dy22 = _mm256_sub_ps(iy2,jy2);
609 dz22 = _mm256_sub_ps(iz2,jz2);
611 /* Calculate squared distance and things based on it */
612 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
613 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
614 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
615 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
616 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
617 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
618 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
619 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
620 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
622 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
623 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
624 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
625 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
626 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
627 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
628 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
629 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
630 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
632 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
633 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
634 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
635 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
636 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
637 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
638 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
639 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
640 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
642 fjx0 = _mm256_setzero_ps();
643 fjy0 = _mm256_setzero_ps();
644 fjz0 = _mm256_setzero_ps();
645 fjx1 = _mm256_setzero_ps();
646 fjy1 = _mm256_setzero_ps();
647 fjz1 = _mm256_setzero_ps();
648 fjx2 = _mm256_setzero_ps();
649 fjy2 = _mm256_setzero_ps();
650 fjz2 = _mm256_setzero_ps();
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 /* COULOMB ELECTROSTATICS */
657 velec = _mm256_mul_ps(qq00,rinv00);
658 felec = _mm256_mul_ps(velec,rinvsq00);
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 velec = _mm256_andnot_ps(dummy_mask,velec);
662 velecsum = _mm256_add_ps(velecsum,velec);
666 fscal = _mm256_andnot_ps(dummy_mask,fscal);
668 /* Calculate temporary vectorial force */
669 tx = _mm256_mul_ps(fscal,dx00);
670 ty = _mm256_mul_ps(fscal,dy00);
671 tz = _mm256_mul_ps(fscal,dz00);
673 /* Update vectorial force */
674 fix0 = _mm256_add_ps(fix0,tx);
675 fiy0 = _mm256_add_ps(fiy0,ty);
676 fiz0 = _mm256_add_ps(fiz0,tz);
678 fjx0 = _mm256_add_ps(fjx0,tx);
679 fjy0 = _mm256_add_ps(fjy0,ty);
680 fjz0 = _mm256_add_ps(fjz0,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 /* COULOMB ELECTROSTATICS */
687 velec = _mm256_mul_ps(qq01,rinv01);
688 felec = _mm256_mul_ps(velec,rinvsq01);
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec = _mm256_andnot_ps(dummy_mask,velec);
692 velecsum = _mm256_add_ps(velecsum,velec);
696 fscal = _mm256_andnot_ps(dummy_mask,fscal);
698 /* Calculate temporary vectorial force */
699 tx = _mm256_mul_ps(fscal,dx01);
700 ty = _mm256_mul_ps(fscal,dy01);
701 tz = _mm256_mul_ps(fscal,dz01);
703 /* Update vectorial force */
704 fix0 = _mm256_add_ps(fix0,tx);
705 fiy0 = _mm256_add_ps(fiy0,ty);
706 fiz0 = _mm256_add_ps(fiz0,tz);
708 fjx1 = _mm256_add_ps(fjx1,tx);
709 fjy1 = _mm256_add_ps(fjy1,ty);
710 fjz1 = _mm256_add_ps(fjz1,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 /* COULOMB ELECTROSTATICS */
717 velec = _mm256_mul_ps(qq02,rinv02);
718 felec = _mm256_mul_ps(velec,rinvsq02);
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec = _mm256_andnot_ps(dummy_mask,velec);
722 velecsum = _mm256_add_ps(velecsum,velec);
726 fscal = _mm256_andnot_ps(dummy_mask,fscal);
728 /* Calculate temporary vectorial force */
729 tx = _mm256_mul_ps(fscal,dx02);
730 ty = _mm256_mul_ps(fscal,dy02);
731 tz = _mm256_mul_ps(fscal,dz02);
733 /* Update vectorial force */
734 fix0 = _mm256_add_ps(fix0,tx);
735 fiy0 = _mm256_add_ps(fiy0,ty);
736 fiz0 = _mm256_add_ps(fiz0,tz);
738 fjx2 = _mm256_add_ps(fjx2,tx);
739 fjy2 = _mm256_add_ps(fjy2,ty);
740 fjz2 = _mm256_add_ps(fjz2,tz);
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 /* COULOMB ELECTROSTATICS */
747 velec = _mm256_mul_ps(qq10,rinv10);
748 felec = _mm256_mul_ps(velec,rinvsq10);
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec = _mm256_andnot_ps(dummy_mask,velec);
752 velecsum = _mm256_add_ps(velecsum,velec);
756 fscal = _mm256_andnot_ps(dummy_mask,fscal);
758 /* Calculate temporary vectorial force */
759 tx = _mm256_mul_ps(fscal,dx10);
760 ty = _mm256_mul_ps(fscal,dy10);
761 tz = _mm256_mul_ps(fscal,dz10);
763 /* Update vectorial force */
764 fix1 = _mm256_add_ps(fix1,tx);
765 fiy1 = _mm256_add_ps(fiy1,ty);
766 fiz1 = _mm256_add_ps(fiz1,tz);
768 fjx0 = _mm256_add_ps(fjx0,tx);
769 fjy0 = _mm256_add_ps(fjy0,ty);
770 fjz0 = _mm256_add_ps(fjz0,tz);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 /* COULOMB ELECTROSTATICS */
777 velec = _mm256_mul_ps(qq11,rinv11);
778 felec = _mm256_mul_ps(velec,rinvsq11);
780 /* Update potential sum for this i atom from the interaction with this j atom. */
781 velec = _mm256_andnot_ps(dummy_mask,velec);
782 velecsum = _mm256_add_ps(velecsum,velec);
786 fscal = _mm256_andnot_ps(dummy_mask,fscal);
788 /* Calculate temporary vectorial force */
789 tx = _mm256_mul_ps(fscal,dx11);
790 ty = _mm256_mul_ps(fscal,dy11);
791 tz = _mm256_mul_ps(fscal,dz11);
793 /* Update vectorial force */
794 fix1 = _mm256_add_ps(fix1,tx);
795 fiy1 = _mm256_add_ps(fiy1,ty);
796 fiz1 = _mm256_add_ps(fiz1,tz);
798 fjx1 = _mm256_add_ps(fjx1,tx);
799 fjy1 = _mm256_add_ps(fjy1,ty);
800 fjz1 = _mm256_add_ps(fjz1,tz);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 /* COULOMB ELECTROSTATICS */
807 velec = _mm256_mul_ps(qq12,rinv12);
808 felec = _mm256_mul_ps(velec,rinvsq12);
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm256_andnot_ps(dummy_mask,velec);
812 velecsum = _mm256_add_ps(velecsum,velec);
816 fscal = _mm256_andnot_ps(dummy_mask,fscal);
818 /* Calculate temporary vectorial force */
819 tx = _mm256_mul_ps(fscal,dx12);
820 ty = _mm256_mul_ps(fscal,dy12);
821 tz = _mm256_mul_ps(fscal,dz12);
823 /* Update vectorial force */
824 fix1 = _mm256_add_ps(fix1,tx);
825 fiy1 = _mm256_add_ps(fiy1,ty);
826 fiz1 = _mm256_add_ps(fiz1,tz);
828 fjx2 = _mm256_add_ps(fjx2,tx);
829 fjy2 = _mm256_add_ps(fjy2,ty);
830 fjz2 = _mm256_add_ps(fjz2,tz);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 /* COULOMB ELECTROSTATICS */
837 velec = _mm256_mul_ps(qq20,rinv20);
838 felec = _mm256_mul_ps(velec,rinvsq20);
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec = _mm256_andnot_ps(dummy_mask,velec);
842 velecsum = _mm256_add_ps(velecsum,velec);
846 fscal = _mm256_andnot_ps(dummy_mask,fscal);
848 /* Calculate temporary vectorial force */
849 tx = _mm256_mul_ps(fscal,dx20);
850 ty = _mm256_mul_ps(fscal,dy20);
851 tz = _mm256_mul_ps(fscal,dz20);
853 /* Update vectorial force */
854 fix2 = _mm256_add_ps(fix2,tx);
855 fiy2 = _mm256_add_ps(fiy2,ty);
856 fiz2 = _mm256_add_ps(fiz2,tz);
858 fjx0 = _mm256_add_ps(fjx0,tx);
859 fjy0 = _mm256_add_ps(fjy0,ty);
860 fjz0 = _mm256_add_ps(fjz0,tz);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 /* COULOMB ELECTROSTATICS */
867 velec = _mm256_mul_ps(qq21,rinv21);
868 felec = _mm256_mul_ps(velec,rinvsq21);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm256_andnot_ps(dummy_mask,velec);
872 velecsum = _mm256_add_ps(velecsum,velec);
876 fscal = _mm256_andnot_ps(dummy_mask,fscal);
878 /* Calculate temporary vectorial force */
879 tx = _mm256_mul_ps(fscal,dx21);
880 ty = _mm256_mul_ps(fscal,dy21);
881 tz = _mm256_mul_ps(fscal,dz21);
883 /* Update vectorial force */
884 fix2 = _mm256_add_ps(fix2,tx);
885 fiy2 = _mm256_add_ps(fiy2,ty);
886 fiz2 = _mm256_add_ps(fiz2,tz);
888 fjx1 = _mm256_add_ps(fjx1,tx);
889 fjy1 = _mm256_add_ps(fjy1,ty);
890 fjz1 = _mm256_add_ps(fjz1,tz);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 /* COULOMB ELECTROSTATICS */
897 velec = _mm256_mul_ps(qq22,rinv22);
898 felec = _mm256_mul_ps(velec,rinvsq22);
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec = _mm256_andnot_ps(dummy_mask,velec);
902 velecsum = _mm256_add_ps(velecsum,velec);
906 fscal = _mm256_andnot_ps(dummy_mask,fscal);
908 /* Calculate temporary vectorial force */
909 tx = _mm256_mul_ps(fscal,dx22);
910 ty = _mm256_mul_ps(fscal,dy22);
911 tz = _mm256_mul_ps(fscal,dz22);
913 /* Update vectorial force */
914 fix2 = _mm256_add_ps(fix2,tx);
915 fiy2 = _mm256_add_ps(fiy2,ty);
916 fiz2 = _mm256_add_ps(fiz2,tz);
918 fjx2 = _mm256_add_ps(fjx2,tx);
919 fjy2 = _mm256_add_ps(fjy2,ty);
920 fjz2 = _mm256_add_ps(fjz2,tz);
922 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
923 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
924 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
925 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
926 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
927 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
928 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
929 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
931 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
932 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
934 /* Inner loop uses 243 flops */
937 /* End of innermost loop */
939 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
940 f+i_coord_offset,fshift+i_shift_offset);
943 /* Update potential energies */
944 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
946 /* Increment number of inner iterations */
947 inneriter += j_index_end - j_index_start;
949 /* Outer loop uses 19 flops */
952 /* Increment number of outer iterations */
955 /* Update outer/inner flops */
957 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*243);
960 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_single
961 * Electrostatics interaction: Coulomb
962 * VdW interaction: None
963 * Geometry: Water3-Water3
964 * Calculate force/pot: Force
967 nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_avx_256_single
968 (t_nblist * gmx_restrict nlist,
969 rvec * gmx_restrict xx,
970 rvec * gmx_restrict ff,
971 t_forcerec * gmx_restrict fr,
972 t_mdatoms * gmx_restrict mdatoms,
973 nb_kernel_data_t * gmx_restrict kernel_data,
974 t_nrnb * gmx_restrict nrnb)
976 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
977 * just 0 for non-waters.
978 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
979 * jnr indices corresponding to data put in the four positions in the SIMD register.
981 int i_shift_offset,i_coord_offset,outeriter,inneriter;
982 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
983 int jnrA,jnrB,jnrC,jnrD;
984 int jnrE,jnrF,jnrG,jnrH;
985 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
986 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
987 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
988 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
989 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
991 real *shiftvec,*fshift,*x,*f;
992 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
994 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
995 real * vdwioffsetptr0;
996 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
997 real * vdwioffsetptr1;
998 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
999 real * vdwioffsetptr2;
1000 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1001 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1002 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1003 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1004 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1005 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1006 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1007 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1008 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1009 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1010 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1011 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1012 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1013 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1014 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1015 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1016 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1018 __m256 dummy_mask,cutoff_mask;
1019 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1020 __m256 one = _mm256_set1_ps(1.0);
1021 __m256 two = _mm256_set1_ps(2.0);
1027 jindex = nlist->jindex;
1029 shiftidx = nlist->shift;
1031 shiftvec = fr->shift_vec[0];
1032 fshift = fr->fshift[0];
1033 facel = _mm256_set1_ps(fr->epsfac);
1034 charge = mdatoms->chargeA;
1036 /* Setup water-specific parameters */
1037 inr = nlist->iinr[0];
1038 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1039 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1040 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1042 jq0 = _mm256_set1_ps(charge[inr+0]);
1043 jq1 = _mm256_set1_ps(charge[inr+1]);
1044 jq2 = _mm256_set1_ps(charge[inr+2]);
1045 qq00 = _mm256_mul_ps(iq0,jq0);
1046 qq01 = _mm256_mul_ps(iq0,jq1);
1047 qq02 = _mm256_mul_ps(iq0,jq2);
1048 qq10 = _mm256_mul_ps(iq1,jq0);
1049 qq11 = _mm256_mul_ps(iq1,jq1);
1050 qq12 = _mm256_mul_ps(iq1,jq2);
1051 qq20 = _mm256_mul_ps(iq2,jq0);
1052 qq21 = _mm256_mul_ps(iq2,jq1);
1053 qq22 = _mm256_mul_ps(iq2,jq2);
1055 /* Avoid stupid compiler warnings */
1056 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1057 j_coord_offsetA = 0;
1058 j_coord_offsetB = 0;
1059 j_coord_offsetC = 0;
1060 j_coord_offsetD = 0;
1061 j_coord_offsetE = 0;
1062 j_coord_offsetF = 0;
1063 j_coord_offsetG = 0;
1064 j_coord_offsetH = 0;
1069 for(iidx=0;iidx<4*DIM;iidx++)
1071 scratch[iidx] = 0.0;
1074 /* Start outer loop over neighborlists */
1075 for(iidx=0; iidx<nri; iidx++)
1077 /* Load shift vector for this list */
1078 i_shift_offset = DIM*shiftidx[iidx];
1080 /* Load limits for loop over neighbors */
1081 j_index_start = jindex[iidx];
1082 j_index_end = jindex[iidx+1];
1084 /* Get outer coordinate index */
1086 i_coord_offset = DIM*inr;
1088 /* Load i particle coords and add shift vector */
1089 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1090 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1092 fix0 = _mm256_setzero_ps();
1093 fiy0 = _mm256_setzero_ps();
1094 fiz0 = _mm256_setzero_ps();
1095 fix1 = _mm256_setzero_ps();
1096 fiy1 = _mm256_setzero_ps();
1097 fiz1 = _mm256_setzero_ps();
1098 fix2 = _mm256_setzero_ps();
1099 fiy2 = _mm256_setzero_ps();
1100 fiz2 = _mm256_setzero_ps();
1102 /* Start inner kernel loop */
1103 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1106 /* Get j neighbor index, and coordinate index */
1108 jnrB = jjnr[jidx+1];
1109 jnrC = jjnr[jidx+2];
1110 jnrD = jjnr[jidx+3];
1111 jnrE = jjnr[jidx+4];
1112 jnrF = jjnr[jidx+5];
1113 jnrG = jjnr[jidx+6];
1114 jnrH = jjnr[jidx+7];
1115 j_coord_offsetA = DIM*jnrA;
1116 j_coord_offsetB = DIM*jnrB;
1117 j_coord_offsetC = DIM*jnrC;
1118 j_coord_offsetD = DIM*jnrD;
1119 j_coord_offsetE = DIM*jnrE;
1120 j_coord_offsetF = DIM*jnrF;
1121 j_coord_offsetG = DIM*jnrG;
1122 j_coord_offsetH = DIM*jnrH;
1124 /* load j atom coordinates */
1125 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1126 x+j_coord_offsetC,x+j_coord_offsetD,
1127 x+j_coord_offsetE,x+j_coord_offsetF,
1128 x+j_coord_offsetG,x+j_coord_offsetH,
1129 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1131 /* Calculate displacement vector */
1132 dx00 = _mm256_sub_ps(ix0,jx0);
1133 dy00 = _mm256_sub_ps(iy0,jy0);
1134 dz00 = _mm256_sub_ps(iz0,jz0);
1135 dx01 = _mm256_sub_ps(ix0,jx1);
1136 dy01 = _mm256_sub_ps(iy0,jy1);
1137 dz01 = _mm256_sub_ps(iz0,jz1);
1138 dx02 = _mm256_sub_ps(ix0,jx2);
1139 dy02 = _mm256_sub_ps(iy0,jy2);
1140 dz02 = _mm256_sub_ps(iz0,jz2);
1141 dx10 = _mm256_sub_ps(ix1,jx0);
1142 dy10 = _mm256_sub_ps(iy1,jy0);
1143 dz10 = _mm256_sub_ps(iz1,jz0);
1144 dx11 = _mm256_sub_ps(ix1,jx1);
1145 dy11 = _mm256_sub_ps(iy1,jy1);
1146 dz11 = _mm256_sub_ps(iz1,jz1);
1147 dx12 = _mm256_sub_ps(ix1,jx2);
1148 dy12 = _mm256_sub_ps(iy1,jy2);
1149 dz12 = _mm256_sub_ps(iz1,jz2);
1150 dx20 = _mm256_sub_ps(ix2,jx0);
1151 dy20 = _mm256_sub_ps(iy2,jy0);
1152 dz20 = _mm256_sub_ps(iz2,jz0);
1153 dx21 = _mm256_sub_ps(ix2,jx1);
1154 dy21 = _mm256_sub_ps(iy2,jy1);
1155 dz21 = _mm256_sub_ps(iz2,jz1);
1156 dx22 = _mm256_sub_ps(ix2,jx2);
1157 dy22 = _mm256_sub_ps(iy2,jy2);
1158 dz22 = _mm256_sub_ps(iz2,jz2);
1160 /* Calculate squared distance and things based on it */
1161 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1162 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1163 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1164 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1165 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1166 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1167 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1168 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1169 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1171 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1172 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1173 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1174 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1175 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1176 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1177 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1178 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1179 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1181 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1182 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1183 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1184 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1185 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1186 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1187 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1188 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1189 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1191 fjx0 = _mm256_setzero_ps();
1192 fjy0 = _mm256_setzero_ps();
1193 fjz0 = _mm256_setzero_ps();
1194 fjx1 = _mm256_setzero_ps();
1195 fjy1 = _mm256_setzero_ps();
1196 fjz1 = _mm256_setzero_ps();
1197 fjx2 = _mm256_setzero_ps();
1198 fjy2 = _mm256_setzero_ps();
1199 fjz2 = _mm256_setzero_ps();
1201 /**************************
1202 * CALCULATE INTERACTIONS *
1203 **************************/
1205 /* COULOMB ELECTROSTATICS */
1206 velec = _mm256_mul_ps(qq00,rinv00);
1207 felec = _mm256_mul_ps(velec,rinvsq00);
1211 /* Calculate temporary vectorial force */
1212 tx = _mm256_mul_ps(fscal,dx00);
1213 ty = _mm256_mul_ps(fscal,dy00);
1214 tz = _mm256_mul_ps(fscal,dz00);
1216 /* Update vectorial force */
1217 fix0 = _mm256_add_ps(fix0,tx);
1218 fiy0 = _mm256_add_ps(fiy0,ty);
1219 fiz0 = _mm256_add_ps(fiz0,tz);
1221 fjx0 = _mm256_add_ps(fjx0,tx);
1222 fjy0 = _mm256_add_ps(fjy0,ty);
1223 fjz0 = _mm256_add_ps(fjz0,tz);
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 /* COULOMB ELECTROSTATICS */
1230 velec = _mm256_mul_ps(qq01,rinv01);
1231 felec = _mm256_mul_ps(velec,rinvsq01);
1235 /* Calculate temporary vectorial force */
1236 tx = _mm256_mul_ps(fscal,dx01);
1237 ty = _mm256_mul_ps(fscal,dy01);
1238 tz = _mm256_mul_ps(fscal,dz01);
1240 /* Update vectorial force */
1241 fix0 = _mm256_add_ps(fix0,tx);
1242 fiy0 = _mm256_add_ps(fiy0,ty);
1243 fiz0 = _mm256_add_ps(fiz0,tz);
1245 fjx1 = _mm256_add_ps(fjx1,tx);
1246 fjy1 = _mm256_add_ps(fjy1,ty);
1247 fjz1 = _mm256_add_ps(fjz1,tz);
1249 /**************************
1250 * CALCULATE INTERACTIONS *
1251 **************************/
1253 /* COULOMB ELECTROSTATICS */
1254 velec = _mm256_mul_ps(qq02,rinv02);
1255 felec = _mm256_mul_ps(velec,rinvsq02);
1259 /* Calculate temporary vectorial force */
1260 tx = _mm256_mul_ps(fscal,dx02);
1261 ty = _mm256_mul_ps(fscal,dy02);
1262 tz = _mm256_mul_ps(fscal,dz02);
1264 /* Update vectorial force */
1265 fix0 = _mm256_add_ps(fix0,tx);
1266 fiy0 = _mm256_add_ps(fiy0,ty);
1267 fiz0 = _mm256_add_ps(fiz0,tz);
1269 fjx2 = _mm256_add_ps(fjx2,tx);
1270 fjy2 = _mm256_add_ps(fjy2,ty);
1271 fjz2 = _mm256_add_ps(fjz2,tz);
1273 /**************************
1274 * CALCULATE INTERACTIONS *
1275 **************************/
1277 /* COULOMB ELECTROSTATICS */
1278 velec = _mm256_mul_ps(qq10,rinv10);
1279 felec = _mm256_mul_ps(velec,rinvsq10);
1283 /* Calculate temporary vectorial force */
1284 tx = _mm256_mul_ps(fscal,dx10);
1285 ty = _mm256_mul_ps(fscal,dy10);
1286 tz = _mm256_mul_ps(fscal,dz10);
1288 /* Update vectorial force */
1289 fix1 = _mm256_add_ps(fix1,tx);
1290 fiy1 = _mm256_add_ps(fiy1,ty);
1291 fiz1 = _mm256_add_ps(fiz1,tz);
1293 fjx0 = _mm256_add_ps(fjx0,tx);
1294 fjy0 = _mm256_add_ps(fjy0,ty);
1295 fjz0 = _mm256_add_ps(fjz0,tz);
1297 /**************************
1298 * CALCULATE INTERACTIONS *
1299 **************************/
1301 /* COULOMB ELECTROSTATICS */
1302 velec = _mm256_mul_ps(qq11,rinv11);
1303 felec = _mm256_mul_ps(velec,rinvsq11);
1307 /* Calculate temporary vectorial force */
1308 tx = _mm256_mul_ps(fscal,dx11);
1309 ty = _mm256_mul_ps(fscal,dy11);
1310 tz = _mm256_mul_ps(fscal,dz11);
1312 /* Update vectorial force */
1313 fix1 = _mm256_add_ps(fix1,tx);
1314 fiy1 = _mm256_add_ps(fiy1,ty);
1315 fiz1 = _mm256_add_ps(fiz1,tz);
1317 fjx1 = _mm256_add_ps(fjx1,tx);
1318 fjy1 = _mm256_add_ps(fjy1,ty);
1319 fjz1 = _mm256_add_ps(fjz1,tz);
1321 /**************************
1322 * CALCULATE INTERACTIONS *
1323 **************************/
1325 /* COULOMB ELECTROSTATICS */
1326 velec = _mm256_mul_ps(qq12,rinv12);
1327 felec = _mm256_mul_ps(velec,rinvsq12);
1331 /* Calculate temporary vectorial force */
1332 tx = _mm256_mul_ps(fscal,dx12);
1333 ty = _mm256_mul_ps(fscal,dy12);
1334 tz = _mm256_mul_ps(fscal,dz12);
1336 /* Update vectorial force */
1337 fix1 = _mm256_add_ps(fix1,tx);
1338 fiy1 = _mm256_add_ps(fiy1,ty);
1339 fiz1 = _mm256_add_ps(fiz1,tz);
1341 fjx2 = _mm256_add_ps(fjx2,tx);
1342 fjy2 = _mm256_add_ps(fjy2,ty);
1343 fjz2 = _mm256_add_ps(fjz2,tz);
1345 /**************************
1346 * CALCULATE INTERACTIONS *
1347 **************************/
1349 /* COULOMB ELECTROSTATICS */
1350 velec = _mm256_mul_ps(qq20,rinv20);
1351 felec = _mm256_mul_ps(velec,rinvsq20);
1355 /* Calculate temporary vectorial force */
1356 tx = _mm256_mul_ps(fscal,dx20);
1357 ty = _mm256_mul_ps(fscal,dy20);
1358 tz = _mm256_mul_ps(fscal,dz20);
1360 /* Update vectorial force */
1361 fix2 = _mm256_add_ps(fix2,tx);
1362 fiy2 = _mm256_add_ps(fiy2,ty);
1363 fiz2 = _mm256_add_ps(fiz2,tz);
1365 fjx0 = _mm256_add_ps(fjx0,tx);
1366 fjy0 = _mm256_add_ps(fjy0,ty);
1367 fjz0 = _mm256_add_ps(fjz0,tz);
1369 /**************************
1370 * CALCULATE INTERACTIONS *
1371 **************************/
1373 /* COULOMB ELECTROSTATICS */
1374 velec = _mm256_mul_ps(qq21,rinv21);
1375 felec = _mm256_mul_ps(velec,rinvsq21);
1379 /* Calculate temporary vectorial force */
1380 tx = _mm256_mul_ps(fscal,dx21);
1381 ty = _mm256_mul_ps(fscal,dy21);
1382 tz = _mm256_mul_ps(fscal,dz21);
1384 /* Update vectorial force */
1385 fix2 = _mm256_add_ps(fix2,tx);
1386 fiy2 = _mm256_add_ps(fiy2,ty);
1387 fiz2 = _mm256_add_ps(fiz2,tz);
1389 fjx1 = _mm256_add_ps(fjx1,tx);
1390 fjy1 = _mm256_add_ps(fjy1,ty);
1391 fjz1 = _mm256_add_ps(fjz1,tz);
1393 /**************************
1394 * CALCULATE INTERACTIONS *
1395 **************************/
1397 /* COULOMB ELECTROSTATICS */
1398 velec = _mm256_mul_ps(qq22,rinv22);
1399 felec = _mm256_mul_ps(velec,rinvsq22);
1403 /* Calculate temporary vectorial force */
1404 tx = _mm256_mul_ps(fscal,dx22);
1405 ty = _mm256_mul_ps(fscal,dy22);
1406 tz = _mm256_mul_ps(fscal,dz22);
1408 /* Update vectorial force */
1409 fix2 = _mm256_add_ps(fix2,tx);
1410 fiy2 = _mm256_add_ps(fiy2,ty);
1411 fiz2 = _mm256_add_ps(fiz2,tz);
1413 fjx2 = _mm256_add_ps(fjx2,tx);
1414 fjy2 = _mm256_add_ps(fjy2,ty);
1415 fjz2 = _mm256_add_ps(fjz2,tz);
1417 fjptrA = f+j_coord_offsetA;
1418 fjptrB = f+j_coord_offsetB;
1419 fjptrC = f+j_coord_offsetC;
1420 fjptrD = f+j_coord_offsetD;
1421 fjptrE = f+j_coord_offsetE;
1422 fjptrF = f+j_coord_offsetF;
1423 fjptrG = f+j_coord_offsetG;
1424 fjptrH = f+j_coord_offsetH;
1426 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1427 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1429 /* Inner loop uses 234 flops */
1432 if(jidx<j_index_end)
1435 /* Get j neighbor index, and coordinate index */
1436 jnrlistA = jjnr[jidx];
1437 jnrlistB = jjnr[jidx+1];
1438 jnrlistC = jjnr[jidx+2];
1439 jnrlistD = jjnr[jidx+3];
1440 jnrlistE = jjnr[jidx+4];
1441 jnrlistF = jjnr[jidx+5];
1442 jnrlistG = jjnr[jidx+6];
1443 jnrlistH = jjnr[jidx+7];
1444 /* Sign of each element will be negative for non-real atoms.
1445 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1446 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1448 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1449 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1451 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1452 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1453 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1454 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1455 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1456 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1457 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1458 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1459 j_coord_offsetA = DIM*jnrA;
1460 j_coord_offsetB = DIM*jnrB;
1461 j_coord_offsetC = DIM*jnrC;
1462 j_coord_offsetD = DIM*jnrD;
1463 j_coord_offsetE = DIM*jnrE;
1464 j_coord_offsetF = DIM*jnrF;
1465 j_coord_offsetG = DIM*jnrG;
1466 j_coord_offsetH = DIM*jnrH;
1468 /* load j atom coordinates */
1469 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1470 x+j_coord_offsetC,x+j_coord_offsetD,
1471 x+j_coord_offsetE,x+j_coord_offsetF,
1472 x+j_coord_offsetG,x+j_coord_offsetH,
1473 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1475 /* Calculate displacement vector */
1476 dx00 = _mm256_sub_ps(ix0,jx0);
1477 dy00 = _mm256_sub_ps(iy0,jy0);
1478 dz00 = _mm256_sub_ps(iz0,jz0);
1479 dx01 = _mm256_sub_ps(ix0,jx1);
1480 dy01 = _mm256_sub_ps(iy0,jy1);
1481 dz01 = _mm256_sub_ps(iz0,jz1);
1482 dx02 = _mm256_sub_ps(ix0,jx2);
1483 dy02 = _mm256_sub_ps(iy0,jy2);
1484 dz02 = _mm256_sub_ps(iz0,jz2);
1485 dx10 = _mm256_sub_ps(ix1,jx0);
1486 dy10 = _mm256_sub_ps(iy1,jy0);
1487 dz10 = _mm256_sub_ps(iz1,jz0);
1488 dx11 = _mm256_sub_ps(ix1,jx1);
1489 dy11 = _mm256_sub_ps(iy1,jy1);
1490 dz11 = _mm256_sub_ps(iz1,jz1);
1491 dx12 = _mm256_sub_ps(ix1,jx2);
1492 dy12 = _mm256_sub_ps(iy1,jy2);
1493 dz12 = _mm256_sub_ps(iz1,jz2);
1494 dx20 = _mm256_sub_ps(ix2,jx0);
1495 dy20 = _mm256_sub_ps(iy2,jy0);
1496 dz20 = _mm256_sub_ps(iz2,jz0);
1497 dx21 = _mm256_sub_ps(ix2,jx1);
1498 dy21 = _mm256_sub_ps(iy2,jy1);
1499 dz21 = _mm256_sub_ps(iz2,jz1);
1500 dx22 = _mm256_sub_ps(ix2,jx2);
1501 dy22 = _mm256_sub_ps(iy2,jy2);
1502 dz22 = _mm256_sub_ps(iz2,jz2);
1504 /* Calculate squared distance and things based on it */
1505 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1506 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1507 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1508 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1509 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1510 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1511 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1512 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1513 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1515 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1516 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1517 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1518 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1519 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1520 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1521 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1522 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1523 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1525 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1526 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1527 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1528 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1529 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1530 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1531 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1532 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1533 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1535 fjx0 = _mm256_setzero_ps();
1536 fjy0 = _mm256_setzero_ps();
1537 fjz0 = _mm256_setzero_ps();
1538 fjx1 = _mm256_setzero_ps();
1539 fjy1 = _mm256_setzero_ps();
1540 fjz1 = _mm256_setzero_ps();
1541 fjx2 = _mm256_setzero_ps();
1542 fjy2 = _mm256_setzero_ps();
1543 fjz2 = _mm256_setzero_ps();
1545 /**************************
1546 * CALCULATE INTERACTIONS *
1547 **************************/
1549 /* COULOMB ELECTROSTATICS */
1550 velec = _mm256_mul_ps(qq00,rinv00);
1551 felec = _mm256_mul_ps(velec,rinvsq00);
1555 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1557 /* Calculate temporary vectorial force */
1558 tx = _mm256_mul_ps(fscal,dx00);
1559 ty = _mm256_mul_ps(fscal,dy00);
1560 tz = _mm256_mul_ps(fscal,dz00);
1562 /* Update vectorial force */
1563 fix0 = _mm256_add_ps(fix0,tx);
1564 fiy0 = _mm256_add_ps(fiy0,ty);
1565 fiz0 = _mm256_add_ps(fiz0,tz);
1567 fjx0 = _mm256_add_ps(fjx0,tx);
1568 fjy0 = _mm256_add_ps(fjy0,ty);
1569 fjz0 = _mm256_add_ps(fjz0,tz);
1571 /**************************
1572 * CALCULATE INTERACTIONS *
1573 **************************/
1575 /* COULOMB ELECTROSTATICS */
1576 velec = _mm256_mul_ps(qq01,rinv01);
1577 felec = _mm256_mul_ps(velec,rinvsq01);
1581 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1583 /* Calculate temporary vectorial force */
1584 tx = _mm256_mul_ps(fscal,dx01);
1585 ty = _mm256_mul_ps(fscal,dy01);
1586 tz = _mm256_mul_ps(fscal,dz01);
1588 /* Update vectorial force */
1589 fix0 = _mm256_add_ps(fix0,tx);
1590 fiy0 = _mm256_add_ps(fiy0,ty);
1591 fiz0 = _mm256_add_ps(fiz0,tz);
1593 fjx1 = _mm256_add_ps(fjx1,tx);
1594 fjy1 = _mm256_add_ps(fjy1,ty);
1595 fjz1 = _mm256_add_ps(fjz1,tz);
1597 /**************************
1598 * CALCULATE INTERACTIONS *
1599 **************************/
1601 /* COULOMB ELECTROSTATICS */
1602 velec = _mm256_mul_ps(qq02,rinv02);
1603 felec = _mm256_mul_ps(velec,rinvsq02);
1607 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1609 /* Calculate temporary vectorial force */
1610 tx = _mm256_mul_ps(fscal,dx02);
1611 ty = _mm256_mul_ps(fscal,dy02);
1612 tz = _mm256_mul_ps(fscal,dz02);
1614 /* Update vectorial force */
1615 fix0 = _mm256_add_ps(fix0,tx);
1616 fiy0 = _mm256_add_ps(fiy0,ty);
1617 fiz0 = _mm256_add_ps(fiz0,tz);
1619 fjx2 = _mm256_add_ps(fjx2,tx);
1620 fjy2 = _mm256_add_ps(fjy2,ty);
1621 fjz2 = _mm256_add_ps(fjz2,tz);
1623 /**************************
1624 * CALCULATE INTERACTIONS *
1625 **************************/
1627 /* COULOMB ELECTROSTATICS */
1628 velec = _mm256_mul_ps(qq10,rinv10);
1629 felec = _mm256_mul_ps(velec,rinvsq10);
1633 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1635 /* Calculate temporary vectorial force */
1636 tx = _mm256_mul_ps(fscal,dx10);
1637 ty = _mm256_mul_ps(fscal,dy10);
1638 tz = _mm256_mul_ps(fscal,dz10);
1640 /* Update vectorial force */
1641 fix1 = _mm256_add_ps(fix1,tx);
1642 fiy1 = _mm256_add_ps(fiy1,ty);
1643 fiz1 = _mm256_add_ps(fiz1,tz);
1645 fjx0 = _mm256_add_ps(fjx0,tx);
1646 fjy0 = _mm256_add_ps(fjy0,ty);
1647 fjz0 = _mm256_add_ps(fjz0,tz);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 /* COULOMB ELECTROSTATICS */
1654 velec = _mm256_mul_ps(qq11,rinv11);
1655 felec = _mm256_mul_ps(velec,rinvsq11);
1659 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1661 /* Calculate temporary vectorial force */
1662 tx = _mm256_mul_ps(fscal,dx11);
1663 ty = _mm256_mul_ps(fscal,dy11);
1664 tz = _mm256_mul_ps(fscal,dz11);
1666 /* Update vectorial force */
1667 fix1 = _mm256_add_ps(fix1,tx);
1668 fiy1 = _mm256_add_ps(fiy1,ty);
1669 fiz1 = _mm256_add_ps(fiz1,tz);
1671 fjx1 = _mm256_add_ps(fjx1,tx);
1672 fjy1 = _mm256_add_ps(fjy1,ty);
1673 fjz1 = _mm256_add_ps(fjz1,tz);
1675 /**************************
1676 * CALCULATE INTERACTIONS *
1677 **************************/
1679 /* COULOMB ELECTROSTATICS */
1680 velec = _mm256_mul_ps(qq12,rinv12);
1681 felec = _mm256_mul_ps(velec,rinvsq12);
1685 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1687 /* Calculate temporary vectorial force */
1688 tx = _mm256_mul_ps(fscal,dx12);
1689 ty = _mm256_mul_ps(fscal,dy12);
1690 tz = _mm256_mul_ps(fscal,dz12);
1692 /* Update vectorial force */
1693 fix1 = _mm256_add_ps(fix1,tx);
1694 fiy1 = _mm256_add_ps(fiy1,ty);
1695 fiz1 = _mm256_add_ps(fiz1,tz);
1697 fjx2 = _mm256_add_ps(fjx2,tx);
1698 fjy2 = _mm256_add_ps(fjy2,ty);
1699 fjz2 = _mm256_add_ps(fjz2,tz);
1701 /**************************
1702 * CALCULATE INTERACTIONS *
1703 **************************/
1705 /* COULOMB ELECTROSTATICS */
1706 velec = _mm256_mul_ps(qq20,rinv20);
1707 felec = _mm256_mul_ps(velec,rinvsq20);
1711 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1713 /* Calculate temporary vectorial force */
1714 tx = _mm256_mul_ps(fscal,dx20);
1715 ty = _mm256_mul_ps(fscal,dy20);
1716 tz = _mm256_mul_ps(fscal,dz20);
1718 /* Update vectorial force */
1719 fix2 = _mm256_add_ps(fix2,tx);
1720 fiy2 = _mm256_add_ps(fiy2,ty);
1721 fiz2 = _mm256_add_ps(fiz2,tz);
1723 fjx0 = _mm256_add_ps(fjx0,tx);
1724 fjy0 = _mm256_add_ps(fjy0,ty);
1725 fjz0 = _mm256_add_ps(fjz0,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 /* COULOMB ELECTROSTATICS */
1732 velec = _mm256_mul_ps(qq21,rinv21);
1733 felec = _mm256_mul_ps(velec,rinvsq21);
1737 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1739 /* Calculate temporary vectorial force */
1740 tx = _mm256_mul_ps(fscal,dx21);
1741 ty = _mm256_mul_ps(fscal,dy21);
1742 tz = _mm256_mul_ps(fscal,dz21);
1744 /* Update vectorial force */
1745 fix2 = _mm256_add_ps(fix2,tx);
1746 fiy2 = _mm256_add_ps(fiy2,ty);
1747 fiz2 = _mm256_add_ps(fiz2,tz);
1749 fjx1 = _mm256_add_ps(fjx1,tx);
1750 fjy1 = _mm256_add_ps(fjy1,ty);
1751 fjz1 = _mm256_add_ps(fjz1,tz);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 /* COULOMB ELECTROSTATICS */
1758 velec = _mm256_mul_ps(qq22,rinv22);
1759 felec = _mm256_mul_ps(velec,rinvsq22);
1763 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1765 /* Calculate temporary vectorial force */
1766 tx = _mm256_mul_ps(fscal,dx22);
1767 ty = _mm256_mul_ps(fscal,dy22);
1768 tz = _mm256_mul_ps(fscal,dz22);
1770 /* Update vectorial force */
1771 fix2 = _mm256_add_ps(fix2,tx);
1772 fiy2 = _mm256_add_ps(fiy2,ty);
1773 fiz2 = _mm256_add_ps(fiz2,tz);
1775 fjx2 = _mm256_add_ps(fjx2,tx);
1776 fjy2 = _mm256_add_ps(fjy2,ty);
1777 fjz2 = _mm256_add_ps(fjz2,tz);
1779 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1780 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1781 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1782 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1783 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1784 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1785 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1786 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1788 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1789 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1791 /* Inner loop uses 234 flops */
1794 /* End of innermost loop */
1796 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1797 f+i_coord_offset,fshift+i_shift_offset);
1799 /* Increment number of inner iterations */
1800 inneriter += j_index_end - j_index_start;
1802 /* Outer loop uses 18 flops */
1805 /* Increment number of outer iterations */
1808 /* Update outer/inner flops */
1810 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*234);