2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m256 dummy_mask,cutoff_mask;
96 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
97 __m256 one = _mm256_set1_ps(1.0);
98 __m256 two = _mm256_set1_ps(2.0);
104 jindex = nlist->jindex;
106 shiftidx = nlist->shift;
108 shiftvec = fr->shift_vec[0];
109 fshift = fr->fshift[0];
110 facel = _mm256_set1_ps(fr->epsfac);
111 charge = mdatoms->chargeA;
112 krf = _mm256_set1_ps(fr->ic->k_rf);
113 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
114 crf = _mm256_set1_ps(fr->ic->c_rf);
116 /* Setup water-specific parameters */
117 inr = nlist->iinr[0];
118 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
119 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
120 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
122 jq0 = _mm256_set1_ps(charge[inr+0]);
123 jq1 = _mm256_set1_ps(charge[inr+1]);
124 jq2 = _mm256_set1_ps(charge[inr+2]);
125 qq00 = _mm256_mul_ps(iq0,jq0);
126 qq01 = _mm256_mul_ps(iq0,jq1);
127 qq02 = _mm256_mul_ps(iq0,jq2);
128 qq10 = _mm256_mul_ps(iq1,jq0);
129 qq11 = _mm256_mul_ps(iq1,jq1);
130 qq12 = _mm256_mul_ps(iq1,jq2);
131 qq20 = _mm256_mul_ps(iq2,jq0);
132 qq21 = _mm256_mul_ps(iq2,jq1);
133 qq22 = _mm256_mul_ps(iq2,jq2);
135 /* Avoid stupid compiler warnings */
136 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
149 for(iidx=0;iidx<4*DIM;iidx++)
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm256_setzero_ps();
173 fiy0 = _mm256_setzero_ps();
174 fiz0 = _mm256_setzero_ps();
175 fix1 = _mm256_setzero_ps();
176 fiy1 = _mm256_setzero_ps();
177 fiz1 = _mm256_setzero_ps();
178 fix2 = _mm256_setzero_ps();
179 fiy2 = _mm256_setzero_ps();
180 fiz2 = _mm256_setzero_ps();
182 /* Reset potential sums */
183 velecsum = _mm256_setzero_ps();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
189 /* Get j neighbor index, and coordinate index */
198 j_coord_offsetA = DIM*jnrA;
199 j_coord_offsetB = DIM*jnrB;
200 j_coord_offsetC = DIM*jnrC;
201 j_coord_offsetD = DIM*jnrD;
202 j_coord_offsetE = DIM*jnrE;
203 j_coord_offsetF = DIM*jnrF;
204 j_coord_offsetG = DIM*jnrG;
205 j_coord_offsetH = DIM*jnrH;
207 /* load j atom coordinates */
208 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
209 x+j_coord_offsetC,x+j_coord_offsetD,
210 x+j_coord_offsetE,x+j_coord_offsetF,
211 x+j_coord_offsetG,x+j_coord_offsetH,
212 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
214 /* Calculate displacement vector */
215 dx00 = _mm256_sub_ps(ix0,jx0);
216 dy00 = _mm256_sub_ps(iy0,jy0);
217 dz00 = _mm256_sub_ps(iz0,jz0);
218 dx01 = _mm256_sub_ps(ix0,jx1);
219 dy01 = _mm256_sub_ps(iy0,jy1);
220 dz01 = _mm256_sub_ps(iz0,jz1);
221 dx02 = _mm256_sub_ps(ix0,jx2);
222 dy02 = _mm256_sub_ps(iy0,jy2);
223 dz02 = _mm256_sub_ps(iz0,jz2);
224 dx10 = _mm256_sub_ps(ix1,jx0);
225 dy10 = _mm256_sub_ps(iy1,jy0);
226 dz10 = _mm256_sub_ps(iz1,jz0);
227 dx11 = _mm256_sub_ps(ix1,jx1);
228 dy11 = _mm256_sub_ps(iy1,jy1);
229 dz11 = _mm256_sub_ps(iz1,jz1);
230 dx12 = _mm256_sub_ps(ix1,jx2);
231 dy12 = _mm256_sub_ps(iy1,jy2);
232 dz12 = _mm256_sub_ps(iz1,jz2);
233 dx20 = _mm256_sub_ps(ix2,jx0);
234 dy20 = _mm256_sub_ps(iy2,jy0);
235 dz20 = _mm256_sub_ps(iz2,jz0);
236 dx21 = _mm256_sub_ps(ix2,jx1);
237 dy21 = _mm256_sub_ps(iy2,jy1);
238 dz21 = _mm256_sub_ps(iz2,jz1);
239 dx22 = _mm256_sub_ps(ix2,jx2);
240 dy22 = _mm256_sub_ps(iy2,jy2);
241 dz22 = _mm256_sub_ps(iz2,jz2);
243 /* Calculate squared distance and things based on it */
244 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
245 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
246 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
247 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
248 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
249 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
250 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
251 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
252 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
254 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
255 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
256 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
257 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
258 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
259 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
260 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
261 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
262 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
264 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
265 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
266 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
267 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
268 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
269 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
270 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
271 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
272 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
274 fjx0 = _mm256_setzero_ps();
275 fjy0 = _mm256_setzero_ps();
276 fjz0 = _mm256_setzero_ps();
277 fjx1 = _mm256_setzero_ps();
278 fjy1 = _mm256_setzero_ps();
279 fjz1 = _mm256_setzero_ps();
280 fjx2 = _mm256_setzero_ps();
281 fjy2 = _mm256_setzero_ps();
282 fjz2 = _mm256_setzero_ps();
284 /**************************
285 * CALCULATE INTERACTIONS *
286 **************************/
288 /* REACTION-FIELD ELECTROSTATICS */
289 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
290 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
292 /* Update potential sum for this i atom from the interaction with this j atom. */
293 velecsum = _mm256_add_ps(velecsum,velec);
297 /* Calculate temporary vectorial force */
298 tx = _mm256_mul_ps(fscal,dx00);
299 ty = _mm256_mul_ps(fscal,dy00);
300 tz = _mm256_mul_ps(fscal,dz00);
302 /* Update vectorial force */
303 fix0 = _mm256_add_ps(fix0,tx);
304 fiy0 = _mm256_add_ps(fiy0,ty);
305 fiz0 = _mm256_add_ps(fiz0,tz);
307 fjx0 = _mm256_add_ps(fjx0,tx);
308 fjy0 = _mm256_add_ps(fjy0,ty);
309 fjz0 = _mm256_add_ps(fjz0,tz);
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 /* REACTION-FIELD ELECTROSTATICS */
316 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
317 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
319 /* Update potential sum for this i atom from the interaction with this j atom. */
320 velecsum = _mm256_add_ps(velecsum,velec);
324 /* Calculate temporary vectorial force */
325 tx = _mm256_mul_ps(fscal,dx01);
326 ty = _mm256_mul_ps(fscal,dy01);
327 tz = _mm256_mul_ps(fscal,dz01);
329 /* Update vectorial force */
330 fix0 = _mm256_add_ps(fix0,tx);
331 fiy0 = _mm256_add_ps(fiy0,ty);
332 fiz0 = _mm256_add_ps(fiz0,tz);
334 fjx1 = _mm256_add_ps(fjx1,tx);
335 fjy1 = _mm256_add_ps(fjy1,ty);
336 fjz1 = _mm256_add_ps(fjz1,tz);
338 /**************************
339 * CALCULATE INTERACTIONS *
340 **************************/
342 /* REACTION-FIELD ELECTROSTATICS */
343 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
344 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velecsum = _mm256_add_ps(velecsum,velec);
351 /* Calculate temporary vectorial force */
352 tx = _mm256_mul_ps(fscal,dx02);
353 ty = _mm256_mul_ps(fscal,dy02);
354 tz = _mm256_mul_ps(fscal,dz02);
356 /* Update vectorial force */
357 fix0 = _mm256_add_ps(fix0,tx);
358 fiy0 = _mm256_add_ps(fiy0,ty);
359 fiz0 = _mm256_add_ps(fiz0,tz);
361 fjx2 = _mm256_add_ps(fjx2,tx);
362 fjy2 = _mm256_add_ps(fjy2,ty);
363 fjz2 = _mm256_add_ps(fjz2,tz);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 /* REACTION-FIELD ELECTROSTATICS */
370 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
371 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velecsum = _mm256_add_ps(velecsum,velec);
378 /* Calculate temporary vectorial force */
379 tx = _mm256_mul_ps(fscal,dx10);
380 ty = _mm256_mul_ps(fscal,dy10);
381 tz = _mm256_mul_ps(fscal,dz10);
383 /* Update vectorial force */
384 fix1 = _mm256_add_ps(fix1,tx);
385 fiy1 = _mm256_add_ps(fiy1,ty);
386 fiz1 = _mm256_add_ps(fiz1,tz);
388 fjx0 = _mm256_add_ps(fjx0,tx);
389 fjy0 = _mm256_add_ps(fjy0,ty);
390 fjz0 = _mm256_add_ps(fjz0,tz);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 /* REACTION-FIELD ELECTROSTATICS */
397 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
398 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velecsum = _mm256_add_ps(velecsum,velec);
405 /* Calculate temporary vectorial force */
406 tx = _mm256_mul_ps(fscal,dx11);
407 ty = _mm256_mul_ps(fscal,dy11);
408 tz = _mm256_mul_ps(fscal,dz11);
410 /* Update vectorial force */
411 fix1 = _mm256_add_ps(fix1,tx);
412 fiy1 = _mm256_add_ps(fiy1,ty);
413 fiz1 = _mm256_add_ps(fiz1,tz);
415 fjx1 = _mm256_add_ps(fjx1,tx);
416 fjy1 = _mm256_add_ps(fjy1,ty);
417 fjz1 = _mm256_add_ps(fjz1,tz);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 /* REACTION-FIELD ELECTROSTATICS */
424 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
425 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum = _mm256_add_ps(velecsum,velec);
432 /* Calculate temporary vectorial force */
433 tx = _mm256_mul_ps(fscal,dx12);
434 ty = _mm256_mul_ps(fscal,dy12);
435 tz = _mm256_mul_ps(fscal,dz12);
437 /* Update vectorial force */
438 fix1 = _mm256_add_ps(fix1,tx);
439 fiy1 = _mm256_add_ps(fiy1,ty);
440 fiz1 = _mm256_add_ps(fiz1,tz);
442 fjx2 = _mm256_add_ps(fjx2,tx);
443 fjy2 = _mm256_add_ps(fjy2,ty);
444 fjz2 = _mm256_add_ps(fjz2,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 /* REACTION-FIELD ELECTROSTATICS */
451 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
452 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velecsum = _mm256_add_ps(velecsum,velec);
459 /* Calculate temporary vectorial force */
460 tx = _mm256_mul_ps(fscal,dx20);
461 ty = _mm256_mul_ps(fscal,dy20);
462 tz = _mm256_mul_ps(fscal,dz20);
464 /* Update vectorial force */
465 fix2 = _mm256_add_ps(fix2,tx);
466 fiy2 = _mm256_add_ps(fiy2,ty);
467 fiz2 = _mm256_add_ps(fiz2,tz);
469 fjx0 = _mm256_add_ps(fjx0,tx);
470 fjy0 = _mm256_add_ps(fjy0,ty);
471 fjz0 = _mm256_add_ps(fjz0,tz);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 /* REACTION-FIELD ELECTROSTATICS */
478 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
479 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velecsum = _mm256_add_ps(velecsum,velec);
486 /* Calculate temporary vectorial force */
487 tx = _mm256_mul_ps(fscal,dx21);
488 ty = _mm256_mul_ps(fscal,dy21);
489 tz = _mm256_mul_ps(fscal,dz21);
491 /* Update vectorial force */
492 fix2 = _mm256_add_ps(fix2,tx);
493 fiy2 = _mm256_add_ps(fiy2,ty);
494 fiz2 = _mm256_add_ps(fiz2,tz);
496 fjx1 = _mm256_add_ps(fjx1,tx);
497 fjy1 = _mm256_add_ps(fjy1,ty);
498 fjz1 = _mm256_add_ps(fjz1,tz);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 /* REACTION-FIELD ELECTROSTATICS */
505 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
506 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
508 /* Update potential sum for this i atom from the interaction with this j atom. */
509 velecsum = _mm256_add_ps(velecsum,velec);
513 /* Calculate temporary vectorial force */
514 tx = _mm256_mul_ps(fscal,dx22);
515 ty = _mm256_mul_ps(fscal,dy22);
516 tz = _mm256_mul_ps(fscal,dz22);
518 /* Update vectorial force */
519 fix2 = _mm256_add_ps(fix2,tx);
520 fiy2 = _mm256_add_ps(fiy2,ty);
521 fiz2 = _mm256_add_ps(fiz2,tz);
523 fjx2 = _mm256_add_ps(fjx2,tx);
524 fjy2 = _mm256_add_ps(fjy2,ty);
525 fjz2 = _mm256_add_ps(fjz2,tz);
527 fjptrA = f+j_coord_offsetA;
528 fjptrB = f+j_coord_offsetB;
529 fjptrC = f+j_coord_offsetC;
530 fjptrD = f+j_coord_offsetD;
531 fjptrE = f+j_coord_offsetE;
532 fjptrF = f+j_coord_offsetF;
533 fjptrG = f+j_coord_offsetG;
534 fjptrH = f+j_coord_offsetH;
536 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
537 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
539 /* Inner loop uses 288 flops */
545 /* Get j neighbor index, and coordinate index */
546 jnrlistA = jjnr[jidx];
547 jnrlistB = jjnr[jidx+1];
548 jnrlistC = jjnr[jidx+2];
549 jnrlistD = jjnr[jidx+3];
550 jnrlistE = jjnr[jidx+4];
551 jnrlistF = jjnr[jidx+5];
552 jnrlistG = jjnr[jidx+6];
553 jnrlistH = jjnr[jidx+7];
554 /* Sign of each element will be negative for non-real atoms.
555 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
556 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
558 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
559 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
561 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
562 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
563 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
564 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
565 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
566 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
567 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
568 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
569 j_coord_offsetA = DIM*jnrA;
570 j_coord_offsetB = DIM*jnrB;
571 j_coord_offsetC = DIM*jnrC;
572 j_coord_offsetD = DIM*jnrD;
573 j_coord_offsetE = DIM*jnrE;
574 j_coord_offsetF = DIM*jnrF;
575 j_coord_offsetG = DIM*jnrG;
576 j_coord_offsetH = DIM*jnrH;
578 /* load j atom coordinates */
579 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
580 x+j_coord_offsetC,x+j_coord_offsetD,
581 x+j_coord_offsetE,x+j_coord_offsetF,
582 x+j_coord_offsetG,x+j_coord_offsetH,
583 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
585 /* Calculate displacement vector */
586 dx00 = _mm256_sub_ps(ix0,jx0);
587 dy00 = _mm256_sub_ps(iy0,jy0);
588 dz00 = _mm256_sub_ps(iz0,jz0);
589 dx01 = _mm256_sub_ps(ix0,jx1);
590 dy01 = _mm256_sub_ps(iy0,jy1);
591 dz01 = _mm256_sub_ps(iz0,jz1);
592 dx02 = _mm256_sub_ps(ix0,jx2);
593 dy02 = _mm256_sub_ps(iy0,jy2);
594 dz02 = _mm256_sub_ps(iz0,jz2);
595 dx10 = _mm256_sub_ps(ix1,jx0);
596 dy10 = _mm256_sub_ps(iy1,jy0);
597 dz10 = _mm256_sub_ps(iz1,jz0);
598 dx11 = _mm256_sub_ps(ix1,jx1);
599 dy11 = _mm256_sub_ps(iy1,jy1);
600 dz11 = _mm256_sub_ps(iz1,jz1);
601 dx12 = _mm256_sub_ps(ix1,jx2);
602 dy12 = _mm256_sub_ps(iy1,jy2);
603 dz12 = _mm256_sub_ps(iz1,jz2);
604 dx20 = _mm256_sub_ps(ix2,jx0);
605 dy20 = _mm256_sub_ps(iy2,jy0);
606 dz20 = _mm256_sub_ps(iz2,jz0);
607 dx21 = _mm256_sub_ps(ix2,jx1);
608 dy21 = _mm256_sub_ps(iy2,jy1);
609 dz21 = _mm256_sub_ps(iz2,jz1);
610 dx22 = _mm256_sub_ps(ix2,jx2);
611 dy22 = _mm256_sub_ps(iy2,jy2);
612 dz22 = _mm256_sub_ps(iz2,jz2);
614 /* Calculate squared distance and things based on it */
615 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
616 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
617 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
618 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
619 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
620 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
621 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
622 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
623 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
625 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
626 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
627 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
628 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
629 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
630 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
631 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
632 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
633 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
635 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
636 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
637 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
638 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
639 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
640 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
641 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
642 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
643 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
645 fjx0 = _mm256_setzero_ps();
646 fjy0 = _mm256_setzero_ps();
647 fjz0 = _mm256_setzero_ps();
648 fjx1 = _mm256_setzero_ps();
649 fjy1 = _mm256_setzero_ps();
650 fjz1 = _mm256_setzero_ps();
651 fjx2 = _mm256_setzero_ps();
652 fjy2 = _mm256_setzero_ps();
653 fjz2 = _mm256_setzero_ps();
655 /**************************
656 * CALCULATE INTERACTIONS *
657 **************************/
659 /* REACTION-FIELD ELECTROSTATICS */
660 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
661 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velec = _mm256_andnot_ps(dummy_mask,velec);
665 velecsum = _mm256_add_ps(velecsum,velec);
669 fscal = _mm256_andnot_ps(dummy_mask,fscal);
671 /* Calculate temporary vectorial force */
672 tx = _mm256_mul_ps(fscal,dx00);
673 ty = _mm256_mul_ps(fscal,dy00);
674 tz = _mm256_mul_ps(fscal,dz00);
676 /* Update vectorial force */
677 fix0 = _mm256_add_ps(fix0,tx);
678 fiy0 = _mm256_add_ps(fiy0,ty);
679 fiz0 = _mm256_add_ps(fiz0,tz);
681 fjx0 = _mm256_add_ps(fjx0,tx);
682 fjy0 = _mm256_add_ps(fjy0,ty);
683 fjz0 = _mm256_add_ps(fjz0,tz);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 /* REACTION-FIELD ELECTROSTATICS */
690 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
691 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velec = _mm256_andnot_ps(dummy_mask,velec);
695 velecsum = _mm256_add_ps(velecsum,velec);
699 fscal = _mm256_andnot_ps(dummy_mask,fscal);
701 /* Calculate temporary vectorial force */
702 tx = _mm256_mul_ps(fscal,dx01);
703 ty = _mm256_mul_ps(fscal,dy01);
704 tz = _mm256_mul_ps(fscal,dz01);
706 /* Update vectorial force */
707 fix0 = _mm256_add_ps(fix0,tx);
708 fiy0 = _mm256_add_ps(fiy0,ty);
709 fiz0 = _mm256_add_ps(fiz0,tz);
711 fjx1 = _mm256_add_ps(fjx1,tx);
712 fjy1 = _mm256_add_ps(fjy1,ty);
713 fjz1 = _mm256_add_ps(fjz1,tz);
715 /**************************
716 * CALCULATE INTERACTIONS *
717 **************************/
719 /* REACTION-FIELD ELECTROSTATICS */
720 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
721 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
723 /* Update potential sum for this i atom from the interaction with this j atom. */
724 velec = _mm256_andnot_ps(dummy_mask,velec);
725 velecsum = _mm256_add_ps(velecsum,velec);
729 fscal = _mm256_andnot_ps(dummy_mask,fscal);
731 /* Calculate temporary vectorial force */
732 tx = _mm256_mul_ps(fscal,dx02);
733 ty = _mm256_mul_ps(fscal,dy02);
734 tz = _mm256_mul_ps(fscal,dz02);
736 /* Update vectorial force */
737 fix0 = _mm256_add_ps(fix0,tx);
738 fiy0 = _mm256_add_ps(fiy0,ty);
739 fiz0 = _mm256_add_ps(fiz0,tz);
741 fjx2 = _mm256_add_ps(fjx2,tx);
742 fjy2 = _mm256_add_ps(fjy2,ty);
743 fjz2 = _mm256_add_ps(fjz2,tz);
745 /**************************
746 * CALCULATE INTERACTIONS *
747 **************************/
749 /* REACTION-FIELD ELECTROSTATICS */
750 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
751 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
753 /* Update potential sum for this i atom from the interaction with this j atom. */
754 velec = _mm256_andnot_ps(dummy_mask,velec);
755 velecsum = _mm256_add_ps(velecsum,velec);
759 fscal = _mm256_andnot_ps(dummy_mask,fscal);
761 /* Calculate temporary vectorial force */
762 tx = _mm256_mul_ps(fscal,dx10);
763 ty = _mm256_mul_ps(fscal,dy10);
764 tz = _mm256_mul_ps(fscal,dz10);
766 /* Update vectorial force */
767 fix1 = _mm256_add_ps(fix1,tx);
768 fiy1 = _mm256_add_ps(fiy1,ty);
769 fiz1 = _mm256_add_ps(fiz1,tz);
771 fjx0 = _mm256_add_ps(fjx0,tx);
772 fjy0 = _mm256_add_ps(fjy0,ty);
773 fjz0 = _mm256_add_ps(fjz0,tz);
775 /**************************
776 * CALCULATE INTERACTIONS *
777 **************************/
779 /* REACTION-FIELD ELECTROSTATICS */
780 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
781 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
783 /* Update potential sum for this i atom from the interaction with this j atom. */
784 velec = _mm256_andnot_ps(dummy_mask,velec);
785 velecsum = _mm256_add_ps(velecsum,velec);
789 fscal = _mm256_andnot_ps(dummy_mask,fscal);
791 /* Calculate temporary vectorial force */
792 tx = _mm256_mul_ps(fscal,dx11);
793 ty = _mm256_mul_ps(fscal,dy11);
794 tz = _mm256_mul_ps(fscal,dz11);
796 /* Update vectorial force */
797 fix1 = _mm256_add_ps(fix1,tx);
798 fiy1 = _mm256_add_ps(fiy1,ty);
799 fiz1 = _mm256_add_ps(fiz1,tz);
801 fjx1 = _mm256_add_ps(fjx1,tx);
802 fjy1 = _mm256_add_ps(fjy1,ty);
803 fjz1 = _mm256_add_ps(fjz1,tz);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 /* REACTION-FIELD ELECTROSTATICS */
810 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
811 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm256_andnot_ps(dummy_mask,velec);
815 velecsum = _mm256_add_ps(velecsum,velec);
819 fscal = _mm256_andnot_ps(dummy_mask,fscal);
821 /* Calculate temporary vectorial force */
822 tx = _mm256_mul_ps(fscal,dx12);
823 ty = _mm256_mul_ps(fscal,dy12);
824 tz = _mm256_mul_ps(fscal,dz12);
826 /* Update vectorial force */
827 fix1 = _mm256_add_ps(fix1,tx);
828 fiy1 = _mm256_add_ps(fiy1,ty);
829 fiz1 = _mm256_add_ps(fiz1,tz);
831 fjx2 = _mm256_add_ps(fjx2,tx);
832 fjy2 = _mm256_add_ps(fjy2,ty);
833 fjz2 = _mm256_add_ps(fjz2,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 /* REACTION-FIELD ELECTROSTATICS */
840 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
841 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
843 /* Update potential sum for this i atom from the interaction with this j atom. */
844 velec = _mm256_andnot_ps(dummy_mask,velec);
845 velecsum = _mm256_add_ps(velecsum,velec);
849 fscal = _mm256_andnot_ps(dummy_mask,fscal);
851 /* Calculate temporary vectorial force */
852 tx = _mm256_mul_ps(fscal,dx20);
853 ty = _mm256_mul_ps(fscal,dy20);
854 tz = _mm256_mul_ps(fscal,dz20);
856 /* Update vectorial force */
857 fix2 = _mm256_add_ps(fix2,tx);
858 fiy2 = _mm256_add_ps(fiy2,ty);
859 fiz2 = _mm256_add_ps(fiz2,tz);
861 fjx0 = _mm256_add_ps(fjx0,tx);
862 fjy0 = _mm256_add_ps(fjy0,ty);
863 fjz0 = _mm256_add_ps(fjz0,tz);
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 /* REACTION-FIELD ELECTROSTATICS */
870 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
871 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm256_andnot_ps(dummy_mask,velec);
875 velecsum = _mm256_add_ps(velecsum,velec);
879 fscal = _mm256_andnot_ps(dummy_mask,fscal);
881 /* Calculate temporary vectorial force */
882 tx = _mm256_mul_ps(fscal,dx21);
883 ty = _mm256_mul_ps(fscal,dy21);
884 tz = _mm256_mul_ps(fscal,dz21);
886 /* Update vectorial force */
887 fix2 = _mm256_add_ps(fix2,tx);
888 fiy2 = _mm256_add_ps(fiy2,ty);
889 fiz2 = _mm256_add_ps(fiz2,tz);
891 fjx1 = _mm256_add_ps(fjx1,tx);
892 fjy1 = _mm256_add_ps(fjy1,ty);
893 fjz1 = _mm256_add_ps(fjz1,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 /* REACTION-FIELD ELECTROSTATICS */
900 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
901 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
903 /* Update potential sum for this i atom from the interaction with this j atom. */
904 velec = _mm256_andnot_ps(dummy_mask,velec);
905 velecsum = _mm256_add_ps(velecsum,velec);
909 fscal = _mm256_andnot_ps(dummy_mask,fscal);
911 /* Calculate temporary vectorial force */
912 tx = _mm256_mul_ps(fscal,dx22);
913 ty = _mm256_mul_ps(fscal,dy22);
914 tz = _mm256_mul_ps(fscal,dz22);
916 /* Update vectorial force */
917 fix2 = _mm256_add_ps(fix2,tx);
918 fiy2 = _mm256_add_ps(fiy2,ty);
919 fiz2 = _mm256_add_ps(fiz2,tz);
921 fjx2 = _mm256_add_ps(fjx2,tx);
922 fjy2 = _mm256_add_ps(fjy2,ty);
923 fjz2 = _mm256_add_ps(fjz2,tz);
925 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
926 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
927 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
928 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
929 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
930 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
931 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
932 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
934 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
935 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
937 /* Inner loop uses 288 flops */
940 /* End of innermost loop */
942 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
943 f+i_coord_offset,fshift+i_shift_offset);
946 /* Update potential energies */
947 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
949 /* Increment number of inner iterations */
950 inneriter += j_index_end - j_index_start;
952 /* Outer loop uses 19 flops */
955 /* Increment number of outer iterations */
958 /* Update outer/inner flops */
960 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*288);
963 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_single
964 * Electrostatics interaction: ReactionField
965 * VdW interaction: None
966 * Geometry: Water3-Water3
967 * Calculate force/pot: Force
970 nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_single
971 (t_nblist * gmx_restrict nlist,
972 rvec * gmx_restrict xx,
973 rvec * gmx_restrict ff,
974 t_forcerec * gmx_restrict fr,
975 t_mdatoms * gmx_restrict mdatoms,
976 nb_kernel_data_t * gmx_restrict kernel_data,
977 t_nrnb * gmx_restrict nrnb)
979 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
980 * just 0 for non-waters.
981 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
982 * jnr indices corresponding to data put in the four positions in the SIMD register.
984 int i_shift_offset,i_coord_offset,outeriter,inneriter;
985 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
986 int jnrA,jnrB,jnrC,jnrD;
987 int jnrE,jnrF,jnrG,jnrH;
988 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
989 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
990 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
991 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
992 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
994 real *shiftvec,*fshift,*x,*f;
995 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
997 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
998 real * vdwioffsetptr0;
999 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1000 real * vdwioffsetptr1;
1001 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1002 real * vdwioffsetptr2;
1003 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1004 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1005 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1006 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1007 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1008 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1009 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1010 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1011 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1012 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1013 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1014 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1015 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1016 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1017 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1018 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1019 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1021 __m256 dummy_mask,cutoff_mask;
1022 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1023 __m256 one = _mm256_set1_ps(1.0);
1024 __m256 two = _mm256_set1_ps(2.0);
1030 jindex = nlist->jindex;
1032 shiftidx = nlist->shift;
1034 shiftvec = fr->shift_vec[0];
1035 fshift = fr->fshift[0];
1036 facel = _mm256_set1_ps(fr->epsfac);
1037 charge = mdatoms->chargeA;
1038 krf = _mm256_set1_ps(fr->ic->k_rf);
1039 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1040 crf = _mm256_set1_ps(fr->ic->c_rf);
1042 /* Setup water-specific parameters */
1043 inr = nlist->iinr[0];
1044 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1045 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1046 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1048 jq0 = _mm256_set1_ps(charge[inr+0]);
1049 jq1 = _mm256_set1_ps(charge[inr+1]);
1050 jq2 = _mm256_set1_ps(charge[inr+2]);
1051 qq00 = _mm256_mul_ps(iq0,jq0);
1052 qq01 = _mm256_mul_ps(iq0,jq1);
1053 qq02 = _mm256_mul_ps(iq0,jq2);
1054 qq10 = _mm256_mul_ps(iq1,jq0);
1055 qq11 = _mm256_mul_ps(iq1,jq1);
1056 qq12 = _mm256_mul_ps(iq1,jq2);
1057 qq20 = _mm256_mul_ps(iq2,jq0);
1058 qq21 = _mm256_mul_ps(iq2,jq1);
1059 qq22 = _mm256_mul_ps(iq2,jq2);
1061 /* Avoid stupid compiler warnings */
1062 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1063 j_coord_offsetA = 0;
1064 j_coord_offsetB = 0;
1065 j_coord_offsetC = 0;
1066 j_coord_offsetD = 0;
1067 j_coord_offsetE = 0;
1068 j_coord_offsetF = 0;
1069 j_coord_offsetG = 0;
1070 j_coord_offsetH = 0;
1075 for(iidx=0;iidx<4*DIM;iidx++)
1077 scratch[iidx] = 0.0;
1080 /* Start outer loop over neighborlists */
1081 for(iidx=0; iidx<nri; iidx++)
1083 /* Load shift vector for this list */
1084 i_shift_offset = DIM*shiftidx[iidx];
1086 /* Load limits for loop over neighbors */
1087 j_index_start = jindex[iidx];
1088 j_index_end = jindex[iidx+1];
1090 /* Get outer coordinate index */
1092 i_coord_offset = DIM*inr;
1094 /* Load i particle coords and add shift vector */
1095 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1096 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1098 fix0 = _mm256_setzero_ps();
1099 fiy0 = _mm256_setzero_ps();
1100 fiz0 = _mm256_setzero_ps();
1101 fix1 = _mm256_setzero_ps();
1102 fiy1 = _mm256_setzero_ps();
1103 fiz1 = _mm256_setzero_ps();
1104 fix2 = _mm256_setzero_ps();
1105 fiy2 = _mm256_setzero_ps();
1106 fiz2 = _mm256_setzero_ps();
1108 /* Start inner kernel loop */
1109 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1112 /* Get j neighbor index, and coordinate index */
1114 jnrB = jjnr[jidx+1];
1115 jnrC = jjnr[jidx+2];
1116 jnrD = jjnr[jidx+3];
1117 jnrE = jjnr[jidx+4];
1118 jnrF = jjnr[jidx+5];
1119 jnrG = jjnr[jidx+6];
1120 jnrH = jjnr[jidx+7];
1121 j_coord_offsetA = DIM*jnrA;
1122 j_coord_offsetB = DIM*jnrB;
1123 j_coord_offsetC = DIM*jnrC;
1124 j_coord_offsetD = DIM*jnrD;
1125 j_coord_offsetE = DIM*jnrE;
1126 j_coord_offsetF = DIM*jnrF;
1127 j_coord_offsetG = DIM*jnrG;
1128 j_coord_offsetH = DIM*jnrH;
1130 /* load j atom coordinates */
1131 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1132 x+j_coord_offsetC,x+j_coord_offsetD,
1133 x+j_coord_offsetE,x+j_coord_offsetF,
1134 x+j_coord_offsetG,x+j_coord_offsetH,
1135 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1137 /* Calculate displacement vector */
1138 dx00 = _mm256_sub_ps(ix0,jx0);
1139 dy00 = _mm256_sub_ps(iy0,jy0);
1140 dz00 = _mm256_sub_ps(iz0,jz0);
1141 dx01 = _mm256_sub_ps(ix0,jx1);
1142 dy01 = _mm256_sub_ps(iy0,jy1);
1143 dz01 = _mm256_sub_ps(iz0,jz1);
1144 dx02 = _mm256_sub_ps(ix0,jx2);
1145 dy02 = _mm256_sub_ps(iy0,jy2);
1146 dz02 = _mm256_sub_ps(iz0,jz2);
1147 dx10 = _mm256_sub_ps(ix1,jx0);
1148 dy10 = _mm256_sub_ps(iy1,jy0);
1149 dz10 = _mm256_sub_ps(iz1,jz0);
1150 dx11 = _mm256_sub_ps(ix1,jx1);
1151 dy11 = _mm256_sub_ps(iy1,jy1);
1152 dz11 = _mm256_sub_ps(iz1,jz1);
1153 dx12 = _mm256_sub_ps(ix1,jx2);
1154 dy12 = _mm256_sub_ps(iy1,jy2);
1155 dz12 = _mm256_sub_ps(iz1,jz2);
1156 dx20 = _mm256_sub_ps(ix2,jx0);
1157 dy20 = _mm256_sub_ps(iy2,jy0);
1158 dz20 = _mm256_sub_ps(iz2,jz0);
1159 dx21 = _mm256_sub_ps(ix2,jx1);
1160 dy21 = _mm256_sub_ps(iy2,jy1);
1161 dz21 = _mm256_sub_ps(iz2,jz1);
1162 dx22 = _mm256_sub_ps(ix2,jx2);
1163 dy22 = _mm256_sub_ps(iy2,jy2);
1164 dz22 = _mm256_sub_ps(iz2,jz2);
1166 /* Calculate squared distance and things based on it */
1167 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1168 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1169 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1170 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1171 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1172 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1173 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1174 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1175 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1177 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1178 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1179 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1180 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1181 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1182 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1183 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1184 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1185 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1187 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1188 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1189 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1190 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1191 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1192 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1193 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1194 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1195 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1197 fjx0 = _mm256_setzero_ps();
1198 fjy0 = _mm256_setzero_ps();
1199 fjz0 = _mm256_setzero_ps();
1200 fjx1 = _mm256_setzero_ps();
1201 fjy1 = _mm256_setzero_ps();
1202 fjz1 = _mm256_setzero_ps();
1203 fjx2 = _mm256_setzero_ps();
1204 fjy2 = _mm256_setzero_ps();
1205 fjz2 = _mm256_setzero_ps();
1207 /**************************
1208 * CALCULATE INTERACTIONS *
1209 **************************/
1211 /* REACTION-FIELD ELECTROSTATICS */
1212 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1216 /* Calculate temporary vectorial force */
1217 tx = _mm256_mul_ps(fscal,dx00);
1218 ty = _mm256_mul_ps(fscal,dy00);
1219 tz = _mm256_mul_ps(fscal,dz00);
1221 /* Update vectorial force */
1222 fix0 = _mm256_add_ps(fix0,tx);
1223 fiy0 = _mm256_add_ps(fiy0,ty);
1224 fiz0 = _mm256_add_ps(fiz0,tz);
1226 fjx0 = _mm256_add_ps(fjx0,tx);
1227 fjy0 = _mm256_add_ps(fjy0,ty);
1228 fjz0 = _mm256_add_ps(fjz0,tz);
1230 /**************************
1231 * CALCULATE INTERACTIONS *
1232 **************************/
1234 /* REACTION-FIELD ELECTROSTATICS */
1235 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1239 /* Calculate temporary vectorial force */
1240 tx = _mm256_mul_ps(fscal,dx01);
1241 ty = _mm256_mul_ps(fscal,dy01);
1242 tz = _mm256_mul_ps(fscal,dz01);
1244 /* Update vectorial force */
1245 fix0 = _mm256_add_ps(fix0,tx);
1246 fiy0 = _mm256_add_ps(fiy0,ty);
1247 fiz0 = _mm256_add_ps(fiz0,tz);
1249 fjx1 = _mm256_add_ps(fjx1,tx);
1250 fjy1 = _mm256_add_ps(fjy1,ty);
1251 fjz1 = _mm256_add_ps(fjz1,tz);
1253 /**************************
1254 * CALCULATE INTERACTIONS *
1255 **************************/
1257 /* REACTION-FIELD ELECTROSTATICS */
1258 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1262 /* Calculate temporary vectorial force */
1263 tx = _mm256_mul_ps(fscal,dx02);
1264 ty = _mm256_mul_ps(fscal,dy02);
1265 tz = _mm256_mul_ps(fscal,dz02);
1267 /* Update vectorial force */
1268 fix0 = _mm256_add_ps(fix0,tx);
1269 fiy0 = _mm256_add_ps(fiy0,ty);
1270 fiz0 = _mm256_add_ps(fiz0,tz);
1272 fjx2 = _mm256_add_ps(fjx2,tx);
1273 fjy2 = _mm256_add_ps(fjy2,ty);
1274 fjz2 = _mm256_add_ps(fjz2,tz);
1276 /**************************
1277 * CALCULATE INTERACTIONS *
1278 **************************/
1280 /* REACTION-FIELD ELECTROSTATICS */
1281 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1285 /* Calculate temporary vectorial force */
1286 tx = _mm256_mul_ps(fscal,dx10);
1287 ty = _mm256_mul_ps(fscal,dy10);
1288 tz = _mm256_mul_ps(fscal,dz10);
1290 /* Update vectorial force */
1291 fix1 = _mm256_add_ps(fix1,tx);
1292 fiy1 = _mm256_add_ps(fiy1,ty);
1293 fiz1 = _mm256_add_ps(fiz1,tz);
1295 fjx0 = _mm256_add_ps(fjx0,tx);
1296 fjy0 = _mm256_add_ps(fjy0,ty);
1297 fjz0 = _mm256_add_ps(fjz0,tz);
1299 /**************************
1300 * CALCULATE INTERACTIONS *
1301 **************************/
1303 /* REACTION-FIELD ELECTROSTATICS */
1304 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1308 /* Calculate temporary vectorial force */
1309 tx = _mm256_mul_ps(fscal,dx11);
1310 ty = _mm256_mul_ps(fscal,dy11);
1311 tz = _mm256_mul_ps(fscal,dz11);
1313 /* Update vectorial force */
1314 fix1 = _mm256_add_ps(fix1,tx);
1315 fiy1 = _mm256_add_ps(fiy1,ty);
1316 fiz1 = _mm256_add_ps(fiz1,tz);
1318 fjx1 = _mm256_add_ps(fjx1,tx);
1319 fjy1 = _mm256_add_ps(fjy1,ty);
1320 fjz1 = _mm256_add_ps(fjz1,tz);
1322 /**************************
1323 * CALCULATE INTERACTIONS *
1324 **************************/
1326 /* REACTION-FIELD ELECTROSTATICS */
1327 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1331 /* Calculate temporary vectorial force */
1332 tx = _mm256_mul_ps(fscal,dx12);
1333 ty = _mm256_mul_ps(fscal,dy12);
1334 tz = _mm256_mul_ps(fscal,dz12);
1336 /* Update vectorial force */
1337 fix1 = _mm256_add_ps(fix1,tx);
1338 fiy1 = _mm256_add_ps(fiy1,ty);
1339 fiz1 = _mm256_add_ps(fiz1,tz);
1341 fjx2 = _mm256_add_ps(fjx2,tx);
1342 fjy2 = _mm256_add_ps(fjy2,ty);
1343 fjz2 = _mm256_add_ps(fjz2,tz);
1345 /**************************
1346 * CALCULATE INTERACTIONS *
1347 **************************/
1349 /* REACTION-FIELD ELECTROSTATICS */
1350 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1354 /* Calculate temporary vectorial force */
1355 tx = _mm256_mul_ps(fscal,dx20);
1356 ty = _mm256_mul_ps(fscal,dy20);
1357 tz = _mm256_mul_ps(fscal,dz20);
1359 /* Update vectorial force */
1360 fix2 = _mm256_add_ps(fix2,tx);
1361 fiy2 = _mm256_add_ps(fiy2,ty);
1362 fiz2 = _mm256_add_ps(fiz2,tz);
1364 fjx0 = _mm256_add_ps(fjx0,tx);
1365 fjy0 = _mm256_add_ps(fjy0,ty);
1366 fjz0 = _mm256_add_ps(fjz0,tz);
1368 /**************************
1369 * CALCULATE INTERACTIONS *
1370 **************************/
1372 /* REACTION-FIELD ELECTROSTATICS */
1373 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1377 /* Calculate temporary vectorial force */
1378 tx = _mm256_mul_ps(fscal,dx21);
1379 ty = _mm256_mul_ps(fscal,dy21);
1380 tz = _mm256_mul_ps(fscal,dz21);
1382 /* Update vectorial force */
1383 fix2 = _mm256_add_ps(fix2,tx);
1384 fiy2 = _mm256_add_ps(fiy2,ty);
1385 fiz2 = _mm256_add_ps(fiz2,tz);
1387 fjx1 = _mm256_add_ps(fjx1,tx);
1388 fjy1 = _mm256_add_ps(fjy1,ty);
1389 fjz1 = _mm256_add_ps(fjz1,tz);
1391 /**************************
1392 * CALCULATE INTERACTIONS *
1393 **************************/
1395 /* REACTION-FIELD ELECTROSTATICS */
1396 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1400 /* Calculate temporary vectorial force */
1401 tx = _mm256_mul_ps(fscal,dx22);
1402 ty = _mm256_mul_ps(fscal,dy22);
1403 tz = _mm256_mul_ps(fscal,dz22);
1405 /* Update vectorial force */
1406 fix2 = _mm256_add_ps(fix2,tx);
1407 fiy2 = _mm256_add_ps(fiy2,ty);
1408 fiz2 = _mm256_add_ps(fiz2,tz);
1410 fjx2 = _mm256_add_ps(fjx2,tx);
1411 fjy2 = _mm256_add_ps(fjy2,ty);
1412 fjz2 = _mm256_add_ps(fjz2,tz);
1414 fjptrA = f+j_coord_offsetA;
1415 fjptrB = f+j_coord_offsetB;
1416 fjptrC = f+j_coord_offsetC;
1417 fjptrD = f+j_coord_offsetD;
1418 fjptrE = f+j_coord_offsetE;
1419 fjptrF = f+j_coord_offsetF;
1420 fjptrG = f+j_coord_offsetG;
1421 fjptrH = f+j_coord_offsetH;
1423 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1424 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1426 /* Inner loop uses 243 flops */
1429 if(jidx<j_index_end)
1432 /* Get j neighbor index, and coordinate index */
1433 jnrlistA = jjnr[jidx];
1434 jnrlistB = jjnr[jidx+1];
1435 jnrlistC = jjnr[jidx+2];
1436 jnrlistD = jjnr[jidx+3];
1437 jnrlistE = jjnr[jidx+4];
1438 jnrlistF = jjnr[jidx+5];
1439 jnrlistG = jjnr[jidx+6];
1440 jnrlistH = jjnr[jidx+7];
1441 /* Sign of each element will be negative for non-real atoms.
1442 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1443 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1445 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1446 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1448 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1449 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1450 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1451 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1452 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1453 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1454 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1455 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1456 j_coord_offsetA = DIM*jnrA;
1457 j_coord_offsetB = DIM*jnrB;
1458 j_coord_offsetC = DIM*jnrC;
1459 j_coord_offsetD = DIM*jnrD;
1460 j_coord_offsetE = DIM*jnrE;
1461 j_coord_offsetF = DIM*jnrF;
1462 j_coord_offsetG = DIM*jnrG;
1463 j_coord_offsetH = DIM*jnrH;
1465 /* load j atom coordinates */
1466 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1467 x+j_coord_offsetC,x+j_coord_offsetD,
1468 x+j_coord_offsetE,x+j_coord_offsetF,
1469 x+j_coord_offsetG,x+j_coord_offsetH,
1470 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1472 /* Calculate displacement vector */
1473 dx00 = _mm256_sub_ps(ix0,jx0);
1474 dy00 = _mm256_sub_ps(iy0,jy0);
1475 dz00 = _mm256_sub_ps(iz0,jz0);
1476 dx01 = _mm256_sub_ps(ix0,jx1);
1477 dy01 = _mm256_sub_ps(iy0,jy1);
1478 dz01 = _mm256_sub_ps(iz0,jz1);
1479 dx02 = _mm256_sub_ps(ix0,jx2);
1480 dy02 = _mm256_sub_ps(iy0,jy2);
1481 dz02 = _mm256_sub_ps(iz0,jz2);
1482 dx10 = _mm256_sub_ps(ix1,jx0);
1483 dy10 = _mm256_sub_ps(iy1,jy0);
1484 dz10 = _mm256_sub_ps(iz1,jz0);
1485 dx11 = _mm256_sub_ps(ix1,jx1);
1486 dy11 = _mm256_sub_ps(iy1,jy1);
1487 dz11 = _mm256_sub_ps(iz1,jz1);
1488 dx12 = _mm256_sub_ps(ix1,jx2);
1489 dy12 = _mm256_sub_ps(iy1,jy2);
1490 dz12 = _mm256_sub_ps(iz1,jz2);
1491 dx20 = _mm256_sub_ps(ix2,jx0);
1492 dy20 = _mm256_sub_ps(iy2,jy0);
1493 dz20 = _mm256_sub_ps(iz2,jz0);
1494 dx21 = _mm256_sub_ps(ix2,jx1);
1495 dy21 = _mm256_sub_ps(iy2,jy1);
1496 dz21 = _mm256_sub_ps(iz2,jz1);
1497 dx22 = _mm256_sub_ps(ix2,jx2);
1498 dy22 = _mm256_sub_ps(iy2,jy2);
1499 dz22 = _mm256_sub_ps(iz2,jz2);
1501 /* Calculate squared distance and things based on it */
1502 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1503 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1504 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1505 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1506 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1507 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1508 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1509 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1510 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1512 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1513 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1514 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1515 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1516 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1517 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1518 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1519 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1520 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1522 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1523 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1524 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1525 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1526 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1527 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1528 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1529 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1530 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1532 fjx0 = _mm256_setzero_ps();
1533 fjy0 = _mm256_setzero_ps();
1534 fjz0 = _mm256_setzero_ps();
1535 fjx1 = _mm256_setzero_ps();
1536 fjy1 = _mm256_setzero_ps();
1537 fjz1 = _mm256_setzero_ps();
1538 fjx2 = _mm256_setzero_ps();
1539 fjy2 = _mm256_setzero_ps();
1540 fjz2 = _mm256_setzero_ps();
1542 /**************************
1543 * CALCULATE INTERACTIONS *
1544 **************************/
1546 /* REACTION-FIELD ELECTROSTATICS */
1547 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1551 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1553 /* Calculate temporary vectorial force */
1554 tx = _mm256_mul_ps(fscal,dx00);
1555 ty = _mm256_mul_ps(fscal,dy00);
1556 tz = _mm256_mul_ps(fscal,dz00);
1558 /* Update vectorial force */
1559 fix0 = _mm256_add_ps(fix0,tx);
1560 fiy0 = _mm256_add_ps(fiy0,ty);
1561 fiz0 = _mm256_add_ps(fiz0,tz);
1563 fjx0 = _mm256_add_ps(fjx0,tx);
1564 fjy0 = _mm256_add_ps(fjy0,ty);
1565 fjz0 = _mm256_add_ps(fjz0,tz);
1567 /**************************
1568 * CALCULATE INTERACTIONS *
1569 **************************/
1571 /* REACTION-FIELD ELECTROSTATICS */
1572 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1576 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1578 /* Calculate temporary vectorial force */
1579 tx = _mm256_mul_ps(fscal,dx01);
1580 ty = _mm256_mul_ps(fscal,dy01);
1581 tz = _mm256_mul_ps(fscal,dz01);
1583 /* Update vectorial force */
1584 fix0 = _mm256_add_ps(fix0,tx);
1585 fiy0 = _mm256_add_ps(fiy0,ty);
1586 fiz0 = _mm256_add_ps(fiz0,tz);
1588 fjx1 = _mm256_add_ps(fjx1,tx);
1589 fjy1 = _mm256_add_ps(fjy1,ty);
1590 fjz1 = _mm256_add_ps(fjz1,tz);
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 /* REACTION-FIELD ELECTROSTATICS */
1597 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1601 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1603 /* Calculate temporary vectorial force */
1604 tx = _mm256_mul_ps(fscal,dx02);
1605 ty = _mm256_mul_ps(fscal,dy02);
1606 tz = _mm256_mul_ps(fscal,dz02);
1608 /* Update vectorial force */
1609 fix0 = _mm256_add_ps(fix0,tx);
1610 fiy0 = _mm256_add_ps(fiy0,ty);
1611 fiz0 = _mm256_add_ps(fiz0,tz);
1613 fjx2 = _mm256_add_ps(fjx2,tx);
1614 fjy2 = _mm256_add_ps(fjy2,ty);
1615 fjz2 = _mm256_add_ps(fjz2,tz);
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 /* REACTION-FIELD ELECTROSTATICS */
1622 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1626 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1628 /* Calculate temporary vectorial force */
1629 tx = _mm256_mul_ps(fscal,dx10);
1630 ty = _mm256_mul_ps(fscal,dy10);
1631 tz = _mm256_mul_ps(fscal,dz10);
1633 /* Update vectorial force */
1634 fix1 = _mm256_add_ps(fix1,tx);
1635 fiy1 = _mm256_add_ps(fiy1,ty);
1636 fiz1 = _mm256_add_ps(fiz1,tz);
1638 fjx0 = _mm256_add_ps(fjx0,tx);
1639 fjy0 = _mm256_add_ps(fjy0,ty);
1640 fjz0 = _mm256_add_ps(fjz0,tz);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 /* REACTION-FIELD ELECTROSTATICS */
1647 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1651 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1653 /* Calculate temporary vectorial force */
1654 tx = _mm256_mul_ps(fscal,dx11);
1655 ty = _mm256_mul_ps(fscal,dy11);
1656 tz = _mm256_mul_ps(fscal,dz11);
1658 /* Update vectorial force */
1659 fix1 = _mm256_add_ps(fix1,tx);
1660 fiy1 = _mm256_add_ps(fiy1,ty);
1661 fiz1 = _mm256_add_ps(fiz1,tz);
1663 fjx1 = _mm256_add_ps(fjx1,tx);
1664 fjy1 = _mm256_add_ps(fjy1,ty);
1665 fjz1 = _mm256_add_ps(fjz1,tz);
1667 /**************************
1668 * CALCULATE INTERACTIONS *
1669 **************************/
1671 /* REACTION-FIELD ELECTROSTATICS */
1672 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1676 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1678 /* Calculate temporary vectorial force */
1679 tx = _mm256_mul_ps(fscal,dx12);
1680 ty = _mm256_mul_ps(fscal,dy12);
1681 tz = _mm256_mul_ps(fscal,dz12);
1683 /* Update vectorial force */
1684 fix1 = _mm256_add_ps(fix1,tx);
1685 fiy1 = _mm256_add_ps(fiy1,ty);
1686 fiz1 = _mm256_add_ps(fiz1,tz);
1688 fjx2 = _mm256_add_ps(fjx2,tx);
1689 fjy2 = _mm256_add_ps(fjy2,ty);
1690 fjz2 = _mm256_add_ps(fjz2,tz);
1692 /**************************
1693 * CALCULATE INTERACTIONS *
1694 **************************/
1696 /* REACTION-FIELD ELECTROSTATICS */
1697 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1701 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1703 /* Calculate temporary vectorial force */
1704 tx = _mm256_mul_ps(fscal,dx20);
1705 ty = _mm256_mul_ps(fscal,dy20);
1706 tz = _mm256_mul_ps(fscal,dz20);
1708 /* Update vectorial force */
1709 fix2 = _mm256_add_ps(fix2,tx);
1710 fiy2 = _mm256_add_ps(fiy2,ty);
1711 fiz2 = _mm256_add_ps(fiz2,tz);
1713 fjx0 = _mm256_add_ps(fjx0,tx);
1714 fjy0 = _mm256_add_ps(fjy0,ty);
1715 fjz0 = _mm256_add_ps(fjz0,tz);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 /* REACTION-FIELD ELECTROSTATICS */
1722 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1726 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1728 /* Calculate temporary vectorial force */
1729 tx = _mm256_mul_ps(fscal,dx21);
1730 ty = _mm256_mul_ps(fscal,dy21);
1731 tz = _mm256_mul_ps(fscal,dz21);
1733 /* Update vectorial force */
1734 fix2 = _mm256_add_ps(fix2,tx);
1735 fiy2 = _mm256_add_ps(fiy2,ty);
1736 fiz2 = _mm256_add_ps(fiz2,tz);
1738 fjx1 = _mm256_add_ps(fjx1,tx);
1739 fjy1 = _mm256_add_ps(fjy1,ty);
1740 fjz1 = _mm256_add_ps(fjz1,tz);
1742 /**************************
1743 * CALCULATE INTERACTIONS *
1744 **************************/
1746 /* REACTION-FIELD ELECTROSTATICS */
1747 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1751 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1753 /* Calculate temporary vectorial force */
1754 tx = _mm256_mul_ps(fscal,dx22);
1755 ty = _mm256_mul_ps(fscal,dy22);
1756 tz = _mm256_mul_ps(fscal,dz22);
1758 /* Update vectorial force */
1759 fix2 = _mm256_add_ps(fix2,tx);
1760 fiy2 = _mm256_add_ps(fiy2,ty);
1761 fiz2 = _mm256_add_ps(fiz2,tz);
1763 fjx2 = _mm256_add_ps(fjx2,tx);
1764 fjy2 = _mm256_add_ps(fjy2,ty);
1765 fjz2 = _mm256_add_ps(fjz2,tz);
1767 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1768 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1769 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1770 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1771 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1772 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1773 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1774 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1776 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1777 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1779 /* Inner loop uses 243 flops */
1782 /* End of innermost loop */
1784 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1785 f+i_coord_offset,fshift+i_shift_offset);
1787 /* Increment number of inner iterations */
1788 inneriter += j_index_end - j_index_start;
1790 /* Outer loop uses 18 flops */
1793 /* Increment number of outer iterations */
1796 /* Update outer/inner flops */
1798 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*243);