2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr1;
73 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
79 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
81 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
83 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
84 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
85 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
86 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
87 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
88 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
89 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
90 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
91 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
92 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m256 dummy_mask,cutoff_mask;
96 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
97 __m256 one = _mm256_set1_ps(1.0);
98 __m256 two = _mm256_set1_ps(2.0);
104 jindex = nlist->jindex;
106 shiftidx = nlist->shift;
108 shiftvec = fr->shift_vec[0];
109 fshift = fr->fshift[0];
110 facel = _mm256_set1_ps(fr->epsfac);
111 charge = mdatoms->chargeA;
112 krf = _mm256_set1_ps(fr->ic->k_rf);
113 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
114 crf = _mm256_set1_ps(fr->ic->c_rf);
116 /* Setup water-specific parameters */
117 inr = nlist->iinr[0];
118 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
119 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
120 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
122 jq1 = _mm256_set1_ps(charge[inr+1]);
123 jq2 = _mm256_set1_ps(charge[inr+2]);
124 jq3 = _mm256_set1_ps(charge[inr+3]);
125 qq11 = _mm256_mul_ps(iq1,jq1);
126 qq12 = _mm256_mul_ps(iq1,jq2);
127 qq13 = _mm256_mul_ps(iq1,jq3);
128 qq21 = _mm256_mul_ps(iq2,jq1);
129 qq22 = _mm256_mul_ps(iq2,jq2);
130 qq23 = _mm256_mul_ps(iq2,jq3);
131 qq31 = _mm256_mul_ps(iq3,jq1);
132 qq32 = _mm256_mul_ps(iq3,jq2);
133 qq33 = _mm256_mul_ps(iq3,jq3);
135 /* Avoid stupid compiler warnings */
136 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
149 for(iidx=0;iidx<4*DIM;iidx++)
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
170 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
172 fix1 = _mm256_setzero_ps();
173 fiy1 = _mm256_setzero_ps();
174 fiz1 = _mm256_setzero_ps();
175 fix2 = _mm256_setzero_ps();
176 fiy2 = _mm256_setzero_ps();
177 fiz2 = _mm256_setzero_ps();
178 fix3 = _mm256_setzero_ps();
179 fiy3 = _mm256_setzero_ps();
180 fiz3 = _mm256_setzero_ps();
182 /* Reset potential sums */
183 velecsum = _mm256_setzero_ps();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
189 /* Get j neighbor index, and coordinate index */
198 j_coord_offsetA = DIM*jnrA;
199 j_coord_offsetB = DIM*jnrB;
200 j_coord_offsetC = DIM*jnrC;
201 j_coord_offsetD = DIM*jnrD;
202 j_coord_offsetE = DIM*jnrE;
203 j_coord_offsetF = DIM*jnrF;
204 j_coord_offsetG = DIM*jnrG;
205 j_coord_offsetH = DIM*jnrH;
207 /* load j atom coordinates */
208 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
209 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
210 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
211 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
212 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
214 /* Calculate displacement vector */
215 dx11 = _mm256_sub_ps(ix1,jx1);
216 dy11 = _mm256_sub_ps(iy1,jy1);
217 dz11 = _mm256_sub_ps(iz1,jz1);
218 dx12 = _mm256_sub_ps(ix1,jx2);
219 dy12 = _mm256_sub_ps(iy1,jy2);
220 dz12 = _mm256_sub_ps(iz1,jz2);
221 dx13 = _mm256_sub_ps(ix1,jx3);
222 dy13 = _mm256_sub_ps(iy1,jy3);
223 dz13 = _mm256_sub_ps(iz1,jz3);
224 dx21 = _mm256_sub_ps(ix2,jx1);
225 dy21 = _mm256_sub_ps(iy2,jy1);
226 dz21 = _mm256_sub_ps(iz2,jz1);
227 dx22 = _mm256_sub_ps(ix2,jx2);
228 dy22 = _mm256_sub_ps(iy2,jy2);
229 dz22 = _mm256_sub_ps(iz2,jz2);
230 dx23 = _mm256_sub_ps(ix2,jx3);
231 dy23 = _mm256_sub_ps(iy2,jy3);
232 dz23 = _mm256_sub_ps(iz2,jz3);
233 dx31 = _mm256_sub_ps(ix3,jx1);
234 dy31 = _mm256_sub_ps(iy3,jy1);
235 dz31 = _mm256_sub_ps(iz3,jz1);
236 dx32 = _mm256_sub_ps(ix3,jx2);
237 dy32 = _mm256_sub_ps(iy3,jy2);
238 dz32 = _mm256_sub_ps(iz3,jz2);
239 dx33 = _mm256_sub_ps(ix3,jx3);
240 dy33 = _mm256_sub_ps(iy3,jy3);
241 dz33 = _mm256_sub_ps(iz3,jz3);
243 /* Calculate squared distance and things based on it */
244 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
245 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
246 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
247 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
248 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
249 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
250 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
251 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
252 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
254 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
255 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
256 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
257 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
258 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
259 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
260 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
261 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
262 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
264 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
265 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
266 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
267 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
268 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
269 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
270 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
271 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
272 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
274 fjx1 = _mm256_setzero_ps();
275 fjy1 = _mm256_setzero_ps();
276 fjz1 = _mm256_setzero_ps();
277 fjx2 = _mm256_setzero_ps();
278 fjy2 = _mm256_setzero_ps();
279 fjz2 = _mm256_setzero_ps();
280 fjx3 = _mm256_setzero_ps();
281 fjy3 = _mm256_setzero_ps();
282 fjz3 = _mm256_setzero_ps();
284 /**************************
285 * CALCULATE INTERACTIONS *
286 **************************/
288 /* REACTION-FIELD ELECTROSTATICS */
289 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
290 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
292 /* Update potential sum for this i atom from the interaction with this j atom. */
293 velecsum = _mm256_add_ps(velecsum,velec);
297 /* Calculate temporary vectorial force */
298 tx = _mm256_mul_ps(fscal,dx11);
299 ty = _mm256_mul_ps(fscal,dy11);
300 tz = _mm256_mul_ps(fscal,dz11);
302 /* Update vectorial force */
303 fix1 = _mm256_add_ps(fix1,tx);
304 fiy1 = _mm256_add_ps(fiy1,ty);
305 fiz1 = _mm256_add_ps(fiz1,tz);
307 fjx1 = _mm256_add_ps(fjx1,tx);
308 fjy1 = _mm256_add_ps(fjy1,ty);
309 fjz1 = _mm256_add_ps(fjz1,tz);
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 /* REACTION-FIELD ELECTROSTATICS */
316 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
317 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
319 /* Update potential sum for this i atom from the interaction with this j atom. */
320 velecsum = _mm256_add_ps(velecsum,velec);
324 /* Calculate temporary vectorial force */
325 tx = _mm256_mul_ps(fscal,dx12);
326 ty = _mm256_mul_ps(fscal,dy12);
327 tz = _mm256_mul_ps(fscal,dz12);
329 /* Update vectorial force */
330 fix1 = _mm256_add_ps(fix1,tx);
331 fiy1 = _mm256_add_ps(fiy1,ty);
332 fiz1 = _mm256_add_ps(fiz1,tz);
334 fjx2 = _mm256_add_ps(fjx2,tx);
335 fjy2 = _mm256_add_ps(fjy2,ty);
336 fjz2 = _mm256_add_ps(fjz2,tz);
338 /**************************
339 * CALCULATE INTERACTIONS *
340 **************************/
342 /* REACTION-FIELD ELECTROSTATICS */
343 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
344 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velecsum = _mm256_add_ps(velecsum,velec);
351 /* Calculate temporary vectorial force */
352 tx = _mm256_mul_ps(fscal,dx13);
353 ty = _mm256_mul_ps(fscal,dy13);
354 tz = _mm256_mul_ps(fscal,dz13);
356 /* Update vectorial force */
357 fix1 = _mm256_add_ps(fix1,tx);
358 fiy1 = _mm256_add_ps(fiy1,ty);
359 fiz1 = _mm256_add_ps(fiz1,tz);
361 fjx3 = _mm256_add_ps(fjx3,tx);
362 fjy3 = _mm256_add_ps(fjy3,ty);
363 fjz3 = _mm256_add_ps(fjz3,tz);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 /* REACTION-FIELD ELECTROSTATICS */
370 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
371 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velecsum = _mm256_add_ps(velecsum,velec);
378 /* Calculate temporary vectorial force */
379 tx = _mm256_mul_ps(fscal,dx21);
380 ty = _mm256_mul_ps(fscal,dy21);
381 tz = _mm256_mul_ps(fscal,dz21);
383 /* Update vectorial force */
384 fix2 = _mm256_add_ps(fix2,tx);
385 fiy2 = _mm256_add_ps(fiy2,ty);
386 fiz2 = _mm256_add_ps(fiz2,tz);
388 fjx1 = _mm256_add_ps(fjx1,tx);
389 fjy1 = _mm256_add_ps(fjy1,ty);
390 fjz1 = _mm256_add_ps(fjz1,tz);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 /* REACTION-FIELD ELECTROSTATICS */
397 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
398 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velecsum = _mm256_add_ps(velecsum,velec);
405 /* Calculate temporary vectorial force */
406 tx = _mm256_mul_ps(fscal,dx22);
407 ty = _mm256_mul_ps(fscal,dy22);
408 tz = _mm256_mul_ps(fscal,dz22);
410 /* Update vectorial force */
411 fix2 = _mm256_add_ps(fix2,tx);
412 fiy2 = _mm256_add_ps(fiy2,ty);
413 fiz2 = _mm256_add_ps(fiz2,tz);
415 fjx2 = _mm256_add_ps(fjx2,tx);
416 fjy2 = _mm256_add_ps(fjy2,ty);
417 fjz2 = _mm256_add_ps(fjz2,tz);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 /* REACTION-FIELD ELECTROSTATICS */
424 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
425 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum = _mm256_add_ps(velecsum,velec);
432 /* Calculate temporary vectorial force */
433 tx = _mm256_mul_ps(fscal,dx23);
434 ty = _mm256_mul_ps(fscal,dy23);
435 tz = _mm256_mul_ps(fscal,dz23);
437 /* Update vectorial force */
438 fix2 = _mm256_add_ps(fix2,tx);
439 fiy2 = _mm256_add_ps(fiy2,ty);
440 fiz2 = _mm256_add_ps(fiz2,tz);
442 fjx3 = _mm256_add_ps(fjx3,tx);
443 fjy3 = _mm256_add_ps(fjy3,ty);
444 fjz3 = _mm256_add_ps(fjz3,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 /* REACTION-FIELD ELECTROSTATICS */
451 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
452 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velecsum = _mm256_add_ps(velecsum,velec);
459 /* Calculate temporary vectorial force */
460 tx = _mm256_mul_ps(fscal,dx31);
461 ty = _mm256_mul_ps(fscal,dy31);
462 tz = _mm256_mul_ps(fscal,dz31);
464 /* Update vectorial force */
465 fix3 = _mm256_add_ps(fix3,tx);
466 fiy3 = _mm256_add_ps(fiy3,ty);
467 fiz3 = _mm256_add_ps(fiz3,tz);
469 fjx1 = _mm256_add_ps(fjx1,tx);
470 fjy1 = _mm256_add_ps(fjy1,ty);
471 fjz1 = _mm256_add_ps(fjz1,tz);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 /* REACTION-FIELD ELECTROSTATICS */
478 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
479 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velecsum = _mm256_add_ps(velecsum,velec);
486 /* Calculate temporary vectorial force */
487 tx = _mm256_mul_ps(fscal,dx32);
488 ty = _mm256_mul_ps(fscal,dy32);
489 tz = _mm256_mul_ps(fscal,dz32);
491 /* Update vectorial force */
492 fix3 = _mm256_add_ps(fix3,tx);
493 fiy3 = _mm256_add_ps(fiy3,ty);
494 fiz3 = _mm256_add_ps(fiz3,tz);
496 fjx2 = _mm256_add_ps(fjx2,tx);
497 fjy2 = _mm256_add_ps(fjy2,ty);
498 fjz2 = _mm256_add_ps(fjz2,tz);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 /* REACTION-FIELD ELECTROSTATICS */
505 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
506 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
508 /* Update potential sum for this i atom from the interaction with this j atom. */
509 velecsum = _mm256_add_ps(velecsum,velec);
513 /* Calculate temporary vectorial force */
514 tx = _mm256_mul_ps(fscal,dx33);
515 ty = _mm256_mul_ps(fscal,dy33);
516 tz = _mm256_mul_ps(fscal,dz33);
518 /* Update vectorial force */
519 fix3 = _mm256_add_ps(fix3,tx);
520 fiy3 = _mm256_add_ps(fiy3,ty);
521 fiz3 = _mm256_add_ps(fiz3,tz);
523 fjx3 = _mm256_add_ps(fjx3,tx);
524 fjy3 = _mm256_add_ps(fjy3,ty);
525 fjz3 = _mm256_add_ps(fjz3,tz);
527 fjptrA = f+j_coord_offsetA;
528 fjptrB = f+j_coord_offsetB;
529 fjptrC = f+j_coord_offsetC;
530 fjptrD = f+j_coord_offsetD;
531 fjptrE = f+j_coord_offsetE;
532 fjptrF = f+j_coord_offsetF;
533 fjptrG = f+j_coord_offsetG;
534 fjptrH = f+j_coord_offsetH;
536 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
537 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
538 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
540 /* Inner loop uses 288 flops */
546 /* Get j neighbor index, and coordinate index */
547 jnrlistA = jjnr[jidx];
548 jnrlistB = jjnr[jidx+1];
549 jnrlistC = jjnr[jidx+2];
550 jnrlistD = jjnr[jidx+3];
551 jnrlistE = jjnr[jidx+4];
552 jnrlistF = jjnr[jidx+5];
553 jnrlistG = jjnr[jidx+6];
554 jnrlistH = jjnr[jidx+7];
555 /* Sign of each element will be negative for non-real atoms.
556 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
557 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
559 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
560 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
562 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
563 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
564 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
565 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
566 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
567 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
568 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
569 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
570 j_coord_offsetA = DIM*jnrA;
571 j_coord_offsetB = DIM*jnrB;
572 j_coord_offsetC = DIM*jnrC;
573 j_coord_offsetD = DIM*jnrD;
574 j_coord_offsetE = DIM*jnrE;
575 j_coord_offsetF = DIM*jnrF;
576 j_coord_offsetG = DIM*jnrG;
577 j_coord_offsetH = DIM*jnrH;
579 /* load j atom coordinates */
580 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
581 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
582 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
583 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
584 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
586 /* Calculate displacement vector */
587 dx11 = _mm256_sub_ps(ix1,jx1);
588 dy11 = _mm256_sub_ps(iy1,jy1);
589 dz11 = _mm256_sub_ps(iz1,jz1);
590 dx12 = _mm256_sub_ps(ix1,jx2);
591 dy12 = _mm256_sub_ps(iy1,jy2);
592 dz12 = _mm256_sub_ps(iz1,jz2);
593 dx13 = _mm256_sub_ps(ix1,jx3);
594 dy13 = _mm256_sub_ps(iy1,jy3);
595 dz13 = _mm256_sub_ps(iz1,jz3);
596 dx21 = _mm256_sub_ps(ix2,jx1);
597 dy21 = _mm256_sub_ps(iy2,jy1);
598 dz21 = _mm256_sub_ps(iz2,jz1);
599 dx22 = _mm256_sub_ps(ix2,jx2);
600 dy22 = _mm256_sub_ps(iy2,jy2);
601 dz22 = _mm256_sub_ps(iz2,jz2);
602 dx23 = _mm256_sub_ps(ix2,jx3);
603 dy23 = _mm256_sub_ps(iy2,jy3);
604 dz23 = _mm256_sub_ps(iz2,jz3);
605 dx31 = _mm256_sub_ps(ix3,jx1);
606 dy31 = _mm256_sub_ps(iy3,jy1);
607 dz31 = _mm256_sub_ps(iz3,jz1);
608 dx32 = _mm256_sub_ps(ix3,jx2);
609 dy32 = _mm256_sub_ps(iy3,jy2);
610 dz32 = _mm256_sub_ps(iz3,jz2);
611 dx33 = _mm256_sub_ps(ix3,jx3);
612 dy33 = _mm256_sub_ps(iy3,jy3);
613 dz33 = _mm256_sub_ps(iz3,jz3);
615 /* Calculate squared distance and things based on it */
616 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
617 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
618 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
619 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
620 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
621 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
622 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
623 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
624 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
626 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
627 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
628 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
629 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
630 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
631 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
632 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
633 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
634 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
636 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
637 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
638 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
639 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
640 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
641 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
642 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
643 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
644 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
646 fjx1 = _mm256_setzero_ps();
647 fjy1 = _mm256_setzero_ps();
648 fjz1 = _mm256_setzero_ps();
649 fjx2 = _mm256_setzero_ps();
650 fjy2 = _mm256_setzero_ps();
651 fjz2 = _mm256_setzero_ps();
652 fjx3 = _mm256_setzero_ps();
653 fjy3 = _mm256_setzero_ps();
654 fjz3 = _mm256_setzero_ps();
656 /**************************
657 * CALCULATE INTERACTIONS *
658 **************************/
660 /* REACTION-FIELD ELECTROSTATICS */
661 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
662 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
664 /* Update potential sum for this i atom from the interaction with this j atom. */
665 velec = _mm256_andnot_ps(dummy_mask,velec);
666 velecsum = _mm256_add_ps(velecsum,velec);
670 fscal = _mm256_andnot_ps(dummy_mask,fscal);
672 /* Calculate temporary vectorial force */
673 tx = _mm256_mul_ps(fscal,dx11);
674 ty = _mm256_mul_ps(fscal,dy11);
675 tz = _mm256_mul_ps(fscal,dz11);
677 /* Update vectorial force */
678 fix1 = _mm256_add_ps(fix1,tx);
679 fiy1 = _mm256_add_ps(fiy1,ty);
680 fiz1 = _mm256_add_ps(fiz1,tz);
682 fjx1 = _mm256_add_ps(fjx1,tx);
683 fjy1 = _mm256_add_ps(fjy1,ty);
684 fjz1 = _mm256_add_ps(fjz1,tz);
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 /* REACTION-FIELD ELECTROSTATICS */
691 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
692 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velec = _mm256_andnot_ps(dummy_mask,velec);
696 velecsum = _mm256_add_ps(velecsum,velec);
700 fscal = _mm256_andnot_ps(dummy_mask,fscal);
702 /* Calculate temporary vectorial force */
703 tx = _mm256_mul_ps(fscal,dx12);
704 ty = _mm256_mul_ps(fscal,dy12);
705 tz = _mm256_mul_ps(fscal,dz12);
707 /* Update vectorial force */
708 fix1 = _mm256_add_ps(fix1,tx);
709 fiy1 = _mm256_add_ps(fiy1,ty);
710 fiz1 = _mm256_add_ps(fiz1,tz);
712 fjx2 = _mm256_add_ps(fjx2,tx);
713 fjy2 = _mm256_add_ps(fjy2,ty);
714 fjz2 = _mm256_add_ps(fjz2,tz);
716 /**************************
717 * CALCULATE INTERACTIONS *
718 **************************/
720 /* REACTION-FIELD ELECTROSTATICS */
721 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
722 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
724 /* Update potential sum for this i atom from the interaction with this j atom. */
725 velec = _mm256_andnot_ps(dummy_mask,velec);
726 velecsum = _mm256_add_ps(velecsum,velec);
730 fscal = _mm256_andnot_ps(dummy_mask,fscal);
732 /* Calculate temporary vectorial force */
733 tx = _mm256_mul_ps(fscal,dx13);
734 ty = _mm256_mul_ps(fscal,dy13);
735 tz = _mm256_mul_ps(fscal,dz13);
737 /* Update vectorial force */
738 fix1 = _mm256_add_ps(fix1,tx);
739 fiy1 = _mm256_add_ps(fiy1,ty);
740 fiz1 = _mm256_add_ps(fiz1,tz);
742 fjx3 = _mm256_add_ps(fjx3,tx);
743 fjy3 = _mm256_add_ps(fjy3,ty);
744 fjz3 = _mm256_add_ps(fjz3,tz);
746 /**************************
747 * CALCULATE INTERACTIONS *
748 **************************/
750 /* REACTION-FIELD ELECTROSTATICS */
751 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
752 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
754 /* Update potential sum for this i atom from the interaction with this j atom. */
755 velec = _mm256_andnot_ps(dummy_mask,velec);
756 velecsum = _mm256_add_ps(velecsum,velec);
760 fscal = _mm256_andnot_ps(dummy_mask,fscal);
762 /* Calculate temporary vectorial force */
763 tx = _mm256_mul_ps(fscal,dx21);
764 ty = _mm256_mul_ps(fscal,dy21);
765 tz = _mm256_mul_ps(fscal,dz21);
767 /* Update vectorial force */
768 fix2 = _mm256_add_ps(fix2,tx);
769 fiy2 = _mm256_add_ps(fiy2,ty);
770 fiz2 = _mm256_add_ps(fiz2,tz);
772 fjx1 = _mm256_add_ps(fjx1,tx);
773 fjy1 = _mm256_add_ps(fjy1,ty);
774 fjz1 = _mm256_add_ps(fjz1,tz);
776 /**************************
777 * CALCULATE INTERACTIONS *
778 **************************/
780 /* REACTION-FIELD ELECTROSTATICS */
781 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
782 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
784 /* Update potential sum for this i atom from the interaction with this j atom. */
785 velec = _mm256_andnot_ps(dummy_mask,velec);
786 velecsum = _mm256_add_ps(velecsum,velec);
790 fscal = _mm256_andnot_ps(dummy_mask,fscal);
792 /* Calculate temporary vectorial force */
793 tx = _mm256_mul_ps(fscal,dx22);
794 ty = _mm256_mul_ps(fscal,dy22);
795 tz = _mm256_mul_ps(fscal,dz22);
797 /* Update vectorial force */
798 fix2 = _mm256_add_ps(fix2,tx);
799 fiy2 = _mm256_add_ps(fiy2,ty);
800 fiz2 = _mm256_add_ps(fiz2,tz);
802 fjx2 = _mm256_add_ps(fjx2,tx);
803 fjy2 = _mm256_add_ps(fjy2,ty);
804 fjz2 = _mm256_add_ps(fjz2,tz);
806 /**************************
807 * CALCULATE INTERACTIONS *
808 **************************/
810 /* REACTION-FIELD ELECTROSTATICS */
811 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
812 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
814 /* Update potential sum for this i atom from the interaction with this j atom. */
815 velec = _mm256_andnot_ps(dummy_mask,velec);
816 velecsum = _mm256_add_ps(velecsum,velec);
820 fscal = _mm256_andnot_ps(dummy_mask,fscal);
822 /* Calculate temporary vectorial force */
823 tx = _mm256_mul_ps(fscal,dx23);
824 ty = _mm256_mul_ps(fscal,dy23);
825 tz = _mm256_mul_ps(fscal,dz23);
827 /* Update vectorial force */
828 fix2 = _mm256_add_ps(fix2,tx);
829 fiy2 = _mm256_add_ps(fiy2,ty);
830 fiz2 = _mm256_add_ps(fiz2,tz);
832 fjx3 = _mm256_add_ps(fjx3,tx);
833 fjy3 = _mm256_add_ps(fjy3,ty);
834 fjz3 = _mm256_add_ps(fjz3,tz);
836 /**************************
837 * CALCULATE INTERACTIONS *
838 **************************/
840 /* REACTION-FIELD ELECTROSTATICS */
841 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
842 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
844 /* Update potential sum for this i atom from the interaction with this j atom. */
845 velec = _mm256_andnot_ps(dummy_mask,velec);
846 velecsum = _mm256_add_ps(velecsum,velec);
850 fscal = _mm256_andnot_ps(dummy_mask,fscal);
852 /* Calculate temporary vectorial force */
853 tx = _mm256_mul_ps(fscal,dx31);
854 ty = _mm256_mul_ps(fscal,dy31);
855 tz = _mm256_mul_ps(fscal,dz31);
857 /* Update vectorial force */
858 fix3 = _mm256_add_ps(fix3,tx);
859 fiy3 = _mm256_add_ps(fiy3,ty);
860 fiz3 = _mm256_add_ps(fiz3,tz);
862 fjx1 = _mm256_add_ps(fjx1,tx);
863 fjy1 = _mm256_add_ps(fjy1,ty);
864 fjz1 = _mm256_add_ps(fjz1,tz);
866 /**************************
867 * CALCULATE INTERACTIONS *
868 **************************/
870 /* REACTION-FIELD ELECTROSTATICS */
871 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
872 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec = _mm256_andnot_ps(dummy_mask,velec);
876 velecsum = _mm256_add_ps(velecsum,velec);
880 fscal = _mm256_andnot_ps(dummy_mask,fscal);
882 /* Calculate temporary vectorial force */
883 tx = _mm256_mul_ps(fscal,dx32);
884 ty = _mm256_mul_ps(fscal,dy32);
885 tz = _mm256_mul_ps(fscal,dz32);
887 /* Update vectorial force */
888 fix3 = _mm256_add_ps(fix3,tx);
889 fiy3 = _mm256_add_ps(fiy3,ty);
890 fiz3 = _mm256_add_ps(fiz3,tz);
892 fjx2 = _mm256_add_ps(fjx2,tx);
893 fjy2 = _mm256_add_ps(fjy2,ty);
894 fjz2 = _mm256_add_ps(fjz2,tz);
896 /**************************
897 * CALCULATE INTERACTIONS *
898 **************************/
900 /* REACTION-FIELD ELECTROSTATICS */
901 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
902 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
904 /* Update potential sum for this i atom from the interaction with this j atom. */
905 velec = _mm256_andnot_ps(dummy_mask,velec);
906 velecsum = _mm256_add_ps(velecsum,velec);
910 fscal = _mm256_andnot_ps(dummy_mask,fscal);
912 /* Calculate temporary vectorial force */
913 tx = _mm256_mul_ps(fscal,dx33);
914 ty = _mm256_mul_ps(fscal,dy33);
915 tz = _mm256_mul_ps(fscal,dz33);
917 /* Update vectorial force */
918 fix3 = _mm256_add_ps(fix3,tx);
919 fiy3 = _mm256_add_ps(fiy3,ty);
920 fiz3 = _mm256_add_ps(fiz3,tz);
922 fjx3 = _mm256_add_ps(fjx3,tx);
923 fjy3 = _mm256_add_ps(fjy3,ty);
924 fjz3 = _mm256_add_ps(fjz3,tz);
926 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
927 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
928 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
929 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
930 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
931 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
932 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
933 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
935 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
936 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
937 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
939 /* Inner loop uses 288 flops */
942 /* End of innermost loop */
944 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
945 f+i_coord_offset+DIM,fshift+i_shift_offset);
948 /* Update potential energies */
949 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
951 /* Increment number of inner iterations */
952 inneriter += j_index_end - j_index_start;
954 /* Outer loop uses 19 flops */
957 /* Increment number of outer iterations */
960 /* Update outer/inner flops */
962 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*288);
965 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single
966 * Electrostatics interaction: ReactionField
967 * VdW interaction: None
968 * Geometry: Water4-Water4
969 * Calculate force/pot: Force
972 nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single
973 (t_nblist * gmx_restrict nlist,
974 rvec * gmx_restrict xx,
975 rvec * gmx_restrict ff,
976 t_forcerec * gmx_restrict fr,
977 t_mdatoms * gmx_restrict mdatoms,
978 nb_kernel_data_t * gmx_restrict kernel_data,
979 t_nrnb * gmx_restrict nrnb)
981 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
982 * just 0 for non-waters.
983 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
984 * jnr indices corresponding to data put in the four positions in the SIMD register.
986 int i_shift_offset,i_coord_offset,outeriter,inneriter;
987 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
988 int jnrA,jnrB,jnrC,jnrD;
989 int jnrE,jnrF,jnrG,jnrH;
990 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
991 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
992 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
993 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
994 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
996 real *shiftvec,*fshift,*x,*f;
997 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
999 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1000 real * vdwioffsetptr1;
1001 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1002 real * vdwioffsetptr2;
1003 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1004 real * vdwioffsetptr3;
1005 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1006 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1007 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1008 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1009 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1010 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1011 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1012 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1013 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1014 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1015 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1016 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1017 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1018 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1019 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1020 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1021 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1023 __m256 dummy_mask,cutoff_mask;
1024 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1025 __m256 one = _mm256_set1_ps(1.0);
1026 __m256 two = _mm256_set1_ps(2.0);
1032 jindex = nlist->jindex;
1034 shiftidx = nlist->shift;
1036 shiftvec = fr->shift_vec[0];
1037 fshift = fr->fshift[0];
1038 facel = _mm256_set1_ps(fr->epsfac);
1039 charge = mdatoms->chargeA;
1040 krf = _mm256_set1_ps(fr->ic->k_rf);
1041 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1042 crf = _mm256_set1_ps(fr->ic->c_rf);
1044 /* Setup water-specific parameters */
1045 inr = nlist->iinr[0];
1046 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1047 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1048 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1050 jq1 = _mm256_set1_ps(charge[inr+1]);
1051 jq2 = _mm256_set1_ps(charge[inr+2]);
1052 jq3 = _mm256_set1_ps(charge[inr+3]);
1053 qq11 = _mm256_mul_ps(iq1,jq1);
1054 qq12 = _mm256_mul_ps(iq1,jq2);
1055 qq13 = _mm256_mul_ps(iq1,jq3);
1056 qq21 = _mm256_mul_ps(iq2,jq1);
1057 qq22 = _mm256_mul_ps(iq2,jq2);
1058 qq23 = _mm256_mul_ps(iq2,jq3);
1059 qq31 = _mm256_mul_ps(iq3,jq1);
1060 qq32 = _mm256_mul_ps(iq3,jq2);
1061 qq33 = _mm256_mul_ps(iq3,jq3);
1063 /* Avoid stupid compiler warnings */
1064 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1065 j_coord_offsetA = 0;
1066 j_coord_offsetB = 0;
1067 j_coord_offsetC = 0;
1068 j_coord_offsetD = 0;
1069 j_coord_offsetE = 0;
1070 j_coord_offsetF = 0;
1071 j_coord_offsetG = 0;
1072 j_coord_offsetH = 0;
1077 for(iidx=0;iidx<4*DIM;iidx++)
1079 scratch[iidx] = 0.0;
1082 /* Start outer loop over neighborlists */
1083 for(iidx=0; iidx<nri; iidx++)
1085 /* Load shift vector for this list */
1086 i_shift_offset = DIM*shiftidx[iidx];
1088 /* Load limits for loop over neighbors */
1089 j_index_start = jindex[iidx];
1090 j_index_end = jindex[iidx+1];
1092 /* Get outer coordinate index */
1094 i_coord_offset = DIM*inr;
1096 /* Load i particle coords and add shift vector */
1097 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1098 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1100 fix1 = _mm256_setzero_ps();
1101 fiy1 = _mm256_setzero_ps();
1102 fiz1 = _mm256_setzero_ps();
1103 fix2 = _mm256_setzero_ps();
1104 fiy2 = _mm256_setzero_ps();
1105 fiz2 = _mm256_setzero_ps();
1106 fix3 = _mm256_setzero_ps();
1107 fiy3 = _mm256_setzero_ps();
1108 fiz3 = _mm256_setzero_ps();
1110 /* Start inner kernel loop */
1111 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1114 /* Get j neighbor index, and coordinate index */
1116 jnrB = jjnr[jidx+1];
1117 jnrC = jjnr[jidx+2];
1118 jnrD = jjnr[jidx+3];
1119 jnrE = jjnr[jidx+4];
1120 jnrF = jjnr[jidx+5];
1121 jnrG = jjnr[jidx+6];
1122 jnrH = jjnr[jidx+7];
1123 j_coord_offsetA = DIM*jnrA;
1124 j_coord_offsetB = DIM*jnrB;
1125 j_coord_offsetC = DIM*jnrC;
1126 j_coord_offsetD = DIM*jnrD;
1127 j_coord_offsetE = DIM*jnrE;
1128 j_coord_offsetF = DIM*jnrF;
1129 j_coord_offsetG = DIM*jnrG;
1130 j_coord_offsetH = DIM*jnrH;
1132 /* load j atom coordinates */
1133 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1134 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1135 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1136 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1137 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1139 /* Calculate displacement vector */
1140 dx11 = _mm256_sub_ps(ix1,jx1);
1141 dy11 = _mm256_sub_ps(iy1,jy1);
1142 dz11 = _mm256_sub_ps(iz1,jz1);
1143 dx12 = _mm256_sub_ps(ix1,jx2);
1144 dy12 = _mm256_sub_ps(iy1,jy2);
1145 dz12 = _mm256_sub_ps(iz1,jz2);
1146 dx13 = _mm256_sub_ps(ix1,jx3);
1147 dy13 = _mm256_sub_ps(iy1,jy3);
1148 dz13 = _mm256_sub_ps(iz1,jz3);
1149 dx21 = _mm256_sub_ps(ix2,jx1);
1150 dy21 = _mm256_sub_ps(iy2,jy1);
1151 dz21 = _mm256_sub_ps(iz2,jz1);
1152 dx22 = _mm256_sub_ps(ix2,jx2);
1153 dy22 = _mm256_sub_ps(iy2,jy2);
1154 dz22 = _mm256_sub_ps(iz2,jz2);
1155 dx23 = _mm256_sub_ps(ix2,jx3);
1156 dy23 = _mm256_sub_ps(iy2,jy3);
1157 dz23 = _mm256_sub_ps(iz2,jz3);
1158 dx31 = _mm256_sub_ps(ix3,jx1);
1159 dy31 = _mm256_sub_ps(iy3,jy1);
1160 dz31 = _mm256_sub_ps(iz3,jz1);
1161 dx32 = _mm256_sub_ps(ix3,jx2);
1162 dy32 = _mm256_sub_ps(iy3,jy2);
1163 dz32 = _mm256_sub_ps(iz3,jz2);
1164 dx33 = _mm256_sub_ps(ix3,jx3);
1165 dy33 = _mm256_sub_ps(iy3,jy3);
1166 dz33 = _mm256_sub_ps(iz3,jz3);
1168 /* Calculate squared distance and things based on it */
1169 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1170 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1171 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1172 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1173 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1174 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1175 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1176 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1177 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1179 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1180 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1181 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1182 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1183 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1184 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1185 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1186 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1187 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1189 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1190 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1191 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1192 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1193 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1194 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1195 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1196 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1197 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1199 fjx1 = _mm256_setzero_ps();
1200 fjy1 = _mm256_setzero_ps();
1201 fjz1 = _mm256_setzero_ps();
1202 fjx2 = _mm256_setzero_ps();
1203 fjy2 = _mm256_setzero_ps();
1204 fjz2 = _mm256_setzero_ps();
1205 fjx3 = _mm256_setzero_ps();
1206 fjy3 = _mm256_setzero_ps();
1207 fjz3 = _mm256_setzero_ps();
1209 /**************************
1210 * CALCULATE INTERACTIONS *
1211 **************************/
1213 /* REACTION-FIELD ELECTROSTATICS */
1214 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1218 /* Calculate temporary vectorial force */
1219 tx = _mm256_mul_ps(fscal,dx11);
1220 ty = _mm256_mul_ps(fscal,dy11);
1221 tz = _mm256_mul_ps(fscal,dz11);
1223 /* Update vectorial force */
1224 fix1 = _mm256_add_ps(fix1,tx);
1225 fiy1 = _mm256_add_ps(fiy1,ty);
1226 fiz1 = _mm256_add_ps(fiz1,tz);
1228 fjx1 = _mm256_add_ps(fjx1,tx);
1229 fjy1 = _mm256_add_ps(fjy1,ty);
1230 fjz1 = _mm256_add_ps(fjz1,tz);
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 /* REACTION-FIELD ELECTROSTATICS */
1237 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1241 /* Calculate temporary vectorial force */
1242 tx = _mm256_mul_ps(fscal,dx12);
1243 ty = _mm256_mul_ps(fscal,dy12);
1244 tz = _mm256_mul_ps(fscal,dz12);
1246 /* Update vectorial force */
1247 fix1 = _mm256_add_ps(fix1,tx);
1248 fiy1 = _mm256_add_ps(fiy1,ty);
1249 fiz1 = _mm256_add_ps(fiz1,tz);
1251 fjx2 = _mm256_add_ps(fjx2,tx);
1252 fjy2 = _mm256_add_ps(fjy2,ty);
1253 fjz2 = _mm256_add_ps(fjz2,tz);
1255 /**************************
1256 * CALCULATE INTERACTIONS *
1257 **************************/
1259 /* REACTION-FIELD ELECTROSTATICS */
1260 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1264 /* Calculate temporary vectorial force */
1265 tx = _mm256_mul_ps(fscal,dx13);
1266 ty = _mm256_mul_ps(fscal,dy13);
1267 tz = _mm256_mul_ps(fscal,dz13);
1269 /* Update vectorial force */
1270 fix1 = _mm256_add_ps(fix1,tx);
1271 fiy1 = _mm256_add_ps(fiy1,ty);
1272 fiz1 = _mm256_add_ps(fiz1,tz);
1274 fjx3 = _mm256_add_ps(fjx3,tx);
1275 fjy3 = _mm256_add_ps(fjy3,ty);
1276 fjz3 = _mm256_add_ps(fjz3,tz);
1278 /**************************
1279 * CALCULATE INTERACTIONS *
1280 **************************/
1282 /* REACTION-FIELD ELECTROSTATICS */
1283 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1287 /* Calculate temporary vectorial force */
1288 tx = _mm256_mul_ps(fscal,dx21);
1289 ty = _mm256_mul_ps(fscal,dy21);
1290 tz = _mm256_mul_ps(fscal,dz21);
1292 /* Update vectorial force */
1293 fix2 = _mm256_add_ps(fix2,tx);
1294 fiy2 = _mm256_add_ps(fiy2,ty);
1295 fiz2 = _mm256_add_ps(fiz2,tz);
1297 fjx1 = _mm256_add_ps(fjx1,tx);
1298 fjy1 = _mm256_add_ps(fjy1,ty);
1299 fjz1 = _mm256_add_ps(fjz1,tz);
1301 /**************************
1302 * CALCULATE INTERACTIONS *
1303 **************************/
1305 /* REACTION-FIELD ELECTROSTATICS */
1306 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1310 /* Calculate temporary vectorial force */
1311 tx = _mm256_mul_ps(fscal,dx22);
1312 ty = _mm256_mul_ps(fscal,dy22);
1313 tz = _mm256_mul_ps(fscal,dz22);
1315 /* Update vectorial force */
1316 fix2 = _mm256_add_ps(fix2,tx);
1317 fiy2 = _mm256_add_ps(fiy2,ty);
1318 fiz2 = _mm256_add_ps(fiz2,tz);
1320 fjx2 = _mm256_add_ps(fjx2,tx);
1321 fjy2 = _mm256_add_ps(fjy2,ty);
1322 fjz2 = _mm256_add_ps(fjz2,tz);
1324 /**************************
1325 * CALCULATE INTERACTIONS *
1326 **************************/
1328 /* REACTION-FIELD ELECTROSTATICS */
1329 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1333 /* Calculate temporary vectorial force */
1334 tx = _mm256_mul_ps(fscal,dx23);
1335 ty = _mm256_mul_ps(fscal,dy23);
1336 tz = _mm256_mul_ps(fscal,dz23);
1338 /* Update vectorial force */
1339 fix2 = _mm256_add_ps(fix2,tx);
1340 fiy2 = _mm256_add_ps(fiy2,ty);
1341 fiz2 = _mm256_add_ps(fiz2,tz);
1343 fjx3 = _mm256_add_ps(fjx3,tx);
1344 fjy3 = _mm256_add_ps(fjy3,ty);
1345 fjz3 = _mm256_add_ps(fjz3,tz);
1347 /**************************
1348 * CALCULATE INTERACTIONS *
1349 **************************/
1351 /* REACTION-FIELD ELECTROSTATICS */
1352 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1356 /* Calculate temporary vectorial force */
1357 tx = _mm256_mul_ps(fscal,dx31);
1358 ty = _mm256_mul_ps(fscal,dy31);
1359 tz = _mm256_mul_ps(fscal,dz31);
1361 /* Update vectorial force */
1362 fix3 = _mm256_add_ps(fix3,tx);
1363 fiy3 = _mm256_add_ps(fiy3,ty);
1364 fiz3 = _mm256_add_ps(fiz3,tz);
1366 fjx1 = _mm256_add_ps(fjx1,tx);
1367 fjy1 = _mm256_add_ps(fjy1,ty);
1368 fjz1 = _mm256_add_ps(fjz1,tz);
1370 /**************************
1371 * CALCULATE INTERACTIONS *
1372 **************************/
1374 /* REACTION-FIELD ELECTROSTATICS */
1375 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1379 /* Calculate temporary vectorial force */
1380 tx = _mm256_mul_ps(fscal,dx32);
1381 ty = _mm256_mul_ps(fscal,dy32);
1382 tz = _mm256_mul_ps(fscal,dz32);
1384 /* Update vectorial force */
1385 fix3 = _mm256_add_ps(fix3,tx);
1386 fiy3 = _mm256_add_ps(fiy3,ty);
1387 fiz3 = _mm256_add_ps(fiz3,tz);
1389 fjx2 = _mm256_add_ps(fjx2,tx);
1390 fjy2 = _mm256_add_ps(fjy2,ty);
1391 fjz2 = _mm256_add_ps(fjz2,tz);
1393 /**************************
1394 * CALCULATE INTERACTIONS *
1395 **************************/
1397 /* REACTION-FIELD ELECTROSTATICS */
1398 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1402 /* Calculate temporary vectorial force */
1403 tx = _mm256_mul_ps(fscal,dx33);
1404 ty = _mm256_mul_ps(fscal,dy33);
1405 tz = _mm256_mul_ps(fscal,dz33);
1407 /* Update vectorial force */
1408 fix3 = _mm256_add_ps(fix3,tx);
1409 fiy3 = _mm256_add_ps(fiy3,ty);
1410 fiz3 = _mm256_add_ps(fiz3,tz);
1412 fjx3 = _mm256_add_ps(fjx3,tx);
1413 fjy3 = _mm256_add_ps(fjy3,ty);
1414 fjz3 = _mm256_add_ps(fjz3,tz);
1416 fjptrA = f+j_coord_offsetA;
1417 fjptrB = f+j_coord_offsetB;
1418 fjptrC = f+j_coord_offsetC;
1419 fjptrD = f+j_coord_offsetD;
1420 fjptrE = f+j_coord_offsetE;
1421 fjptrF = f+j_coord_offsetF;
1422 fjptrG = f+j_coord_offsetG;
1423 fjptrH = f+j_coord_offsetH;
1425 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1426 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1427 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1429 /* Inner loop uses 243 flops */
1432 if(jidx<j_index_end)
1435 /* Get j neighbor index, and coordinate index */
1436 jnrlistA = jjnr[jidx];
1437 jnrlistB = jjnr[jidx+1];
1438 jnrlistC = jjnr[jidx+2];
1439 jnrlistD = jjnr[jidx+3];
1440 jnrlistE = jjnr[jidx+4];
1441 jnrlistF = jjnr[jidx+5];
1442 jnrlistG = jjnr[jidx+6];
1443 jnrlistH = jjnr[jidx+7];
1444 /* Sign of each element will be negative for non-real atoms.
1445 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1446 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1448 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1449 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1451 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1452 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1453 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1454 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1455 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1456 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1457 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1458 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1459 j_coord_offsetA = DIM*jnrA;
1460 j_coord_offsetB = DIM*jnrB;
1461 j_coord_offsetC = DIM*jnrC;
1462 j_coord_offsetD = DIM*jnrD;
1463 j_coord_offsetE = DIM*jnrE;
1464 j_coord_offsetF = DIM*jnrF;
1465 j_coord_offsetG = DIM*jnrG;
1466 j_coord_offsetH = DIM*jnrH;
1468 /* load j atom coordinates */
1469 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1470 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1471 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1472 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1473 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1475 /* Calculate displacement vector */
1476 dx11 = _mm256_sub_ps(ix1,jx1);
1477 dy11 = _mm256_sub_ps(iy1,jy1);
1478 dz11 = _mm256_sub_ps(iz1,jz1);
1479 dx12 = _mm256_sub_ps(ix1,jx2);
1480 dy12 = _mm256_sub_ps(iy1,jy2);
1481 dz12 = _mm256_sub_ps(iz1,jz2);
1482 dx13 = _mm256_sub_ps(ix1,jx3);
1483 dy13 = _mm256_sub_ps(iy1,jy3);
1484 dz13 = _mm256_sub_ps(iz1,jz3);
1485 dx21 = _mm256_sub_ps(ix2,jx1);
1486 dy21 = _mm256_sub_ps(iy2,jy1);
1487 dz21 = _mm256_sub_ps(iz2,jz1);
1488 dx22 = _mm256_sub_ps(ix2,jx2);
1489 dy22 = _mm256_sub_ps(iy2,jy2);
1490 dz22 = _mm256_sub_ps(iz2,jz2);
1491 dx23 = _mm256_sub_ps(ix2,jx3);
1492 dy23 = _mm256_sub_ps(iy2,jy3);
1493 dz23 = _mm256_sub_ps(iz2,jz3);
1494 dx31 = _mm256_sub_ps(ix3,jx1);
1495 dy31 = _mm256_sub_ps(iy3,jy1);
1496 dz31 = _mm256_sub_ps(iz3,jz1);
1497 dx32 = _mm256_sub_ps(ix3,jx2);
1498 dy32 = _mm256_sub_ps(iy3,jy2);
1499 dz32 = _mm256_sub_ps(iz3,jz2);
1500 dx33 = _mm256_sub_ps(ix3,jx3);
1501 dy33 = _mm256_sub_ps(iy3,jy3);
1502 dz33 = _mm256_sub_ps(iz3,jz3);
1504 /* Calculate squared distance and things based on it */
1505 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1506 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1507 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1508 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1509 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1510 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1511 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1512 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1513 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1515 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1516 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1517 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1518 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1519 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1520 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1521 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1522 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1523 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1525 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1526 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1527 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1528 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1529 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1530 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1531 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1532 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1533 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1535 fjx1 = _mm256_setzero_ps();
1536 fjy1 = _mm256_setzero_ps();
1537 fjz1 = _mm256_setzero_ps();
1538 fjx2 = _mm256_setzero_ps();
1539 fjy2 = _mm256_setzero_ps();
1540 fjz2 = _mm256_setzero_ps();
1541 fjx3 = _mm256_setzero_ps();
1542 fjy3 = _mm256_setzero_ps();
1543 fjz3 = _mm256_setzero_ps();
1545 /**************************
1546 * CALCULATE INTERACTIONS *
1547 **************************/
1549 /* REACTION-FIELD ELECTROSTATICS */
1550 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1554 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1556 /* Calculate temporary vectorial force */
1557 tx = _mm256_mul_ps(fscal,dx11);
1558 ty = _mm256_mul_ps(fscal,dy11);
1559 tz = _mm256_mul_ps(fscal,dz11);
1561 /* Update vectorial force */
1562 fix1 = _mm256_add_ps(fix1,tx);
1563 fiy1 = _mm256_add_ps(fiy1,ty);
1564 fiz1 = _mm256_add_ps(fiz1,tz);
1566 fjx1 = _mm256_add_ps(fjx1,tx);
1567 fjy1 = _mm256_add_ps(fjy1,ty);
1568 fjz1 = _mm256_add_ps(fjz1,tz);
1570 /**************************
1571 * CALCULATE INTERACTIONS *
1572 **************************/
1574 /* REACTION-FIELD ELECTROSTATICS */
1575 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1579 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1581 /* Calculate temporary vectorial force */
1582 tx = _mm256_mul_ps(fscal,dx12);
1583 ty = _mm256_mul_ps(fscal,dy12);
1584 tz = _mm256_mul_ps(fscal,dz12);
1586 /* Update vectorial force */
1587 fix1 = _mm256_add_ps(fix1,tx);
1588 fiy1 = _mm256_add_ps(fiy1,ty);
1589 fiz1 = _mm256_add_ps(fiz1,tz);
1591 fjx2 = _mm256_add_ps(fjx2,tx);
1592 fjy2 = _mm256_add_ps(fjy2,ty);
1593 fjz2 = _mm256_add_ps(fjz2,tz);
1595 /**************************
1596 * CALCULATE INTERACTIONS *
1597 **************************/
1599 /* REACTION-FIELD ELECTROSTATICS */
1600 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1604 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1606 /* Calculate temporary vectorial force */
1607 tx = _mm256_mul_ps(fscal,dx13);
1608 ty = _mm256_mul_ps(fscal,dy13);
1609 tz = _mm256_mul_ps(fscal,dz13);
1611 /* Update vectorial force */
1612 fix1 = _mm256_add_ps(fix1,tx);
1613 fiy1 = _mm256_add_ps(fiy1,ty);
1614 fiz1 = _mm256_add_ps(fiz1,tz);
1616 fjx3 = _mm256_add_ps(fjx3,tx);
1617 fjy3 = _mm256_add_ps(fjy3,ty);
1618 fjz3 = _mm256_add_ps(fjz3,tz);
1620 /**************************
1621 * CALCULATE INTERACTIONS *
1622 **************************/
1624 /* REACTION-FIELD ELECTROSTATICS */
1625 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1629 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1631 /* Calculate temporary vectorial force */
1632 tx = _mm256_mul_ps(fscal,dx21);
1633 ty = _mm256_mul_ps(fscal,dy21);
1634 tz = _mm256_mul_ps(fscal,dz21);
1636 /* Update vectorial force */
1637 fix2 = _mm256_add_ps(fix2,tx);
1638 fiy2 = _mm256_add_ps(fiy2,ty);
1639 fiz2 = _mm256_add_ps(fiz2,tz);
1641 fjx1 = _mm256_add_ps(fjx1,tx);
1642 fjy1 = _mm256_add_ps(fjy1,ty);
1643 fjz1 = _mm256_add_ps(fjz1,tz);
1645 /**************************
1646 * CALCULATE INTERACTIONS *
1647 **************************/
1649 /* REACTION-FIELD ELECTROSTATICS */
1650 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1654 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1656 /* Calculate temporary vectorial force */
1657 tx = _mm256_mul_ps(fscal,dx22);
1658 ty = _mm256_mul_ps(fscal,dy22);
1659 tz = _mm256_mul_ps(fscal,dz22);
1661 /* Update vectorial force */
1662 fix2 = _mm256_add_ps(fix2,tx);
1663 fiy2 = _mm256_add_ps(fiy2,ty);
1664 fiz2 = _mm256_add_ps(fiz2,tz);
1666 fjx2 = _mm256_add_ps(fjx2,tx);
1667 fjy2 = _mm256_add_ps(fjy2,ty);
1668 fjz2 = _mm256_add_ps(fjz2,tz);
1670 /**************************
1671 * CALCULATE INTERACTIONS *
1672 **************************/
1674 /* REACTION-FIELD ELECTROSTATICS */
1675 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1679 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1681 /* Calculate temporary vectorial force */
1682 tx = _mm256_mul_ps(fscal,dx23);
1683 ty = _mm256_mul_ps(fscal,dy23);
1684 tz = _mm256_mul_ps(fscal,dz23);
1686 /* Update vectorial force */
1687 fix2 = _mm256_add_ps(fix2,tx);
1688 fiy2 = _mm256_add_ps(fiy2,ty);
1689 fiz2 = _mm256_add_ps(fiz2,tz);
1691 fjx3 = _mm256_add_ps(fjx3,tx);
1692 fjy3 = _mm256_add_ps(fjy3,ty);
1693 fjz3 = _mm256_add_ps(fjz3,tz);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 /* REACTION-FIELD ELECTROSTATICS */
1700 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1704 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1706 /* Calculate temporary vectorial force */
1707 tx = _mm256_mul_ps(fscal,dx31);
1708 ty = _mm256_mul_ps(fscal,dy31);
1709 tz = _mm256_mul_ps(fscal,dz31);
1711 /* Update vectorial force */
1712 fix3 = _mm256_add_ps(fix3,tx);
1713 fiy3 = _mm256_add_ps(fiy3,ty);
1714 fiz3 = _mm256_add_ps(fiz3,tz);
1716 fjx1 = _mm256_add_ps(fjx1,tx);
1717 fjy1 = _mm256_add_ps(fjy1,ty);
1718 fjz1 = _mm256_add_ps(fjz1,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 /* REACTION-FIELD ELECTROSTATICS */
1725 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1729 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1731 /* Calculate temporary vectorial force */
1732 tx = _mm256_mul_ps(fscal,dx32);
1733 ty = _mm256_mul_ps(fscal,dy32);
1734 tz = _mm256_mul_ps(fscal,dz32);
1736 /* Update vectorial force */
1737 fix3 = _mm256_add_ps(fix3,tx);
1738 fiy3 = _mm256_add_ps(fiy3,ty);
1739 fiz3 = _mm256_add_ps(fiz3,tz);
1741 fjx2 = _mm256_add_ps(fjx2,tx);
1742 fjy2 = _mm256_add_ps(fjy2,ty);
1743 fjz2 = _mm256_add_ps(fjz2,tz);
1745 /**************************
1746 * CALCULATE INTERACTIONS *
1747 **************************/
1749 /* REACTION-FIELD ELECTROSTATICS */
1750 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1754 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1756 /* Calculate temporary vectorial force */
1757 tx = _mm256_mul_ps(fscal,dx33);
1758 ty = _mm256_mul_ps(fscal,dy33);
1759 tz = _mm256_mul_ps(fscal,dz33);
1761 /* Update vectorial force */
1762 fix3 = _mm256_add_ps(fix3,tx);
1763 fiy3 = _mm256_add_ps(fiy3,ty);
1764 fiz3 = _mm256_add_ps(fiz3,tz);
1766 fjx3 = _mm256_add_ps(fjx3,tx);
1767 fjy3 = _mm256_add_ps(fjy3,ty);
1768 fjz3 = _mm256_add_ps(fjz3,tz);
1770 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1771 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1772 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1773 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1774 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1775 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1776 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1777 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1779 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1780 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1781 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1783 /* Inner loop uses 243 flops */
1786 /* End of innermost loop */
1788 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1789 f+i_coord_offset+DIM,fshift+i_shift_offset);
1791 /* Increment number of inner iterations */
1792 inneriter += j_index_end - j_index_start;
1794 /* Outer loop uses 18 flops */
1797 /* Increment number of outer iterations */
1800 /* Update outer/inner flops */
1802 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*243);