2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: None
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
99 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
111 __m256 dummy_mask,cutoff_mask;
112 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113 __m256 one = _mm256_set1_ps(1.0);
114 __m256 two = _mm256_set1_ps(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_ps(fr->epsfac);
127 charge = mdatoms->chargeA;
128 krf = _mm256_set1_ps(fr->ic->k_rf);
129 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
130 crf = _mm256_set1_ps(fr->ic->c_rf);
132 /* Setup water-specific parameters */
133 inr = nlist->iinr[0];
134 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
135 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
136 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
138 jq1 = _mm256_set1_ps(charge[inr+1]);
139 jq2 = _mm256_set1_ps(charge[inr+2]);
140 jq3 = _mm256_set1_ps(charge[inr+3]);
141 qq11 = _mm256_mul_ps(iq1,jq1);
142 qq12 = _mm256_mul_ps(iq1,jq2);
143 qq13 = _mm256_mul_ps(iq1,jq3);
144 qq21 = _mm256_mul_ps(iq2,jq1);
145 qq22 = _mm256_mul_ps(iq2,jq2);
146 qq23 = _mm256_mul_ps(iq2,jq3);
147 qq31 = _mm256_mul_ps(iq3,jq1);
148 qq32 = _mm256_mul_ps(iq3,jq2);
149 qq33 = _mm256_mul_ps(iq3,jq3);
151 /* Avoid stupid compiler warnings */
152 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
165 for(iidx=0;iidx<4*DIM;iidx++)
170 /* Start outer loop over neighborlists */
171 for(iidx=0; iidx<nri; iidx++)
173 /* Load shift vector for this list */
174 i_shift_offset = DIM*shiftidx[iidx];
176 /* Load limits for loop over neighbors */
177 j_index_start = jindex[iidx];
178 j_index_end = jindex[iidx+1];
180 /* Get outer coordinate index */
182 i_coord_offset = DIM*inr;
184 /* Load i particle coords and add shift vector */
185 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
186 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
188 fix1 = _mm256_setzero_ps();
189 fiy1 = _mm256_setzero_ps();
190 fiz1 = _mm256_setzero_ps();
191 fix2 = _mm256_setzero_ps();
192 fiy2 = _mm256_setzero_ps();
193 fiz2 = _mm256_setzero_ps();
194 fix3 = _mm256_setzero_ps();
195 fiy3 = _mm256_setzero_ps();
196 fiz3 = _mm256_setzero_ps();
198 /* Reset potential sums */
199 velecsum = _mm256_setzero_ps();
201 /* Start inner kernel loop */
202 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
205 /* Get j neighbor index, and coordinate index */
214 j_coord_offsetA = DIM*jnrA;
215 j_coord_offsetB = DIM*jnrB;
216 j_coord_offsetC = DIM*jnrC;
217 j_coord_offsetD = DIM*jnrD;
218 j_coord_offsetE = DIM*jnrE;
219 j_coord_offsetF = DIM*jnrF;
220 j_coord_offsetG = DIM*jnrG;
221 j_coord_offsetH = DIM*jnrH;
223 /* load j atom coordinates */
224 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
225 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
226 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
227 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
228 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
230 /* Calculate displacement vector */
231 dx11 = _mm256_sub_ps(ix1,jx1);
232 dy11 = _mm256_sub_ps(iy1,jy1);
233 dz11 = _mm256_sub_ps(iz1,jz1);
234 dx12 = _mm256_sub_ps(ix1,jx2);
235 dy12 = _mm256_sub_ps(iy1,jy2);
236 dz12 = _mm256_sub_ps(iz1,jz2);
237 dx13 = _mm256_sub_ps(ix1,jx3);
238 dy13 = _mm256_sub_ps(iy1,jy3);
239 dz13 = _mm256_sub_ps(iz1,jz3);
240 dx21 = _mm256_sub_ps(ix2,jx1);
241 dy21 = _mm256_sub_ps(iy2,jy1);
242 dz21 = _mm256_sub_ps(iz2,jz1);
243 dx22 = _mm256_sub_ps(ix2,jx2);
244 dy22 = _mm256_sub_ps(iy2,jy2);
245 dz22 = _mm256_sub_ps(iz2,jz2);
246 dx23 = _mm256_sub_ps(ix2,jx3);
247 dy23 = _mm256_sub_ps(iy2,jy3);
248 dz23 = _mm256_sub_ps(iz2,jz3);
249 dx31 = _mm256_sub_ps(ix3,jx1);
250 dy31 = _mm256_sub_ps(iy3,jy1);
251 dz31 = _mm256_sub_ps(iz3,jz1);
252 dx32 = _mm256_sub_ps(ix3,jx2);
253 dy32 = _mm256_sub_ps(iy3,jy2);
254 dz32 = _mm256_sub_ps(iz3,jz2);
255 dx33 = _mm256_sub_ps(ix3,jx3);
256 dy33 = _mm256_sub_ps(iy3,jy3);
257 dz33 = _mm256_sub_ps(iz3,jz3);
259 /* Calculate squared distance and things based on it */
260 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
261 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
262 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
263 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
264 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
265 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
266 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
267 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
268 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
270 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
271 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
272 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
273 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
274 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
275 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
276 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
277 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
278 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
280 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
281 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
282 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
283 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
284 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
285 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
286 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
287 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
288 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
290 fjx1 = _mm256_setzero_ps();
291 fjy1 = _mm256_setzero_ps();
292 fjz1 = _mm256_setzero_ps();
293 fjx2 = _mm256_setzero_ps();
294 fjy2 = _mm256_setzero_ps();
295 fjz2 = _mm256_setzero_ps();
296 fjx3 = _mm256_setzero_ps();
297 fjy3 = _mm256_setzero_ps();
298 fjz3 = _mm256_setzero_ps();
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 /* REACTION-FIELD ELECTROSTATICS */
305 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
306 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 velecsum = _mm256_add_ps(velecsum,velec);
313 /* Calculate temporary vectorial force */
314 tx = _mm256_mul_ps(fscal,dx11);
315 ty = _mm256_mul_ps(fscal,dy11);
316 tz = _mm256_mul_ps(fscal,dz11);
318 /* Update vectorial force */
319 fix1 = _mm256_add_ps(fix1,tx);
320 fiy1 = _mm256_add_ps(fiy1,ty);
321 fiz1 = _mm256_add_ps(fiz1,tz);
323 fjx1 = _mm256_add_ps(fjx1,tx);
324 fjy1 = _mm256_add_ps(fjy1,ty);
325 fjz1 = _mm256_add_ps(fjz1,tz);
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 /* REACTION-FIELD ELECTROSTATICS */
332 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
333 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 velecsum = _mm256_add_ps(velecsum,velec);
340 /* Calculate temporary vectorial force */
341 tx = _mm256_mul_ps(fscal,dx12);
342 ty = _mm256_mul_ps(fscal,dy12);
343 tz = _mm256_mul_ps(fscal,dz12);
345 /* Update vectorial force */
346 fix1 = _mm256_add_ps(fix1,tx);
347 fiy1 = _mm256_add_ps(fiy1,ty);
348 fiz1 = _mm256_add_ps(fiz1,tz);
350 fjx2 = _mm256_add_ps(fjx2,tx);
351 fjy2 = _mm256_add_ps(fjy2,ty);
352 fjz2 = _mm256_add_ps(fjz2,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 /* REACTION-FIELD ELECTROSTATICS */
359 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
360 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum = _mm256_add_ps(velecsum,velec);
367 /* Calculate temporary vectorial force */
368 tx = _mm256_mul_ps(fscal,dx13);
369 ty = _mm256_mul_ps(fscal,dy13);
370 tz = _mm256_mul_ps(fscal,dz13);
372 /* Update vectorial force */
373 fix1 = _mm256_add_ps(fix1,tx);
374 fiy1 = _mm256_add_ps(fiy1,ty);
375 fiz1 = _mm256_add_ps(fiz1,tz);
377 fjx3 = _mm256_add_ps(fjx3,tx);
378 fjy3 = _mm256_add_ps(fjy3,ty);
379 fjz3 = _mm256_add_ps(fjz3,tz);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 /* REACTION-FIELD ELECTROSTATICS */
386 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
387 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velecsum = _mm256_add_ps(velecsum,velec);
394 /* Calculate temporary vectorial force */
395 tx = _mm256_mul_ps(fscal,dx21);
396 ty = _mm256_mul_ps(fscal,dy21);
397 tz = _mm256_mul_ps(fscal,dz21);
399 /* Update vectorial force */
400 fix2 = _mm256_add_ps(fix2,tx);
401 fiy2 = _mm256_add_ps(fiy2,ty);
402 fiz2 = _mm256_add_ps(fiz2,tz);
404 fjx1 = _mm256_add_ps(fjx1,tx);
405 fjy1 = _mm256_add_ps(fjy1,ty);
406 fjz1 = _mm256_add_ps(fjz1,tz);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 /* REACTION-FIELD ELECTROSTATICS */
413 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
414 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velecsum = _mm256_add_ps(velecsum,velec);
421 /* Calculate temporary vectorial force */
422 tx = _mm256_mul_ps(fscal,dx22);
423 ty = _mm256_mul_ps(fscal,dy22);
424 tz = _mm256_mul_ps(fscal,dz22);
426 /* Update vectorial force */
427 fix2 = _mm256_add_ps(fix2,tx);
428 fiy2 = _mm256_add_ps(fiy2,ty);
429 fiz2 = _mm256_add_ps(fiz2,tz);
431 fjx2 = _mm256_add_ps(fjx2,tx);
432 fjy2 = _mm256_add_ps(fjy2,ty);
433 fjz2 = _mm256_add_ps(fjz2,tz);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 /* REACTION-FIELD ELECTROSTATICS */
440 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
441 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm256_add_ps(velecsum,velec);
448 /* Calculate temporary vectorial force */
449 tx = _mm256_mul_ps(fscal,dx23);
450 ty = _mm256_mul_ps(fscal,dy23);
451 tz = _mm256_mul_ps(fscal,dz23);
453 /* Update vectorial force */
454 fix2 = _mm256_add_ps(fix2,tx);
455 fiy2 = _mm256_add_ps(fiy2,ty);
456 fiz2 = _mm256_add_ps(fiz2,tz);
458 fjx3 = _mm256_add_ps(fjx3,tx);
459 fjy3 = _mm256_add_ps(fjy3,ty);
460 fjz3 = _mm256_add_ps(fjz3,tz);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 /* REACTION-FIELD ELECTROSTATICS */
467 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
468 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm256_add_ps(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm256_mul_ps(fscal,dx31);
477 ty = _mm256_mul_ps(fscal,dy31);
478 tz = _mm256_mul_ps(fscal,dz31);
480 /* Update vectorial force */
481 fix3 = _mm256_add_ps(fix3,tx);
482 fiy3 = _mm256_add_ps(fiy3,ty);
483 fiz3 = _mm256_add_ps(fiz3,tz);
485 fjx1 = _mm256_add_ps(fjx1,tx);
486 fjy1 = _mm256_add_ps(fjy1,ty);
487 fjz1 = _mm256_add_ps(fjz1,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 /* REACTION-FIELD ELECTROSTATICS */
494 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
495 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm256_add_ps(velecsum,velec);
502 /* Calculate temporary vectorial force */
503 tx = _mm256_mul_ps(fscal,dx32);
504 ty = _mm256_mul_ps(fscal,dy32);
505 tz = _mm256_mul_ps(fscal,dz32);
507 /* Update vectorial force */
508 fix3 = _mm256_add_ps(fix3,tx);
509 fiy3 = _mm256_add_ps(fiy3,ty);
510 fiz3 = _mm256_add_ps(fiz3,tz);
512 fjx2 = _mm256_add_ps(fjx2,tx);
513 fjy2 = _mm256_add_ps(fjy2,ty);
514 fjz2 = _mm256_add_ps(fjz2,tz);
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
520 /* REACTION-FIELD ELECTROSTATICS */
521 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
522 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
524 /* Update potential sum for this i atom from the interaction with this j atom. */
525 velecsum = _mm256_add_ps(velecsum,velec);
529 /* Calculate temporary vectorial force */
530 tx = _mm256_mul_ps(fscal,dx33);
531 ty = _mm256_mul_ps(fscal,dy33);
532 tz = _mm256_mul_ps(fscal,dz33);
534 /* Update vectorial force */
535 fix3 = _mm256_add_ps(fix3,tx);
536 fiy3 = _mm256_add_ps(fiy3,ty);
537 fiz3 = _mm256_add_ps(fiz3,tz);
539 fjx3 = _mm256_add_ps(fjx3,tx);
540 fjy3 = _mm256_add_ps(fjy3,ty);
541 fjz3 = _mm256_add_ps(fjz3,tz);
543 fjptrA = f+j_coord_offsetA;
544 fjptrB = f+j_coord_offsetB;
545 fjptrC = f+j_coord_offsetC;
546 fjptrD = f+j_coord_offsetD;
547 fjptrE = f+j_coord_offsetE;
548 fjptrF = f+j_coord_offsetF;
549 fjptrG = f+j_coord_offsetG;
550 fjptrH = f+j_coord_offsetH;
552 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
553 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
554 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
556 /* Inner loop uses 288 flops */
562 /* Get j neighbor index, and coordinate index */
563 jnrlistA = jjnr[jidx];
564 jnrlistB = jjnr[jidx+1];
565 jnrlistC = jjnr[jidx+2];
566 jnrlistD = jjnr[jidx+3];
567 jnrlistE = jjnr[jidx+4];
568 jnrlistF = jjnr[jidx+5];
569 jnrlistG = jjnr[jidx+6];
570 jnrlistH = jjnr[jidx+7];
571 /* Sign of each element will be negative for non-real atoms.
572 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
573 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
575 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
576 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
578 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
579 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
580 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
581 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
582 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
583 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
584 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
585 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
586 j_coord_offsetA = DIM*jnrA;
587 j_coord_offsetB = DIM*jnrB;
588 j_coord_offsetC = DIM*jnrC;
589 j_coord_offsetD = DIM*jnrD;
590 j_coord_offsetE = DIM*jnrE;
591 j_coord_offsetF = DIM*jnrF;
592 j_coord_offsetG = DIM*jnrG;
593 j_coord_offsetH = DIM*jnrH;
595 /* load j atom coordinates */
596 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
597 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
598 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
599 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
600 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
602 /* Calculate displacement vector */
603 dx11 = _mm256_sub_ps(ix1,jx1);
604 dy11 = _mm256_sub_ps(iy1,jy1);
605 dz11 = _mm256_sub_ps(iz1,jz1);
606 dx12 = _mm256_sub_ps(ix1,jx2);
607 dy12 = _mm256_sub_ps(iy1,jy2);
608 dz12 = _mm256_sub_ps(iz1,jz2);
609 dx13 = _mm256_sub_ps(ix1,jx3);
610 dy13 = _mm256_sub_ps(iy1,jy3);
611 dz13 = _mm256_sub_ps(iz1,jz3);
612 dx21 = _mm256_sub_ps(ix2,jx1);
613 dy21 = _mm256_sub_ps(iy2,jy1);
614 dz21 = _mm256_sub_ps(iz2,jz1);
615 dx22 = _mm256_sub_ps(ix2,jx2);
616 dy22 = _mm256_sub_ps(iy2,jy2);
617 dz22 = _mm256_sub_ps(iz2,jz2);
618 dx23 = _mm256_sub_ps(ix2,jx3);
619 dy23 = _mm256_sub_ps(iy2,jy3);
620 dz23 = _mm256_sub_ps(iz2,jz3);
621 dx31 = _mm256_sub_ps(ix3,jx1);
622 dy31 = _mm256_sub_ps(iy3,jy1);
623 dz31 = _mm256_sub_ps(iz3,jz1);
624 dx32 = _mm256_sub_ps(ix3,jx2);
625 dy32 = _mm256_sub_ps(iy3,jy2);
626 dz32 = _mm256_sub_ps(iz3,jz2);
627 dx33 = _mm256_sub_ps(ix3,jx3);
628 dy33 = _mm256_sub_ps(iy3,jy3);
629 dz33 = _mm256_sub_ps(iz3,jz3);
631 /* Calculate squared distance and things based on it */
632 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
633 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
634 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
635 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
636 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
637 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
638 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
639 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
640 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
642 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
643 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
644 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
645 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
646 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
647 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
648 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
649 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
650 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
652 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
653 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
654 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
655 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
656 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
657 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
658 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
659 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
660 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
662 fjx1 = _mm256_setzero_ps();
663 fjy1 = _mm256_setzero_ps();
664 fjz1 = _mm256_setzero_ps();
665 fjx2 = _mm256_setzero_ps();
666 fjy2 = _mm256_setzero_ps();
667 fjz2 = _mm256_setzero_ps();
668 fjx3 = _mm256_setzero_ps();
669 fjy3 = _mm256_setzero_ps();
670 fjz3 = _mm256_setzero_ps();
672 /**************************
673 * CALCULATE INTERACTIONS *
674 **************************/
676 /* REACTION-FIELD ELECTROSTATICS */
677 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
678 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
680 /* Update potential sum for this i atom from the interaction with this j atom. */
681 velec = _mm256_andnot_ps(dummy_mask,velec);
682 velecsum = _mm256_add_ps(velecsum,velec);
686 fscal = _mm256_andnot_ps(dummy_mask,fscal);
688 /* Calculate temporary vectorial force */
689 tx = _mm256_mul_ps(fscal,dx11);
690 ty = _mm256_mul_ps(fscal,dy11);
691 tz = _mm256_mul_ps(fscal,dz11);
693 /* Update vectorial force */
694 fix1 = _mm256_add_ps(fix1,tx);
695 fiy1 = _mm256_add_ps(fiy1,ty);
696 fiz1 = _mm256_add_ps(fiz1,tz);
698 fjx1 = _mm256_add_ps(fjx1,tx);
699 fjy1 = _mm256_add_ps(fjy1,ty);
700 fjz1 = _mm256_add_ps(fjz1,tz);
702 /**************************
703 * CALCULATE INTERACTIONS *
704 **************************/
706 /* REACTION-FIELD ELECTROSTATICS */
707 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
708 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
710 /* Update potential sum for this i atom from the interaction with this j atom. */
711 velec = _mm256_andnot_ps(dummy_mask,velec);
712 velecsum = _mm256_add_ps(velecsum,velec);
716 fscal = _mm256_andnot_ps(dummy_mask,fscal);
718 /* Calculate temporary vectorial force */
719 tx = _mm256_mul_ps(fscal,dx12);
720 ty = _mm256_mul_ps(fscal,dy12);
721 tz = _mm256_mul_ps(fscal,dz12);
723 /* Update vectorial force */
724 fix1 = _mm256_add_ps(fix1,tx);
725 fiy1 = _mm256_add_ps(fiy1,ty);
726 fiz1 = _mm256_add_ps(fiz1,tz);
728 fjx2 = _mm256_add_ps(fjx2,tx);
729 fjy2 = _mm256_add_ps(fjy2,ty);
730 fjz2 = _mm256_add_ps(fjz2,tz);
732 /**************************
733 * CALCULATE INTERACTIONS *
734 **************************/
736 /* REACTION-FIELD ELECTROSTATICS */
737 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
738 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
740 /* Update potential sum for this i atom from the interaction with this j atom. */
741 velec = _mm256_andnot_ps(dummy_mask,velec);
742 velecsum = _mm256_add_ps(velecsum,velec);
746 fscal = _mm256_andnot_ps(dummy_mask,fscal);
748 /* Calculate temporary vectorial force */
749 tx = _mm256_mul_ps(fscal,dx13);
750 ty = _mm256_mul_ps(fscal,dy13);
751 tz = _mm256_mul_ps(fscal,dz13);
753 /* Update vectorial force */
754 fix1 = _mm256_add_ps(fix1,tx);
755 fiy1 = _mm256_add_ps(fiy1,ty);
756 fiz1 = _mm256_add_ps(fiz1,tz);
758 fjx3 = _mm256_add_ps(fjx3,tx);
759 fjy3 = _mm256_add_ps(fjy3,ty);
760 fjz3 = _mm256_add_ps(fjz3,tz);
762 /**************************
763 * CALCULATE INTERACTIONS *
764 **************************/
766 /* REACTION-FIELD ELECTROSTATICS */
767 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
768 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
770 /* Update potential sum for this i atom from the interaction with this j atom. */
771 velec = _mm256_andnot_ps(dummy_mask,velec);
772 velecsum = _mm256_add_ps(velecsum,velec);
776 fscal = _mm256_andnot_ps(dummy_mask,fscal);
778 /* Calculate temporary vectorial force */
779 tx = _mm256_mul_ps(fscal,dx21);
780 ty = _mm256_mul_ps(fscal,dy21);
781 tz = _mm256_mul_ps(fscal,dz21);
783 /* Update vectorial force */
784 fix2 = _mm256_add_ps(fix2,tx);
785 fiy2 = _mm256_add_ps(fiy2,ty);
786 fiz2 = _mm256_add_ps(fiz2,tz);
788 fjx1 = _mm256_add_ps(fjx1,tx);
789 fjy1 = _mm256_add_ps(fjy1,ty);
790 fjz1 = _mm256_add_ps(fjz1,tz);
792 /**************************
793 * CALCULATE INTERACTIONS *
794 **************************/
796 /* REACTION-FIELD ELECTROSTATICS */
797 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
798 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm256_andnot_ps(dummy_mask,velec);
802 velecsum = _mm256_add_ps(velecsum,velec);
806 fscal = _mm256_andnot_ps(dummy_mask,fscal);
808 /* Calculate temporary vectorial force */
809 tx = _mm256_mul_ps(fscal,dx22);
810 ty = _mm256_mul_ps(fscal,dy22);
811 tz = _mm256_mul_ps(fscal,dz22);
813 /* Update vectorial force */
814 fix2 = _mm256_add_ps(fix2,tx);
815 fiy2 = _mm256_add_ps(fiy2,ty);
816 fiz2 = _mm256_add_ps(fiz2,tz);
818 fjx2 = _mm256_add_ps(fjx2,tx);
819 fjy2 = _mm256_add_ps(fjy2,ty);
820 fjz2 = _mm256_add_ps(fjz2,tz);
822 /**************************
823 * CALCULATE INTERACTIONS *
824 **************************/
826 /* REACTION-FIELD ELECTROSTATICS */
827 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
828 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
830 /* Update potential sum for this i atom from the interaction with this j atom. */
831 velec = _mm256_andnot_ps(dummy_mask,velec);
832 velecsum = _mm256_add_ps(velecsum,velec);
836 fscal = _mm256_andnot_ps(dummy_mask,fscal);
838 /* Calculate temporary vectorial force */
839 tx = _mm256_mul_ps(fscal,dx23);
840 ty = _mm256_mul_ps(fscal,dy23);
841 tz = _mm256_mul_ps(fscal,dz23);
843 /* Update vectorial force */
844 fix2 = _mm256_add_ps(fix2,tx);
845 fiy2 = _mm256_add_ps(fiy2,ty);
846 fiz2 = _mm256_add_ps(fiz2,tz);
848 fjx3 = _mm256_add_ps(fjx3,tx);
849 fjy3 = _mm256_add_ps(fjy3,ty);
850 fjz3 = _mm256_add_ps(fjz3,tz);
852 /**************************
853 * CALCULATE INTERACTIONS *
854 **************************/
856 /* REACTION-FIELD ELECTROSTATICS */
857 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
858 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
860 /* Update potential sum for this i atom from the interaction with this j atom. */
861 velec = _mm256_andnot_ps(dummy_mask,velec);
862 velecsum = _mm256_add_ps(velecsum,velec);
866 fscal = _mm256_andnot_ps(dummy_mask,fscal);
868 /* Calculate temporary vectorial force */
869 tx = _mm256_mul_ps(fscal,dx31);
870 ty = _mm256_mul_ps(fscal,dy31);
871 tz = _mm256_mul_ps(fscal,dz31);
873 /* Update vectorial force */
874 fix3 = _mm256_add_ps(fix3,tx);
875 fiy3 = _mm256_add_ps(fiy3,ty);
876 fiz3 = _mm256_add_ps(fiz3,tz);
878 fjx1 = _mm256_add_ps(fjx1,tx);
879 fjy1 = _mm256_add_ps(fjy1,ty);
880 fjz1 = _mm256_add_ps(fjz1,tz);
882 /**************************
883 * CALCULATE INTERACTIONS *
884 **************************/
886 /* REACTION-FIELD ELECTROSTATICS */
887 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
888 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
890 /* Update potential sum for this i atom from the interaction with this j atom. */
891 velec = _mm256_andnot_ps(dummy_mask,velec);
892 velecsum = _mm256_add_ps(velecsum,velec);
896 fscal = _mm256_andnot_ps(dummy_mask,fscal);
898 /* Calculate temporary vectorial force */
899 tx = _mm256_mul_ps(fscal,dx32);
900 ty = _mm256_mul_ps(fscal,dy32);
901 tz = _mm256_mul_ps(fscal,dz32);
903 /* Update vectorial force */
904 fix3 = _mm256_add_ps(fix3,tx);
905 fiy3 = _mm256_add_ps(fiy3,ty);
906 fiz3 = _mm256_add_ps(fiz3,tz);
908 fjx2 = _mm256_add_ps(fjx2,tx);
909 fjy2 = _mm256_add_ps(fjy2,ty);
910 fjz2 = _mm256_add_ps(fjz2,tz);
912 /**************************
913 * CALCULATE INTERACTIONS *
914 **************************/
916 /* REACTION-FIELD ELECTROSTATICS */
917 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
918 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
920 /* Update potential sum for this i atom from the interaction with this j atom. */
921 velec = _mm256_andnot_ps(dummy_mask,velec);
922 velecsum = _mm256_add_ps(velecsum,velec);
926 fscal = _mm256_andnot_ps(dummy_mask,fscal);
928 /* Calculate temporary vectorial force */
929 tx = _mm256_mul_ps(fscal,dx33);
930 ty = _mm256_mul_ps(fscal,dy33);
931 tz = _mm256_mul_ps(fscal,dz33);
933 /* Update vectorial force */
934 fix3 = _mm256_add_ps(fix3,tx);
935 fiy3 = _mm256_add_ps(fiy3,ty);
936 fiz3 = _mm256_add_ps(fiz3,tz);
938 fjx3 = _mm256_add_ps(fjx3,tx);
939 fjy3 = _mm256_add_ps(fjy3,ty);
940 fjz3 = _mm256_add_ps(fjz3,tz);
942 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
943 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
944 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
945 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
946 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
947 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
948 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
949 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
951 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
952 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
953 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
955 /* Inner loop uses 288 flops */
958 /* End of innermost loop */
960 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
961 f+i_coord_offset+DIM,fshift+i_shift_offset);
964 /* Update potential energies */
965 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
967 /* Increment number of inner iterations */
968 inneriter += j_index_end - j_index_start;
970 /* Outer loop uses 19 flops */
973 /* Increment number of outer iterations */
976 /* Update outer/inner flops */
978 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*288);
981 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single
982 * Electrostatics interaction: ReactionField
983 * VdW interaction: None
984 * Geometry: Water4-Water4
985 * Calculate force/pot: Force
988 nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single
989 (t_nblist * gmx_restrict nlist,
990 rvec * gmx_restrict xx,
991 rvec * gmx_restrict ff,
992 t_forcerec * gmx_restrict fr,
993 t_mdatoms * gmx_restrict mdatoms,
994 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
995 t_nrnb * gmx_restrict nrnb)
997 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
998 * just 0 for non-waters.
999 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1000 * jnr indices corresponding to data put in the four positions in the SIMD register.
1002 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1003 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1004 int jnrA,jnrB,jnrC,jnrD;
1005 int jnrE,jnrF,jnrG,jnrH;
1006 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1007 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1008 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1009 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1010 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1011 real rcutoff_scalar;
1012 real *shiftvec,*fshift,*x,*f;
1013 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1014 real scratch[4*DIM];
1015 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1016 real * vdwioffsetptr1;
1017 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1018 real * vdwioffsetptr2;
1019 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1020 real * vdwioffsetptr3;
1021 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1022 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1023 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1024 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1025 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1026 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1027 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1028 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1029 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1030 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1031 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1032 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1033 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1034 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1035 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1036 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1037 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1039 __m256 dummy_mask,cutoff_mask;
1040 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1041 __m256 one = _mm256_set1_ps(1.0);
1042 __m256 two = _mm256_set1_ps(2.0);
1048 jindex = nlist->jindex;
1050 shiftidx = nlist->shift;
1052 shiftvec = fr->shift_vec[0];
1053 fshift = fr->fshift[0];
1054 facel = _mm256_set1_ps(fr->epsfac);
1055 charge = mdatoms->chargeA;
1056 krf = _mm256_set1_ps(fr->ic->k_rf);
1057 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1058 crf = _mm256_set1_ps(fr->ic->c_rf);
1060 /* Setup water-specific parameters */
1061 inr = nlist->iinr[0];
1062 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1063 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1064 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1066 jq1 = _mm256_set1_ps(charge[inr+1]);
1067 jq2 = _mm256_set1_ps(charge[inr+2]);
1068 jq3 = _mm256_set1_ps(charge[inr+3]);
1069 qq11 = _mm256_mul_ps(iq1,jq1);
1070 qq12 = _mm256_mul_ps(iq1,jq2);
1071 qq13 = _mm256_mul_ps(iq1,jq3);
1072 qq21 = _mm256_mul_ps(iq2,jq1);
1073 qq22 = _mm256_mul_ps(iq2,jq2);
1074 qq23 = _mm256_mul_ps(iq2,jq3);
1075 qq31 = _mm256_mul_ps(iq3,jq1);
1076 qq32 = _mm256_mul_ps(iq3,jq2);
1077 qq33 = _mm256_mul_ps(iq3,jq3);
1079 /* Avoid stupid compiler warnings */
1080 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1081 j_coord_offsetA = 0;
1082 j_coord_offsetB = 0;
1083 j_coord_offsetC = 0;
1084 j_coord_offsetD = 0;
1085 j_coord_offsetE = 0;
1086 j_coord_offsetF = 0;
1087 j_coord_offsetG = 0;
1088 j_coord_offsetH = 0;
1093 for(iidx=0;iidx<4*DIM;iidx++)
1095 scratch[iidx] = 0.0;
1098 /* Start outer loop over neighborlists */
1099 for(iidx=0; iidx<nri; iidx++)
1101 /* Load shift vector for this list */
1102 i_shift_offset = DIM*shiftidx[iidx];
1104 /* Load limits for loop over neighbors */
1105 j_index_start = jindex[iidx];
1106 j_index_end = jindex[iidx+1];
1108 /* Get outer coordinate index */
1110 i_coord_offset = DIM*inr;
1112 /* Load i particle coords and add shift vector */
1113 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1114 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1116 fix1 = _mm256_setzero_ps();
1117 fiy1 = _mm256_setzero_ps();
1118 fiz1 = _mm256_setzero_ps();
1119 fix2 = _mm256_setzero_ps();
1120 fiy2 = _mm256_setzero_ps();
1121 fiz2 = _mm256_setzero_ps();
1122 fix3 = _mm256_setzero_ps();
1123 fiy3 = _mm256_setzero_ps();
1124 fiz3 = _mm256_setzero_ps();
1126 /* Start inner kernel loop */
1127 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1130 /* Get j neighbor index, and coordinate index */
1132 jnrB = jjnr[jidx+1];
1133 jnrC = jjnr[jidx+2];
1134 jnrD = jjnr[jidx+3];
1135 jnrE = jjnr[jidx+4];
1136 jnrF = jjnr[jidx+5];
1137 jnrG = jjnr[jidx+6];
1138 jnrH = jjnr[jidx+7];
1139 j_coord_offsetA = DIM*jnrA;
1140 j_coord_offsetB = DIM*jnrB;
1141 j_coord_offsetC = DIM*jnrC;
1142 j_coord_offsetD = DIM*jnrD;
1143 j_coord_offsetE = DIM*jnrE;
1144 j_coord_offsetF = DIM*jnrF;
1145 j_coord_offsetG = DIM*jnrG;
1146 j_coord_offsetH = DIM*jnrH;
1148 /* load j atom coordinates */
1149 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1150 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1151 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1152 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1153 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1155 /* Calculate displacement vector */
1156 dx11 = _mm256_sub_ps(ix1,jx1);
1157 dy11 = _mm256_sub_ps(iy1,jy1);
1158 dz11 = _mm256_sub_ps(iz1,jz1);
1159 dx12 = _mm256_sub_ps(ix1,jx2);
1160 dy12 = _mm256_sub_ps(iy1,jy2);
1161 dz12 = _mm256_sub_ps(iz1,jz2);
1162 dx13 = _mm256_sub_ps(ix1,jx3);
1163 dy13 = _mm256_sub_ps(iy1,jy3);
1164 dz13 = _mm256_sub_ps(iz1,jz3);
1165 dx21 = _mm256_sub_ps(ix2,jx1);
1166 dy21 = _mm256_sub_ps(iy2,jy1);
1167 dz21 = _mm256_sub_ps(iz2,jz1);
1168 dx22 = _mm256_sub_ps(ix2,jx2);
1169 dy22 = _mm256_sub_ps(iy2,jy2);
1170 dz22 = _mm256_sub_ps(iz2,jz2);
1171 dx23 = _mm256_sub_ps(ix2,jx3);
1172 dy23 = _mm256_sub_ps(iy2,jy3);
1173 dz23 = _mm256_sub_ps(iz2,jz3);
1174 dx31 = _mm256_sub_ps(ix3,jx1);
1175 dy31 = _mm256_sub_ps(iy3,jy1);
1176 dz31 = _mm256_sub_ps(iz3,jz1);
1177 dx32 = _mm256_sub_ps(ix3,jx2);
1178 dy32 = _mm256_sub_ps(iy3,jy2);
1179 dz32 = _mm256_sub_ps(iz3,jz2);
1180 dx33 = _mm256_sub_ps(ix3,jx3);
1181 dy33 = _mm256_sub_ps(iy3,jy3);
1182 dz33 = _mm256_sub_ps(iz3,jz3);
1184 /* Calculate squared distance and things based on it */
1185 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1186 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1187 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1188 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1189 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1190 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1191 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1192 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1193 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1195 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1196 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1197 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1198 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1199 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1200 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1201 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1202 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1203 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1205 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1206 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1207 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1208 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1209 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1210 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1211 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1212 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1213 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1215 fjx1 = _mm256_setzero_ps();
1216 fjy1 = _mm256_setzero_ps();
1217 fjz1 = _mm256_setzero_ps();
1218 fjx2 = _mm256_setzero_ps();
1219 fjy2 = _mm256_setzero_ps();
1220 fjz2 = _mm256_setzero_ps();
1221 fjx3 = _mm256_setzero_ps();
1222 fjy3 = _mm256_setzero_ps();
1223 fjz3 = _mm256_setzero_ps();
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 /* REACTION-FIELD ELECTROSTATICS */
1230 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1234 /* Calculate temporary vectorial force */
1235 tx = _mm256_mul_ps(fscal,dx11);
1236 ty = _mm256_mul_ps(fscal,dy11);
1237 tz = _mm256_mul_ps(fscal,dz11);
1239 /* Update vectorial force */
1240 fix1 = _mm256_add_ps(fix1,tx);
1241 fiy1 = _mm256_add_ps(fiy1,ty);
1242 fiz1 = _mm256_add_ps(fiz1,tz);
1244 fjx1 = _mm256_add_ps(fjx1,tx);
1245 fjy1 = _mm256_add_ps(fjy1,ty);
1246 fjz1 = _mm256_add_ps(fjz1,tz);
1248 /**************************
1249 * CALCULATE INTERACTIONS *
1250 **************************/
1252 /* REACTION-FIELD ELECTROSTATICS */
1253 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1257 /* Calculate temporary vectorial force */
1258 tx = _mm256_mul_ps(fscal,dx12);
1259 ty = _mm256_mul_ps(fscal,dy12);
1260 tz = _mm256_mul_ps(fscal,dz12);
1262 /* Update vectorial force */
1263 fix1 = _mm256_add_ps(fix1,tx);
1264 fiy1 = _mm256_add_ps(fiy1,ty);
1265 fiz1 = _mm256_add_ps(fiz1,tz);
1267 fjx2 = _mm256_add_ps(fjx2,tx);
1268 fjy2 = _mm256_add_ps(fjy2,ty);
1269 fjz2 = _mm256_add_ps(fjz2,tz);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 /* REACTION-FIELD ELECTROSTATICS */
1276 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1280 /* Calculate temporary vectorial force */
1281 tx = _mm256_mul_ps(fscal,dx13);
1282 ty = _mm256_mul_ps(fscal,dy13);
1283 tz = _mm256_mul_ps(fscal,dz13);
1285 /* Update vectorial force */
1286 fix1 = _mm256_add_ps(fix1,tx);
1287 fiy1 = _mm256_add_ps(fiy1,ty);
1288 fiz1 = _mm256_add_ps(fiz1,tz);
1290 fjx3 = _mm256_add_ps(fjx3,tx);
1291 fjy3 = _mm256_add_ps(fjy3,ty);
1292 fjz3 = _mm256_add_ps(fjz3,tz);
1294 /**************************
1295 * CALCULATE INTERACTIONS *
1296 **************************/
1298 /* REACTION-FIELD ELECTROSTATICS */
1299 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1303 /* Calculate temporary vectorial force */
1304 tx = _mm256_mul_ps(fscal,dx21);
1305 ty = _mm256_mul_ps(fscal,dy21);
1306 tz = _mm256_mul_ps(fscal,dz21);
1308 /* Update vectorial force */
1309 fix2 = _mm256_add_ps(fix2,tx);
1310 fiy2 = _mm256_add_ps(fiy2,ty);
1311 fiz2 = _mm256_add_ps(fiz2,tz);
1313 fjx1 = _mm256_add_ps(fjx1,tx);
1314 fjy1 = _mm256_add_ps(fjy1,ty);
1315 fjz1 = _mm256_add_ps(fjz1,tz);
1317 /**************************
1318 * CALCULATE INTERACTIONS *
1319 **************************/
1321 /* REACTION-FIELD ELECTROSTATICS */
1322 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1326 /* Calculate temporary vectorial force */
1327 tx = _mm256_mul_ps(fscal,dx22);
1328 ty = _mm256_mul_ps(fscal,dy22);
1329 tz = _mm256_mul_ps(fscal,dz22);
1331 /* Update vectorial force */
1332 fix2 = _mm256_add_ps(fix2,tx);
1333 fiy2 = _mm256_add_ps(fiy2,ty);
1334 fiz2 = _mm256_add_ps(fiz2,tz);
1336 fjx2 = _mm256_add_ps(fjx2,tx);
1337 fjy2 = _mm256_add_ps(fjy2,ty);
1338 fjz2 = _mm256_add_ps(fjz2,tz);
1340 /**************************
1341 * CALCULATE INTERACTIONS *
1342 **************************/
1344 /* REACTION-FIELD ELECTROSTATICS */
1345 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1349 /* Calculate temporary vectorial force */
1350 tx = _mm256_mul_ps(fscal,dx23);
1351 ty = _mm256_mul_ps(fscal,dy23);
1352 tz = _mm256_mul_ps(fscal,dz23);
1354 /* Update vectorial force */
1355 fix2 = _mm256_add_ps(fix2,tx);
1356 fiy2 = _mm256_add_ps(fiy2,ty);
1357 fiz2 = _mm256_add_ps(fiz2,tz);
1359 fjx3 = _mm256_add_ps(fjx3,tx);
1360 fjy3 = _mm256_add_ps(fjy3,ty);
1361 fjz3 = _mm256_add_ps(fjz3,tz);
1363 /**************************
1364 * CALCULATE INTERACTIONS *
1365 **************************/
1367 /* REACTION-FIELD ELECTROSTATICS */
1368 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1372 /* Calculate temporary vectorial force */
1373 tx = _mm256_mul_ps(fscal,dx31);
1374 ty = _mm256_mul_ps(fscal,dy31);
1375 tz = _mm256_mul_ps(fscal,dz31);
1377 /* Update vectorial force */
1378 fix3 = _mm256_add_ps(fix3,tx);
1379 fiy3 = _mm256_add_ps(fiy3,ty);
1380 fiz3 = _mm256_add_ps(fiz3,tz);
1382 fjx1 = _mm256_add_ps(fjx1,tx);
1383 fjy1 = _mm256_add_ps(fjy1,ty);
1384 fjz1 = _mm256_add_ps(fjz1,tz);
1386 /**************************
1387 * CALCULATE INTERACTIONS *
1388 **************************/
1390 /* REACTION-FIELD ELECTROSTATICS */
1391 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1395 /* Calculate temporary vectorial force */
1396 tx = _mm256_mul_ps(fscal,dx32);
1397 ty = _mm256_mul_ps(fscal,dy32);
1398 tz = _mm256_mul_ps(fscal,dz32);
1400 /* Update vectorial force */
1401 fix3 = _mm256_add_ps(fix3,tx);
1402 fiy3 = _mm256_add_ps(fiy3,ty);
1403 fiz3 = _mm256_add_ps(fiz3,tz);
1405 fjx2 = _mm256_add_ps(fjx2,tx);
1406 fjy2 = _mm256_add_ps(fjy2,ty);
1407 fjz2 = _mm256_add_ps(fjz2,tz);
1409 /**************************
1410 * CALCULATE INTERACTIONS *
1411 **************************/
1413 /* REACTION-FIELD ELECTROSTATICS */
1414 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1418 /* Calculate temporary vectorial force */
1419 tx = _mm256_mul_ps(fscal,dx33);
1420 ty = _mm256_mul_ps(fscal,dy33);
1421 tz = _mm256_mul_ps(fscal,dz33);
1423 /* Update vectorial force */
1424 fix3 = _mm256_add_ps(fix3,tx);
1425 fiy3 = _mm256_add_ps(fiy3,ty);
1426 fiz3 = _mm256_add_ps(fiz3,tz);
1428 fjx3 = _mm256_add_ps(fjx3,tx);
1429 fjy3 = _mm256_add_ps(fjy3,ty);
1430 fjz3 = _mm256_add_ps(fjz3,tz);
1432 fjptrA = f+j_coord_offsetA;
1433 fjptrB = f+j_coord_offsetB;
1434 fjptrC = f+j_coord_offsetC;
1435 fjptrD = f+j_coord_offsetD;
1436 fjptrE = f+j_coord_offsetE;
1437 fjptrF = f+j_coord_offsetF;
1438 fjptrG = f+j_coord_offsetG;
1439 fjptrH = f+j_coord_offsetH;
1441 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1442 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1443 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1445 /* Inner loop uses 243 flops */
1448 if(jidx<j_index_end)
1451 /* Get j neighbor index, and coordinate index */
1452 jnrlistA = jjnr[jidx];
1453 jnrlistB = jjnr[jidx+1];
1454 jnrlistC = jjnr[jidx+2];
1455 jnrlistD = jjnr[jidx+3];
1456 jnrlistE = jjnr[jidx+4];
1457 jnrlistF = jjnr[jidx+5];
1458 jnrlistG = jjnr[jidx+6];
1459 jnrlistH = jjnr[jidx+7];
1460 /* Sign of each element will be negative for non-real atoms.
1461 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1462 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1464 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1465 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1467 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1468 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1469 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1470 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1471 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1472 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1473 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1474 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1475 j_coord_offsetA = DIM*jnrA;
1476 j_coord_offsetB = DIM*jnrB;
1477 j_coord_offsetC = DIM*jnrC;
1478 j_coord_offsetD = DIM*jnrD;
1479 j_coord_offsetE = DIM*jnrE;
1480 j_coord_offsetF = DIM*jnrF;
1481 j_coord_offsetG = DIM*jnrG;
1482 j_coord_offsetH = DIM*jnrH;
1484 /* load j atom coordinates */
1485 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1486 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1487 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1488 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1489 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1491 /* Calculate displacement vector */
1492 dx11 = _mm256_sub_ps(ix1,jx1);
1493 dy11 = _mm256_sub_ps(iy1,jy1);
1494 dz11 = _mm256_sub_ps(iz1,jz1);
1495 dx12 = _mm256_sub_ps(ix1,jx2);
1496 dy12 = _mm256_sub_ps(iy1,jy2);
1497 dz12 = _mm256_sub_ps(iz1,jz2);
1498 dx13 = _mm256_sub_ps(ix1,jx3);
1499 dy13 = _mm256_sub_ps(iy1,jy3);
1500 dz13 = _mm256_sub_ps(iz1,jz3);
1501 dx21 = _mm256_sub_ps(ix2,jx1);
1502 dy21 = _mm256_sub_ps(iy2,jy1);
1503 dz21 = _mm256_sub_ps(iz2,jz1);
1504 dx22 = _mm256_sub_ps(ix2,jx2);
1505 dy22 = _mm256_sub_ps(iy2,jy2);
1506 dz22 = _mm256_sub_ps(iz2,jz2);
1507 dx23 = _mm256_sub_ps(ix2,jx3);
1508 dy23 = _mm256_sub_ps(iy2,jy3);
1509 dz23 = _mm256_sub_ps(iz2,jz3);
1510 dx31 = _mm256_sub_ps(ix3,jx1);
1511 dy31 = _mm256_sub_ps(iy3,jy1);
1512 dz31 = _mm256_sub_ps(iz3,jz1);
1513 dx32 = _mm256_sub_ps(ix3,jx2);
1514 dy32 = _mm256_sub_ps(iy3,jy2);
1515 dz32 = _mm256_sub_ps(iz3,jz2);
1516 dx33 = _mm256_sub_ps(ix3,jx3);
1517 dy33 = _mm256_sub_ps(iy3,jy3);
1518 dz33 = _mm256_sub_ps(iz3,jz3);
1520 /* Calculate squared distance and things based on it */
1521 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1522 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1523 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1524 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1525 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1526 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1527 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1528 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1529 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1531 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1532 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1533 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1534 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1535 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1536 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1537 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1538 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1539 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1541 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1542 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1543 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1544 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1545 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1546 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1547 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1548 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1549 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1551 fjx1 = _mm256_setzero_ps();
1552 fjy1 = _mm256_setzero_ps();
1553 fjz1 = _mm256_setzero_ps();
1554 fjx2 = _mm256_setzero_ps();
1555 fjy2 = _mm256_setzero_ps();
1556 fjz2 = _mm256_setzero_ps();
1557 fjx3 = _mm256_setzero_ps();
1558 fjy3 = _mm256_setzero_ps();
1559 fjz3 = _mm256_setzero_ps();
1561 /**************************
1562 * CALCULATE INTERACTIONS *
1563 **************************/
1565 /* REACTION-FIELD ELECTROSTATICS */
1566 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1570 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1572 /* Calculate temporary vectorial force */
1573 tx = _mm256_mul_ps(fscal,dx11);
1574 ty = _mm256_mul_ps(fscal,dy11);
1575 tz = _mm256_mul_ps(fscal,dz11);
1577 /* Update vectorial force */
1578 fix1 = _mm256_add_ps(fix1,tx);
1579 fiy1 = _mm256_add_ps(fiy1,ty);
1580 fiz1 = _mm256_add_ps(fiz1,tz);
1582 fjx1 = _mm256_add_ps(fjx1,tx);
1583 fjy1 = _mm256_add_ps(fjy1,ty);
1584 fjz1 = _mm256_add_ps(fjz1,tz);
1586 /**************************
1587 * CALCULATE INTERACTIONS *
1588 **************************/
1590 /* REACTION-FIELD ELECTROSTATICS */
1591 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1595 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1597 /* Calculate temporary vectorial force */
1598 tx = _mm256_mul_ps(fscal,dx12);
1599 ty = _mm256_mul_ps(fscal,dy12);
1600 tz = _mm256_mul_ps(fscal,dz12);
1602 /* Update vectorial force */
1603 fix1 = _mm256_add_ps(fix1,tx);
1604 fiy1 = _mm256_add_ps(fiy1,ty);
1605 fiz1 = _mm256_add_ps(fiz1,tz);
1607 fjx2 = _mm256_add_ps(fjx2,tx);
1608 fjy2 = _mm256_add_ps(fjy2,ty);
1609 fjz2 = _mm256_add_ps(fjz2,tz);
1611 /**************************
1612 * CALCULATE INTERACTIONS *
1613 **************************/
1615 /* REACTION-FIELD ELECTROSTATICS */
1616 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1620 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1622 /* Calculate temporary vectorial force */
1623 tx = _mm256_mul_ps(fscal,dx13);
1624 ty = _mm256_mul_ps(fscal,dy13);
1625 tz = _mm256_mul_ps(fscal,dz13);
1627 /* Update vectorial force */
1628 fix1 = _mm256_add_ps(fix1,tx);
1629 fiy1 = _mm256_add_ps(fiy1,ty);
1630 fiz1 = _mm256_add_ps(fiz1,tz);
1632 fjx3 = _mm256_add_ps(fjx3,tx);
1633 fjy3 = _mm256_add_ps(fjy3,ty);
1634 fjz3 = _mm256_add_ps(fjz3,tz);
1636 /**************************
1637 * CALCULATE INTERACTIONS *
1638 **************************/
1640 /* REACTION-FIELD ELECTROSTATICS */
1641 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1645 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1647 /* Calculate temporary vectorial force */
1648 tx = _mm256_mul_ps(fscal,dx21);
1649 ty = _mm256_mul_ps(fscal,dy21);
1650 tz = _mm256_mul_ps(fscal,dz21);
1652 /* Update vectorial force */
1653 fix2 = _mm256_add_ps(fix2,tx);
1654 fiy2 = _mm256_add_ps(fiy2,ty);
1655 fiz2 = _mm256_add_ps(fiz2,tz);
1657 fjx1 = _mm256_add_ps(fjx1,tx);
1658 fjy1 = _mm256_add_ps(fjy1,ty);
1659 fjz1 = _mm256_add_ps(fjz1,tz);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 /* REACTION-FIELD ELECTROSTATICS */
1666 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1670 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1672 /* Calculate temporary vectorial force */
1673 tx = _mm256_mul_ps(fscal,dx22);
1674 ty = _mm256_mul_ps(fscal,dy22);
1675 tz = _mm256_mul_ps(fscal,dz22);
1677 /* Update vectorial force */
1678 fix2 = _mm256_add_ps(fix2,tx);
1679 fiy2 = _mm256_add_ps(fiy2,ty);
1680 fiz2 = _mm256_add_ps(fiz2,tz);
1682 fjx2 = _mm256_add_ps(fjx2,tx);
1683 fjy2 = _mm256_add_ps(fjy2,ty);
1684 fjz2 = _mm256_add_ps(fjz2,tz);
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 /* REACTION-FIELD ELECTROSTATICS */
1691 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1695 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1697 /* Calculate temporary vectorial force */
1698 tx = _mm256_mul_ps(fscal,dx23);
1699 ty = _mm256_mul_ps(fscal,dy23);
1700 tz = _mm256_mul_ps(fscal,dz23);
1702 /* Update vectorial force */
1703 fix2 = _mm256_add_ps(fix2,tx);
1704 fiy2 = _mm256_add_ps(fiy2,ty);
1705 fiz2 = _mm256_add_ps(fiz2,tz);
1707 fjx3 = _mm256_add_ps(fjx3,tx);
1708 fjy3 = _mm256_add_ps(fjy3,ty);
1709 fjz3 = _mm256_add_ps(fjz3,tz);
1711 /**************************
1712 * CALCULATE INTERACTIONS *
1713 **************************/
1715 /* REACTION-FIELD ELECTROSTATICS */
1716 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1720 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1722 /* Calculate temporary vectorial force */
1723 tx = _mm256_mul_ps(fscal,dx31);
1724 ty = _mm256_mul_ps(fscal,dy31);
1725 tz = _mm256_mul_ps(fscal,dz31);
1727 /* Update vectorial force */
1728 fix3 = _mm256_add_ps(fix3,tx);
1729 fiy3 = _mm256_add_ps(fiy3,ty);
1730 fiz3 = _mm256_add_ps(fiz3,tz);
1732 fjx1 = _mm256_add_ps(fjx1,tx);
1733 fjy1 = _mm256_add_ps(fjy1,ty);
1734 fjz1 = _mm256_add_ps(fjz1,tz);
1736 /**************************
1737 * CALCULATE INTERACTIONS *
1738 **************************/
1740 /* REACTION-FIELD ELECTROSTATICS */
1741 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1745 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1747 /* Calculate temporary vectorial force */
1748 tx = _mm256_mul_ps(fscal,dx32);
1749 ty = _mm256_mul_ps(fscal,dy32);
1750 tz = _mm256_mul_ps(fscal,dz32);
1752 /* Update vectorial force */
1753 fix3 = _mm256_add_ps(fix3,tx);
1754 fiy3 = _mm256_add_ps(fiy3,ty);
1755 fiz3 = _mm256_add_ps(fiz3,tz);
1757 fjx2 = _mm256_add_ps(fjx2,tx);
1758 fjy2 = _mm256_add_ps(fjy2,ty);
1759 fjz2 = _mm256_add_ps(fjz2,tz);
1761 /**************************
1762 * CALCULATE INTERACTIONS *
1763 **************************/
1765 /* REACTION-FIELD ELECTROSTATICS */
1766 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1770 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1772 /* Calculate temporary vectorial force */
1773 tx = _mm256_mul_ps(fscal,dx33);
1774 ty = _mm256_mul_ps(fscal,dy33);
1775 tz = _mm256_mul_ps(fscal,dz33);
1777 /* Update vectorial force */
1778 fix3 = _mm256_add_ps(fix3,tx);
1779 fiy3 = _mm256_add_ps(fiy3,ty);
1780 fiz3 = _mm256_add_ps(fiz3,tz);
1782 fjx3 = _mm256_add_ps(fjx3,tx);
1783 fjy3 = _mm256_add_ps(fjy3,ty);
1784 fjz3 = _mm256_add_ps(fjz3,tz);
1786 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1787 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1788 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1789 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1790 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1791 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1792 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1793 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1795 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1796 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1797 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1799 /* Inner loop uses 243 flops */
1802 /* End of innermost loop */
1804 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1805 f+i_coord_offset+DIM,fshift+i_shift_offset);
1807 /* Increment number of inner iterations */
1808 inneriter += j_index_end - j_index_start;
1810 /* Outer loop uses 18 flops */
1813 /* Increment number of outer iterations */
1816 /* Update outer/inner flops */
1818 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*243);