2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: None
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr1;
87 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 real * vdwioffsetptr3;
91 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
93 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
95 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
97 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
109 __m256 dummy_mask,cutoff_mask;
110 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
111 __m256 one = _mm256_set1_ps(1.0);
112 __m256 two = _mm256_set1_ps(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm256_set1_ps(fr->epsfac);
125 charge = mdatoms->chargeA;
126 krf = _mm256_set1_ps(fr->ic->k_rf);
127 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
128 crf = _mm256_set1_ps(fr->ic->c_rf);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
133 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
134 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
136 jq1 = _mm256_set1_ps(charge[inr+1]);
137 jq2 = _mm256_set1_ps(charge[inr+2]);
138 jq3 = _mm256_set1_ps(charge[inr+3]);
139 qq11 = _mm256_mul_ps(iq1,jq1);
140 qq12 = _mm256_mul_ps(iq1,jq2);
141 qq13 = _mm256_mul_ps(iq1,jq3);
142 qq21 = _mm256_mul_ps(iq2,jq1);
143 qq22 = _mm256_mul_ps(iq2,jq2);
144 qq23 = _mm256_mul_ps(iq2,jq3);
145 qq31 = _mm256_mul_ps(iq3,jq1);
146 qq32 = _mm256_mul_ps(iq3,jq2);
147 qq33 = _mm256_mul_ps(iq3,jq3);
149 /* Avoid stupid compiler warnings */
150 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
163 for(iidx=0;iidx<4*DIM;iidx++)
168 /* Start outer loop over neighborlists */
169 for(iidx=0; iidx<nri; iidx++)
171 /* Load shift vector for this list */
172 i_shift_offset = DIM*shiftidx[iidx];
174 /* Load limits for loop over neighbors */
175 j_index_start = jindex[iidx];
176 j_index_end = jindex[iidx+1];
178 /* Get outer coordinate index */
180 i_coord_offset = DIM*inr;
182 /* Load i particle coords and add shift vector */
183 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
184 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
186 fix1 = _mm256_setzero_ps();
187 fiy1 = _mm256_setzero_ps();
188 fiz1 = _mm256_setzero_ps();
189 fix2 = _mm256_setzero_ps();
190 fiy2 = _mm256_setzero_ps();
191 fiz2 = _mm256_setzero_ps();
192 fix3 = _mm256_setzero_ps();
193 fiy3 = _mm256_setzero_ps();
194 fiz3 = _mm256_setzero_ps();
196 /* Reset potential sums */
197 velecsum = _mm256_setzero_ps();
199 /* Start inner kernel loop */
200 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
203 /* Get j neighbor index, and coordinate index */
212 j_coord_offsetA = DIM*jnrA;
213 j_coord_offsetB = DIM*jnrB;
214 j_coord_offsetC = DIM*jnrC;
215 j_coord_offsetD = DIM*jnrD;
216 j_coord_offsetE = DIM*jnrE;
217 j_coord_offsetF = DIM*jnrF;
218 j_coord_offsetG = DIM*jnrG;
219 j_coord_offsetH = DIM*jnrH;
221 /* load j atom coordinates */
222 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
223 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
224 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
225 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
226 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
228 /* Calculate displacement vector */
229 dx11 = _mm256_sub_ps(ix1,jx1);
230 dy11 = _mm256_sub_ps(iy1,jy1);
231 dz11 = _mm256_sub_ps(iz1,jz1);
232 dx12 = _mm256_sub_ps(ix1,jx2);
233 dy12 = _mm256_sub_ps(iy1,jy2);
234 dz12 = _mm256_sub_ps(iz1,jz2);
235 dx13 = _mm256_sub_ps(ix1,jx3);
236 dy13 = _mm256_sub_ps(iy1,jy3);
237 dz13 = _mm256_sub_ps(iz1,jz3);
238 dx21 = _mm256_sub_ps(ix2,jx1);
239 dy21 = _mm256_sub_ps(iy2,jy1);
240 dz21 = _mm256_sub_ps(iz2,jz1);
241 dx22 = _mm256_sub_ps(ix2,jx2);
242 dy22 = _mm256_sub_ps(iy2,jy2);
243 dz22 = _mm256_sub_ps(iz2,jz2);
244 dx23 = _mm256_sub_ps(ix2,jx3);
245 dy23 = _mm256_sub_ps(iy2,jy3);
246 dz23 = _mm256_sub_ps(iz2,jz3);
247 dx31 = _mm256_sub_ps(ix3,jx1);
248 dy31 = _mm256_sub_ps(iy3,jy1);
249 dz31 = _mm256_sub_ps(iz3,jz1);
250 dx32 = _mm256_sub_ps(ix3,jx2);
251 dy32 = _mm256_sub_ps(iy3,jy2);
252 dz32 = _mm256_sub_ps(iz3,jz2);
253 dx33 = _mm256_sub_ps(ix3,jx3);
254 dy33 = _mm256_sub_ps(iy3,jy3);
255 dz33 = _mm256_sub_ps(iz3,jz3);
257 /* Calculate squared distance and things based on it */
258 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
259 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
260 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
261 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
262 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
263 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
264 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
265 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
266 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
268 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
269 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
270 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
271 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
272 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
273 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
274 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
275 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
276 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
278 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
279 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
280 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
281 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
282 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
283 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
284 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
285 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
286 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
288 fjx1 = _mm256_setzero_ps();
289 fjy1 = _mm256_setzero_ps();
290 fjz1 = _mm256_setzero_ps();
291 fjx2 = _mm256_setzero_ps();
292 fjy2 = _mm256_setzero_ps();
293 fjz2 = _mm256_setzero_ps();
294 fjx3 = _mm256_setzero_ps();
295 fjy3 = _mm256_setzero_ps();
296 fjz3 = _mm256_setzero_ps();
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 /* REACTION-FIELD ELECTROSTATICS */
303 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
304 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
306 /* Update potential sum for this i atom from the interaction with this j atom. */
307 velecsum = _mm256_add_ps(velecsum,velec);
311 /* Calculate temporary vectorial force */
312 tx = _mm256_mul_ps(fscal,dx11);
313 ty = _mm256_mul_ps(fscal,dy11);
314 tz = _mm256_mul_ps(fscal,dz11);
316 /* Update vectorial force */
317 fix1 = _mm256_add_ps(fix1,tx);
318 fiy1 = _mm256_add_ps(fiy1,ty);
319 fiz1 = _mm256_add_ps(fiz1,tz);
321 fjx1 = _mm256_add_ps(fjx1,tx);
322 fjy1 = _mm256_add_ps(fjy1,ty);
323 fjz1 = _mm256_add_ps(fjz1,tz);
325 /**************************
326 * CALCULATE INTERACTIONS *
327 **************************/
329 /* REACTION-FIELD ELECTROSTATICS */
330 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
331 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm256_add_ps(velecsum,velec);
338 /* Calculate temporary vectorial force */
339 tx = _mm256_mul_ps(fscal,dx12);
340 ty = _mm256_mul_ps(fscal,dy12);
341 tz = _mm256_mul_ps(fscal,dz12);
343 /* Update vectorial force */
344 fix1 = _mm256_add_ps(fix1,tx);
345 fiy1 = _mm256_add_ps(fiy1,ty);
346 fiz1 = _mm256_add_ps(fiz1,tz);
348 fjx2 = _mm256_add_ps(fjx2,tx);
349 fjy2 = _mm256_add_ps(fjy2,ty);
350 fjz2 = _mm256_add_ps(fjz2,tz);
352 /**************************
353 * CALCULATE INTERACTIONS *
354 **************************/
356 /* REACTION-FIELD ELECTROSTATICS */
357 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
358 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 velecsum = _mm256_add_ps(velecsum,velec);
365 /* Calculate temporary vectorial force */
366 tx = _mm256_mul_ps(fscal,dx13);
367 ty = _mm256_mul_ps(fscal,dy13);
368 tz = _mm256_mul_ps(fscal,dz13);
370 /* Update vectorial force */
371 fix1 = _mm256_add_ps(fix1,tx);
372 fiy1 = _mm256_add_ps(fiy1,ty);
373 fiz1 = _mm256_add_ps(fiz1,tz);
375 fjx3 = _mm256_add_ps(fjx3,tx);
376 fjy3 = _mm256_add_ps(fjy3,ty);
377 fjz3 = _mm256_add_ps(fjz3,tz);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 /* REACTION-FIELD ELECTROSTATICS */
384 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
385 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velecsum = _mm256_add_ps(velecsum,velec);
392 /* Calculate temporary vectorial force */
393 tx = _mm256_mul_ps(fscal,dx21);
394 ty = _mm256_mul_ps(fscal,dy21);
395 tz = _mm256_mul_ps(fscal,dz21);
397 /* Update vectorial force */
398 fix2 = _mm256_add_ps(fix2,tx);
399 fiy2 = _mm256_add_ps(fiy2,ty);
400 fiz2 = _mm256_add_ps(fiz2,tz);
402 fjx1 = _mm256_add_ps(fjx1,tx);
403 fjy1 = _mm256_add_ps(fjy1,ty);
404 fjz1 = _mm256_add_ps(fjz1,tz);
406 /**************************
407 * CALCULATE INTERACTIONS *
408 **************************/
410 /* REACTION-FIELD ELECTROSTATICS */
411 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
412 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velecsum = _mm256_add_ps(velecsum,velec);
419 /* Calculate temporary vectorial force */
420 tx = _mm256_mul_ps(fscal,dx22);
421 ty = _mm256_mul_ps(fscal,dy22);
422 tz = _mm256_mul_ps(fscal,dz22);
424 /* Update vectorial force */
425 fix2 = _mm256_add_ps(fix2,tx);
426 fiy2 = _mm256_add_ps(fiy2,ty);
427 fiz2 = _mm256_add_ps(fiz2,tz);
429 fjx2 = _mm256_add_ps(fjx2,tx);
430 fjy2 = _mm256_add_ps(fjy2,ty);
431 fjz2 = _mm256_add_ps(fjz2,tz);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 /* REACTION-FIELD ELECTROSTATICS */
438 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
439 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm256_add_ps(velecsum,velec);
446 /* Calculate temporary vectorial force */
447 tx = _mm256_mul_ps(fscal,dx23);
448 ty = _mm256_mul_ps(fscal,dy23);
449 tz = _mm256_mul_ps(fscal,dz23);
451 /* Update vectorial force */
452 fix2 = _mm256_add_ps(fix2,tx);
453 fiy2 = _mm256_add_ps(fiy2,ty);
454 fiz2 = _mm256_add_ps(fiz2,tz);
456 fjx3 = _mm256_add_ps(fjx3,tx);
457 fjy3 = _mm256_add_ps(fjy3,ty);
458 fjz3 = _mm256_add_ps(fjz3,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 /* REACTION-FIELD ELECTROSTATICS */
465 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
466 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
468 /* Update potential sum for this i atom from the interaction with this j atom. */
469 velecsum = _mm256_add_ps(velecsum,velec);
473 /* Calculate temporary vectorial force */
474 tx = _mm256_mul_ps(fscal,dx31);
475 ty = _mm256_mul_ps(fscal,dy31);
476 tz = _mm256_mul_ps(fscal,dz31);
478 /* Update vectorial force */
479 fix3 = _mm256_add_ps(fix3,tx);
480 fiy3 = _mm256_add_ps(fiy3,ty);
481 fiz3 = _mm256_add_ps(fiz3,tz);
483 fjx1 = _mm256_add_ps(fjx1,tx);
484 fjy1 = _mm256_add_ps(fjy1,ty);
485 fjz1 = _mm256_add_ps(fjz1,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 /* REACTION-FIELD ELECTROSTATICS */
492 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
493 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velecsum = _mm256_add_ps(velecsum,velec);
500 /* Calculate temporary vectorial force */
501 tx = _mm256_mul_ps(fscal,dx32);
502 ty = _mm256_mul_ps(fscal,dy32);
503 tz = _mm256_mul_ps(fscal,dz32);
505 /* Update vectorial force */
506 fix3 = _mm256_add_ps(fix3,tx);
507 fiy3 = _mm256_add_ps(fiy3,ty);
508 fiz3 = _mm256_add_ps(fiz3,tz);
510 fjx2 = _mm256_add_ps(fjx2,tx);
511 fjy2 = _mm256_add_ps(fjy2,ty);
512 fjz2 = _mm256_add_ps(fjz2,tz);
514 /**************************
515 * CALCULATE INTERACTIONS *
516 **************************/
518 /* REACTION-FIELD ELECTROSTATICS */
519 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
520 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum = _mm256_add_ps(velecsum,velec);
527 /* Calculate temporary vectorial force */
528 tx = _mm256_mul_ps(fscal,dx33);
529 ty = _mm256_mul_ps(fscal,dy33);
530 tz = _mm256_mul_ps(fscal,dz33);
532 /* Update vectorial force */
533 fix3 = _mm256_add_ps(fix3,tx);
534 fiy3 = _mm256_add_ps(fiy3,ty);
535 fiz3 = _mm256_add_ps(fiz3,tz);
537 fjx3 = _mm256_add_ps(fjx3,tx);
538 fjy3 = _mm256_add_ps(fjy3,ty);
539 fjz3 = _mm256_add_ps(fjz3,tz);
541 fjptrA = f+j_coord_offsetA;
542 fjptrB = f+j_coord_offsetB;
543 fjptrC = f+j_coord_offsetC;
544 fjptrD = f+j_coord_offsetD;
545 fjptrE = f+j_coord_offsetE;
546 fjptrF = f+j_coord_offsetF;
547 fjptrG = f+j_coord_offsetG;
548 fjptrH = f+j_coord_offsetH;
550 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
551 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
552 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
554 /* Inner loop uses 288 flops */
560 /* Get j neighbor index, and coordinate index */
561 jnrlistA = jjnr[jidx];
562 jnrlistB = jjnr[jidx+1];
563 jnrlistC = jjnr[jidx+2];
564 jnrlistD = jjnr[jidx+3];
565 jnrlistE = jjnr[jidx+4];
566 jnrlistF = jjnr[jidx+5];
567 jnrlistG = jjnr[jidx+6];
568 jnrlistH = jjnr[jidx+7];
569 /* Sign of each element will be negative for non-real atoms.
570 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
571 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
573 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
574 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
576 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
577 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
578 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
579 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
580 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
581 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
582 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
583 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
584 j_coord_offsetA = DIM*jnrA;
585 j_coord_offsetB = DIM*jnrB;
586 j_coord_offsetC = DIM*jnrC;
587 j_coord_offsetD = DIM*jnrD;
588 j_coord_offsetE = DIM*jnrE;
589 j_coord_offsetF = DIM*jnrF;
590 j_coord_offsetG = DIM*jnrG;
591 j_coord_offsetH = DIM*jnrH;
593 /* load j atom coordinates */
594 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
595 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
596 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
597 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
598 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
600 /* Calculate displacement vector */
601 dx11 = _mm256_sub_ps(ix1,jx1);
602 dy11 = _mm256_sub_ps(iy1,jy1);
603 dz11 = _mm256_sub_ps(iz1,jz1);
604 dx12 = _mm256_sub_ps(ix1,jx2);
605 dy12 = _mm256_sub_ps(iy1,jy2);
606 dz12 = _mm256_sub_ps(iz1,jz2);
607 dx13 = _mm256_sub_ps(ix1,jx3);
608 dy13 = _mm256_sub_ps(iy1,jy3);
609 dz13 = _mm256_sub_ps(iz1,jz3);
610 dx21 = _mm256_sub_ps(ix2,jx1);
611 dy21 = _mm256_sub_ps(iy2,jy1);
612 dz21 = _mm256_sub_ps(iz2,jz1);
613 dx22 = _mm256_sub_ps(ix2,jx2);
614 dy22 = _mm256_sub_ps(iy2,jy2);
615 dz22 = _mm256_sub_ps(iz2,jz2);
616 dx23 = _mm256_sub_ps(ix2,jx3);
617 dy23 = _mm256_sub_ps(iy2,jy3);
618 dz23 = _mm256_sub_ps(iz2,jz3);
619 dx31 = _mm256_sub_ps(ix3,jx1);
620 dy31 = _mm256_sub_ps(iy3,jy1);
621 dz31 = _mm256_sub_ps(iz3,jz1);
622 dx32 = _mm256_sub_ps(ix3,jx2);
623 dy32 = _mm256_sub_ps(iy3,jy2);
624 dz32 = _mm256_sub_ps(iz3,jz2);
625 dx33 = _mm256_sub_ps(ix3,jx3);
626 dy33 = _mm256_sub_ps(iy3,jy3);
627 dz33 = _mm256_sub_ps(iz3,jz3);
629 /* Calculate squared distance and things based on it */
630 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
631 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
632 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
633 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
634 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
635 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
636 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
637 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
638 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
640 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
641 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
642 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
643 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
644 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
645 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
646 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
647 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
648 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
650 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
651 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
652 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
653 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
654 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
655 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
656 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
657 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
658 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
660 fjx1 = _mm256_setzero_ps();
661 fjy1 = _mm256_setzero_ps();
662 fjz1 = _mm256_setzero_ps();
663 fjx2 = _mm256_setzero_ps();
664 fjy2 = _mm256_setzero_ps();
665 fjz2 = _mm256_setzero_ps();
666 fjx3 = _mm256_setzero_ps();
667 fjy3 = _mm256_setzero_ps();
668 fjz3 = _mm256_setzero_ps();
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 /* REACTION-FIELD ELECTROSTATICS */
675 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
676 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
678 /* Update potential sum for this i atom from the interaction with this j atom. */
679 velec = _mm256_andnot_ps(dummy_mask,velec);
680 velecsum = _mm256_add_ps(velecsum,velec);
684 fscal = _mm256_andnot_ps(dummy_mask,fscal);
686 /* Calculate temporary vectorial force */
687 tx = _mm256_mul_ps(fscal,dx11);
688 ty = _mm256_mul_ps(fscal,dy11);
689 tz = _mm256_mul_ps(fscal,dz11);
691 /* Update vectorial force */
692 fix1 = _mm256_add_ps(fix1,tx);
693 fiy1 = _mm256_add_ps(fiy1,ty);
694 fiz1 = _mm256_add_ps(fiz1,tz);
696 fjx1 = _mm256_add_ps(fjx1,tx);
697 fjy1 = _mm256_add_ps(fjy1,ty);
698 fjz1 = _mm256_add_ps(fjz1,tz);
700 /**************************
701 * CALCULATE INTERACTIONS *
702 **************************/
704 /* REACTION-FIELD ELECTROSTATICS */
705 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
706 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velec = _mm256_andnot_ps(dummy_mask,velec);
710 velecsum = _mm256_add_ps(velecsum,velec);
714 fscal = _mm256_andnot_ps(dummy_mask,fscal);
716 /* Calculate temporary vectorial force */
717 tx = _mm256_mul_ps(fscal,dx12);
718 ty = _mm256_mul_ps(fscal,dy12);
719 tz = _mm256_mul_ps(fscal,dz12);
721 /* Update vectorial force */
722 fix1 = _mm256_add_ps(fix1,tx);
723 fiy1 = _mm256_add_ps(fiy1,ty);
724 fiz1 = _mm256_add_ps(fiz1,tz);
726 fjx2 = _mm256_add_ps(fjx2,tx);
727 fjy2 = _mm256_add_ps(fjy2,ty);
728 fjz2 = _mm256_add_ps(fjz2,tz);
730 /**************************
731 * CALCULATE INTERACTIONS *
732 **************************/
734 /* REACTION-FIELD ELECTROSTATICS */
735 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
736 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
738 /* Update potential sum for this i atom from the interaction with this j atom. */
739 velec = _mm256_andnot_ps(dummy_mask,velec);
740 velecsum = _mm256_add_ps(velecsum,velec);
744 fscal = _mm256_andnot_ps(dummy_mask,fscal);
746 /* Calculate temporary vectorial force */
747 tx = _mm256_mul_ps(fscal,dx13);
748 ty = _mm256_mul_ps(fscal,dy13);
749 tz = _mm256_mul_ps(fscal,dz13);
751 /* Update vectorial force */
752 fix1 = _mm256_add_ps(fix1,tx);
753 fiy1 = _mm256_add_ps(fiy1,ty);
754 fiz1 = _mm256_add_ps(fiz1,tz);
756 fjx3 = _mm256_add_ps(fjx3,tx);
757 fjy3 = _mm256_add_ps(fjy3,ty);
758 fjz3 = _mm256_add_ps(fjz3,tz);
760 /**************************
761 * CALCULATE INTERACTIONS *
762 **************************/
764 /* REACTION-FIELD ELECTROSTATICS */
765 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
766 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
768 /* Update potential sum for this i atom from the interaction with this j atom. */
769 velec = _mm256_andnot_ps(dummy_mask,velec);
770 velecsum = _mm256_add_ps(velecsum,velec);
774 fscal = _mm256_andnot_ps(dummy_mask,fscal);
776 /* Calculate temporary vectorial force */
777 tx = _mm256_mul_ps(fscal,dx21);
778 ty = _mm256_mul_ps(fscal,dy21);
779 tz = _mm256_mul_ps(fscal,dz21);
781 /* Update vectorial force */
782 fix2 = _mm256_add_ps(fix2,tx);
783 fiy2 = _mm256_add_ps(fiy2,ty);
784 fiz2 = _mm256_add_ps(fiz2,tz);
786 fjx1 = _mm256_add_ps(fjx1,tx);
787 fjy1 = _mm256_add_ps(fjy1,ty);
788 fjz1 = _mm256_add_ps(fjz1,tz);
790 /**************************
791 * CALCULATE INTERACTIONS *
792 **************************/
794 /* REACTION-FIELD ELECTROSTATICS */
795 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
796 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
798 /* Update potential sum for this i atom from the interaction with this j atom. */
799 velec = _mm256_andnot_ps(dummy_mask,velec);
800 velecsum = _mm256_add_ps(velecsum,velec);
804 fscal = _mm256_andnot_ps(dummy_mask,fscal);
806 /* Calculate temporary vectorial force */
807 tx = _mm256_mul_ps(fscal,dx22);
808 ty = _mm256_mul_ps(fscal,dy22);
809 tz = _mm256_mul_ps(fscal,dz22);
811 /* Update vectorial force */
812 fix2 = _mm256_add_ps(fix2,tx);
813 fiy2 = _mm256_add_ps(fiy2,ty);
814 fiz2 = _mm256_add_ps(fiz2,tz);
816 fjx2 = _mm256_add_ps(fjx2,tx);
817 fjy2 = _mm256_add_ps(fjy2,ty);
818 fjz2 = _mm256_add_ps(fjz2,tz);
820 /**************************
821 * CALCULATE INTERACTIONS *
822 **************************/
824 /* REACTION-FIELD ELECTROSTATICS */
825 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
826 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
828 /* Update potential sum for this i atom from the interaction with this j atom. */
829 velec = _mm256_andnot_ps(dummy_mask,velec);
830 velecsum = _mm256_add_ps(velecsum,velec);
834 fscal = _mm256_andnot_ps(dummy_mask,fscal);
836 /* Calculate temporary vectorial force */
837 tx = _mm256_mul_ps(fscal,dx23);
838 ty = _mm256_mul_ps(fscal,dy23);
839 tz = _mm256_mul_ps(fscal,dz23);
841 /* Update vectorial force */
842 fix2 = _mm256_add_ps(fix2,tx);
843 fiy2 = _mm256_add_ps(fiy2,ty);
844 fiz2 = _mm256_add_ps(fiz2,tz);
846 fjx3 = _mm256_add_ps(fjx3,tx);
847 fjy3 = _mm256_add_ps(fjy3,ty);
848 fjz3 = _mm256_add_ps(fjz3,tz);
850 /**************************
851 * CALCULATE INTERACTIONS *
852 **************************/
854 /* REACTION-FIELD ELECTROSTATICS */
855 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
856 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
858 /* Update potential sum for this i atom from the interaction with this j atom. */
859 velec = _mm256_andnot_ps(dummy_mask,velec);
860 velecsum = _mm256_add_ps(velecsum,velec);
864 fscal = _mm256_andnot_ps(dummy_mask,fscal);
866 /* Calculate temporary vectorial force */
867 tx = _mm256_mul_ps(fscal,dx31);
868 ty = _mm256_mul_ps(fscal,dy31);
869 tz = _mm256_mul_ps(fscal,dz31);
871 /* Update vectorial force */
872 fix3 = _mm256_add_ps(fix3,tx);
873 fiy3 = _mm256_add_ps(fiy3,ty);
874 fiz3 = _mm256_add_ps(fiz3,tz);
876 fjx1 = _mm256_add_ps(fjx1,tx);
877 fjy1 = _mm256_add_ps(fjy1,ty);
878 fjz1 = _mm256_add_ps(fjz1,tz);
880 /**************************
881 * CALCULATE INTERACTIONS *
882 **************************/
884 /* REACTION-FIELD ELECTROSTATICS */
885 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
886 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
888 /* Update potential sum for this i atom from the interaction with this j atom. */
889 velec = _mm256_andnot_ps(dummy_mask,velec);
890 velecsum = _mm256_add_ps(velecsum,velec);
894 fscal = _mm256_andnot_ps(dummy_mask,fscal);
896 /* Calculate temporary vectorial force */
897 tx = _mm256_mul_ps(fscal,dx32);
898 ty = _mm256_mul_ps(fscal,dy32);
899 tz = _mm256_mul_ps(fscal,dz32);
901 /* Update vectorial force */
902 fix3 = _mm256_add_ps(fix3,tx);
903 fiy3 = _mm256_add_ps(fiy3,ty);
904 fiz3 = _mm256_add_ps(fiz3,tz);
906 fjx2 = _mm256_add_ps(fjx2,tx);
907 fjy2 = _mm256_add_ps(fjy2,ty);
908 fjz2 = _mm256_add_ps(fjz2,tz);
910 /**************************
911 * CALCULATE INTERACTIONS *
912 **************************/
914 /* REACTION-FIELD ELECTROSTATICS */
915 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
916 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
918 /* Update potential sum for this i atom from the interaction with this j atom. */
919 velec = _mm256_andnot_ps(dummy_mask,velec);
920 velecsum = _mm256_add_ps(velecsum,velec);
924 fscal = _mm256_andnot_ps(dummy_mask,fscal);
926 /* Calculate temporary vectorial force */
927 tx = _mm256_mul_ps(fscal,dx33);
928 ty = _mm256_mul_ps(fscal,dy33);
929 tz = _mm256_mul_ps(fscal,dz33);
931 /* Update vectorial force */
932 fix3 = _mm256_add_ps(fix3,tx);
933 fiy3 = _mm256_add_ps(fiy3,ty);
934 fiz3 = _mm256_add_ps(fiz3,tz);
936 fjx3 = _mm256_add_ps(fjx3,tx);
937 fjy3 = _mm256_add_ps(fjy3,ty);
938 fjz3 = _mm256_add_ps(fjz3,tz);
940 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
941 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
942 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
943 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
944 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
945 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
946 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
947 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
949 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
950 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
951 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
953 /* Inner loop uses 288 flops */
956 /* End of innermost loop */
958 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
959 f+i_coord_offset+DIM,fshift+i_shift_offset);
962 /* Update potential energies */
963 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
965 /* Increment number of inner iterations */
966 inneriter += j_index_end - j_index_start;
968 /* Outer loop uses 19 flops */
971 /* Increment number of outer iterations */
974 /* Update outer/inner flops */
976 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*288);
979 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single
980 * Electrostatics interaction: ReactionField
981 * VdW interaction: None
982 * Geometry: Water4-Water4
983 * Calculate force/pot: Force
986 nb_kernel_ElecRF_VdwNone_GeomW4W4_F_avx_256_single
987 (t_nblist * gmx_restrict nlist,
988 rvec * gmx_restrict xx,
989 rvec * gmx_restrict ff,
990 t_forcerec * gmx_restrict fr,
991 t_mdatoms * gmx_restrict mdatoms,
992 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
993 t_nrnb * gmx_restrict nrnb)
995 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
996 * just 0 for non-waters.
997 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
998 * jnr indices corresponding to data put in the four positions in the SIMD register.
1000 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1001 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1002 int jnrA,jnrB,jnrC,jnrD;
1003 int jnrE,jnrF,jnrG,jnrH;
1004 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1005 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1006 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1007 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1008 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1009 real rcutoff_scalar;
1010 real *shiftvec,*fshift,*x,*f;
1011 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1012 real scratch[4*DIM];
1013 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1014 real * vdwioffsetptr1;
1015 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1016 real * vdwioffsetptr2;
1017 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1018 real * vdwioffsetptr3;
1019 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1020 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1021 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1022 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1023 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1024 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1025 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1026 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1027 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1028 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1029 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1030 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1031 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1032 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1033 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1034 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1035 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1037 __m256 dummy_mask,cutoff_mask;
1038 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1039 __m256 one = _mm256_set1_ps(1.0);
1040 __m256 two = _mm256_set1_ps(2.0);
1046 jindex = nlist->jindex;
1048 shiftidx = nlist->shift;
1050 shiftvec = fr->shift_vec[0];
1051 fshift = fr->fshift[0];
1052 facel = _mm256_set1_ps(fr->epsfac);
1053 charge = mdatoms->chargeA;
1054 krf = _mm256_set1_ps(fr->ic->k_rf);
1055 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1056 crf = _mm256_set1_ps(fr->ic->c_rf);
1058 /* Setup water-specific parameters */
1059 inr = nlist->iinr[0];
1060 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1061 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1062 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1064 jq1 = _mm256_set1_ps(charge[inr+1]);
1065 jq2 = _mm256_set1_ps(charge[inr+2]);
1066 jq3 = _mm256_set1_ps(charge[inr+3]);
1067 qq11 = _mm256_mul_ps(iq1,jq1);
1068 qq12 = _mm256_mul_ps(iq1,jq2);
1069 qq13 = _mm256_mul_ps(iq1,jq3);
1070 qq21 = _mm256_mul_ps(iq2,jq1);
1071 qq22 = _mm256_mul_ps(iq2,jq2);
1072 qq23 = _mm256_mul_ps(iq2,jq3);
1073 qq31 = _mm256_mul_ps(iq3,jq1);
1074 qq32 = _mm256_mul_ps(iq3,jq2);
1075 qq33 = _mm256_mul_ps(iq3,jq3);
1077 /* Avoid stupid compiler warnings */
1078 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1079 j_coord_offsetA = 0;
1080 j_coord_offsetB = 0;
1081 j_coord_offsetC = 0;
1082 j_coord_offsetD = 0;
1083 j_coord_offsetE = 0;
1084 j_coord_offsetF = 0;
1085 j_coord_offsetG = 0;
1086 j_coord_offsetH = 0;
1091 for(iidx=0;iidx<4*DIM;iidx++)
1093 scratch[iidx] = 0.0;
1096 /* Start outer loop over neighborlists */
1097 for(iidx=0; iidx<nri; iidx++)
1099 /* Load shift vector for this list */
1100 i_shift_offset = DIM*shiftidx[iidx];
1102 /* Load limits for loop over neighbors */
1103 j_index_start = jindex[iidx];
1104 j_index_end = jindex[iidx+1];
1106 /* Get outer coordinate index */
1108 i_coord_offset = DIM*inr;
1110 /* Load i particle coords and add shift vector */
1111 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1112 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1114 fix1 = _mm256_setzero_ps();
1115 fiy1 = _mm256_setzero_ps();
1116 fiz1 = _mm256_setzero_ps();
1117 fix2 = _mm256_setzero_ps();
1118 fiy2 = _mm256_setzero_ps();
1119 fiz2 = _mm256_setzero_ps();
1120 fix3 = _mm256_setzero_ps();
1121 fiy3 = _mm256_setzero_ps();
1122 fiz3 = _mm256_setzero_ps();
1124 /* Start inner kernel loop */
1125 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1128 /* Get j neighbor index, and coordinate index */
1130 jnrB = jjnr[jidx+1];
1131 jnrC = jjnr[jidx+2];
1132 jnrD = jjnr[jidx+3];
1133 jnrE = jjnr[jidx+4];
1134 jnrF = jjnr[jidx+5];
1135 jnrG = jjnr[jidx+6];
1136 jnrH = jjnr[jidx+7];
1137 j_coord_offsetA = DIM*jnrA;
1138 j_coord_offsetB = DIM*jnrB;
1139 j_coord_offsetC = DIM*jnrC;
1140 j_coord_offsetD = DIM*jnrD;
1141 j_coord_offsetE = DIM*jnrE;
1142 j_coord_offsetF = DIM*jnrF;
1143 j_coord_offsetG = DIM*jnrG;
1144 j_coord_offsetH = DIM*jnrH;
1146 /* load j atom coordinates */
1147 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1148 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1149 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1150 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1151 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1153 /* Calculate displacement vector */
1154 dx11 = _mm256_sub_ps(ix1,jx1);
1155 dy11 = _mm256_sub_ps(iy1,jy1);
1156 dz11 = _mm256_sub_ps(iz1,jz1);
1157 dx12 = _mm256_sub_ps(ix1,jx2);
1158 dy12 = _mm256_sub_ps(iy1,jy2);
1159 dz12 = _mm256_sub_ps(iz1,jz2);
1160 dx13 = _mm256_sub_ps(ix1,jx3);
1161 dy13 = _mm256_sub_ps(iy1,jy3);
1162 dz13 = _mm256_sub_ps(iz1,jz3);
1163 dx21 = _mm256_sub_ps(ix2,jx1);
1164 dy21 = _mm256_sub_ps(iy2,jy1);
1165 dz21 = _mm256_sub_ps(iz2,jz1);
1166 dx22 = _mm256_sub_ps(ix2,jx2);
1167 dy22 = _mm256_sub_ps(iy2,jy2);
1168 dz22 = _mm256_sub_ps(iz2,jz2);
1169 dx23 = _mm256_sub_ps(ix2,jx3);
1170 dy23 = _mm256_sub_ps(iy2,jy3);
1171 dz23 = _mm256_sub_ps(iz2,jz3);
1172 dx31 = _mm256_sub_ps(ix3,jx1);
1173 dy31 = _mm256_sub_ps(iy3,jy1);
1174 dz31 = _mm256_sub_ps(iz3,jz1);
1175 dx32 = _mm256_sub_ps(ix3,jx2);
1176 dy32 = _mm256_sub_ps(iy3,jy2);
1177 dz32 = _mm256_sub_ps(iz3,jz2);
1178 dx33 = _mm256_sub_ps(ix3,jx3);
1179 dy33 = _mm256_sub_ps(iy3,jy3);
1180 dz33 = _mm256_sub_ps(iz3,jz3);
1182 /* Calculate squared distance and things based on it */
1183 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1184 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1185 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1186 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1187 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1188 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1189 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1190 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1191 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1193 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1194 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1195 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1196 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1197 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1198 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1199 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1200 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1201 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1203 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1204 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1205 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1206 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1207 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1208 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1209 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1210 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1211 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1213 fjx1 = _mm256_setzero_ps();
1214 fjy1 = _mm256_setzero_ps();
1215 fjz1 = _mm256_setzero_ps();
1216 fjx2 = _mm256_setzero_ps();
1217 fjy2 = _mm256_setzero_ps();
1218 fjz2 = _mm256_setzero_ps();
1219 fjx3 = _mm256_setzero_ps();
1220 fjy3 = _mm256_setzero_ps();
1221 fjz3 = _mm256_setzero_ps();
1223 /**************************
1224 * CALCULATE INTERACTIONS *
1225 **************************/
1227 /* REACTION-FIELD ELECTROSTATICS */
1228 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1232 /* Calculate temporary vectorial force */
1233 tx = _mm256_mul_ps(fscal,dx11);
1234 ty = _mm256_mul_ps(fscal,dy11);
1235 tz = _mm256_mul_ps(fscal,dz11);
1237 /* Update vectorial force */
1238 fix1 = _mm256_add_ps(fix1,tx);
1239 fiy1 = _mm256_add_ps(fiy1,ty);
1240 fiz1 = _mm256_add_ps(fiz1,tz);
1242 fjx1 = _mm256_add_ps(fjx1,tx);
1243 fjy1 = _mm256_add_ps(fjy1,ty);
1244 fjz1 = _mm256_add_ps(fjz1,tz);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 /* REACTION-FIELD ELECTROSTATICS */
1251 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1255 /* Calculate temporary vectorial force */
1256 tx = _mm256_mul_ps(fscal,dx12);
1257 ty = _mm256_mul_ps(fscal,dy12);
1258 tz = _mm256_mul_ps(fscal,dz12);
1260 /* Update vectorial force */
1261 fix1 = _mm256_add_ps(fix1,tx);
1262 fiy1 = _mm256_add_ps(fiy1,ty);
1263 fiz1 = _mm256_add_ps(fiz1,tz);
1265 fjx2 = _mm256_add_ps(fjx2,tx);
1266 fjy2 = _mm256_add_ps(fjy2,ty);
1267 fjz2 = _mm256_add_ps(fjz2,tz);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 /* REACTION-FIELD ELECTROSTATICS */
1274 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1278 /* Calculate temporary vectorial force */
1279 tx = _mm256_mul_ps(fscal,dx13);
1280 ty = _mm256_mul_ps(fscal,dy13);
1281 tz = _mm256_mul_ps(fscal,dz13);
1283 /* Update vectorial force */
1284 fix1 = _mm256_add_ps(fix1,tx);
1285 fiy1 = _mm256_add_ps(fiy1,ty);
1286 fiz1 = _mm256_add_ps(fiz1,tz);
1288 fjx3 = _mm256_add_ps(fjx3,tx);
1289 fjy3 = _mm256_add_ps(fjy3,ty);
1290 fjz3 = _mm256_add_ps(fjz3,tz);
1292 /**************************
1293 * CALCULATE INTERACTIONS *
1294 **************************/
1296 /* REACTION-FIELD ELECTROSTATICS */
1297 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1301 /* Calculate temporary vectorial force */
1302 tx = _mm256_mul_ps(fscal,dx21);
1303 ty = _mm256_mul_ps(fscal,dy21);
1304 tz = _mm256_mul_ps(fscal,dz21);
1306 /* Update vectorial force */
1307 fix2 = _mm256_add_ps(fix2,tx);
1308 fiy2 = _mm256_add_ps(fiy2,ty);
1309 fiz2 = _mm256_add_ps(fiz2,tz);
1311 fjx1 = _mm256_add_ps(fjx1,tx);
1312 fjy1 = _mm256_add_ps(fjy1,ty);
1313 fjz1 = _mm256_add_ps(fjz1,tz);
1315 /**************************
1316 * CALCULATE INTERACTIONS *
1317 **************************/
1319 /* REACTION-FIELD ELECTROSTATICS */
1320 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1324 /* Calculate temporary vectorial force */
1325 tx = _mm256_mul_ps(fscal,dx22);
1326 ty = _mm256_mul_ps(fscal,dy22);
1327 tz = _mm256_mul_ps(fscal,dz22);
1329 /* Update vectorial force */
1330 fix2 = _mm256_add_ps(fix2,tx);
1331 fiy2 = _mm256_add_ps(fiy2,ty);
1332 fiz2 = _mm256_add_ps(fiz2,tz);
1334 fjx2 = _mm256_add_ps(fjx2,tx);
1335 fjy2 = _mm256_add_ps(fjy2,ty);
1336 fjz2 = _mm256_add_ps(fjz2,tz);
1338 /**************************
1339 * CALCULATE INTERACTIONS *
1340 **************************/
1342 /* REACTION-FIELD ELECTROSTATICS */
1343 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1347 /* Calculate temporary vectorial force */
1348 tx = _mm256_mul_ps(fscal,dx23);
1349 ty = _mm256_mul_ps(fscal,dy23);
1350 tz = _mm256_mul_ps(fscal,dz23);
1352 /* Update vectorial force */
1353 fix2 = _mm256_add_ps(fix2,tx);
1354 fiy2 = _mm256_add_ps(fiy2,ty);
1355 fiz2 = _mm256_add_ps(fiz2,tz);
1357 fjx3 = _mm256_add_ps(fjx3,tx);
1358 fjy3 = _mm256_add_ps(fjy3,ty);
1359 fjz3 = _mm256_add_ps(fjz3,tz);
1361 /**************************
1362 * CALCULATE INTERACTIONS *
1363 **************************/
1365 /* REACTION-FIELD ELECTROSTATICS */
1366 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1370 /* Calculate temporary vectorial force */
1371 tx = _mm256_mul_ps(fscal,dx31);
1372 ty = _mm256_mul_ps(fscal,dy31);
1373 tz = _mm256_mul_ps(fscal,dz31);
1375 /* Update vectorial force */
1376 fix3 = _mm256_add_ps(fix3,tx);
1377 fiy3 = _mm256_add_ps(fiy3,ty);
1378 fiz3 = _mm256_add_ps(fiz3,tz);
1380 fjx1 = _mm256_add_ps(fjx1,tx);
1381 fjy1 = _mm256_add_ps(fjy1,ty);
1382 fjz1 = _mm256_add_ps(fjz1,tz);
1384 /**************************
1385 * CALCULATE INTERACTIONS *
1386 **************************/
1388 /* REACTION-FIELD ELECTROSTATICS */
1389 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1393 /* Calculate temporary vectorial force */
1394 tx = _mm256_mul_ps(fscal,dx32);
1395 ty = _mm256_mul_ps(fscal,dy32);
1396 tz = _mm256_mul_ps(fscal,dz32);
1398 /* Update vectorial force */
1399 fix3 = _mm256_add_ps(fix3,tx);
1400 fiy3 = _mm256_add_ps(fiy3,ty);
1401 fiz3 = _mm256_add_ps(fiz3,tz);
1403 fjx2 = _mm256_add_ps(fjx2,tx);
1404 fjy2 = _mm256_add_ps(fjy2,ty);
1405 fjz2 = _mm256_add_ps(fjz2,tz);
1407 /**************************
1408 * CALCULATE INTERACTIONS *
1409 **************************/
1411 /* REACTION-FIELD ELECTROSTATICS */
1412 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1416 /* Calculate temporary vectorial force */
1417 tx = _mm256_mul_ps(fscal,dx33);
1418 ty = _mm256_mul_ps(fscal,dy33);
1419 tz = _mm256_mul_ps(fscal,dz33);
1421 /* Update vectorial force */
1422 fix3 = _mm256_add_ps(fix3,tx);
1423 fiy3 = _mm256_add_ps(fiy3,ty);
1424 fiz3 = _mm256_add_ps(fiz3,tz);
1426 fjx3 = _mm256_add_ps(fjx3,tx);
1427 fjy3 = _mm256_add_ps(fjy3,ty);
1428 fjz3 = _mm256_add_ps(fjz3,tz);
1430 fjptrA = f+j_coord_offsetA;
1431 fjptrB = f+j_coord_offsetB;
1432 fjptrC = f+j_coord_offsetC;
1433 fjptrD = f+j_coord_offsetD;
1434 fjptrE = f+j_coord_offsetE;
1435 fjptrF = f+j_coord_offsetF;
1436 fjptrG = f+j_coord_offsetG;
1437 fjptrH = f+j_coord_offsetH;
1439 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1440 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1441 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1443 /* Inner loop uses 243 flops */
1446 if(jidx<j_index_end)
1449 /* Get j neighbor index, and coordinate index */
1450 jnrlistA = jjnr[jidx];
1451 jnrlistB = jjnr[jidx+1];
1452 jnrlistC = jjnr[jidx+2];
1453 jnrlistD = jjnr[jidx+3];
1454 jnrlistE = jjnr[jidx+4];
1455 jnrlistF = jjnr[jidx+5];
1456 jnrlistG = jjnr[jidx+6];
1457 jnrlistH = jjnr[jidx+7];
1458 /* Sign of each element will be negative for non-real atoms.
1459 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1460 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1462 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1463 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1465 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1466 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1467 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1468 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1469 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1470 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1471 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1472 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1473 j_coord_offsetA = DIM*jnrA;
1474 j_coord_offsetB = DIM*jnrB;
1475 j_coord_offsetC = DIM*jnrC;
1476 j_coord_offsetD = DIM*jnrD;
1477 j_coord_offsetE = DIM*jnrE;
1478 j_coord_offsetF = DIM*jnrF;
1479 j_coord_offsetG = DIM*jnrG;
1480 j_coord_offsetH = DIM*jnrH;
1482 /* load j atom coordinates */
1483 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1484 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1485 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1486 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1487 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1489 /* Calculate displacement vector */
1490 dx11 = _mm256_sub_ps(ix1,jx1);
1491 dy11 = _mm256_sub_ps(iy1,jy1);
1492 dz11 = _mm256_sub_ps(iz1,jz1);
1493 dx12 = _mm256_sub_ps(ix1,jx2);
1494 dy12 = _mm256_sub_ps(iy1,jy2);
1495 dz12 = _mm256_sub_ps(iz1,jz2);
1496 dx13 = _mm256_sub_ps(ix1,jx3);
1497 dy13 = _mm256_sub_ps(iy1,jy3);
1498 dz13 = _mm256_sub_ps(iz1,jz3);
1499 dx21 = _mm256_sub_ps(ix2,jx1);
1500 dy21 = _mm256_sub_ps(iy2,jy1);
1501 dz21 = _mm256_sub_ps(iz2,jz1);
1502 dx22 = _mm256_sub_ps(ix2,jx2);
1503 dy22 = _mm256_sub_ps(iy2,jy2);
1504 dz22 = _mm256_sub_ps(iz2,jz2);
1505 dx23 = _mm256_sub_ps(ix2,jx3);
1506 dy23 = _mm256_sub_ps(iy2,jy3);
1507 dz23 = _mm256_sub_ps(iz2,jz3);
1508 dx31 = _mm256_sub_ps(ix3,jx1);
1509 dy31 = _mm256_sub_ps(iy3,jy1);
1510 dz31 = _mm256_sub_ps(iz3,jz1);
1511 dx32 = _mm256_sub_ps(ix3,jx2);
1512 dy32 = _mm256_sub_ps(iy3,jy2);
1513 dz32 = _mm256_sub_ps(iz3,jz2);
1514 dx33 = _mm256_sub_ps(ix3,jx3);
1515 dy33 = _mm256_sub_ps(iy3,jy3);
1516 dz33 = _mm256_sub_ps(iz3,jz3);
1518 /* Calculate squared distance and things based on it */
1519 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1520 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1521 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1522 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1523 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1524 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1525 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1526 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1527 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1529 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1530 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1531 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1532 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1533 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1534 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1535 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1536 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1537 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1539 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1540 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1541 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1542 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1543 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1544 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1545 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1546 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1547 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1549 fjx1 = _mm256_setzero_ps();
1550 fjy1 = _mm256_setzero_ps();
1551 fjz1 = _mm256_setzero_ps();
1552 fjx2 = _mm256_setzero_ps();
1553 fjy2 = _mm256_setzero_ps();
1554 fjz2 = _mm256_setzero_ps();
1555 fjx3 = _mm256_setzero_ps();
1556 fjy3 = _mm256_setzero_ps();
1557 fjz3 = _mm256_setzero_ps();
1559 /**************************
1560 * CALCULATE INTERACTIONS *
1561 **************************/
1563 /* REACTION-FIELD ELECTROSTATICS */
1564 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1568 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1570 /* Calculate temporary vectorial force */
1571 tx = _mm256_mul_ps(fscal,dx11);
1572 ty = _mm256_mul_ps(fscal,dy11);
1573 tz = _mm256_mul_ps(fscal,dz11);
1575 /* Update vectorial force */
1576 fix1 = _mm256_add_ps(fix1,tx);
1577 fiy1 = _mm256_add_ps(fiy1,ty);
1578 fiz1 = _mm256_add_ps(fiz1,tz);
1580 fjx1 = _mm256_add_ps(fjx1,tx);
1581 fjy1 = _mm256_add_ps(fjy1,ty);
1582 fjz1 = _mm256_add_ps(fjz1,tz);
1584 /**************************
1585 * CALCULATE INTERACTIONS *
1586 **************************/
1588 /* REACTION-FIELD ELECTROSTATICS */
1589 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1593 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1595 /* Calculate temporary vectorial force */
1596 tx = _mm256_mul_ps(fscal,dx12);
1597 ty = _mm256_mul_ps(fscal,dy12);
1598 tz = _mm256_mul_ps(fscal,dz12);
1600 /* Update vectorial force */
1601 fix1 = _mm256_add_ps(fix1,tx);
1602 fiy1 = _mm256_add_ps(fiy1,ty);
1603 fiz1 = _mm256_add_ps(fiz1,tz);
1605 fjx2 = _mm256_add_ps(fjx2,tx);
1606 fjy2 = _mm256_add_ps(fjy2,ty);
1607 fjz2 = _mm256_add_ps(fjz2,tz);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 /* REACTION-FIELD ELECTROSTATICS */
1614 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1618 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1620 /* Calculate temporary vectorial force */
1621 tx = _mm256_mul_ps(fscal,dx13);
1622 ty = _mm256_mul_ps(fscal,dy13);
1623 tz = _mm256_mul_ps(fscal,dz13);
1625 /* Update vectorial force */
1626 fix1 = _mm256_add_ps(fix1,tx);
1627 fiy1 = _mm256_add_ps(fiy1,ty);
1628 fiz1 = _mm256_add_ps(fiz1,tz);
1630 fjx3 = _mm256_add_ps(fjx3,tx);
1631 fjy3 = _mm256_add_ps(fjy3,ty);
1632 fjz3 = _mm256_add_ps(fjz3,tz);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 /* REACTION-FIELD ELECTROSTATICS */
1639 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1643 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1645 /* Calculate temporary vectorial force */
1646 tx = _mm256_mul_ps(fscal,dx21);
1647 ty = _mm256_mul_ps(fscal,dy21);
1648 tz = _mm256_mul_ps(fscal,dz21);
1650 /* Update vectorial force */
1651 fix2 = _mm256_add_ps(fix2,tx);
1652 fiy2 = _mm256_add_ps(fiy2,ty);
1653 fiz2 = _mm256_add_ps(fiz2,tz);
1655 fjx1 = _mm256_add_ps(fjx1,tx);
1656 fjy1 = _mm256_add_ps(fjy1,ty);
1657 fjz1 = _mm256_add_ps(fjz1,tz);
1659 /**************************
1660 * CALCULATE INTERACTIONS *
1661 **************************/
1663 /* REACTION-FIELD ELECTROSTATICS */
1664 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1668 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1670 /* Calculate temporary vectorial force */
1671 tx = _mm256_mul_ps(fscal,dx22);
1672 ty = _mm256_mul_ps(fscal,dy22);
1673 tz = _mm256_mul_ps(fscal,dz22);
1675 /* Update vectorial force */
1676 fix2 = _mm256_add_ps(fix2,tx);
1677 fiy2 = _mm256_add_ps(fiy2,ty);
1678 fiz2 = _mm256_add_ps(fiz2,tz);
1680 fjx2 = _mm256_add_ps(fjx2,tx);
1681 fjy2 = _mm256_add_ps(fjy2,ty);
1682 fjz2 = _mm256_add_ps(fjz2,tz);
1684 /**************************
1685 * CALCULATE INTERACTIONS *
1686 **************************/
1688 /* REACTION-FIELD ELECTROSTATICS */
1689 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1693 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1695 /* Calculate temporary vectorial force */
1696 tx = _mm256_mul_ps(fscal,dx23);
1697 ty = _mm256_mul_ps(fscal,dy23);
1698 tz = _mm256_mul_ps(fscal,dz23);
1700 /* Update vectorial force */
1701 fix2 = _mm256_add_ps(fix2,tx);
1702 fiy2 = _mm256_add_ps(fiy2,ty);
1703 fiz2 = _mm256_add_ps(fiz2,tz);
1705 fjx3 = _mm256_add_ps(fjx3,tx);
1706 fjy3 = _mm256_add_ps(fjy3,ty);
1707 fjz3 = _mm256_add_ps(fjz3,tz);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 /* REACTION-FIELD ELECTROSTATICS */
1714 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1718 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1720 /* Calculate temporary vectorial force */
1721 tx = _mm256_mul_ps(fscal,dx31);
1722 ty = _mm256_mul_ps(fscal,dy31);
1723 tz = _mm256_mul_ps(fscal,dz31);
1725 /* Update vectorial force */
1726 fix3 = _mm256_add_ps(fix3,tx);
1727 fiy3 = _mm256_add_ps(fiy3,ty);
1728 fiz3 = _mm256_add_ps(fiz3,tz);
1730 fjx1 = _mm256_add_ps(fjx1,tx);
1731 fjy1 = _mm256_add_ps(fjy1,ty);
1732 fjz1 = _mm256_add_ps(fjz1,tz);
1734 /**************************
1735 * CALCULATE INTERACTIONS *
1736 **************************/
1738 /* REACTION-FIELD ELECTROSTATICS */
1739 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1743 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1745 /* Calculate temporary vectorial force */
1746 tx = _mm256_mul_ps(fscal,dx32);
1747 ty = _mm256_mul_ps(fscal,dy32);
1748 tz = _mm256_mul_ps(fscal,dz32);
1750 /* Update vectorial force */
1751 fix3 = _mm256_add_ps(fix3,tx);
1752 fiy3 = _mm256_add_ps(fiy3,ty);
1753 fiz3 = _mm256_add_ps(fiz3,tz);
1755 fjx2 = _mm256_add_ps(fjx2,tx);
1756 fjy2 = _mm256_add_ps(fjy2,ty);
1757 fjz2 = _mm256_add_ps(fjz2,tz);
1759 /**************************
1760 * CALCULATE INTERACTIONS *
1761 **************************/
1763 /* REACTION-FIELD ELECTROSTATICS */
1764 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1768 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1770 /* Calculate temporary vectorial force */
1771 tx = _mm256_mul_ps(fscal,dx33);
1772 ty = _mm256_mul_ps(fscal,dy33);
1773 tz = _mm256_mul_ps(fscal,dz33);
1775 /* Update vectorial force */
1776 fix3 = _mm256_add_ps(fix3,tx);
1777 fiy3 = _mm256_add_ps(fiy3,ty);
1778 fiz3 = _mm256_add_ps(fiz3,tz);
1780 fjx3 = _mm256_add_ps(fjx3,tx);
1781 fjy3 = _mm256_add_ps(fjy3,ty);
1782 fjz3 = _mm256_add_ps(fjz3,tz);
1784 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1785 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1786 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1787 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1788 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1789 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1790 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1791 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1793 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1794 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1795 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1797 /* Inner loop uses 243 flops */
1800 /* End of innermost loop */
1802 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1803 f+i_coord_offset+DIM,fshift+i_shift_offset);
1805 /* Increment number of inner iterations */
1806 inneriter += j_index_end - j_index_start;
1808 /* Outer loop uses 18 flops */
1811 /* Increment number of outer iterations */
1814 /* Update outer/inner flops */
1816 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*243);