2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: None
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr1;
87 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 real * vdwioffsetptr3;
91 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
93 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
95 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
97 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
109 __m256 dummy_mask,cutoff_mask;
110 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
111 __m256 one = _mm256_set1_ps(1.0);
112 __m256 two = _mm256_set1_ps(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm256_set1_ps(fr->epsfac);
125 charge = mdatoms->chargeA;
126 krf = _mm256_set1_ps(fr->ic->k_rf);
127 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
128 crf = _mm256_set1_ps(fr->ic->c_rf);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
133 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
134 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
136 jq1 = _mm256_set1_ps(charge[inr+1]);
137 jq2 = _mm256_set1_ps(charge[inr+2]);
138 jq3 = _mm256_set1_ps(charge[inr+3]);
139 qq11 = _mm256_mul_ps(iq1,jq1);
140 qq12 = _mm256_mul_ps(iq1,jq2);
141 qq13 = _mm256_mul_ps(iq1,jq3);
142 qq21 = _mm256_mul_ps(iq2,jq1);
143 qq22 = _mm256_mul_ps(iq2,jq2);
144 qq23 = _mm256_mul_ps(iq2,jq3);
145 qq31 = _mm256_mul_ps(iq3,jq1);
146 qq32 = _mm256_mul_ps(iq3,jq2);
147 qq33 = _mm256_mul_ps(iq3,jq3);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->rcoulomb;
151 rcutoff = _mm256_set1_ps(rcutoff_scalar);
152 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
154 /* Avoid stupid compiler warnings */
155 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
168 for(iidx=0;iidx<4*DIM;iidx++)
173 /* Start outer loop over neighborlists */
174 for(iidx=0; iidx<nri; iidx++)
176 /* Load shift vector for this list */
177 i_shift_offset = DIM*shiftidx[iidx];
179 /* Load limits for loop over neighbors */
180 j_index_start = jindex[iidx];
181 j_index_end = jindex[iidx+1];
183 /* Get outer coordinate index */
185 i_coord_offset = DIM*inr;
187 /* Load i particle coords and add shift vector */
188 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
189 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
191 fix1 = _mm256_setzero_ps();
192 fiy1 = _mm256_setzero_ps();
193 fiz1 = _mm256_setzero_ps();
194 fix2 = _mm256_setzero_ps();
195 fiy2 = _mm256_setzero_ps();
196 fiz2 = _mm256_setzero_ps();
197 fix3 = _mm256_setzero_ps();
198 fiy3 = _mm256_setzero_ps();
199 fiz3 = _mm256_setzero_ps();
201 /* Reset potential sums */
202 velecsum = _mm256_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
208 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
219 j_coord_offsetC = DIM*jnrC;
220 j_coord_offsetD = DIM*jnrD;
221 j_coord_offsetE = DIM*jnrE;
222 j_coord_offsetF = DIM*jnrF;
223 j_coord_offsetG = DIM*jnrG;
224 j_coord_offsetH = DIM*jnrH;
226 /* load j atom coordinates */
227 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
228 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
229 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
230 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
231 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
233 /* Calculate displacement vector */
234 dx11 = _mm256_sub_ps(ix1,jx1);
235 dy11 = _mm256_sub_ps(iy1,jy1);
236 dz11 = _mm256_sub_ps(iz1,jz1);
237 dx12 = _mm256_sub_ps(ix1,jx2);
238 dy12 = _mm256_sub_ps(iy1,jy2);
239 dz12 = _mm256_sub_ps(iz1,jz2);
240 dx13 = _mm256_sub_ps(ix1,jx3);
241 dy13 = _mm256_sub_ps(iy1,jy3);
242 dz13 = _mm256_sub_ps(iz1,jz3);
243 dx21 = _mm256_sub_ps(ix2,jx1);
244 dy21 = _mm256_sub_ps(iy2,jy1);
245 dz21 = _mm256_sub_ps(iz2,jz1);
246 dx22 = _mm256_sub_ps(ix2,jx2);
247 dy22 = _mm256_sub_ps(iy2,jy2);
248 dz22 = _mm256_sub_ps(iz2,jz2);
249 dx23 = _mm256_sub_ps(ix2,jx3);
250 dy23 = _mm256_sub_ps(iy2,jy3);
251 dz23 = _mm256_sub_ps(iz2,jz3);
252 dx31 = _mm256_sub_ps(ix3,jx1);
253 dy31 = _mm256_sub_ps(iy3,jy1);
254 dz31 = _mm256_sub_ps(iz3,jz1);
255 dx32 = _mm256_sub_ps(ix3,jx2);
256 dy32 = _mm256_sub_ps(iy3,jy2);
257 dz32 = _mm256_sub_ps(iz3,jz2);
258 dx33 = _mm256_sub_ps(ix3,jx3);
259 dy33 = _mm256_sub_ps(iy3,jy3);
260 dz33 = _mm256_sub_ps(iz3,jz3);
262 /* Calculate squared distance and things based on it */
263 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
264 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
265 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
266 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
267 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
268 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
269 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
270 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
271 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
273 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
274 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
275 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
276 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
277 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
278 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
279 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
280 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
281 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
283 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
284 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
285 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
286 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
287 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
288 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
289 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
290 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
291 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
293 fjx1 = _mm256_setzero_ps();
294 fjy1 = _mm256_setzero_ps();
295 fjz1 = _mm256_setzero_ps();
296 fjx2 = _mm256_setzero_ps();
297 fjy2 = _mm256_setzero_ps();
298 fjz2 = _mm256_setzero_ps();
299 fjx3 = _mm256_setzero_ps();
300 fjy3 = _mm256_setzero_ps();
301 fjz3 = _mm256_setzero_ps();
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 if (gmx_mm256_any_lt(rsq11,rcutoff2))
310 /* REACTION-FIELD ELECTROSTATICS */
311 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
312 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
314 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
316 /* Update potential sum for this i atom from the interaction with this j atom. */
317 velec = _mm256_and_ps(velec,cutoff_mask);
318 velecsum = _mm256_add_ps(velecsum,velec);
322 fscal = _mm256_and_ps(fscal,cutoff_mask);
324 /* Calculate temporary vectorial force */
325 tx = _mm256_mul_ps(fscal,dx11);
326 ty = _mm256_mul_ps(fscal,dy11);
327 tz = _mm256_mul_ps(fscal,dz11);
329 /* Update vectorial force */
330 fix1 = _mm256_add_ps(fix1,tx);
331 fiy1 = _mm256_add_ps(fiy1,ty);
332 fiz1 = _mm256_add_ps(fiz1,tz);
334 fjx1 = _mm256_add_ps(fjx1,tx);
335 fjy1 = _mm256_add_ps(fjy1,ty);
336 fjz1 = _mm256_add_ps(fjz1,tz);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 if (gmx_mm256_any_lt(rsq12,rcutoff2))
347 /* REACTION-FIELD ELECTROSTATICS */
348 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
349 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
351 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velec = _mm256_and_ps(velec,cutoff_mask);
355 velecsum = _mm256_add_ps(velecsum,velec);
359 fscal = _mm256_and_ps(fscal,cutoff_mask);
361 /* Calculate temporary vectorial force */
362 tx = _mm256_mul_ps(fscal,dx12);
363 ty = _mm256_mul_ps(fscal,dy12);
364 tz = _mm256_mul_ps(fscal,dz12);
366 /* Update vectorial force */
367 fix1 = _mm256_add_ps(fix1,tx);
368 fiy1 = _mm256_add_ps(fiy1,ty);
369 fiz1 = _mm256_add_ps(fiz1,tz);
371 fjx2 = _mm256_add_ps(fjx2,tx);
372 fjy2 = _mm256_add_ps(fjy2,ty);
373 fjz2 = _mm256_add_ps(fjz2,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 if (gmx_mm256_any_lt(rsq13,rcutoff2))
384 /* REACTION-FIELD ELECTROSTATICS */
385 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
386 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
388 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velec = _mm256_and_ps(velec,cutoff_mask);
392 velecsum = _mm256_add_ps(velecsum,velec);
396 fscal = _mm256_and_ps(fscal,cutoff_mask);
398 /* Calculate temporary vectorial force */
399 tx = _mm256_mul_ps(fscal,dx13);
400 ty = _mm256_mul_ps(fscal,dy13);
401 tz = _mm256_mul_ps(fscal,dz13);
403 /* Update vectorial force */
404 fix1 = _mm256_add_ps(fix1,tx);
405 fiy1 = _mm256_add_ps(fiy1,ty);
406 fiz1 = _mm256_add_ps(fiz1,tz);
408 fjx3 = _mm256_add_ps(fjx3,tx);
409 fjy3 = _mm256_add_ps(fjy3,ty);
410 fjz3 = _mm256_add_ps(fjz3,tz);
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 if (gmx_mm256_any_lt(rsq21,rcutoff2))
421 /* REACTION-FIELD ELECTROSTATICS */
422 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
423 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
425 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velec = _mm256_and_ps(velec,cutoff_mask);
429 velecsum = _mm256_add_ps(velecsum,velec);
433 fscal = _mm256_and_ps(fscal,cutoff_mask);
435 /* Calculate temporary vectorial force */
436 tx = _mm256_mul_ps(fscal,dx21);
437 ty = _mm256_mul_ps(fscal,dy21);
438 tz = _mm256_mul_ps(fscal,dz21);
440 /* Update vectorial force */
441 fix2 = _mm256_add_ps(fix2,tx);
442 fiy2 = _mm256_add_ps(fiy2,ty);
443 fiz2 = _mm256_add_ps(fiz2,tz);
445 fjx1 = _mm256_add_ps(fjx1,tx);
446 fjy1 = _mm256_add_ps(fjy1,ty);
447 fjz1 = _mm256_add_ps(fjz1,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 if (gmx_mm256_any_lt(rsq22,rcutoff2))
458 /* REACTION-FIELD ELECTROSTATICS */
459 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
460 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
462 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm256_and_ps(velec,cutoff_mask);
466 velecsum = _mm256_add_ps(velecsum,velec);
470 fscal = _mm256_and_ps(fscal,cutoff_mask);
472 /* Calculate temporary vectorial force */
473 tx = _mm256_mul_ps(fscal,dx22);
474 ty = _mm256_mul_ps(fscal,dy22);
475 tz = _mm256_mul_ps(fscal,dz22);
477 /* Update vectorial force */
478 fix2 = _mm256_add_ps(fix2,tx);
479 fiy2 = _mm256_add_ps(fiy2,ty);
480 fiz2 = _mm256_add_ps(fiz2,tz);
482 fjx2 = _mm256_add_ps(fjx2,tx);
483 fjy2 = _mm256_add_ps(fjy2,ty);
484 fjz2 = _mm256_add_ps(fjz2,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 if (gmx_mm256_any_lt(rsq23,rcutoff2))
495 /* REACTION-FIELD ELECTROSTATICS */
496 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
497 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
499 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velec = _mm256_and_ps(velec,cutoff_mask);
503 velecsum = _mm256_add_ps(velecsum,velec);
507 fscal = _mm256_and_ps(fscal,cutoff_mask);
509 /* Calculate temporary vectorial force */
510 tx = _mm256_mul_ps(fscal,dx23);
511 ty = _mm256_mul_ps(fscal,dy23);
512 tz = _mm256_mul_ps(fscal,dz23);
514 /* Update vectorial force */
515 fix2 = _mm256_add_ps(fix2,tx);
516 fiy2 = _mm256_add_ps(fiy2,ty);
517 fiz2 = _mm256_add_ps(fiz2,tz);
519 fjx3 = _mm256_add_ps(fjx3,tx);
520 fjy3 = _mm256_add_ps(fjy3,ty);
521 fjz3 = _mm256_add_ps(fjz3,tz);
525 /**************************
526 * CALCULATE INTERACTIONS *
527 **************************/
529 if (gmx_mm256_any_lt(rsq31,rcutoff2))
532 /* REACTION-FIELD ELECTROSTATICS */
533 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
534 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
536 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velec = _mm256_and_ps(velec,cutoff_mask);
540 velecsum = _mm256_add_ps(velecsum,velec);
544 fscal = _mm256_and_ps(fscal,cutoff_mask);
546 /* Calculate temporary vectorial force */
547 tx = _mm256_mul_ps(fscal,dx31);
548 ty = _mm256_mul_ps(fscal,dy31);
549 tz = _mm256_mul_ps(fscal,dz31);
551 /* Update vectorial force */
552 fix3 = _mm256_add_ps(fix3,tx);
553 fiy3 = _mm256_add_ps(fiy3,ty);
554 fiz3 = _mm256_add_ps(fiz3,tz);
556 fjx1 = _mm256_add_ps(fjx1,tx);
557 fjy1 = _mm256_add_ps(fjy1,ty);
558 fjz1 = _mm256_add_ps(fjz1,tz);
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 if (gmx_mm256_any_lt(rsq32,rcutoff2))
569 /* REACTION-FIELD ELECTROSTATICS */
570 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
571 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
573 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velec = _mm256_and_ps(velec,cutoff_mask);
577 velecsum = _mm256_add_ps(velecsum,velec);
581 fscal = _mm256_and_ps(fscal,cutoff_mask);
583 /* Calculate temporary vectorial force */
584 tx = _mm256_mul_ps(fscal,dx32);
585 ty = _mm256_mul_ps(fscal,dy32);
586 tz = _mm256_mul_ps(fscal,dz32);
588 /* Update vectorial force */
589 fix3 = _mm256_add_ps(fix3,tx);
590 fiy3 = _mm256_add_ps(fiy3,ty);
591 fiz3 = _mm256_add_ps(fiz3,tz);
593 fjx2 = _mm256_add_ps(fjx2,tx);
594 fjy2 = _mm256_add_ps(fjy2,ty);
595 fjz2 = _mm256_add_ps(fjz2,tz);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 if (gmx_mm256_any_lt(rsq33,rcutoff2))
606 /* REACTION-FIELD ELECTROSTATICS */
607 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
608 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
610 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velec = _mm256_and_ps(velec,cutoff_mask);
614 velecsum = _mm256_add_ps(velecsum,velec);
618 fscal = _mm256_and_ps(fscal,cutoff_mask);
620 /* Calculate temporary vectorial force */
621 tx = _mm256_mul_ps(fscal,dx33);
622 ty = _mm256_mul_ps(fscal,dy33);
623 tz = _mm256_mul_ps(fscal,dz33);
625 /* Update vectorial force */
626 fix3 = _mm256_add_ps(fix3,tx);
627 fiy3 = _mm256_add_ps(fiy3,ty);
628 fiz3 = _mm256_add_ps(fiz3,tz);
630 fjx3 = _mm256_add_ps(fjx3,tx);
631 fjy3 = _mm256_add_ps(fjy3,ty);
632 fjz3 = _mm256_add_ps(fjz3,tz);
636 fjptrA = f+j_coord_offsetA;
637 fjptrB = f+j_coord_offsetB;
638 fjptrC = f+j_coord_offsetC;
639 fjptrD = f+j_coord_offsetD;
640 fjptrE = f+j_coord_offsetE;
641 fjptrF = f+j_coord_offsetF;
642 fjptrG = f+j_coord_offsetG;
643 fjptrH = f+j_coord_offsetH;
645 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
646 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
647 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
649 /* Inner loop uses 324 flops */
655 /* Get j neighbor index, and coordinate index */
656 jnrlistA = jjnr[jidx];
657 jnrlistB = jjnr[jidx+1];
658 jnrlistC = jjnr[jidx+2];
659 jnrlistD = jjnr[jidx+3];
660 jnrlistE = jjnr[jidx+4];
661 jnrlistF = jjnr[jidx+5];
662 jnrlistG = jjnr[jidx+6];
663 jnrlistH = jjnr[jidx+7];
664 /* Sign of each element will be negative for non-real atoms.
665 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
666 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
668 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
669 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
671 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
672 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
673 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
674 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
675 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
676 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
677 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
678 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
679 j_coord_offsetA = DIM*jnrA;
680 j_coord_offsetB = DIM*jnrB;
681 j_coord_offsetC = DIM*jnrC;
682 j_coord_offsetD = DIM*jnrD;
683 j_coord_offsetE = DIM*jnrE;
684 j_coord_offsetF = DIM*jnrF;
685 j_coord_offsetG = DIM*jnrG;
686 j_coord_offsetH = DIM*jnrH;
688 /* load j atom coordinates */
689 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
690 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
691 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
692 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
693 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
695 /* Calculate displacement vector */
696 dx11 = _mm256_sub_ps(ix1,jx1);
697 dy11 = _mm256_sub_ps(iy1,jy1);
698 dz11 = _mm256_sub_ps(iz1,jz1);
699 dx12 = _mm256_sub_ps(ix1,jx2);
700 dy12 = _mm256_sub_ps(iy1,jy2);
701 dz12 = _mm256_sub_ps(iz1,jz2);
702 dx13 = _mm256_sub_ps(ix1,jx3);
703 dy13 = _mm256_sub_ps(iy1,jy3);
704 dz13 = _mm256_sub_ps(iz1,jz3);
705 dx21 = _mm256_sub_ps(ix2,jx1);
706 dy21 = _mm256_sub_ps(iy2,jy1);
707 dz21 = _mm256_sub_ps(iz2,jz1);
708 dx22 = _mm256_sub_ps(ix2,jx2);
709 dy22 = _mm256_sub_ps(iy2,jy2);
710 dz22 = _mm256_sub_ps(iz2,jz2);
711 dx23 = _mm256_sub_ps(ix2,jx3);
712 dy23 = _mm256_sub_ps(iy2,jy3);
713 dz23 = _mm256_sub_ps(iz2,jz3);
714 dx31 = _mm256_sub_ps(ix3,jx1);
715 dy31 = _mm256_sub_ps(iy3,jy1);
716 dz31 = _mm256_sub_ps(iz3,jz1);
717 dx32 = _mm256_sub_ps(ix3,jx2);
718 dy32 = _mm256_sub_ps(iy3,jy2);
719 dz32 = _mm256_sub_ps(iz3,jz2);
720 dx33 = _mm256_sub_ps(ix3,jx3);
721 dy33 = _mm256_sub_ps(iy3,jy3);
722 dz33 = _mm256_sub_ps(iz3,jz3);
724 /* Calculate squared distance and things based on it */
725 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
726 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
727 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
728 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
729 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
730 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
731 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
732 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
733 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
735 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
736 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
737 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
738 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
739 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
740 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
741 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
742 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
743 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
745 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
746 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
747 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
748 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
749 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
750 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
751 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
752 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
753 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
755 fjx1 = _mm256_setzero_ps();
756 fjy1 = _mm256_setzero_ps();
757 fjz1 = _mm256_setzero_ps();
758 fjx2 = _mm256_setzero_ps();
759 fjy2 = _mm256_setzero_ps();
760 fjz2 = _mm256_setzero_ps();
761 fjx3 = _mm256_setzero_ps();
762 fjy3 = _mm256_setzero_ps();
763 fjz3 = _mm256_setzero_ps();
765 /**************************
766 * CALCULATE INTERACTIONS *
767 **************************/
769 if (gmx_mm256_any_lt(rsq11,rcutoff2))
772 /* REACTION-FIELD ELECTROSTATICS */
773 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
774 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
776 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
778 /* Update potential sum for this i atom from the interaction with this j atom. */
779 velec = _mm256_and_ps(velec,cutoff_mask);
780 velec = _mm256_andnot_ps(dummy_mask,velec);
781 velecsum = _mm256_add_ps(velecsum,velec);
785 fscal = _mm256_and_ps(fscal,cutoff_mask);
787 fscal = _mm256_andnot_ps(dummy_mask,fscal);
789 /* Calculate temporary vectorial force */
790 tx = _mm256_mul_ps(fscal,dx11);
791 ty = _mm256_mul_ps(fscal,dy11);
792 tz = _mm256_mul_ps(fscal,dz11);
794 /* Update vectorial force */
795 fix1 = _mm256_add_ps(fix1,tx);
796 fiy1 = _mm256_add_ps(fiy1,ty);
797 fiz1 = _mm256_add_ps(fiz1,tz);
799 fjx1 = _mm256_add_ps(fjx1,tx);
800 fjy1 = _mm256_add_ps(fjy1,ty);
801 fjz1 = _mm256_add_ps(fjz1,tz);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 if (gmx_mm256_any_lt(rsq12,rcutoff2))
812 /* REACTION-FIELD ELECTROSTATICS */
813 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
814 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
816 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
818 /* Update potential sum for this i atom from the interaction with this j atom. */
819 velec = _mm256_and_ps(velec,cutoff_mask);
820 velec = _mm256_andnot_ps(dummy_mask,velec);
821 velecsum = _mm256_add_ps(velecsum,velec);
825 fscal = _mm256_and_ps(fscal,cutoff_mask);
827 fscal = _mm256_andnot_ps(dummy_mask,fscal);
829 /* Calculate temporary vectorial force */
830 tx = _mm256_mul_ps(fscal,dx12);
831 ty = _mm256_mul_ps(fscal,dy12);
832 tz = _mm256_mul_ps(fscal,dz12);
834 /* Update vectorial force */
835 fix1 = _mm256_add_ps(fix1,tx);
836 fiy1 = _mm256_add_ps(fiy1,ty);
837 fiz1 = _mm256_add_ps(fiz1,tz);
839 fjx2 = _mm256_add_ps(fjx2,tx);
840 fjy2 = _mm256_add_ps(fjy2,ty);
841 fjz2 = _mm256_add_ps(fjz2,tz);
845 /**************************
846 * CALCULATE INTERACTIONS *
847 **************************/
849 if (gmx_mm256_any_lt(rsq13,rcutoff2))
852 /* REACTION-FIELD ELECTROSTATICS */
853 velec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
854 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
856 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
858 /* Update potential sum for this i atom from the interaction with this j atom. */
859 velec = _mm256_and_ps(velec,cutoff_mask);
860 velec = _mm256_andnot_ps(dummy_mask,velec);
861 velecsum = _mm256_add_ps(velecsum,velec);
865 fscal = _mm256_and_ps(fscal,cutoff_mask);
867 fscal = _mm256_andnot_ps(dummy_mask,fscal);
869 /* Calculate temporary vectorial force */
870 tx = _mm256_mul_ps(fscal,dx13);
871 ty = _mm256_mul_ps(fscal,dy13);
872 tz = _mm256_mul_ps(fscal,dz13);
874 /* Update vectorial force */
875 fix1 = _mm256_add_ps(fix1,tx);
876 fiy1 = _mm256_add_ps(fiy1,ty);
877 fiz1 = _mm256_add_ps(fiz1,tz);
879 fjx3 = _mm256_add_ps(fjx3,tx);
880 fjy3 = _mm256_add_ps(fjy3,ty);
881 fjz3 = _mm256_add_ps(fjz3,tz);
885 /**************************
886 * CALCULATE INTERACTIONS *
887 **************************/
889 if (gmx_mm256_any_lt(rsq21,rcutoff2))
892 /* REACTION-FIELD ELECTROSTATICS */
893 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
894 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
896 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
898 /* Update potential sum for this i atom from the interaction with this j atom. */
899 velec = _mm256_and_ps(velec,cutoff_mask);
900 velec = _mm256_andnot_ps(dummy_mask,velec);
901 velecsum = _mm256_add_ps(velecsum,velec);
905 fscal = _mm256_and_ps(fscal,cutoff_mask);
907 fscal = _mm256_andnot_ps(dummy_mask,fscal);
909 /* Calculate temporary vectorial force */
910 tx = _mm256_mul_ps(fscal,dx21);
911 ty = _mm256_mul_ps(fscal,dy21);
912 tz = _mm256_mul_ps(fscal,dz21);
914 /* Update vectorial force */
915 fix2 = _mm256_add_ps(fix2,tx);
916 fiy2 = _mm256_add_ps(fiy2,ty);
917 fiz2 = _mm256_add_ps(fiz2,tz);
919 fjx1 = _mm256_add_ps(fjx1,tx);
920 fjy1 = _mm256_add_ps(fjy1,ty);
921 fjz1 = _mm256_add_ps(fjz1,tz);
925 /**************************
926 * CALCULATE INTERACTIONS *
927 **************************/
929 if (gmx_mm256_any_lt(rsq22,rcutoff2))
932 /* REACTION-FIELD ELECTROSTATICS */
933 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
934 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
936 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
938 /* Update potential sum for this i atom from the interaction with this j atom. */
939 velec = _mm256_and_ps(velec,cutoff_mask);
940 velec = _mm256_andnot_ps(dummy_mask,velec);
941 velecsum = _mm256_add_ps(velecsum,velec);
945 fscal = _mm256_and_ps(fscal,cutoff_mask);
947 fscal = _mm256_andnot_ps(dummy_mask,fscal);
949 /* Calculate temporary vectorial force */
950 tx = _mm256_mul_ps(fscal,dx22);
951 ty = _mm256_mul_ps(fscal,dy22);
952 tz = _mm256_mul_ps(fscal,dz22);
954 /* Update vectorial force */
955 fix2 = _mm256_add_ps(fix2,tx);
956 fiy2 = _mm256_add_ps(fiy2,ty);
957 fiz2 = _mm256_add_ps(fiz2,tz);
959 fjx2 = _mm256_add_ps(fjx2,tx);
960 fjy2 = _mm256_add_ps(fjy2,ty);
961 fjz2 = _mm256_add_ps(fjz2,tz);
965 /**************************
966 * CALCULATE INTERACTIONS *
967 **************************/
969 if (gmx_mm256_any_lt(rsq23,rcutoff2))
972 /* REACTION-FIELD ELECTROSTATICS */
973 velec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
974 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
976 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm256_and_ps(velec,cutoff_mask);
980 velec = _mm256_andnot_ps(dummy_mask,velec);
981 velecsum = _mm256_add_ps(velecsum,velec);
985 fscal = _mm256_and_ps(fscal,cutoff_mask);
987 fscal = _mm256_andnot_ps(dummy_mask,fscal);
989 /* Calculate temporary vectorial force */
990 tx = _mm256_mul_ps(fscal,dx23);
991 ty = _mm256_mul_ps(fscal,dy23);
992 tz = _mm256_mul_ps(fscal,dz23);
994 /* Update vectorial force */
995 fix2 = _mm256_add_ps(fix2,tx);
996 fiy2 = _mm256_add_ps(fiy2,ty);
997 fiz2 = _mm256_add_ps(fiz2,tz);
999 fjx3 = _mm256_add_ps(fjx3,tx);
1000 fjy3 = _mm256_add_ps(fjy3,ty);
1001 fjz3 = _mm256_add_ps(fjz3,tz);
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1009 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1012 /* REACTION-FIELD ELECTROSTATICS */
1013 velec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
1014 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1016 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1018 /* Update potential sum for this i atom from the interaction with this j atom. */
1019 velec = _mm256_and_ps(velec,cutoff_mask);
1020 velec = _mm256_andnot_ps(dummy_mask,velec);
1021 velecsum = _mm256_add_ps(velecsum,velec);
1025 fscal = _mm256_and_ps(fscal,cutoff_mask);
1027 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1029 /* Calculate temporary vectorial force */
1030 tx = _mm256_mul_ps(fscal,dx31);
1031 ty = _mm256_mul_ps(fscal,dy31);
1032 tz = _mm256_mul_ps(fscal,dz31);
1034 /* Update vectorial force */
1035 fix3 = _mm256_add_ps(fix3,tx);
1036 fiy3 = _mm256_add_ps(fiy3,ty);
1037 fiz3 = _mm256_add_ps(fiz3,tz);
1039 fjx1 = _mm256_add_ps(fjx1,tx);
1040 fjy1 = _mm256_add_ps(fjy1,ty);
1041 fjz1 = _mm256_add_ps(fjz1,tz);
1045 /**************************
1046 * CALCULATE INTERACTIONS *
1047 **************************/
1049 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1052 /* REACTION-FIELD ELECTROSTATICS */
1053 velec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
1054 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1056 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm256_and_ps(velec,cutoff_mask);
1060 velec = _mm256_andnot_ps(dummy_mask,velec);
1061 velecsum = _mm256_add_ps(velecsum,velec);
1065 fscal = _mm256_and_ps(fscal,cutoff_mask);
1067 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1069 /* Calculate temporary vectorial force */
1070 tx = _mm256_mul_ps(fscal,dx32);
1071 ty = _mm256_mul_ps(fscal,dy32);
1072 tz = _mm256_mul_ps(fscal,dz32);
1074 /* Update vectorial force */
1075 fix3 = _mm256_add_ps(fix3,tx);
1076 fiy3 = _mm256_add_ps(fiy3,ty);
1077 fiz3 = _mm256_add_ps(fiz3,tz);
1079 fjx2 = _mm256_add_ps(fjx2,tx);
1080 fjy2 = _mm256_add_ps(fjy2,ty);
1081 fjz2 = _mm256_add_ps(fjz2,tz);
1085 /**************************
1086 * CALCULATE INTERACTIONS *
1087 **************************/
1089 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1092 /* REACTION-FIELD ELECTROSTATICS */
1093 velec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
1094 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1096 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1098 /* Update potential sum for this i atom from the interaction with this j atom. */
1099 velec = _mm256_and_ps(velec,cutoff_mask);
1100 velec = _mm256_andnot_ps(dummy_mask,velec);
1101 velecsum = _mm256_add_ps(velecsum,velec);
1105 fscal = _mm256_and_ps(fscal,cutoff_mask);
1107 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1109 /* Calculate temporary vectorial force */
1110 tx = _mm256_mul_ps(fscal,dx33);
1111 ty = _mm256_mul_ps(fscal,dy33);
1112 tz = _mm256_mul_ps(fscal,dz33);
1114 /* Update vectorial force */
1115 fix3 = _mm256_add_ps(fix3,tx);
1116 fiy3 = _mm256_add_ps(fiy3,ty);
1117 fiz3 = _mm256_add_ps(fiz3,tz);
1119 fjx3 = _mm256_add_ps(fjx3,tx);
1120 fjy3 = _mm256_add_ps(fjy3,ty);
1121 fjz3 = _mm256_add_ps(fjz3,tz);
1125 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1126 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1127 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1128 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1129 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1130 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1131 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1132 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1134 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1135 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1136 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1138 /* Inner loop uses 324 flops */
1141 /* End of innermost loop */
1143 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1144 f+i_coord_offset+DIM,fshift+i_shift_offset);
1147 /* Update potential energies */
1148 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1150 /* Increment number of inner iterations */
1151 inneriter += j_index_end - j_index_start;
1153 /* Outer loop uses 19 flops */
1156 /* Increment number of outer iterations */
1159 /* Update outer/inner flops */
1161 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*324);
1164 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single
1165 * Electrostatics interaction: ReactionField
1166 * VdW interaction: None
1167 * Geometry: Water4-Water4
1168 * Calculate force/pot: Force
1171 nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_avx_256_single
1172 (t_nblist * gmx_restrict nlist,
1173 rvec * gmx_restrict xx,
1174 rvec * gmx_restrict ff,
1175 t_forcerec * gmx_restrict fr,
1176 t_mdatoms * gmx_restrict mdatoms,
1177 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1178 t_nrnb * gmx_restrict nrnb)
1180 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1181 * just 0 for non-waters.
1182 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1183 * jnr indices corresponding to data put in the four positions in the SIMD register.
1185 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1186 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1187 int jnrA,jnrB,jnrC,jnrD;
1188 int jnrE,jnrF,jnrG,jnrH;
1189 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1190 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1191 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1192 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1193 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1194 real rcutoff_scalar;
1195 real *shiftvec,*fshift,*x,*f;
1196 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1197 real scratch[4*DIM];
1198 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1199 real * vdwioffsetptr1;
1200 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1201 real * vdwioffsetptr2;
1202 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1203 real * vdwioffsetptr3;
1204 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1205 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1206 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1207 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1208 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1209 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1210 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1211 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1212 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1213 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1214 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1215 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1216 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1217 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1218 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1219 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1220 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1222 __m256 dummy_mask,cutoff_mask;
1223 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1224 __m256 one = _mm256_set1_ps(1.0);
1225 __m256 two = _mm256_set1_ps(2.0);
1231 jindex = nlist->jindex;
1233 shiftidx = nlist->shift;
1235 shiftvec = fr->shift_vec[0];
1236 fshift = fr->fshift[0];
1237 facel = _mm256_set1_ps(fr->epsfac);
1238 charge = mdatoms->chargeA;
1239 krf = _mm256_set1_ps(fr->ic->k_rf);
1240 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1241 crf = _mm256_set1_ps(fr->ic->c_rf);
1243 /* Setup water-specific parameters */
1244 inr = nlist->iinr[0];
1245 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1246 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1247 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1249 jq1 = _mm256_set1_ps(charge[inr+1]);
1250 jq2 = _mm256_set1_ps(charge[inr+2]);
1251 jq3 = _mm256_set1_ps(charge[inr+3]);
1252 qq11 = _mm256_mul_ps(iq1,jq1);
1253 qq12 = _mm256_mul_ps(iq1,jq2);
1254 qq13 = _mm256_mul_ps(iq1,jq3);
1255 qq21 = _mm256_mul_ps(iq2,jq1);
1256 qq22 = _mm256_mul_ps(iq2,jq2);
1257 qq23 = _mm256_mul_ps(iq2,jq3);
1258 qq31 = _mm256_mul_ps(iq3,jq1);
1259 qq32 = _mm256_mul_ps(iq3,jq2);
1260 qq33 = _mm256_mul_ps(iq3,jq3);
1262 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1263 rcutoff_scalar = fr->rcoulomb;
1264 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1265 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1267 /* Avoid stupid compiler warnings */
1268 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1269 j_coord_offsetA = 0;
1270 j_coord_offsetB = 0;
1271 j_coord_offsetC = 0;
1272 j_coord_offsetD = 0;
1273 j_coord_offsetE = 0;
1274 j_coord_offsetF = 0;
1275 j_coord_offsetG = 0;
1276 j_coord_offsetH = 0;
1281 for(iidx=0;iidx<4*DIM;iidx++)
1283 scratch[iidx] = 0.0;
1286 /* Start outer loop over neighborlists */
1287 for(iidx=0; iidx<nri; iidx++)
1289 /* Load shift vector for this list */
1290 i_shift_offset = DIM*shiftidx[iidx];
1292 /* Load limits for loop over neighbors */
1293 j_index_start = jindex[iidx];
1294 j_index_end = jindex[iidx+1];
1296 /* Get outer coordinate index */
1298 i_coord_offset = DIM*inr;
1300 /* Load i particle coords and add shift vector */
1301 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1302 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1304 fix1 = _mm256_setzero_ps();
1305 fiy1 = _mm256_setzero_ps();
1306 fiz1 = _mm256_setzero_ps();
1307 fix2 = _mm256_setzero_ps();
1308 fiy2 = _mm256_setzero_ps();
1309 fiz2 = _mm256_setzero_ps();
1310 fix3 = _mm256_setzero_ps();
1311 fiy3 = _mm256_setzero_ps();
1312 fiz3 = _mm256_setzero_ps();
1314 /* Start inner kernel loop */
1315 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1318 /* Get j neighbor index, and coordinate index */
1320 jnrB = jjnr[jidx+1];
1321 jnrC = jjnr[jidx+2];
1322 jnrD = jjnr[jidx+3];
1323 jnrE = jjnr[jidx+4];
1324 jnrF = jjnr[jidx+5];
1325 jnrG = jjnr[jidx+6];
1326 jnrH = jjnr[jidx+7];
1327 j_coord_offsetA = DIM*jnrA;
1328 j_coord_offsetB = DIM*jnrB;
1329 j_coord_offsetC = DIM*jnrC;
1330 j_coord_offsetD = DIM*jnrD;
1331 j_coord_offsetE = DIM*jnrE;
1332 j_coord_offsetF = DIM*jnrF;
1333 j_coord_offsetG = DIM*jnrG;
1334 j_coord_offsetH = DIM*jnrH;
1336 /* load j atom coordinates */
1337 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1338 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1339 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1340 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1341 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1343 /* Calculate displacement vector */
1344 dx11 = _mm256_sub_ps(ix1,jx1);
1345 dy11 = _mm256_sub_ps(iy1,jy1);
1346 dz11 = _mm256_sub_ps(iz1,jz1);
1347 dx12 = _mm256_sub_ps(ix1,jx2);
1348 dy12 = _mm256_sub_ps(iy1,jy2);
1349 dz12 = _mm256_sub_ps(iz1,jz2);
1350 dx13 = _mm256_sub_ps(ix1,jx3);
1351 dy13 = _mm256_sub_ps(iy1,jy3);
1352 dz13 = _mm256_sub_ps(iz1,jz3);
1353 dx21 = _mm256_sub_ps(ix2,jx1);
1354 dy21 = _mm256_sub_ps(iy2,jy1);
1355 dz21 = _mm256_sub_ps(iz2,jz1);
1356 dx22 = _mm256_sub_ps(ix2,jx2);
1357 dy22 = _mm256_sub_ps(iy2,jy2);
1358 dz22 = _mm256_sub_ps(iz2,jz2);
1359 dx23 = _mm256_sub_ps(ix2,jx3);
1360 dy23 = _mm256_sub_ps(iy2,jy3);
1361 dz23 = _mm256_sub_ps(iz2,jz3);
1362 dx31 = _mm256_sub_ps(ix3,jx1);
1363 dy31 = _mm256_sub_ps(iy3,jy1);
1364 dz31 = _mm256_sub_ps(iz3,jz1);
1365 dx32 = _mm256_sub_ps(ix3,jx2);
1366 dy32 = _mm256_sub_ps(iy3,jy2);
1367 dz32 = _mm256_sub_ps(iz3,jz2);
1368 dx33 = _mm256_sub_ps(ix3,jx3);
1369 dy33 = _mm256_sub_ps(iy3,jy3);
1370 dz33 = _mm256_sub_ps(iz3,jz3);
1372 /* Calculate squared distance and things based on it */
1373 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1374 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1375 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1376 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1377 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1378 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1379 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1380 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1381 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1383 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1384 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1385 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1386 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1387 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1388 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1389 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1390 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1391 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1393 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1394 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1395 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1396 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1397 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1398 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1399 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1400 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1401 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1403 fjx1 = _mm256_setzero_ps();
1404 fjy1 = _mm256_setzero_ps();
1405 fjz1 = _mm256_setzero_ps();
1406 fjx2 = _mm256_setzero_ps();
1407 fjy2 = _mm256_setzero_ps();
1408 fjz2 = _mm256_setzero_ps();
1409 fjx3 = _mm256_setzero_ps();
1410 fjy3 = _mm256_setzero_ps();
1411 fjz3 = _mm256_setzero_ps();
1413 /**************************
1414 * CALCULATE INTERACTIONS *
1415 **************************/
1417 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1420 /* REACTION-FIELD ELECTROSTATICS */
1421 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1423 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1427 fscal = _mm256_and_ps(fscal,cutoff_mask);
1429 /* Calculate temporary vectorial force */
1430 tx = _mm256_mul_ps(fscal,dx11);
1431 ty = _mm256_mul_ps(fscal,dy11);
1432 tz = _mm256_mul_ps(fscal,dz11);
1434 /* Update vectorial force */
1435 fix1 = _mm256_add_ps(fix1,tx);
1436 fiy1 = _mm256_add_ps(fiy1,ty);
1437 fiz1 = _mm256_add_ps(fiz1,tz);
1439 fjx1 = _mm256_add_ps(fjx1,tx);
1440 fjy1 = _mm256_add_ps(fjy1,ty);
1441 fjz1 = _mm256_add_ps(fjz1,tz);
1445 /**************************
1446 * CALCULATE INTERACTIONS *
1447 **************************/
1449 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1452 /* REACTION-FIELD ELECTROSTATICS */
1453 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1455 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1459 fscal = _mm256_and_ps(fscal,cutoff_mask);
1461 /* Calculate temporary vectorial force */
1462 tx = _mm256_mul_ps(fscal,dx12);
1463 ty = _mm256_mul_ps(fscal,dy12);
1464 tz = _mm256_mul_ps(fscal,dz12);
1466 /* Update vectorial force */
1467 fix1 = _mm256_add_ps(fix1,tx);
1468 fiy1 = _mm256_add_ps(fiy1,ty);
1469 fiz1 = _mm256_add_ps(fiz1,tz);
1471 fjx2 = _mm256_add_ps(fjx2,tx);
1472 fjy2 = _mm256_add_ps(fjy2,ty);
1473 fjz2 = _mm256_add_ps(fjz2,tz);
1477 /**************************
1478 * CALCULATE INTERACTIONS *
1479 **************************/
1481 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1484 /* REACTION-FIELD ELECTROSTATICS */
1485 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1487 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1491 fscal = _mm256_and_ps(fscal,cutoff_mask);
1493 /* Calculate temporary vectorial force */
1494 tx = _mm256_mul_ps(fscal,dx13);
1495 ty = _mm256_mul_ps(fscal,dy13);
1496 tz = _mm256_mul_ps(fscal,dz13);
1498 /* Update vectorial force */
1499 fix1 = _mm256_add_ps(fix1,tx);
1500 fiy1 = _mm256_add_ps(fiy1,ty);
1501 fiz1 = _mm256_add_ps(fiz1,tz);
1503 fjx3 = _mm256_add_ps(fjx3,tx);
1504 fjy3 = _mm256_add_ps(fjy3,ty);
1505 fjz3 = _mm256_add_ps(fjz3,tz);
1509 /**************************
1510 * CALCULATE INTERACTIONS *
1511 **************************/
1513 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1516 /* REACTION-FIELD ELECTROSTATICS */
1517 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1519 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1523 fscal = _mm256_and_ps(fscal,cutoff_mask);
1525 /* Calculate temporary vectorial force */
1526 tx = _mm256_mul_ps(fscal,dx21);
1527 ty = _mm256_mul_ps(fscal,dy21);
1528 tz = _mm256_mul_ps(fscal,dz21);
1530 /* Update vectorial force */
1531 fix2 = _mm256_add_ps(fix2,tx);
1532 fiy2 = _mm256_add_ps(fiy2,ty);
1533 fiz2 = _mm256_add_ps(fiz2,tz);
1535 fjx1 = _mm256_add_ps(fjx1,tx);
1536 fjy1 = _mm256_add_ps(fjy1,ty);
1537 fjz1 = _mm256_add_ps(fjz1,tz);
1541 /**************************
1542 * CALCULATE INTERACTIONS *
1543 **************************/
1545 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1548 /* REACTION-FIELD ELECTROSTATICS */
1549 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1551 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1555 fscal = _mm256_and_ps(fscal,cutoff_mask);
1557 /* Calculate temporary vectorial force */
1558 tx = _mm256_mul_ps(fscal,dx22);
1559 ty = _mm256_mul_ps(fscal,dy22);
1560 tz = _mm256_mul_ps(fscal,dz22);
1562 /* Update vectorial force */
1563 fix2 = _mm256_add_ps(fix2,tx);
1564 fiy2 = _mm256_add_ps(fiy2,ty);
1565 fiz2 = _mm256_add_ps(fiz2,tz);
1567 fjx2 = _mm256_add_ps(fjx2,tx);
1568 fjy2 = _mm256_add_ps(fjy2,ty);
1569 fjz2 = _mm256_add_ps(fjz2,tz);
1573 /**************************
1574 * CALCULATE INTERACTIONS *
1575 **************************/
1577 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1580 /* REACTION-FIELD ELECTROSTATICS */
1581 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1583 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1587 fscal = _mm256_and_ps(fscal,cutoff_mask);
1589 /* Calculate temporary vectorial force */
1590 tx = _mm256_mul_ps(fscal,dx23);
1591 ty = _mm256_mul_ps(fscal,dy23);
1592 tz = _mm256_mul_ps(fscal,dz23);
1594 /* Update vectorial force */
1595 fix2 = _mm256_add_ps(fix2,tx);
1596 fiy2 = _mm256_add_ps(fiy2,ty);
1597 fiz2 = _mm256_add_ps(fiz2,tz);
1599 fjx3 = _mm256_add_ps(fjx3,tx);
1600 fjy3 = _mm256_add_ps(fjy3,ty);
1601 fjz3 = _mm256_add_ps(fjz3,tz);
1605 /**************************
1606 * CALCULATE INTERACTIONS *
1607 **************************/
1609 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1612 /* REACTION-FIELD ELECTROSTATICS */
1613 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1615 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1619 fscal = _mm256_and_ps(fscal,cutoff_mask);
1621 /* Calculate temporary vectorial force */
1622 tx = _mm256_mul_ps(fscal,dx31);
1623 ty = _mm256_mul_ps(fscal,dy31);
1624 tz = _mm256_mul_ps(fscal,dz31);
1626 /* Update vectorial force */
1627 fix3 = _mm256_add_ps(fix3,tx);
1628 fiy3 = _mm256_add_ps(fiy3,ty);
1629 fiz3 = _mm256_add_ps(fiz3,tz);
1631 fjx1 = _mm256_add_ps(fjx1,tx);
1632 fjy1 = _mm256_add_ps(fjy1,ty);
1633 fjz1 = _mm256_add_ps(fjz1,tz);
1637 /**************************
1638 * CALCULATE INTERACTIONS *
1639 **************************/
1641 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1644 /* REACTION-FIELD ELECTROSTATICS */
1645 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1647 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1651 fscal = _mm256_and_ps(fscal,cutoff_mask);
1653 /* Calculate temporary vectorial force */
1654 tx = _mm256_mul_ps(fscal,dx32);
1655 ty = _mm256_mul_ps(fscal,dy32);
1656 tz = _mm256_mul_ps(fscal,dz32);
1658 /* Update vectorial force */
1659 fix3 = _mm256_add_ps(fix3,tx);
1660 fiy3 = _mm256_add_ps(fiy3,ty);
1661 fiz3 = _mm256_add_ps(fiz3,tz);
1663 fjx2 = _mm256_add_ps(fjx2,tx);
1664 fjy2 = _mm256_add_ps(fjy2,ty);
1665 fjz2 = _mm256_add_ps(fjz2,tz);
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1676 /* REACTION-FIELD ELECTROSTATICS */
1677 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1679 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1683 fscal = _mm256_and_ps(fscal,cutoff_mask);
1685 /* Calculate temporary vectorial force */
1686 tx = _mm256_mul_ps(fscal,dx33);
1687 ty = _mm256_mul_ps(fscal,dy33);
1688 tz = _mm256_mul_ps(fscal,dz33);
1690 /* Update vectorial force */
1691 fix3 = _mm256_add_ps(fix3,tx);
1692 fiy3 = _mm256_add_ps(fiy3,ty);
1693 fiz3 = _mm256_add_ps(fiz3,tz);
1695 fjx3 = _mm256_add_ps(fjx3,tx);
1696 fjy3 = _mm256_add_ps(fjy3,ty);
1697 fjz3 = _mm256_add_ps(fjz3,tz);
1701 fjptrA = f+j_coord_offsetA;
1702 fjptrB = f+j_coord_offsetB;
1703 fjptrC = f+j_coord_offsetC;
1704 fjptrD = f+j_coord_offsetD;
1705 fjptrE = f+j_coord_offsetE;
1706 fjptrF = f+j_coord_offsetF;
1707 fjptrG = f+j_coord_offsetG;
1708 fjptrH = f+j_coord_offsetH;
1710 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1711 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1712 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1714 /* Inner loop uses 270 flops */
1717 if(jidx<j_index_end)
1720 /* Get j neighbor index, and coordinate index */
1721 jnrlistA = jjnr[jidx];
1722 jnrlistB = jjnr[jidx+1];
1723 jnrlistC = jjnr[jidx+2];
1724 jnrlistD = jjnr[jidx+3];
1725 jnrlistE = jjnr[jidx+4];
1726 jnrlistF = jjnr[jidx+5];
1727 jnrlistG = jjnr[jidx+6];
1728 jnrlistH = jjnr[jidx+7];
1729 /* Sign of each element will be negative for non-real atoms.
1730 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1731 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1733 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1734 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1736 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1737 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1738 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1739 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1740 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1741 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1742 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1743 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1744 j_coord_offsetA = DIM*jnrA;
1745 j_coord_offsetB = DIM*jnrB;
1746 j_coord_offsetC = DIM*jnrC;
1747 j_coord_offsetD = DIM*jnrD;
1748 j_coord_offsetE = DIM*jnrE;
1749 j_coord_offsetF = DIM*jnrF;
1750 j_coord_offsetG = DIM*jnrG;
1751 j_coord_offsetH = DIM*jnrH;
1753 /* load j atom coordinates */
1754 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1755 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1756 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1757 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1758 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1760 /* Calculate displacement vector */
1761 dx11 = _mm256_sub_ps(ix1,jx1);
1762 dy11 = _mm256_sub_ps(iy1,jy1);
1763 dz11 = _mm256_sub_ps(iz1,jz1);
1764 dx12 = _mm256_sub_ps(ix1,jx2);
1765 dy12 = _mm256_sub_ps(iy1,jy2);
1766 dz12 = _mm256_sub_ps(iz1,jz2);
1767 dx13 = _mm256_sub_ps(ix1,jx3);
1768 dy13 = _mm256_sub_ps(iy1,jy3);
1769 dz13 = _mm256_sub_ps(iz1,jz3);
1770 dx21 = _mm256_sub_ps(ix2,jx1);
1771 dy21 = _mm256_sub_ps(iy2,jy1);
1772 dz21 = _mm256_sub_ps(iz2,jz1);
1773 dx22 = _mm256_sub_ps(ix2,jx2);
1774 dy22 = _mm256_sub_ps(iy2,jy2);
1775 dz22 = _mm256_sub_ps(iz2,jz2);
1776 dx23 = _mm256_sub_ps(ix2,jx3);
1777 dy23 = _mm256_sub_ps(iy2,jy3);
1778 dz23 = _mm256_sub_ps(iz2,jz3);
1779 dx31 = _mm256_sub_ps(ix3,jx1);
1780 dy31 = _mm256_sub_ps(iy3,jy1);
1781 dz31 = _mm256_sub_ps(iz3,jz1);
1782 dx32 = _mm256_sub_ps(ix3,jx2);
1783 dy32 = _mm256_sub_ps(iy3,jy2);
1784 dz32 = _mm256_sub_ps(iz3,jz2);
1785 dx33 = _mm256_sub_ps(ix3,jx3);
1786 dy33 = _mm256_sub_ps(iy3,jy3);
1787 dz33 = _mm256_sub_ps(iz3,jz3);
1789 /* Calculate squared distance and things based on it */
1790 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1791 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1792 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1793 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1794 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1795 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1796 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1797 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1798 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1800 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1801 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1802 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1803 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1804 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1805 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1806 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1807 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1808 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1810 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1811 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1812 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1813 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1814 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1815 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1816 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1817 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1818 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1820 fjx1 = _mm256_setzero_ps();
1821 fjy1 = _mm256_setzero_ps();
1822 fjz1 = _mm256_setzero_ps();
1823 fjx2 = _mm256_setzero_ps();
1824 fjy2 = _mm256_setzero_ps();
1825 fjz2 = _mm256_setzero_ps();
1826 fjx3 = _mm256_setzero_ps();
1827 fjy3 = _mm256_setzero_ps();
1828 fjz3 = _mm256_setzero_ps();
1830 /**************************
1831 * CALCULATE INTERACTIONS *
1832 **************************/
1834 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1837 /* REACTION-FIELD ELECTROSTATICS */
1838 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1840 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1844 fscal = _mm256_and_ps(fscal,cutoff_mask);
1846 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1848 /* Calculate temporary vectorial force */
1849 tx = _mm256_mul_ps(fscal,dx11);
1850 ty = _mm256_mul_ps(fscal,dy11);
1851 tz = _mm256_mul_ps(fscal,dz11);
1853 /* Update vectorial force */
1854 fix1 = _mm256_add_ps(fix1,tx);
1855 fiy1 = _mm256_add_ps(fiy1,ty);
1856 fiz1 = _mm256_add_ps(fiz1,tz);
1858 fjx1 = _mm256_add_ps(fjx1,tx);
1859 fjy1 = _mm256_add_ps(fjy1,ty);
1860 fjz1 = _mm256_add_ps(fjz1,tz);
1864 /**************************
1865 * CALCULATE INTERACTIONS *
1866 **************************/
1868 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1871 /* REACTION-FIELD ELECTROSTATICS */
1872 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1874 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1878 fscal = _mm256_and_ps(fscal,cutoff_mask);
1880 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1882 /* Calculate temporary vectorial force */
1883 tx = _mm256_mul_ps(fscal,dx12);
1884 ty = _mm256_mul_ps(fscal,dy12);
1885 tz = _mm256_mul_ps(fscal,dz12);
1887 /* Update vectorial force */
1888 fix1 = _mm256_add_ps(fix1,tx);
1889 fiy1 = _mm256_add_ps(fiy1,ty);
1890 fiz1 = _mm256_add_ps(fiz1,tz);
1892 fjx2 = _mm256_add_ps(fjx2,tx);
1893 fjy2 = _mm256_add_ps(fjy2,ty);
1894 fjz2 = _mm256_add_ps(fjz2,tz);
1898 /**************************
1899 * CALCULATE INTERACTIONS *
1900 **************************/
1902 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1905 /* REACTION-FIELD ELECTROSTATICS */
1906 felec = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1908 cutoff_mask = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1912 fscal = _mm256_and_ps(fscal,cutoff_mask);
1914 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1916 /* Calculate temporary vectorial force */
1917 tx = _mm256_mul_ps(fscal,dx13);
1918 ty = _mm256_mul_ps(fscal,dy13);
1919 tz = _mm256_mul_ps(fscal,dz13);
1921 /* Update vectorial force */
1922 fix1 = _mm256_add_ps(fix1,tx);
1923 fiy1 = _mm256_add_ps(fiy1,ty);
1924 fiz1 = _mm256_add_ps(fiz1,tz);
1926 fjx3 = _mm256_add_ps(fjx3,tx);
1927 fjy3 = _mm256_add_ps(fjy3,ty);
1928 fjz3 = _mm256_add_ps(fjz3,tz);
1932 /**************************
1933 * CALCULATE INTERACTIONS *
1934 **************************/
1936 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1939 /* REACTION-FIELD ELECTROSTATICS */
1940 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1942 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1946 fscal = _mm256_and_ps(fscal,cutoff_mask);
1948 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1950 /* Calculate temporary vectorial force */
1951 tx = _mm256_mul_ps(fscal,dx21);
1952 ty = _mm256_mul_ps(fscal,dy21);
1953 tz = _mm256_mul_ps(fscal,dz21);
1955 /* Update vectorial force */
1956 fix2 = _mm256_add_ps(fix2,tx);
1957 fiy2 = _mm256_add_ps(fiy2,ty);
1958 fiz2 = _mm256_add_ps(fiz2,tz);
1960 fjx1 = _mm256_add_ps(fjx1,tx);
1961 fjy1 = _mm256_add_ps(fjy1,ty);
1962 fjz1 = _mm256_add_ps(fjz1,tz);
1966 /**************************
1967 * CALCULATE INTERACTIONS *
1968 **************************/
1970 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1973 /* REACTION-FIELD ELECTROSTATICS */
1974 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1976 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1980 fscal = _mm256_and_ps(fscal,cutoff_mask);
1982 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1984 /* Calculate temporary vectorial force */
1985 tx = _mm256_mul_ps(fscal,dx22);
1986 ty = _mm256_mul_ps(fscal,dy22);
1987 tz = _mm256_mul_ps(fscal,dz22);
1989 /* Update vectorial force */
1990 fix2 = _mm256_add_ps(fix2,tx);
1991 fiy2 = _mm256_add_ps(fiy2,ty);
1992 fiz2 = _mm256_add_ps(fiz2,tz);
1994 fjx2 = _mm256_add_ps(fjx2,tx);
1995 fjy2 = _mm256_add_ps(fjy2,ty);
1996 fjz2 = _mm256_add_ps(fjz2,tz);
2000 /**************************
2001 * CALCULATE INTERACTIONS *
2002 **************************/
2004 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2007 /* REACTION-FIELD ELECTROSTATICS */
2008 felec = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
2010 cutoff_mask = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2014 fscal = _mm256_and_ps(fscal,cutoff_mask);
2016 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2018 /* Calculate temporary vectorial force */
2019 tx = _mm256_mul_ps(fscal,dx23);
2020 ty = _mm256_mul_ps(fscal,dy23);
2021 tz = _mm256_mul_ps(fscal,dz23);
2023 /* Update vectorial force */
2024 fix2 = _mm256_add_ps(fix2,tx);
2025 fiy2 = _mm256_add_ps(fiy2,ty);
2026 fiz2 = _mm256_add_ps(fiz2,tz);
2028 fjx3 = _mm256_add_ps(fjx3,tx);
2029 fjy3 = _mm256_add_ps(fjy3,ty);
2030 fjz3 = _mm256_add_ps(fjz3,tz);
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2041 /* REACTION-FIELD ELECTROSTATICS */
2042 felec = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
2044 cutoff_mask = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2048 fscal = _mm256_and_ps(fscal,cutoff_mask);
2050 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2052 /* Calculate temporary vectorial force */
2053 tx = _mm256_mul_ps(fscal,dx31);
2054 ty = _mm256_mul_ps(fscal,dy31);
2055 tz = _mm256_mul_ps(fscal,dz31);
2057 /* Update vectorial force */
2058 fix3 = _mm256_add_ps(fix3,tx);
2059 fiy3 = _mm256_add_ps(fiy3,ty);
2060 fiz3 = _mm256_add_ps(fiz3,tz);
2062 fjx1 = _mm256_add_ps(fjx1,tx);
2063 fjy1 = _mm256_add_ps(fjy1,ty);
2064 fjz1 = _mm256_add_ps(fjz1,tz);
2068 /**************************
2069 * CALCULATE INTERACTIONS *
2070 **************************/
2072 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2075 /* REACTION-FIELD ELECTROSTATICS */
2076 felec = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
2078 cutoff_mask = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2082 fscal = _mm256_and_ps(fscal,cutoff_mask);
2084 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2086 /* Calculate temporary vectorial force */
2087 tx = _mm256_mul_ps(fscal,dx32);
2088 ty = _mm256_mul_ps(fscal,dy32);
2089 tz = _mm256_mul_ps(fscal,dz32);
2091 /* Update vectorial force */
2092 fix3 = _mm256_add_ps(fix3,tx);
2093 fiy3 = _mm256_add_ps(fiy3,ty);
2094 fiz3 = _mm256_add_ps(fiz3,tz);
2096 fjx2 = _mm256_add_ps(fjx2,tx);
2097 fjy2 = _mm256_add_ps(fjy2,ty);
2098 fjz2 = _mm256_add_ps(fjz2,tz);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2109 /* REACTION-FIELD ELECTROSTATICS */
2110 felec = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2112 cutoff_mask = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2116 fscal = _mm256_and_ps(fscal,cutoff_mask);
2118 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2120 /* Calculate temporary vectorial force */
2121 tx = _mm256_mul_ps(fscal,dx33);
2122 ty = _mm256_mul_ps(fscal,dy33);
2123 tz = _mm256_mul_ps(fscal,dz33);
2125 /* Update vectorial force */
2126 fix3 = _mm256_add_ps(fix3,tx);
2127 fiy3 = _mm256_add_ps(fiy3,ty);
2128 fiz3 = _mm256_add_ps(fiz3,tz);
2130 fjx3 = _mm256_add_ps(fjx3,tx);
2131 fjy3 = _mm256_add_ps(fjy3,ty);
2132 fjz3 = _mm256_add_ps(fjz3,tz);
2136 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2137 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2138 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2139 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2140 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2141 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2142 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2143 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2145 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2146 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2147 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2149 /* Inner loop uses 270 flops */
2152 /* End of innermost loop */
2154 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2155 f+i_coord_offset+DIM,fshift+i_shift_offset);
2157 /* Increment number of inner iterations */
2158 inneriter += j_index_end - j_index_start;
2160 /* Outer loop uses 18 flops */
2163 /* Increment number of outer iterations */
2166 /* Update outer/inner flops */
2168 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);