2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
107 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
111 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
112 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
113 real rswitch_scalar,d_scalar;
114 __m256d dummy_mask,cutoff_mask;
115 __m128 tmpmask0,tmpmask1;
116 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
117 __m256d one = _mm256_set1_pd(1.0);
118 __m256d two = _mm256_set1_pd(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm256_set1_pd(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
132 krf = _mm256_set1_pd(fr->ic->k_rf);
133 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
134 crf = _mm256_set1_pd(fr->ic->c_rf);
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
142 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
143 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
144 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
146 jq0 = _mm256_set1_pd(charge[inr+0]);
147 jq1 = _mm256_set1_pd(charge[inr+1]);
148 jq2 = _mm256_set1_pd(charge[inr+2]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 qq00 = _mm256_mul_pd(iq0,jq0);
151 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
152 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
153 qq01 = _mm256_mul_pd(iq0,jq1);
154 qq02 = _mm256_mul_pd(iq0,jq2);
155 qq10 = _mm256_mul_pd(iq1,jq0);
156 qq11 = _mm256_mul_pd(iq1,jq1);
157 qq12 = _mm256_mul_pd(iq1,jq2);
158 qq20 = _mm256_mul_pd(iq2,jq0);
159 qq21 = _mm256_mul_pd(iq2,jq1);
160 qq22 = _mm256_mul_pd(iq2,jq2);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar = fr->ic->rcoulomb;
164 rcutoff = _mm256_set1_pd(rcutoff_scalar);
165 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
167 rswitch_scalar = fr->ic->rvdw_switch;
168 rswitch = _mm256_set1_pd(rswitch_scalar);
169 /* Setup switch parameters */
170 d_scalar = rcutoff_scalar-rswitch_scalar;
171 d = _mm256_set1_pd(d_scalar);
172 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
173 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
176 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 /* Avoid stupid compiler warnings */
180 jnrA = jnrB = jnrC = jnrD = 0;
189 for(iidx=0;iidx<4*DIM;iidx++)
194 /* Start outer loop over neighborlists */
195 for(iidx=0; iidx<nri; iidx++)
197 /* Load shift vector for this list */
198 i_shift_offset = DIM*shiftidx[iidx];
200 /* Load limits for loop over neighbors */
201 j_index_start = jindex[iidx];
202 j_index_end = jindex[iidx+1];
204 /* Get outer coordinate index */
206 i_coord_offset = DIM*inr;
208 /* Load i particle coords and add shift vector */
209 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
210 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
212 fix0 = _mm256_setzero_pd();
213 fiy0 = _mm256_setzero_pd();
214 fiz0 = _mm256_setzero_pd();
215 fix1 = _mm256_setzero_pd();
216 fiy1 = _mm256_setzero_pd();
217 fiz1 = _mm256_setzero_pd();
218 fix2 = _mm256_setzero_pd();
219 fiy2 = _mm256_setzero_pd();
220 fiz2 = _mm256_setzero_pd();
222 /* Reset potential sums */
223 velecsum = _mm256_setzero_pd();
224 vvdwsum = _mm256_setzero_pd();
226 /* Start inner kernel loop */
227 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
230 /* Get j neighbor index, and coordinate index */
235 j_coord_offsetA = DIM*jnrA;
236 j_coord_offsetB = DIM*jnrB;
237 j_coord_offsetC = DIM*jnrC;
238 j_coord_offsetD = DIM*jnrD;
240 /* load j atom coordinates */
241 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
242 x+j_coord_offsetC,x+j_coord_offsetD,
243 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
245 /* Calculate displacement vector */
246 dx00 = _mm256_sub_pd(ix0,jx0);
247 dy00 = _mm256_sub_pd(iy0,jy0);
248 dz00 = _mm256_sub_pd(iz0,jz0);
249 dx01 = _mm256_sub_pd(ix0,jx1);
250 dy01 = _mm256_sub_pd(iy0,jy1);
251 dz01 = _mm256_sub_pd(iz0,jz1);
252 dx02 = _mm256_sub_pd(ix0,jx2);
253 dy02 = _mm256_sub_pd(iy0,jy2);
254 dz02 = _mm256_sub_pd(iz0,jz2);
255 dx10 = _mm256_sub_pd(ix1,jx0);
256 dy10 = _mm256_sub_pd(iy1,jy0);
257 dz10 = _mm256_sub_pd(iz1,jz0);
258 dx11 = _mm256_sub_pd(ix1,jx1);
259 dy11 = _mm256_sub_pd(iy1,jy1);
260 dz11 = _mm256_sub_pd(iz1,jz1);
261 dx12 = _mm256_sub_pd(ix1,jx2);
262 dy12 = _mm256_sub_pd(iy1,jy2);
263 dz12 = _mm256_sub_pd(iz1,jz2);
264 dx20 = _mm256_sub_pd(ix2,jx0);
265 dy20 = _mm256_sub_pd(iy2,jy0);
266 dz20 = _mm256_sub_pd(iz2,jz0);
267 dx21 = _mm256_sub_pd(ix2,jx1);
268 dy21 = _mm256_sub_pd(iy2,jy1);
269 dz21 = _mm256_sub_pd(iz2,jz1);
270 dx22 = _mm256_sub_pd(ix2,jx2);
271 dy22 = _mm256_sub_pd(iy2,jy2);
272 dz22 = _mm256_sub_pd(iz2,jz2);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
276 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
277 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
278 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
279 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
280 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
281 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
282 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
283 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
285 rinv00 = avx256_invsqrt_d(rsq00);
286 rinv01 = avx256_invsqrt_d(rsq01);
287 rinv02 = avx256_invsqrt_d(rsq02);
288 rinv10 = avx256_invsqrt_d(rsq10);
289 rinv11 = avx256_invsqrt_d(rsq11);
290 rinv12 = avx256_invsqrt_d(rsq12);
291 rinv20 = avx256_invsqrt_d(rsq20);
292 rinv21 = avx256_invsqrt_d(rsq21);
293 rinv22 = avx256_invsqrt_d(rsq22);
295 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
296 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
297 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
298 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
299 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
300 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
301 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
302 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
303 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
305 fjx0 = _mm256_setzero_pd();
306 fjy0 = _mm256_setzero_pd();
307 fjz0 = _mm256_setzero_pd();
308 fjx1 = _mm256_setzero_pd();
309 fjy1 = _mm256_setzero_pd();
310 fjz1 = _mm256_setzero_pd();
311 fjx2 = _mm256_setzero_pd();
312 fjy2 = _mm256_setzero_pd();
313 fjz2 = _mm256_setzero_pd();
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
319 if (gmx_mm256_any_lt(rsq00,rcutoff2))
322 r00 = _mm256_mul_pd(rsq00,rinv00);
324 /* REACTION-FIELD ELECTROSTATICS */
325 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
326 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
328 /* LENNARD-JONES DISPERSION/REPULSION */
330 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
331 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
332 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
333 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
334 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
336 d = _mm256_sub_pd(r00,rswitch);
337 d = _mm256_max_pd(d,_mm256_setzero_pd());
338 d2 = _mm256_mul_pd(d,d);
339 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
341 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
343 /* Evaluate switch function */
344 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
345 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
346 vvdw = _mm256_mul_pd(vvdw,sw);
347 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velec = _mm256_and_pd(velec,cutoff_mask);
351 velecsum = _mm256_add_pd(velecsum,velec);
352 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
353 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
355 fscal = _mm256_add_pd(felec,fvdw);
357 fscal = _mm256_and_pd(fscal,cutoff_mask);
359 /* Calculate temporary vectorial force */
360 tx = _mm256_mul_pd(fscal,dx00);
361 ty = _mm256_mul_pd(fscal,dy00);
362 tz = _mm256_mul_pd(fscal,dz00);
364 /* Update vectorial force */
365 fix0 = _mm256_add_pd(fix0,tx);
366 fiy0 = _mm256_add_pd(fiy0,ty);
367 fiz0 = _mm256_add_pd(fiz0,tz);
369 fjx0 = _mm256_add_pd(fjx0,tx);
370 fjy0 = _mm256_add_pd(fjy0,ty);
371 fjz0 = _mm256_add_pd(fjz0,tz);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 if (gmx_mm256_any_lt(rsq01,rcutoff2))
382 /* REACTION-FIELD ELECTROSTATICS */
383 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
384 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
386 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velec = _mm256_and_pd(velec,cutoff_mask);
390 velecsum = _mm256_add_pd(velecsum,velec);
394 fscal = _mm256_and_pd(fscal,cutoff_mask);
396 /* Calculate temporary vectorial force */
397 tx = _mm256_mul_pd(fscal,dx01);
398 ty = _mm256_mul_pd(fscal,dy01);
399 tz = _mm256_mul_pd(fscal,dz01);
401 /* Update vectorial force */
402 fix0 = _mm256_add_pd(fix0,tx);
403 fiy0 = _mm256_add_pd(fiy0,ty);
404 fiz0 = _mm256_add_pd(fiz0,tz);
406 fjx1 = _mm256_add_pd(fjx1,tx);
407 fjy1 = _mm256_add_pd(fjy1,ty);
408 fjz1 = _mm256_add_pd(fjz1,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 if (gmx_mm256_any_lt(rsq02,rcutoff2))
419 /* REACTION-FIELD ELECTROSTATICS */
420 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
421 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
423 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
425 /* Update potential sum for this i atom from the interaction with this j atom. */
426 velec = _mm256_and_pd(velec,cutoff_mask);
427 velecsum = _mm256_add_pd(velecsum,velec);
431 fscal = _mm256_and_pd(fscal,cutoff_mask);
433 /* Calculate temporary vectorial force */
434 tx = _mm256_mul_pd(fscal,dx02);
435 ty = _mm256_mul_pd(fscal,dy02);
436 tz = _mm256_mul_pd(fscal,dz02);
438 /* Update vectorial force */
439 fix0 = _mm256_add_pd(fix0,tx);
440 fiy0 = _mm256_add_pd(fiy0,ty);
441 fiz0 = _mm256_add_pd(fiz0,tz);
443 fjx2 = _mm256_add_pd(fjx2,tx);
444 fjy2 = _mm256_add_pd(fjy2,ty);
445 fjz2 = _mm256_add_pd(fjz2,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 if (gmx_mm256_any_lt(rsq10,rcutoff2))
456 /* REACTION-FIELD ELECTROSTATICS */
457 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
458 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
460 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
462 /* Update potential sum for this i atom from the interaction with this j atom. */
463 velec = _mm256_and_pd(velec,cutoff_mask);
464 velecsum = _mm256_add_pd(velecsum,velec);
468 fscal = _mm256_and_pd(fscal,cutoff_mask);
470 /* Calculate temporary vectorial force */
471 tx = _mm256_mul_pd(fscal,dx10);
472 ty = _mm256_mul_pd(fscal,dy10);
473 tz = _mm256_mul_pd(fscal,dz10);
475 /* Update vectorial force */
476 fix1 = _mm256_add_pd(fix1,tx);
477 fiy1 = _mm256_add_pd(fiy1,ty);
478 fiz1 = _mm256_add_pd(fiz1,tz);
480 fjx0 = _mm256_add_pd(fjx0,tx);
481 fjy0 = _mm256_add_pd(fjy0,ty);
482 fjz0 = _mm256_add_pd(fjz0,tz);
486 /**************************
487 * CALCULATE INTERACTIONS *
488 **************************/
490 if (gmx_mm256_any_lt(rsq11,rcutoff2))
493 /* REACTION-FIELD ELECTROSTATICS */
494 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
495 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
497 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velec = _mm256_and_pd(velec,cutoff_mask);
501 velecsum = _mm256_add_pd(velecsum,velec);
505 fscal = _mm256_and_pd(fscal,cutoff_mask);
507 /* Calculate temporary vectorial force */
508 tx = _mm256_mul_pd(fscal,dx11);
509 ty = _mm256_mul_pd(fscal,dy11);
510 tz = _mm256_mul_pd(fscal,dz11);
512 /* Update vectorial force */
513 fix1 = _mm256_add_pd(fix1,tx);
514 fiy1 = _mm256_add_pd(fiy1,ty);
515 fiz1 = _mm256_add_pd(fiz1,tz);
517 fjx1 = _mm256_add_pd(fjx1,tx);
518 fjy1 = _mm256_add_pd(fjy1,ty);
519 fjz1 = _mm256_add_pd(fjz1,tz);
523 /**************************
524 * CALCULATE INTERACTIONS *
525 **************************/
527 if (gmx_mm256_any_lt(rsq12,rcutoff2))
530 /* REACTION-FIELD ELECTROSTATICS */
531 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
532 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
534 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velec = _mm256_and_pd(velec,cutoff_mask);
538 velecsum = _mm256_add_pd(velecsum,velec);
542 fscal = _mm256_and_pd(fscal,cutoff_mask);
544 /* Calculate temporary vectorial force */
545 tx = _mm256_mul_pd(fscal,dx12);
546 ty = _mm256_mul_pd(fscal,dy12);
547 tz = _mm256_mul_pd(fscal,dz12);
549 /* Update vectorial force */
550 fix1 = _mm256_add_pd(fix1,tx);
551 fiy1 = _mm256_add_pd(fiy1,ty);
552 fiz1 = _mm256_add_pd(fiz1,tz);
554 fjx2 = _mm256_add_pd(fjx2,tx);
555 fjy2 = _mm256_add_pd(fjy2,ty);
556 fjz2 = _mm256_add_pd(fjz2,tz);
560 /**************************
561 * CALCULATE INTERACTIONS *
562 **************************/
564 if (gmx_mm256_any_lt(rsq20,rcutoff2))
567 /* REACTION-FIELD ELECTROSTATICS */
568 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
569 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
571 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velec = _mm256_and_pd(velec,cutoff_mask);
575 velecsum = _mm256_add_pd(velecsum,velec);
579 fscal = _mm256_and_pd(fscal,cutoff_mask);
581 /* Calculate temporary vectorial force */
582 tx = _mm256_mul_pd(fscal,dx20);
583 ty = _mm256_mul_pd(fscal,dy20);
584 tz = _mm256_mul_pd(fscal,dz20);
586 /* Update vectorial force */
587 fix2 = _mm256_add_pd(fix2,tx);
588 fiy2 = _mm256_add_pd(fiy2,ty);
589 fiz2 = _mm256_add_pd(fiz2,tz);
591 fjx0 = _mm256_add_pd(fjx0,tx);
592 fjy0 = _mm256_add_pd(fjy0,ty);
593 fjz0 = _mm256_add_pd(fjz0,tz);
597 /**************************
598 * CALCULATE INTERACTIONS *
599 **************************/
601 if (gmx_mm256_any_lt(rsq21,rcutoff2))
604 /* REACTION-FIELD ELECTROSTATICS */
605 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
606 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
608 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
610 /* Update potential sum for this i atom from the interaction with this j atom. */
611 velec = _mm256_and_pd(velec,cutoff_mask);
612 velecsum = _mm256_add_pd(velecsum,velec);
616 fscal = _mm256_and_pd(fscal,cutoff_mask);
618 /* Calculate temporary vectorial force */
619 tx = _mm256_mul_pd(fscal,dx21);
620 ty = _mm256_mul_pd(fscal,dy21);
621 tz = _mm256_mul_pd(fscal,dz21);
623 /* Update vectorial force */
624 fix2 = _mm256_add_pd(fix2,tx);
625 fiy2 = _mm256_add_pd(fiy2,ty);
626 fiz2 = _mm256_add_pd(fiz2,tz);
628 fjx1 = _mm256_add_pd(fjx1,tx);
629 fjy1 = _mm256_add_pd(fjy1,ty);
630 fjz1 = _mm256_add_pd(fjz1,tz);
634 /**************************
635 * CALCULATE INTERACTIONS *
636 **************************/
638 if (gmx_mm256_any_lt(rsq22,rcutoff2))
641 /* REACTION-FIELD ELECTROSTATICS */
642 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
643 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
645 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
647 /* Update potential sum for this i atom from the interaction with this j atom. */
648 velec = _mm256_and_pd(velec,cutoff_mask);
649 velecsum = _mm256_add_pd(velecsum,velec);
653 fscal = _mm256_and_pd(fscal,cutoff_mask);
655 /* Calculate temporary vectorial force */
656 tx = _mm256_mul_pd(fscal,dx22);
657 ty = _mm256_mul_pd(fscal,dy22);
658 tz = _mm256_mul_pd(fscal,dz22);
660 /* Update vectorial force */
661 fix2 = _mm256_add_pd(fix2,tx);
662 fiy2 = _mm256_add_pd(fiy2,ty);
663 fiz2 = _mm256_add_pd(fiz2,tz);
665 fjx2 = _mm256_add_pd(fjx2,tx);
666 fjy2 = _mm256_add_pd(fjy2,ty);
667 fjz2 = _mm256_add_pd(fjz2,tz);
671 fjptrA = f+j_coord_offsetA;
672 fjptrB = f+j_coord_offsetB;
673 fjptrC = f+j_coord_offsetC;
674 fjptrD = f+j_coord_offsetD;
676 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
677 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
679 /* Inner loop uses 358 flops */
685 /* Get j neighbor index, and coordinate index */
686 jnrlistA = jjnr[jidx];
687 jnrlistB = jjnr[jidx+1];
688 jnrlistC = jjnr[jidx+2];
689 jnrlistD = jjnr[jidx+3];
690 /* Sign of each element will be negative for non-real atoms.
691 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
692 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
694 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
696 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
697 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
698 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
700 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
701 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
702 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
703 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
704 j_coord_offsetA = DIM*jnrA;
705 j_coord_offsetB = DIM*jnrB;
706 j_coord_offsetC = DIM*jnrC;
707 j_coord_offsetD = DIM*jnrD;
709 /* load j atom coordinates */
710 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
711 x+j_coord_offsetC,x+j_coord_offsetD,
712 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
714 /* Calculate displacement vector */
715 dx00 = _mm256_sub_pd(ix0,jx0);
716 dy00 = _mm256_sub_pd(iy0,jy0);
717 dz00 = _mm256_sub_pd(iz0,jz0);
718 dx01 = _mm256_sub_pd(ix0,jx1);
719 dy01 = _mm256_sub_pd(iy0,jy1);
720 dz01 = _mm256_sub_pd(iz0,jz1);
721 dx02 = _mm256_sub_pd(ix0,jx2);
722 dy02 = _mm256_sub_pd(iy0,jy2);
723 dz02 = _mm256_sub_pd(iz0,jz2);
724 dx10 = _mm256_sub_pd(ix1,jx0);
725 dy10 = _mm256_sub_pd(iy1,jy0);
726 dz10 = _mm256_sub_pd(iz1,jz0);
727 dx11 = _mm256_sub_pd(ix1,jx1);
728 dy11 = _mm256_sub_pd(iy1,jy1);
729 dz11 = _mm256_sub_pd(iz1,jz1);
730 dx12 = _mm256_sub_pd(ix1,jx2);
731 dy12 = _mm256_sub_pd(iy1,jy2);
732 dz12 = _mm256_sub_pd(iz1,jz2);
733 dx20 = _mm256_sub_pd(ix2,jx0);
734 dy20 = _mm256_sub_pd(iy2,jy0);
735 dz20 = _mm256_sub_pd(iz2,jz0);
736 dx21 = _mm256_sub_pd(ix2,jx1);
737 dy21 = _mm256_sub_pd(iy2,jy1);
738 dz21 = _mm256_sub_pd(iz2,jz1);
739 dx22 = _mm256_sub_pd(ix2,jx2);
740 dy22 = _mm256_sub_pd(iy2,jy2);
741 dz22 = _mm256_sub_pd(iz2,jz2);
743 /* Calculate squared distance and things based on it */
744 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
745 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
746 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
747 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
748 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
749 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
750 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
751 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
752 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
754 rinv00 = avx256_invsqrt_d(rsq00);
755 rinv01 = avx256_invsqrt_d(rsq01);
756 rinv02 = avx256_invsqrt_d(rsq02);
757 rinv10 = avx256_invsqrt_d(rsq10);
758 rinv11 = avx256_invsqrt_d(rsq11);
759 rinv12 = avx256_invsqrt_d(rsq12);
760 rinv20 = avx256_invsqrt_d(rsq20);
761 rinv21 = avx256_invsqrt_d(rsq21);
762 rinv22 = avx256_invsqrt_d(rsq22);
764 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
765 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
766 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
767 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
768 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
769 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
770 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
771 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
772 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
774 fjx0 = _mm256_setzero_pd();
775 fjy0 = _mm256_setzero_pd();
776 fjz0 = _mm256_setzero_pd();
777 fjx1 = _mm256_setzero_pd();
778 fjy1 = _mm256_setzero_pd();
779 fjz1 = _mm256_setzero_pd();
780 fjx2 = _mm256_setzero_pd();
781 fjy2 = _mm256_setzero_pd();
782 fjz2 = _mm256_setzero_pd();
784 /**************************
785 * CALCULATE INTERACTIONS *
786 **************************/
788 if (gmx_mm256_any_lt(rsq00,rcutoff2))
791 r00 = _mm256_mul_pd(rsq00,rinv00);
792 r00 = _mm256_andnot_pd(dummy_mask,r00);
794 /* REACTION-FIELD ELECTROSTATICS */
795 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
796 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
798 /* LENNARD-JONES DISPERSION/REPULSION */
800 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
801 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
802 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
803 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
804 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
806 d = _mm256_sub_pd(r00,rswitch);
807 d = _mm256_max_pd(d,_mm256_setzero_pd());
808 d2 = _mm256_mul_pd(d,d);
809 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
811 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
813 /* Evaluate switch function */
814 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
815 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
816 vvdw = _mm256_mul_pd(vvdw,sw);
817 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
819 /* Update potential sum for this i atom from the interaction with this j atom. */
820 velec = _mm256_and_pd(velec,cutoff_mask);
821 velec = _mm256_andnot_pd(dummy_mask,velec);
822 velecsum = _mm256_add_pd(velecsum,velec);
823 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
824 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
825 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
827 fscal = _mm256_add_pd(felec,fvdw);
829 fscal = _mm256_and_pd(fscal,cutoff_mask);
831 fscal = _mm256_andnot_pd(dummy_mask,fscal);
833 /* Calculate temporary vectorial force */
834 tx = _mm256_mul_pd(fscal,dx00);
835 ty = _mm256_mul_pd(fscal,dy00);
836 tz = _mm256_mul_pd(fscal,dz00);
838 /* Update vectorial force */
839 fix0 = _mm256_add_pd(fix0,tx);
840 fiy0 = _mm256_add_pd(fiy0,ty);
841 fiz0 = _mm256_add_pd(fiz0,tz);
843 fjx0 = _mm256_add_pd(fjx0,tx);
844 fjy0 = _mm256_add_pd(fjy0,ty);
845 fjz0 = _mm256_add_pd(fjz0,tz);
849 /**************************
850 * CALCULATE INTERACTIONS *
851 **************************/
853 if (gmx_mm256_any_lt(rsq01,rcutoff2))
856 /* REACTION-FIELD ELECTROSTATICS */
857 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
858 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
860 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 velec = _mm256_and_pd(velec,cutoff_mask);
864 velec = _mm256_andnot_pd(dummy_mask,velec);
865 velecsum = _mm256_add_pd(velecsum,velec);
869 fscal = _mm256_and_pd(fscal,cutoff_mask);
871 fscal = _mm256_andnot_pd(dummy_mask,fscal);
873 /* Calculate temporary vectorial force */
874 tx = _mm256_mul_pd(fscal,dx01);
875 ty = _mm256_mul_pd(fscal,dy01);
876 tz = _mm256_mul_pd(fscal,dz01);
878 /* Update vectorial force */
879 fix0 = _mm256_add_pd(fix0,tx);
880 fiy0 = _mm256_add_pd(fiy0,ty);
881 fiz0 = _mm256_add_pd(fiz0,tz);
883 fjx1 = _mm256_add_pd(fjx1,tx);
884 fjy1 = _mm256_add_pd(fjy1,ty);
885 fjz1 = _mm256_add_pd(fjz1,tz);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 if (gmx_mm256_any_lt(rsq02,rcutoff2))
896 /* REACTION-FIELD ELECTROSTATICS */
897 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
898 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
900 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
902 /* Update potential sum for this i atom from the interaction with this j atom. */
903 velec = _mm256_and_pd(velec,cutoff_mask);
904 velec = _mm256_andnot_pd(dummy_mask,velec);
905 velecsum = _mm256_add_pd(velecsum,velec);
909 fscal = _mm256_and_pd(fscal,cutoff_mask);
911 fscal = _mm256_andnot_pd(dummy_mask,fscal);
913 /* Calculate temporary vectorial force */
914 tx = _mm256_mul_pd(fscal,dx02);
915 ty = _mm256_mul_pd(fscal,dy02);
916 tz = _mm256_mul_pd(fscal,dz02);
918 /* Update vectorial force */
919 fix0 = _mm256_add_pd(fix0,tx);
920 fiy0 = _mm256_add_pd(fiy0,ty);
921 fiz0 = _mm256_add_pd(fiz0,tz);
923 fjx2 = _mm256_add_pd(fjx2,tx);
924 fjy2 = _mm256_add_pd(fjy2,ty);
925 fjz2 = _mm256_add_pd(fjz2,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 if (gmx_mm256_any_lt(rsq10,rcutoff2))
936 /* REACTION-FIELD ELECTROSTATICS */
937 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
938 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
940 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec = _mm256_and_pd(velec,cutoff_mask);
944 velec = _mm256_andnot_pd(dummy_mask,velec);
945 velecsum = _mm256_add_pd(velecsum,velec);
949 fscal = _mm256_and_pd(fscal,cutoff_mask);
951 fscal = _mm256_andnot_pd(dummy_mask,fscal);
953 /* Calculate temporary vectorial force */
954 tx = _mm256_mul_pd(fscal,dx10);
955 ty = _mm256_mul_pd(fscal,dy10);
956 tz = _mm256_mul_pd(fscal,dz10);
958 /* Update vectorial force */
959 fix1 = _mm256_add_pd(fix1,tx);
960 fiy1 = _mm256_add_pd(fiy1,ty);
961 fiz1 = _mm256_add_pd(fiz1,tz);
963 fjx0 = _mm256_add_pd(fjx0,tx);
964 fjy0 = _mm256_add_pd(fjy0,ty);
965 fjz0 = _mm256_add_pd(fjz0,tz);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 if (gmx_mm256_any_lt(rsq11,rcutoff2))
976 /* REACTION-FIELD ELECTROSTATICS */
977 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
978 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
980 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm256_and_pd(velec,cutoff_mask);
984 velec = _mm256_andnot_pd(dummy_mask,velec);
985 velecsum = _mm256_add_pd(velecsum,velec);
989 fscal = _mm256_and_pd(fscal,cutoff_mask);
991 fscal = _mm256_andnot_pd(dummy_mask,fscal);
993 /* Calculate temporary vectorial force */
994 tx = _mm256_mul_pd(fscal,dx11);
995 ty = _mm256_mul_pd(fscal,dy11);
996 tz = _mm256_mul_pd(fscal,dz11);
998 /* Update vectorial force */
999 fix1 = _mm256_add_pd(fix1,tx);
1000 fiy1 = _mm256_add_pd(fiy1,ty);
1001 fiz1 = _mm256_add_pd(fiz1,tz);
1003 fjx1 = _mm256_add_pd(fjx1,tx);
1004 fjy1 = _mm256_add_pd(fjy1,ty);
1005 fjz1 = _mm256_add_pd(fjz1,tz);
1009 /**************************
1010 * CALCULATE INTERACTIONS *
1011 **************************/
1013 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1016 /* REACTION-FIELD ELECTROSTATICS */
1017 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
1018 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1020 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1022 /* Update potential sum for this i atom from the interaction with this j atom. */
1023 velec = _mm256_and_pd(velec,cutoff_mask);
1024 velec = _mm256_andnot_pd(dummy_mask,velec);
1025 velecsum = _mm256_add_pd(velecsum,velec);
1029 fscal = _mm256_and_pd(fscal,cutoff_mask);
1031 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm256_mul_pd(fscal,dx12);
1035 ty = _mm256_mul_pd(fscal,dy12);
1036 tz = _mm256_mul_pd(fscal,dz12);
1038 /* Update vectorial force */
1039 fix1 = _mm256_add_pd(fix1,tx);
1040 fiy1 = _mm256_add_pd(fiy1,ty);
1041 fiz1 = _mm256_add_pd(fiz1,tz);
1043 fjx2 = _mm256_add_pd(fjx2,tx);
1044 fjy2 = _mm256_add_pd(fjy2,ty);
1045 fjz2 = _mm256_add_pd(fjz2,tz);
1049 /**************************
1050 * CALCULATE INTERACTIONS *
1051 **************************/
1053 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1056 /* REACTION-FIELD ELECTROSTATICS */
1057 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
1058 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1060 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1062 /* Update potential sum for this i atom from the interaction with this j atom. */
1063 velec = _mm256_and_pd(velec,cutoff_mask);
1064 velec = _mm256_andnot_pd(dummy_mask,velec);
1065 velecsum = _mm256_add_pd(velecsum,velec);
1069 fscal = _mm256_and_pd(fscal,cutoff_mask);
1071 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm256_mul_pd(fscal,dx20);
1075 ty = _mm256_mul_pd(fscal,dy20);
1076 tz = _mm256_mul_pd(fscal,dz20);
1078 /* Update vectorial force */
1079 fix2 = _mm256_add_pd(fix2,tx);
1080 fiy2 = _mm256_add_pd(fiy2,ty);
1081 fiz2 = _mm256_add_pd(fiz2,tz);
1083 fjx0 = _mm256_add_pd(fjx0,tx);
1084 fjy0 = _mm256_add_pd(fjy0,ty);
1085 fjz0 = _mm256_add_pd(fjz0,tz);
1089 /**************************
1090 * CALCULATE INTERACTIONS *
1091 **************************/
1093 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1096 /* REACTION-FIELD ELECTROSTATICS */
1097 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1098 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1100 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1102 /* Update potential sum for this i atom from the interaction with this j atom. */
1103 velec = _mm256_and_pd(velec,cutoff_mask);
1104 velec = _mm256_andnot_pd(dummy_mask,velec);
1105 velecsum = _mm256_add_pd(velecsum,velec);
1109 fscal = _mm256_and_pd(fscal,cutoff_mask);
1111 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1113 /* Calculate temporary vectorial force */
1114 tx = _mm256_mul_pd(fscal,dx21);
1115 ty = _mm256_mul_pd(fscal,dy21);
1116 tz = _mm256_mul_pd(fscal,dz21);
1118 /* Update vectorial force */
1119 fix2 = _mm256_add_pd(fix2,tx);
1120 fiy2 = _mm256_add_pd(fiy2,ty);
1121 fiz2 = _mm256_add_pd(fiz2,tz);
1123 fjx1 = _mm256_add_pd(fjx1,tx);
1124 fjy1 = _mm256_add_pd(fjy1,ty);
1125 fjz1 = _mm256_add_pd(fjz1,tz);
1129 /**************************
1130 * CALCULATE INTERACTIONS *
1131 **************************/
1133 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1136 /* REACTION-FIELD ELECTROSTATICS */
1137 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1138 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1140 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1142 /* Update potential sum for this i atom from the interaction with this j atom. */
1143 velec = _mm256_and_pd(velec,cutoff_mask);
1144 velec = _mm256_andnot_pd(dummy_mask,velec);
1145 velecsum = _mm256_add_pd(velecsum,velec);
1149 fscal = _mm256_and_pd(fscal,cutoff_mask);
1151 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1153 /* Calculate temporary vectorial force */
1154 tx = _mm256_mul_pd(fscal,dx22);
1155 ty = _mm256_mul_pd(fscal,dy22);
1156 tz = _mm256_mul_pd(fscal,dz22);
1158 /* Update vectorial force */
1159 fix2 = _mm256_add_pd(fix2,tx);
1160 fiy2 = _mm256_add_pd(fiy2,ty);
1161 fiz2 = _mm256_add_pd(fiz2,tz);
1163 fjx2 = _mm256_add_pd(fjx2,tx);
1164 fjy2 = _mm256_add_pd(fjy2,ty);
1165 fjz2 = _mm256_add_pd(fjz2,tz);
1169 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1170 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1171 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1172 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1174 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1175 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1177 /* Inner loop uses 359 flops */
1180 /* End of innermost loop */
1182 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1183 f+i_coord_offset,fshift+i_shift_offset);
1186 /* Update potential energies */
1187 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1188 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1190 /* Increment number of inner iterations */
1191 inneriter += j_index_end - j_index_start;
1193 /* Outer loop uses 20 flops */
1196 /* Increment number of outer iterations */
1199 /* Update outer/inner flops */
1201 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1204 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
1205 * Electrostatics interaction: ReactionField
1206 * VdW interaction: LennardJones
1207 * Geometry: Water3-Water3
1208 * Calculate force/pot: Force
1211 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
1212 (t_nblist * gmx_restrict nlist,
1213 rvec * gmx_restrict xx,
1214 rvec * gmx_restrict ff,
1215 struct t_forcerec * gmx_restrict fr,
1216 t_mdatoms * gmx_restrict mdatoms,
1217 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1218 t_nrnb * gmx_restrict nrnb)
1220 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1221 * just 0 for non-waters.
1222 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1223 * jnr indices corresponding to data put in the four positions in the SIMD register.
1225 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1226 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1227 int jnrA,jnrB,jnrC,jnrD;
1228 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1229 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1230 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1231 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1232 real rcutoff_scalar;
1233 real *shiftvec,*fshift,*x,*f;
1234 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1235 real scratch[4*DIM];
1236 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1237 real * vdwioffsetptr0;
1238 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1239 real * vdwioffsetptr1;
1240 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1241 real * vdwioffsetptr2;
1242 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1243 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1244 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1245 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1246 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1247 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1248 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1249 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1250 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1251 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1252 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1253 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1254 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1255 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1256 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1257 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1258 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1261 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1264 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1265 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1266 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1267 real rswitch_scalar,d_scalar;
1268 __m256d dummy_mask,cutoff_mask;
1269 __m128 tmpmask0,tmpmask1;
1270 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1271 __m256d one = _mm256_set1_pd(1.0);
1272 __m256d two = _mm256_set1_pd(2.0);
1278 jindex = nlist->jindex;
1280 shiftidx = nlist->shift;
1282 shiftvec = fr->shift_vec[0];
1283 fshift = fr->fshift[0];
1284 facel = _mm256_set1_pd(fr->ic->epsfac);
1285 charge = mdatoms->chargeA;
1286 krf = _mm256_set1_pd(fr->ic->k_rf);
1287 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1288 crf = _mm256_set1_pd(fr->ic->c_rf);
1289 nvdwtype = fr->ntype;
1290 vdwparam = fr->nbfp;
1291 vdwtype = mdatoms->typeA;
1293 /* Setup water-specific parameters */
1294 inr = nlist->iinr[0];
1295 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1296 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1297 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1298 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1300 jq0 = _mm256_set1_pd(charge[inr+0]);
1301 jq1 = _mm256_set1_pd(charge[inr+1]);
1302 jq2 = _mm256_set1_pd(charge[inr+2]);
1303 vdwjidx0A = 2*vdwtype[inr+0];
1304 qq00 = _mm256_mul_pd(iq0,jq0);
1305 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1306 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1307 qq01 = _mm256_mul_pd(iq0,jq1);
1308 qq02 = _mm256_mul_pd(iq0,jq2);
1309 qq10 = _mm256_mul_pd(iq1,jq0);
1310 qq11 = _mm256_mul_pd(iq1,jq1);
1311 qq12 = _mm256_mul_pd(iq1,jq2);
1312 qq20 = _mm256_mul_pd(iq2,jq0);
1313 qq21 = _mm256_mul_pd(iq2,jq1);
1314 qq22 = _mm256_mul_pd(iq2,jq2);
1316 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1317 rcutoff_scalar = fr->ic->rcoulomb;
1318 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1319 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1321 rswitch_scalar = fr->ic->rvdw_switch;
1322 rswitch = _mm256_set1_pd(rswitch_scalar);
1323 /* Setup switch parameters */
1324 d_scalar = rcutoff_scalar-rswitch_scalar;
1325 d = _mm256_set1_pd(d_scalar);
1326 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1327 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1328 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1329 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1330 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1331 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1333 /* Avoid stupid compiler warnings */
1334 jnrA = jnrB = jnrC = jnrD = 0;
1335 j_coord_offsetA = 0;
1336 j_coord_offsetB = 0;
1337 j_coord_offsetC = 0;
1338 j_coord_offsetD = 0;
1343 for(iidx=0;iidx<4*DIM;iidx++)
1345 scratch[iidx] = 0.0;
1348 /* Start outer loop over neighborlists */
1349 for(iidx=0; iidx<nri; iidx++)
1351 /* Load shift vector for this list */
1352 i_shift_offset = DIM*shiftidx[iidx];
1354 /* Load limits for loop over neighbors */
1355 j_index_start = jindex[iidx];
1356 j_index_end = jindex[iidx+1];
1358 /* Get outer coordinate index */
1360 i_coord_offset = DIM*inr;
1362 /* Load i particle coords and add shift vector */
1363 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1364 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1366 fix0 = _mm256_setzero_pd();
1367 fiy0 = _mm256_setzero_pd();
1368 fiz0 = _mm256_setzero_pd();
1369 fix1 = _mm256_setzero_pd();
1370 fiy1 = _mm256_setzero_pd();
1371 fiz1 = _mm256_setzero_pd();
1372 fix2 = _mm256_setzero_pd();
1373 fiy2 = _mm256_setzero_pd();
1374 fiz2 = _mm256_setzero_pd();
1376 /* Start inner kernel loop */
1377 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1380 /* Get j neighbor index, and coordinate index */
1382 jnrB = jjnr[jidx+1];
1383 jnrC = jjnr[jidx+2];
1384 jnrD = jjnr[jidx+3];
1385 j_coord_offsetA = DIM*jnrA;
1386 j_coord_offsetB = DIM*jnrB;
1387 j_coord_offsetC = DIM*jnrC;
1388 j_coord_offsetD = DIM*jnrD;
1390 /* load j atom coordinates */
1391 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1392 x+j_coord_offsetC,x+j_coord_offsetD,
1393 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1395 /* Calculate displacement vector */
1396 dx00 = _mm256_sub_pd(ix0,jx0);
1397 dy00 = _mm256_sub_pd(iy0,jy0);
1398 dz00 = _mm256_sub_pd(iz0,jz0);
1399 dx01 = _mm256_sub_pd(ix0,jx1);
1400 dy01 = _mm256_sub_pd(iy0,jy1);
1401 dz01 = _mm256_sub_pd(iz0,jz1);
1402 dx02 = _mm256_sub_pd(ix0,jx2);
1403 dy02 = _mm256_sub_pd(iy0,jy2);
1404 dz02 = _mm256_sub_pd(iz0,jz2);
1405 dx10 = _mm256_sub_pd(ix1,jx0);
1406 dy10 = _mm256_sub_pd(iy1,jy0);
1407 dz10 = _mm256_sub_pd(iz1,jz0);
1408 dx11 = _mm256_sub_pd(ix1,jx1);
1409 dy11 = _mm256_sub_pd(iy1,jy1);
1410 dz11 = _mm256_sub_pd(iz1,jz1);
1411 dx12 = _mm256_sub_pd(ix1,jx2);
1412 dy12 = _mm256_sub_pd(iy1,jy2);
1413 dz12 = _mm256_sub_pd(iz1,jz2);
1414 dx20 = _mm256_sub_pd(ix2,jx0);
1415 dy20 = _mm256_sub_pd(iy2,jy0);
1416 dz20 = _mm256_sub_pd(iz2,jz0);
1417 dx21 = _mm256_sub_pd(ix2,jx1);
1418 dy21 = _mm256_sub_pd(iy2,jy1);
1419 dz21 = _mm256_sub_pd(iz2,jz1);
1420 dx22 = _mm256_sub_pd(ix2,jx2);
1421 dy22 = _mm256_sub_pd(iy2,jy2);
1422 dz22 = _mm256_sub_pd(iz2,jz2);
1424 /* Calculate squared distance and things based on it */
1425 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1426 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1427 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1428 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1429 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1430 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1431 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1432 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1433 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1435 rinv00 = avx256_invsqrt_d(rsq00);
1436 rinv01 = avx256_invsqrt_d(rsq01);
1437 rinv02 = avx256_invsqrt_d(rsq02);
1438 rinv10 = avx256_invsqrt_d(rsq10);
1439 rinv11 = avx256_invsqrt_d(rsq11);
1440 rinv12 = avx256_invsqrt_d(rsq12);
1441 rinv20 = avx256_invsqrt_d(rsq20);
1442 rinv21 = avx256_invsqrt_d(rsq21);
1443 rinv22 = avx256_invsqrt_d(rsq22);
1445 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1446 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1447 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1448 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1449 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1450 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1451 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1452 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1453 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1455 fjx0 = _mm256_setzero_pd();
1456 fjy0 = _mm256_setzero_pd();
1457 fjz0 = _mm256_setzero_pd();
1458 fjx1 = _mm256_setzero_pd();
1459 fjy1 = _mm256_setzero_pd();
1460 fjz1 = _mm256_setzero_pd();
1461 fjx2 = _mm256_setzero_pd();
1462 fjy2 = _mm256_setzero_pd();
1463 fjz2 = _mm256_setzero_pd();
1465 /**************************
1466 * CALCULATE INTERACTIONS *
1467 **************************/
1469 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1472 r00 = _mm256_mul_pd(rsq00,rinv00);
1474 /* REACTION-FIELD ELECTROSTATICS */
1475 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1477 /* LENNARD-JONES DISPERSION/REPULSION */
1479 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1480 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1481 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1482 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1483 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1485 d = _mm256_sub_pd(r00,rswitch);
1486 d = _mm256_max_pd(d,_mm256_setzero_pd());
1487 d2 = _mm256_mul_pd(d,d);
1488 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1490 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1492 /* Evaluate switch function */
1493 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1494 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1495 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1497 fscal = _mm256_add_pd(felec,fvdw);
1499 fscal = _mm256_and_pd(fscal,cutoff_mask);
1501 /* Calculate temporary vectorial force */
1502 tx = _mm256_mul_pd(fscal,dx00);
1503 ty = _mm256_mul_pd(fscal,dy00);
1504 tz = _mm256_mul_pd(fscal,dz00);
1506 /* Update vectorial force */
1507 fix0 = _mm256_add_pd(fix0,tx);
1508 fiy0 = _mm256_add_pd(fiy0,ty);
1509 fiz0 = _mm256_add_pd(fiz0,tz);
1511 fjx0 = _mm256_add_pd(fjx0,tx);
1512 fjy0 = _mm256_add_pd(fjy0,ty);
1513 fjz0 = _mm256_add_pd(fjz0,tz);
1517 /**************************
1518 * CALCULATE INTERACTIONS *
1519 **************************/
1521 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1524 /* REACTION-FIELD ELECTROSTATICS */
1525 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1527 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1531 fscal = _mm256_and_pd(fscal,cutoff_mask);
1533 /* Calculate temporary vectorial force */
1534 tx = _mm256_mul_pd(fscal,dx01);
1535 ty = _mm256_mul_pd(fscal,dy01);
1536 tz = _mm256_mul_pd(fscal,dz01);
1538 /* Update vectorial force */
1539 fix0 = _mm256_add_pd(fix0,tx);
1540 fiy0 = _mm256_add_pd(fiy0,ty);
1541 fiz0 = _mm256_add_pd(fiz0,tz);
1543 fjx1 = _mm256_add_pd(fjx1,tx);
1544 fjy1 = _mm256_add_pd(fjy1,ty);
1545 fjz1 = _mm256_add_pd(fjz1,tz);
1549 /**************************
1550 * CALCULATE INTERACTIONS *
1551 **************************/
1553 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1556 /* REACTION-FIELD ELECTROSTATICS */
1557 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1559 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1563 fscal = _mm256_and_pd(fscal,cutoff_mask);
1565 /* Calculate temporary vectorial force */
1566 tx = _mm256_mul_pd(fscal,dx02);
1567 ty = _mm256_mul_pd(fscal,dy02);
1568 tz = _mm256_mul_pd(fscal,dz02);
1570 /* Update vectorial force */
1571 fix0 = _mm256_add_pd(fix0,tx);
1572 fiy0 = _mm256_add_pd(fiy0,ty);
1573 fiz0 = _mm256_add_pd(fiz0,tz);
1575 fjx2 = _mm256_add_pd(fjx2,tx);
1576 fjy2 = _mm256_add_pd(fjy2,ty);
1577 fjz2 = _mm256_add_pd(fjz2,tz);
1581 /**************************
1582 * CALCULATE INTERACTIONS *
1583 **************************/
1585 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1588 /* REACTION-FIELD ELECTROSTATICS */
1589 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1591 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1595 fscal = _mm256_and_pd(fscal,cutoff_mask);
1597 /* Calculate temporary vectorial force */
1598 tx = _mm256_mul_pd(fscal,dx10);
1599 ty = _mm256_mul_pd(fscal,dy10);
1600 tz = _mm256_mul_pd(fscal,dz10);
1602 /* Update vectorial force */
1603 fix1 = _mm256_add_pd(fix1,tx);
1604 fiy1 = _mm256_add_pd(fiy1,ty);
1605 fiz1 = _mm256_add_pd(fiz1,tz);
1607 fjx0 = _mm256_add_pd(fjx0,tx);
1608 fjy0 = _mm256_add_pd(fjy0,ty);
1609 fjz0 = _mm256_add_pd(fjz0,tz);
1613 /**************************
1614 * CALCULATE INTERACTIONS *
1615 **************************/
1617 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1620 /* REACTION-FIELD ELECTROSTATICS */
1621 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1623 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1627 fscal = _mm256_and_pd(fscal,cutoff_mask);
1629 /* Calculate temporary vectorial force */
1630 tx = _mm256_mul_pd(fscal,dx11);
1631 ty = _mm256_mul_pd(fscal,dy11);
1632 tz = _mm256_mul_pd(fscal,dz11);
1634 /* Update vectorial force */
1635 fix1 = _mm256_add_pd(fix1,tx);
1636 fiy1 = _mm256_add_pd(fiy1,ty);
1637 fiz1 = _mm256_add_pd(fiz1,tz);
1639 fjx1 = _mm256_add_pd(fjx1,tx);
1640 fjy1 = _mm256_add_pd(fjy1,ty);
1641 fjz1 = _mm256_add_pd(fjz1,tz);
1645 /**************************
1646 * CALCULATE INTERACTIONS *
1647 **************************/
1649 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1652 /* REACTION-FIELD ELECTROSTATICS */
1653 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1655 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1659 fscal = _mm256_and_pd(fscal,cutoff_mask);
1661 /* Calculate temporary vectorial force */
1662 tx = _mm256_mul_pd(fscal,dx12);
1663 ty = _mm256_mul_pd(fscal,dy12);
1664 tz = _mm256_mul_pd(fscal,dz12);
1666 /* Update vectorial force */
1667 fix1 = _mm256_add_pd(fix1,tx);
1668 fiy1 = _mm256_add_pd(fiy1,ty);
1669 fiz1 = _mm256_add_pd(fiz1,tz);
1671 fjx2 = _mm256_add_pd(fjx2,tx);
1672 fjy2 = _mm256_add_pd(fjy2,ty);
1673 fjz2 = _mm256_add_pd(fjz2,tz);
1677 /**************************
1678 * CALCULATE INTERACTIONS *
1679 **************************/
1681 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1684 /* REACTION-FIELD ELECTROSTATICS */
1685 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1687 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1691 fscal = _mm256_and_pd(fscal,cutoff_mask);
1693 /* Calculate temporary vectorial force */
1694 tx = _mm256_mul_pd(fscal,dx20);
1695 ty = _mm256_mul_pd(fscal,dy20);
1696 tz = _mm256_mul_pd(fscal,dz20);
1698 /* Update vectorial force */
1699 fix2 = _mm256_add_pd(fix2,tx);
1700 fiy2 = _mm256_add_pd(fiy2,ty);
1701 fiz2 = _mm256_add_pd(fiz2,tz);
1703 fjx0 = _mm256_add_pd(fjx0,tx);
1704 fjy0 = _mm256_add_pd(fjy0,ty);
1705 fjz0 = _mm256_add_pd(fjz0,tz);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1716 /* REACTION-FIELD ELECTROSTATICS */
1717 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1719 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1723 fscal = _mm256_and_pd(fscal,cutoff_mask);
1725 /* Calculate temporary vectorial force */
1726 tx = _mm256_mul_pd(fscal,dx21);
1727 ty = _mm256_mul_pd(fscal,dy21);
1728 tz = _mm256_mul_pd(fscal,dz21);
1730 /* Update vectorial force */
1731 fix2 = _mm256_add_pd(fix2,tx);
1732 fiy2 = _mm256_add_pd(fiy2,ty);
1733 fiz2 = _mm256_add_pd(fiz2,tz);
1735 fjx1 = _mm256_add_pd(fjx1,tx);
1736 fjy1 = _mm256_add_pd(fjy1,ty);
1737 fjz1 = _mm256_add_pd(fjz1,tz);
1741 /**************************
1742 * CALCULATE INTERACTIONS *
1743 **************************/
1745 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1748 /* REACTION-FIELD ELECTROSTATICS */
1749 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1751 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1755 fscal = _mm256_and_pd(fscal,cutoff_mask);
1757 /* Calculate temporary vectorial force */
1758 tx = _mm256_mul_pd(fscal,dx22);
1759 ty = _mm256_mul_pd(fscal,dy22);
1760 tz = _mm256_mul_pd(fscal,dz22);
1762 /* Update vectorial force */
1763 fix2 = _mm256_add_pd(fix2,tx);
1764 fiy2 = _mm256_add_pd(fiy2,ty);
1765 fiz2 = _mm256_add_pd(fiz2,tz);
1767 fjx2 = _mm256_add_pd(fjx2,tx);
1768 fjy2 = _mm256_add_pd(fjy2,ty);
1769 fjz2 = _mm256_add_pd(fjz2,tz);
1773 fjptrA = f+j_coord_offsetA;
1774 fjptrB = f+j_coord_offsetB;
1775 fjptrC = f+j_coord_offsetC;
1776 fjptrD = f+j_coord_offsetD;
1778 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1779 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1781 /* Inner loop uses 301 flops */
1784 if(jidx<j_index_end)
1787 /* Get j neighbor index, and coordinate index */
1788 jnrlistA = jjnr[jidx];
1789 jnrlistB = jjnr[jidx+1];
1790 jnrlistC = jjnr[jidx+2];
1791 jnrlistD = jjnr[jidx+3];
1792 /* Sign of each element will be negative for non-real atoms.
1793 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1794 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1796 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1798 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1799 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1800 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1802 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1803 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1804 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1805 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1806 j_coord_offsetA = DIM*jnrA;
1807 j_coord_offsetB = DIM*jnrB;
1808 j_coord_offsetC = DIM*jnrC;
1809 j_coord_offsetD = DIM*jnrD;
1811 /* load j atom coordinates */
1812 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1813 x+j_coord_offsetC,x+j_coord_offsetD,
1814 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1816 /* Calculate displacement vector */
1817 dx00 = _mm256_sub_pd(ix0,jx0);
1818 dy00 = _mm256_sub_pd(iy0,jy0);
1819 dz00 = _mm256_sub_pd(iz0,jz0);
1820 dx01 = _mm256_sub_pd(ix0,jx1);
1821 dy01 = _mm256_sub_pd(iy0,jy1);
1822 dz01 = _mm256_sub_pd(iz0,jz1);
1823 dx02 = _mm256_sub_pd(ix0,jx2);
1824 dy02 = _mm256_sub_pd(iy0,jy2);
1825 dz02 = _mm256_sub_pd(iz0,jz2);
1826 dx10 = _mm256_sub_pd(ix1,jx0);
1827 dy10 = _mm256_sub_pd(iy1,jy0);
1828 dz10 = _mm256_sub_pd(iz1,jz0);
1829 dx11 = _mm256_sub_pd(ix1,jx1);
1830 dy11 = _mm256_sub_pd(iy1,jy1);
1831 dz11 = _mm256_sub_pd(iz1,jz1);
1832 dx12 = _mm256_sub_pd(ix1,jx2);
1833 dy12 = _mm256_sub_pd(iy1,jy2);
1834 dz12 = _mm256_sub_pd(iz1,jz2);
1835 dx20 = _mm256_sub_pd(ix2,jx0);
1836 dy20 = _mm256_sub_pd(iy2,jy0);
1837 dz20 = _mm256_sub_pd(iz2,jz0);
1838 dx21 = _mm256_sub_pd(ix2,jx1);
1839 dy21 = _mm256_sub_pd(iy2,jy1);
1840 dz21 = _mm256_sub_pd(iz2,jz1);
1841 dx22 = _mm256_sub_pd(ix2,jx2);
1842 dy22 = _mm256_sub_pd(iy2,jy2);
1843 dz22 = _mm256_sub_pd(iz2,jz2);
1845 /* Calculate squared distance and things based on it */
1846 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1847 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1848 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1849 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1850 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1851 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1852 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1853 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1854 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1856 rinv00 = avx256_invsqrt_d(rsq00);
1857 rinv01 = avx256_invsqrt_d(rsq01);
1858 rinv02 = avx256_invsqrt_d(rsq02);
1859 rinv10 = avx256_invsqrt_d(rsq10);
1860 rinv11 = avx256_invsqrt_d(rsq11);
1861 rinv12 = avx256_invsqrt_d(rsq12);
1862 rinv20 = avx256_invsqrt_d(rsq20);
1863 rinv21 = avx256_invsqrt_d(rsq21);
1864 rinv22 = avx256_invsqrt_d(rsq22);
1866 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1867 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1868 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1869 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1870 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1871 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1872 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1873 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1874 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1876 fjx0 = _mm256_setzero_pd();
1877 fjy0 = _mm256_setzero_pd();
1878 fjz0 = _mm256_setzero_pd();
1879 fjx1 = _mm256_setzero_pd();
1880 fjy1 = _mm256_setzero_pd();
1881 fjz1 = _mm256_setzero_pd();
1882 fjx2 = _mm256_setzero_pd();
1883 fjy2 = _mm256_setzero_pd();
1884 fjz2 = _mm256_setzero_pd();
1886 /**************************
1887 * CALCULATE INTERACTIONS *
1888 **************************/
1890 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1893 r00 = _mm256_mul_pd(rsq00,rinv00);
1894 r00 = _mm256_andnot_pd(dummy_mask,r00);
1896 /* REACTION-FIELD ELECTROSTATICS */
1897 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1899 /* LENNARD-JONES DISPERSION/REPULSION */
1901 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1902 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1903 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1904 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1905 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1907 d = _mm256_sub_pd(r00,rswitch);
1908 d = _mm256_max_pd(d,_mm256_setzero_pd());
1909 d2 = _mm256_mul_pd(d,d);
1910 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1912 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1914 /* Evaluate switch function */
1915 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1916 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1917 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1919 fscal = _mm256_add_pd(felec,fvdw);
1921 fscal = _mm256_and_pd(fscal,cutoff_mask);
1923 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1925 /* Calculate temporary vectorial force */
1926 tx = _mm256_mul_pd(fscal,dx00);
1927 ty = _mm256_mul_pd(fscal,dy00);
1928 tz = _mm256_mul_pd(fscal,dz00);
1930 /* Update vectorial force */
1931 fix0 = _mm256_add_pd(fix0,tx);
1932 fiy0 = _mm256_add_pd(fiy0,ty);
1933 fiz0 = _mm256_add_pd(fiz0,tz);
1935 fjx0 = _mm256_add_pd(fjx0,tx);
1936 fjy0 = _mm256_add_pd(fjy0,ty);
1937 fjz0 = _mm256_add_pd(fjz0,tz);
1941 /**************************
1942 * CALCULATE INTERACTIONS *
1943 **************************/
1945 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1948 /* REACTION-FIELD ELECTROSTATICS */
1949 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1951 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1955 fscal = _mm256_and_pd(fscal,cutoff_mask);
1957 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1959 /* Calculate temporary vectorial force */
1960 tx = _mm256_mul_pd(fscal,dx01);
1961 ty = _mm256_mul_pd(fscal,dy01);
1962 tz = _mm256_mul_pd(fscal,dz01);
1964 /* Update vectorial force */
1965 fix0 = _mm256_add_pd(fix0,tx);
1966 fiy0 = _mm256_add_pd(fiy0,ty);
1967 fiz0 = _mm256_add_pd(fiz0,tz);
1969 fjx1 = _mm256_add_pd(fjx1,tx);
1970 fjy1 = _mm256_add_pd(fjy1,ty);
1971 fjz1 = _mm256_add_pd(fjz1,tz);
1975 /**************************
1976 * CALCULATE INTERACTIONS *
1977 **************************/
1979 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1982 /* REACTION-FIELD ELECTROSTATICS */
1983 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1985 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1989 fscal = _mm256_and_pd(fscal,cutoff_mask);
1991 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1993 /* Calculate temporary vectorial force */
1994 tx = _mm256_mul_pd(fscal,dx02);
1995 ty = _mm256_mul_pd(fscal,dy02);
1996 tz = _mm256_mul_pd(fscal,dz02);
1998 /* Update vectorial force */
1999 fix0 = _mm256_add_pd(fix0,tx);
2000 fiy0 = _mm256_add_pd(fiy0,ty);
2001 fiz0 = _mm256_add_pd(fiz0,tz);
2003 fjx2 = _mm256_add_pd(fjx2,tx);
2004 fjy2 = _mm256_add_pd(fjy2,ty);
2005 fjz2 = _mm256_add_pd(fjz2,tz);
2009 /**************************
2010 * CALCULATE INTERACTIONS *
2011 **************************/
2013 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2016 /* REACTION-FIELD ELECTROSTATICS */
2017 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
2019 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2023 fscal = _mm256_and_pd(fscal,cutoff_mask);
2025 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2027 /* Calculate temporary vectorial force */
2028 tx = _mm256_mul_pd(fscal,dx10);
2029 ty = _mm256_mul_pd(fscal,dy10);
2030 tz = _mm256_mul_pd(fscal,dz10);
2032 /* Update vectorial force */
2033 fix1 = _mm256_add_pd(fix1,tx);
2034 fiy1 = _mm256_add_pd(fiy1,ty);
2035 fiz1 = _mm256_add_pd(fiz1,tz);
2037 fjx0 = _mm256_add_pd(fjx0,tx);
2038 fjy0 = _mm256_add_pd(fjy0,ty);
2039 fjz0 = _mm256_add_pd(fjz0,tz);
2043 /**************************
2044 * CALCULATE INTERACTIONS *
2045 **************************/
2047 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2050 /* REACTION-FIELD ELECTROSTATICS */
2051 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2053 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2057 fscal = _mm256_and_pd(fscal,cutoff_mask);
2059 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2061 /* Calculate temporary vectorial force */
2062 tx = _mm256_mul_pd(fscal,dx11);
2063 ty = _mm256_mul_pd(fscal,dy11);
2064 tz = _mm256_mul_pd(fscal,dz11);
2066 /* Update vectorial force */
2067 fix1 = _mm256_add_pd(fix1,tx);
2068 fiy1 = _mm256_add_pd(fiy1,ty);
2069 fiz1 = _mm256_add_pd(fiz1,tz);
2071 fjx1 = _mm256_add_pd(fjx1,tx);
2072 fjy1 = _mm256_add_pd(fjy1,ty);
2073 fjz1 = _mm256_add_pd(fjz1,tz);
2077 /**************************
2078 * CALCULATE INTERACTIONS *
2079 **************************/
2081 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2084 /* REACTION-FIELD ELECTROSTATICS */
2085 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2087 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2091 fscal = _mm256_and_pd(fscal,cutoff_mask);
2093 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm256_mul_pd(fscal,dx12);
2097 ty = _mm256_mul_pd(fscal,dy12);
2098 tz = _mm256_mul_pd(fscal,dz12);
2100 /* Update vectorial force */
2101 fix1 = _mm256_add_pd(fix1,tx);
2102 fiy1 = _mm256_add_pd(fiy1,ty);
2103 fiz1 = _mm256_add_pd(fiz1,tz);
2105 fjx2 = _mm256_add_pd(fjx2,tx);
2106 fjy2 = _mm256_add_pd(fjy2,ty);
2107 fjz2 = _mm256_add_pd(fjz2,tz);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2118 /* REACTION-FIELD ELECTROSTATICS */
2119 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
2121 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2125 fscal = _mm256_and_pd(fscal,cutoff_mask);
2127 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2129 /* Calculate temporary vectorial force */
2130 tx = _mm256_mul_pd(fscal,dx20);
2131 ty = _mm256_mul_pd(fscal,dy20);
2132 tz = _mm256_mul_pd(fscal,dz20);
2134 /* Update vectorial force */
2135 fix2 = _mm256_add_pd(fix2,tx);
2136 fiy2 = _mm256_add_pd(fiy2,ty);
2137 fiz2 = _mm256_add_pd(fiz2,tz);
2139 fjx0 = _mm256_add_pd(fjx0,tx);
2140 fjy0 = _mm256_add_pd(fjy0,ty);
2141 fjz0 = _mm256_add_pd(fjz0,tz);
2145 /**************************
2146 * CALCULATE INTERACTIONS *
2147 **************************/
2149 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2152 /* REACTION-FIELD ELECTROSTATICS */
2153 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2155 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2159 fscal = _mm256_and_pd(fscal,cutoff_mask);
2161 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2163 /* Calculate temporary vectorial force */
2164 tx = _mm256_mul_pd(fscal,dx21);
2165 ty = _mm256_mul_pd(fscal,dy21);
2166 tz = _mm256_mul_pd(fscal,dz21);
2168 /* Update vectorial force */
2169 fix2 = _mm256_add_pd(fix2,tx);
2170 fiy2 = _mm256_add_pd(fiy2,ty);
2171 fiz2 = _mm256_add_pd(fiz2,tz);
2173 fjx1 = _mm256_add_pd(fjx1,tx);
2174 fjy1 = _mm256_add_pd(fjy1,ty);
2175 fjz1 = _mm256_add_pd(fjz1,tz);
2179 /**************************
2180 * CALCULATE INTERACTIONS *
2181 **************************/
2183 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2186 /* REACTION-FIELD ELECTROSTATICS */
2187 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2189 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2193 fscal = _mm256_and_pd(fscal,cutoff_mask);
2195 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2197 /* Calculate temporary vectorial force */
2198 tx = _mm256_mul_pd(fscal,dx22);
2199 ty = _mm256_mul_pd(fscal,dy22);
2200 tz = _mm256_mul_pd(fscal,dz22);
2202 /* Update vectorial force */
2203 fix2 = _mm256_add_pd(fix2,tx);
2204 fiy2 = _mm256_add_pd(fiy2,ty);
2205 fiz2 = _mm256_add_pd(fiz2,tz);
2207 fjx2 = _mm256_add_pd(fjx2,tx);
2208 fjy2 = _mm256_add_pd(fjy2,ty);
2209 fjz2 = _mm256_add_pd(fjz2,tz);
2213 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2214 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2215 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2216 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2218 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2219 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2221 /* Inner loop uses 302 flops */
2224 /* End of innermost loop */
2226 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2227 f+i_coord_offset,fshift+i_shift_offset);
2229 /* Increment number of inner iterations */
2230 inneriter += j_index_end - j_index_start;
2232 /* Outer loop uses 18 flops */
2235 /* Increment number of outer iterations */
2238 /* Update outer/inner flops */
2240 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);