2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
114 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
115 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116 real rswitch_scalar,d_scalar;
117 __m256d dummy_mask,cutoff_mask;
118 __m128 tmpmask0,tmpmask1;
119 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120 __m256d one = _mm256_set1_pd(1.0);
121 __m256d two = _mm256_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm256_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 krf = _mm256_set1_pd(fr->ic->k_rf);
136 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
137 crf = _mm256_set1_pd(fr->ic->c_rf);
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
145 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
146 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
147 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
149 jq0 = _mm256_set1_pd(charge[inr+0]);
150 jq1 = _mm256_set1_pd(charge[inr+1]);
151 jq2 = _mm256_set1_pd(charge[inr+2]);
152 vdwjidx0A = 2*vdwtype[inr+0];
153 qq00 = _mm256_mul_pd(iq0,jq0);
154 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
155 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
156 qq01 = _mm256_mul_pd(iq0,jq1);
157 qq02 = _mm256_mul_pd(iq0,jq2);
158 qq10 = _mm256_mul_pd(iq1,jq0);
159 qq11 = _mm256_mul_pd(iq1,jq1);
160 qq12 = _mm256_mul_pd(iq1,jq2);
161 qq20 = _mm256_mul_pd(iq2,jq0);
162 qq21 = _mm256_mul_pd(iq2,jq1);
163 qq22 = _mm256_mul_pd(iq2,jq2);
165 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166 rcutoff_scalar = fr->rcoulomb;
167 rcutoff = _mm256_set1_pd(rcutoff_scalar);
168 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
170 rswitch_scalar = fr->rvdw_switch;
171 rswitch = _mm256_set1_pd(rswitch_scalar);
172 /* Setup switch parameters */
173 d_scalar = rcutoff_scalar-rswitch_scalar;
174 d = _mm256_set1_pd(d_scalar);
175 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
176 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
179 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
182 /* Avoid stupid compiler warnings */
183 jnrA = jnrB = jnrC = jnrD = 0;
192 for(iidx=0;iidx<4*DIM;iidx++)
197 /* Start outer loop over neighborlists */
198 for(iidx=0; iidx<nri; iidx++)
200 /* Load shift vector for this list */
201 i_shift_offset = DIM*shiftidx[iidx];
203 /* Load limits for loop over neighbors */
204 j_index_start = jindex[iidx];
205 j_index_end = jindex[iidx+1];
207 /* Get outer coordinate index */
209 i_coord_offset = DIM*inr;
211 /* Load i particle coords and add shift vector */
212 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
213 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
215 fix0 = _mm256_setzero_pd();
216 fiy0 = _mm256_setzero_pd();
217 fiz0 = _mm256_setzero_pd();
218 fix1 = _mm256_setzero_pd();
219 fiy1 = _mm256_setzero_pd();
220 fiz1 = _mm256_setzero_pd();
221 fix2 = _mm256_setzero_pd();
222 fiy2 = _mm256_setzero_pd();
223 fiz2 = _mm256_setzero_pd();
225 /* Reset potential sums */
226 velecsum = _mm256_setzero_pd();
227 vvdwsum = _mm256_setzero_pd();
229 /* Start inner kernel loop */
230 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
233 /* Get j neighbor index, and coordinate index */
238 j_coord_offsetA = DIM*jnrA;
239 j_coord_offsetB = DIM*jnrB;
240 j_coord_offsetC = DIM*jnrC;
241 j_coord_offsetD = DIM*jnrD;
243 /* load j atom coordinates */
244 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
245 x+j_coord_offsetC,x+j_coord_offsetD,
246 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
248 /* Calculate displacement vector */
249 dx00 = _mm256_sub_pd(ix0,jx0);
250 dy00 = _mm256_sub_pd(iy0,jy0);
251 dz00 = _mm256_sub_pd(iz0,jz0);
252 dx01 = _mm256_sub_pd(ix0,jx1);
253 dy01 = _mm256_sub_pd(iy0,jy1);
254 dz01 = _mm256_sub_pd(iz0,jz1);
255 dx02 = _mm256_sub_pd(ix0,jx2);
256 dy02 = _mm256_sub_pd(iy0,jy2);
257 dz02 = _mm256_sub_pd(iz0,jz2);
258 dx10 = _mm256_sub_pd(ix1,jx0);
259 dy10 = _mm256_sub_pd(iy1,jy0);
260 dz10 = _mm256_sub_pd(iz1,jz0);
261 dx11 = _mm256_sub_pd(ix1,jx1);
262 dy11 = _mm256_sub_pd(iy1,jy1);
263 dz11 = _mm256_sub_pd(iz1,jz1);
264 dx12 = _mm256_sub_pd(ix1,jx2);
265 dy12 = _mm256_sub_pd(iy1,jy2);
266 dz12 = _mm256_sub_pd(iz1,jz2);
267 dx20 = _mm256_sub_pd(ix2,jx0);
268 dy20 = _mm256_sub_pd(iy2,jy0);
269 dz20 = _mm256_sub_pd(iz2,jz0);
270 dx21 = _mm256_sub_pd(ix2,jx1);
271 dy21 = _mm256_sub_pd(iy2,jy1);
272 dz21 = _mm256_sub_pd(iz2,jz1);
273 dx22 = _mm256_sub_pd(ix2,jx2);
274 dy22 = _mm256_sub_pd(iy2,jy2);
275 dz22 = _mm256_sub_pd(iz2,jz2);
277 /* Calculate squared distance and things based on it */
278 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
279 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
280 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
281 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
282 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
283 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
284 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
285 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
286 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
288 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
289 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
290 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
291 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
292 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
293 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
294 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
295 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
296 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
298 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
299 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
300 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
301 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
302 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
303 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
304 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
305 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
306 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
308 fjx0 = _mm256_setzero_pd();
309 fjy0 = _mm256_setzero_pd();
310 fjz0 = _mm256_setzero_pd();
311 fjx1 = _mm256_setzero_pd();
312 fjy1 = _mm256_setzero_pd();
313 fjz1 = _mm256_setzero_pd();
314 fjx2 = _mm256_setzero_pd();
315 fjy2 = _mm256_setzero_pd();
316 fjz2 = _mm256_setzero_pd();
318 /**************************
319 * CALCULATE INTERACTIONS *
320 **************************/
322 if (gmx_mm256_any_lt(rsq00,rcutoff2))
325 r00 = _mm256_mul_pd(rsq00,rinv00);
327 /* REACTION-FIELD ELECTROSTATICS */
328 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
329 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
331 /* LENNARD-JONES DISPERSION/REPULSION */
333 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
334 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
335 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
336 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
337 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
339 d = _mm256_sub_pd(r00,rswitch);
340 d = _mm256_max_pd(d,_mm256_setzero_pd());
341 d2 = _mm256_mul_pd(d,d);
342 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
344 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
346 /* Evaluate switch function */
347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
349 vvdw = _mm256_mul_pd(vvdw,sw);
350 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velec = _mm256_and_pd(velec,cutoff_mask);
354 velecsum = _mm256_add_pd(velecsum,velec);
355 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
356 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
358 fscal = _mm256_add_pd(felec,fvdw);
360 fscal = _mm256_and_pd(fscal,cutoff_mask);
362 /* Calculate temporary vectorial force */
363 tx = _mm256_mul_pd(fscal,dx00);
364 ty = _mm256_mul_pd(fscal,dy00);
365 tz = _mm256_mul_pd(fscal,dz00);
367 /* Update vectorial force */
368 fix0 = _mm256_add_pd(fix0,tx);
369 fiy0 = _mm256_add_pd(fiy0,ty);
370 fiz0 = _mm256_add_pd(fiz0,tz);
372 fjx0 = _mm256_add_pd(fjx0,tx);
373 fjy0 = _mm256_add_pd(fjy0,ty);
374 fjz0 = _mm256_add_pd(fjz0,tz);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 if (gmx_mm256_any_lt(rsq01,rcutoff2))
385 /* REACTION-FIELD ELECTROSTATICS */
386 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
387 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
389 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
391 /* Update potential sum for this i atom from the interaction with this j atom. */
392 velec = _mm256_and_pd(velec,cutoff_mask);
393 velecsum = _mm256_add_pd(velecsum,velec);
397 fscal = _mm256_and_pd(fscal,cutoff_mask);
399 /* Calculate temporary vectorial force */
400 tx = _mm256_mul_pd(fscal,dx01);
401 ty = _mm256_mul_pd(fscal,dy01);
402 tz = _mm256_mul_pd(fscal,dz01);
404 /* Update vectorial force */
405 fix0 = _mm256_add_pd(fix0,tx);
406 fiy0 = _mm256_add_pd(fiy0,ty);
407 fiz0 = _mm256_add_pd(fiz0,tz);
409 fjx1 = _mm256_add_pd(fjx1,tx);
410 fjy1 = _mm256_add_pd(fjy1,ty);
411 fjz1 = _mm256_add_pd(fjz1,tz);
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 if (gmx_mm256_any_lt(rsq02,rcutoff2))
422 /* REACTION-FIELD ELECTROSTATICS */
423 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
424 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
426 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velec = _mm256_and_pd(velec,cutoff_mask);
430 velecsum = _mm256_add_pd(velecsum,velec);
434 fscal = _mm256_and_pd(fscal,cutoff_mask);
436 /* Calculate temporary vectorial force */
437 tx = _mm256_mul_pd(fscal,dx02);
438 ty = _mm256_mul_pd(fscal,dy02);
439 tz = _mm256_mul_pd(fscal,dz02);
441 /* Update vectorial force */
442 fix0 = _mm256_add_pd(fix0,tx);
443 fiy0 = _mm256_add_pd(fiy0,ty);
444 fiz0 = _mm256_add_pd(fiz0,tz);
446 fjx2 = _mm256_add_pd(fjx2,tx);
447 fjy2 = _mm256_add_pd(fjy2,ty);
448 fjz2 = _mm256_add_pd(fjz2,tz);
452 /**************************
453 * CALCULATE INTERACTIONS *
454 **************************/
456 if (gmx_mm256_any_lt(rsq10,rcutoff2))
459 /* REACTION-FIELD ELECTROSTATICS */
460 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
461 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
463 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velec = _mm256_and_pd(velec,cutoff_mask);
467 velecsum = _mm256_add_pd(velecsum,velec);
471 fscal = _mm256_and_pd(fscal,cutoff_mask);
473 /* Calculate temporary vectorial force */
474 tx = _mm256_mul_pd(fscal,dx10);
475 ty = _mm256_mul_pd(fscal,dy10);
476 tz = _mm256_mul_pd(fscal,dz10);
478 /* Update vectorial force */
479 fix1 = _mm256_add_pd(fix1,tx);
480 fiy1 = _mm256_add_pd(fiy1,ty);
481 fiz1 = _mm256_add_pd(fiz1,tz);
483 fjx0 = _mm256_add_pd(fjx0,tx);
484 fjy0 = _mm256_add_pd(fjy0,ty);
485 fjz0 = _mm256_add_pd(fjz0,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 if (gmx_mm256_any_lt(rsq11,rcutoff2))
496 /* REACTION-FIELD ELECTROSTATICS */
497 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
498 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
500 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velec = _mm256_and_pd(velec,cutoff_mask);
504 velecsum = _mm256_add_pd(velecsum,velec);
508 fscal = _mm256_and_pd(fscal,cutoff_mask);
510 /* Calculate temporary vectorial force */
511 tx = _mm256_mul_pd(fscal,dx11);
512 ty = _mm256_mul_pd(fscal,dy11);
513 tz = _mm256_mul_pd(fscal,dz11);
515 /* Update vectorial force */
516 fix1 = _mm256_add_pd(fix1,tx);
517 fiy1 = _mm256_add_pd(fiy1,ty);
518 fiz1 = _mm256_add_pd(fiz1,tz);
520 fjx1 = _mm256_add_pd(fjx1,tx);
521 fjy1 = _mm256_add_pd(fjy1,ty);
522 fjz1 = _mm256_add_pd(fjz1,tz);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 if (gmx_mm256_any_lt(rsq12,rcutoff2))
533 /* REACTION-FIELD ELECTROSTATICS */
534 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
535 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
537 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velec = _mm256_and_pd(velec,cutoff_mask);
541 velecsum = _mm256_add_pd(velecsum,velec);
545 fscal = _mm256_and_pd(fscal,cutoff_mask);
547 /* Calculate temporary vectorial force */
548 tx = _mm256_mul_pd(fscal,dx12);
549 ty = _mm256_mul_pd(fscal,dy12);
550 tz = _mm256_mul_pd(fscal,dz12);
552 /* Update vectorial force */
553 fix1 = _mm256_add_pd(fix1,tx);
554 fiy1 = _mm256_add_pd(fiy1,ty);
555 fiz1 = _mm256_add_pd(fiz1,tz);
557 fjx2 = _mm256_add_pd(fjx2,tx);
558 fjy2 = _mm256_add_pd(fjy2,ty);
559 fjz2 = _mm256_add_pd(fjz2,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 if (gmx_mm256_any_lt(rsq20,rcutoff2))
570 /* REACTION-FIELD ELECTROSTATICS */
571 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
572 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
574 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
576 /* Update potential sum for this i atom from the interaction with this j atom. */
577 velec = _mm256_and_pd(velec,cutoff_mask);
578 velecsum = _mm256_add_pd(velecsum,velec);
582 fscal = _mm256_and_pd(fscal,cutoff_mask);
584 /* Calculate temporary vectorial force */
585 tx = _mm256_mul_pd(fscal,dx20);
586 ty = _mm256_mul_pd(fscal,dy20);
587 tz = _mm256_mul_pd(fscal,dz20);
589 /* Update vectorial force */
590 fix2 = _mm256_add_pd(fix2,tx);
591 fiy2 = _mm256_add_pd(fiy2,ty);
592 fiz2 = _mm256_add_pd(fiz2,tz);
594 fjx0 = _mm256_add_pd(fjx0,tx);
595 fjy0 = _mm256_add_pd(fjy0,ty);
596 fjz0 = _mm256_add_pd(fjz0,tz);
600 /**************************
601 * CALCULATE INTERACTIONS *
602 **************************/
604 if (gmx_mm256_any_lt(rsq21,rcutoff2))
607 /* REACTION-FIELD ELECTROSTATICS */
608 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
609 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
611 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velec = _mm256_and_pd(velec,cutoff_mask);
615 velecsum = _mm256_add_pd(velecsum,velec);
619 fscal = _mm256_and_pd(fscal,cutoff_mask);
621 /* Calculate temporary vectorial force */
622 tx = _mm256_mul_pd(fscal,dx21);
623 ty = _mm256_mul_pd(fscal,dy21);
624 tz = _mm256_mul_pd(fscal,dz21);
626 /* Update vectorial force */
627 fix2 = _mm256_add_pd(fix2,tx);
628 fiy2 = _mm256_add_pd(fiy2,ty);
629 fiz2 = _mm256_add_pd(fiz2,tz);
631 fjx1 = _mm256_add_pd(fjx1,tx);
632 fjy1 = _mm256_add_pd(fjy1,ty);
633 fjz1 = _mm256_add_pd(fjz1,tz);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 if (gmx_mm256_any_lt(rsq22,rcutoff2))
644 /* REACTION-FIELD ELECTROSTATICS */
645 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
646 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
648 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velec = _mm256_and_pd(velec,cutoff_mask);
652 velecsum = _mm256_add_pd(velecsum,velec);
656 fscal = _mm256_and_pd(fscal,cutoff_mask);
658 /* Calculate temporary vectorial force */
659 tx = _mm256_mul_pd(fscal,dx22);
660 ty = _mm256_mul_pd(fscal,dy22);
661 tz = _mm256_mul_pd(fscal,dz22);
663 /* Update vectorial force */
664 fix2 = _mm256_add_pd(fix2,tx);
665 fiy2 = _mm256_add_pd(fiy2,ty);
666 fiz2 = _mm256_add_pd(fiz2,tz);
668 fjx2 = _mm256_add_pd(fjx2,tx);
669 fjy2 = _mm256_add_pd(fjy2,ty);
670 fjz2 = _mm256_add_pd(fjz2,tz);
674 fjptrA = f+j_coord_offsetA;
675 fjptrB = f+j_coord_offsetB;
676 fjptrC = f+j_coord_offsetC;
677 fjptrD = f+j_coord_offsetD;
679 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
680 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
682 /* Inner loop uses 358 flops */
688 /* Get j neighbor index, and coordinate index */
689 jnrlistA = jjnr[jidx];
690 jnrlistB = jjnr[jidx+1];
691 jnrlistC = jjnr[jidx+2];
692 jnrlistD = jjnr[jidx+3];
693 /* Sign of each element will be negative for non-real atoms.
694 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
695 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
697 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
699 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
700 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
701 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
703 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
704 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
705 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
706 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
707 j_coord_offsetA = DIM*jnrA;
708 j_coord_offsetB = DIM*jnrB;
709 j_coord_offsetC = DIM*jnrC;
710 j_coord_offsetD = DIM*jnrD;
712 /* load j atom coordinates */
713 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
714 x+j_coord_offsetC,x+j_coord_offsetD,
715 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
717 /* Calculate displacement vector */
718 dx00 = _mm256_sub_pd(ix0,jx0);
719 dy00 = _mm256_sub_pd(iy0,jy0);
720 dz00 = _mm256_sub_pd(iz0,jz0);
721 dx01 = _mm256_sub_pd(ix0,jx1);
722 dy01 = _mm256_sub_pd(iy0,jy1);
723 dz01 = _mm256_sub_pd(iz0,jz1);
724 dx02 = _mm256_sub_pd(ix0,jx2);
725 dy02 = _mm256_sub_pd(iy0,jy2);
726 dz02 = _mm256_sub_pd(iz0,jz2);
727 dx10 = _mm256_sub_pd(ix1,jx0);
728 dy10 = _mm256_sub_pd(iy1,jy0);
729 dz10 = _mm256_sub_pd(iz1,jz0);
730 dx11 = _mm256_sub_pd(ix1,jx1);
731 dy11 = _mm256_sub_pd(iy1,jy1);
732 dz11 = _mm256_sub_pd(iz1,jz1);
733 dx12 = _mm256_sub_pd(ix1,jx2);
734 dy12 = _mm256_sub_pd(iy1,jy2);
735 dz12 = _mm256_sub_pd(iz1,jz2);
736 dx20 = _mm256_sub_pd(ix2,jx0);
737 dy20 = _mm256_sub_pd(iy2,jy0);
738 dz20 = _mm256_sub_pd(iz2,jz0);
739 dx21 = _mm256_sub_pd(ix2,jx1);
740 dy21 = _mm256_sub_pd(iy2,jy1);
741 dz21 = _mm256_sub_pd(iz2,jz1);
742 dx22 = _mm256_sub_pd(ix2,jx2);
743 dy22 = _mm256_sub_pd(iy2,jy2);
744 dz22 = _mm256_sub_pd(iz2,jz2);
746 /* Calculate squared distance and things based on it */
747 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
748 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
749 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
750 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
751 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
752 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
753 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
754 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
755 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
757 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
758 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
759 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
760 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
761 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
762 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
763 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
764 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
765 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
767 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
768 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
769 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
770 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
771 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
772 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
773 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
774 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
775 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
777 fjx0 = _mm256_setzero_pd();
778 fjy0 = _mm256_setzero_pd();
779 fjz0 = _mm256_setzero_pd();
780 fjx1 = _mm256_setzero_pd();
781 fjy1 = _mm256_setzero_pd();
782 fjz1 = _mm256_setzero_pd();
783 fjx2 = _mm256_setzero_pd();
784 fjy2 = _mm256_setzero_pd();
785 fjz2 = _mm256_setzero_pd();
787 /**************************
788 * CALCULATE INTERACTIONS *
789 **************************/
791 if (gmx_mm256_any_lt(rsq00,rcutoff2))
794 r00 = _mm256_mul_pd(rsq00,rinv00);
795 r00 = _mm256_andnot_pd(dummy_mask,r00);
797 /* REACTION-FIELD ELECTROSTATICS */
798 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
799 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
801 /* LENNARD-JONES DISPERSION/REPULSION */
803 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
804 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
805 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
806 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
807 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
809 d = _mm256_sub_pd(r00,rswitch);
810 d = _mm256_max_pd(d,_mm256_setzero_pd());
811 d2 = _mm256_mul_pd(d,d);
812 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
814 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
816 /* Evaluate switch function */
817 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
818 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
819 vvdw = _mm256_mul_pd(vvdw,sw);
820 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
822 /* Update potential sum for this i atom from the interaction with this j atom. */
823 velec = _mm256_and_pd(velec,cutoff_mask);
824 velec = _mm256_andnot_pd(dummy_mask,velec);
825 velecsum = _mm256_add_pd(velecsum,velec);
826 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
827 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
828 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
830 fscal = _mm256_add_pd(felec,fvdw);
832 fscal = _mm256_and_pd(fscal,cutoff_mask);
834 fscal = _mm256_andnot_pd(dummy_mask,fscal);
836 /* Calculate temporary vectorial force */
837 tx = _mm256_mul_pd(fscal,dx00);
838 ty = _mm256_mul_pd(fscal,dy00);
839 tz = _mm256_mul_pd(fscal,dz00);
841 /* Update vectorial force */
842 fix0 = _mm256_add_pd(fix0,tx);
843 fiy0 = _mm256_add_pd(fiy0,ty);
844 fiz0 = _mm256_add_pd(fiz0,tz);
846 fjx0 = _mm256_add_pd(fjx0,tx);
847 fjy0 = _mm256_add_pd(fjy0,ty);
848 fjz0 = _mm256_add_pd(fjz0,tz);
852 /**************************
853 * CALCULATE INTERACTIONS *
854 **************************/
856 if (gmx_mm256_any_lt(rsq01,rcutoff2))
859 /* REACTION-FIELD ELECTROSTATICS */
860 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
861 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
863 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
865 /* Update potential sum for this i atom from the interaction with this j atom. */
866 velec = _mm256_and_pd(velec,cutoff_mask);
867 velec = _mm256_andnot_pd(dummy_mask,velec);
868 velecsum = _mm256_add_pd(velecsum,velec);
872 fscal = _mm256_and_pd(fscal,cutoff_mask);
874 fscal = _mm256_andnot_pd(dummy_mask,fscal);
876 /* Calculate temporary vectorial force */
877 tx = _mm256_mul_pd(fscal,dx01);
878 ty = _mm256_mul_pd(fscal,dy01);
879 tz = _mm256_mul_pd(fscal,dz01);
881 /* Update vectorial force */
882 fix0 = _mm256_add_pd(fix0,tx);
883 fiy0 = _mm256_add_pd(fiy0,ty);
884 fiz0 = _mm256_add_pd(fiz0,tz);
886 fjx1 = _mm256_add_pd(fjx1,tx);
887 fjy1 = _mm256_add_pd(fjy1,ty);
888 fjz1 = _mm256_add_pd(fjz1,tz);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 if (gmx_mm256_any_lt(rsq02,rcutoff2))
899 /* REACTION-FIELD ELECTROSTATICS */
900 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
901 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
903 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
905 /* Update potential sum for this i atom from the interaction with this j atom. */
906 velec = _mm256_and_pd(velec,cutoff_mask);
907 velec = _mm256_andnot_pd(dummy_mask,velec);
908 velecsum = _mm256_add_pd(velecsum,velec);
912 fscal = _mm256_and_pd(fscal,cutoff_mask);
914 fscal = _mm256_andnot_pd(dummy_mask,fscal);
916 /* Calculate temporary vectorial force */
917 tx = _mm256_mul_pd(fscal,dx02);
918 ty = _mm256_mul_pd(fscal,dy02);
919 tz = _mm256_mul_pd(fscal,dz02);
921 /* Update vectorial force */
922 fix0 = _mm256_add_pd(fix0,tx);
923 fiy0 = _mm256_add_pd(fiy0,ty);
924 fiz0 = _mm256_add_pd(fiz0,tz);
926 fjx2 = _mm256_add_pd(fjx2,tx);
927 fjy2 = _mm256_add_pd(fjy2,ty);
928 fjz2 = _mm256_add_pd(fjz2,tz);
932 /**************************
933 * CALCULATE INTERACTIONS *
934 **************************/
936 if (gmx_mm256_any_lt(rsq10,rcutoff2))
939 /* REACTION-FIELD ELECTROSTATICS */
940 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
941 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
943 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
945 /* Update potential sum for this i atom from the interaction with this j atom. */
946 velec = _mm256_and_pd(velec,cutoff_mask);
947 velec = _mm256_andnot_pd(dummy_mask,velec);
948 velecsum = _mm256_add_pd(velecsum,velec);
952 fscal = _mm256_and_pd(fscal,cutoff_mask);
954 fscal = _mm256_andnot_pd(dummy_mask,fscal);
956 /* Calculate temporary vectorial force */
957 tx = _mm256_mul_pd(fscal,dx10);
958 ty = _mm256_mul_pd(fscal,dy10);
959 tz = _mm256_mul_pd(fscal,dz10);
961 /* Update vectorial force */
962 fix1 = _mm256_add_pd(fix1,tx);
963 fiy1 = _mm256_add_pd(fiy1,ty);
964 fiz1 = _mm256_add_pd(fiz1,tz);
966 fjx0 = _mm256_add_pd(fjx0,tx);
967 fjy0 = _mm256_add_pd(fjy0,ty);
968 fjz0 = _mm256_add_pd(fjz0,tz);
972 /**************************
973 * CALCULATE INTERACTIONS *
974 **************************/
976 if (gmx_mm256_any_lt(rsq11,rcutoff2))
979 /* REACTION-FIELD ELECTROSTATICS */
980 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
981 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
983 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
985 /* Update potential sum for this i atom from the interaction with this j atom. */
986 velec = _mm256_and_pd(velec,cutoff_mask);
987 velec = _mm256_andnot_pd(dummy_mask,velec);
988 velecsum = _mm256_add_pd(velecsum,velec);
992 fscal = _mm256_and_pd(fscal,cutoff_mask);
994 fscal = _mm256_andnot_pd(dummy_mask,fscal);
996 /* Calculate temporary vectorial force */
997 tx = _mm256_mul_pd(fscal,dx11);
998 ty = _mm256_mul_pd(fscal,dy11);
999 tz = _mm256_mul_pd(fscal,dz11);
1001 /* Update vectorial force */
1002 fix1 = _mm256_add_pd(fix1,tx);
1003 fiy1 = _mm256_add_pd(fiy1,ty);
1004 fiz1 = _mm256_add_pd(fiz1,tz);
1006 fjx1 = _mm256_add_pd(fjx1,tx);
1007 fjy1 = _mm256_add_pd(fjy1,ty);
1008 fjz1 = _mm256_add_pd(fjz1,tz);
1012 /**************************
1013 * CALCULATE INTERACTIONS *
1014 **************************/
1016 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1019 /* REACTION-FIELD ELECTROSTATICS */
1020 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
1021 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1023 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm256_and_pd(velec,cutoff_mask);
1027 velec = _mm256_andnot_pd(dummy_mask,velec);
1028 velecsum = _mm256_add_pd(velecsum,velec);
1032 fscal = _mm256_and_pd(fscal,cutoff_mask);
1034 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1036 /* Calculate temporary vectorial force */
1037 tx = _mm256_mul_pd(fscal,dx12);
1038 ty = _mm256_mul_pd(fscal,dy12);
1039 tz = _mm256_mul_pd(fscal,dz12);
1041 /* Update vectorial force */
1042 fix1 = _mm256_add_pd(fix1,tx);
1043 fiy1 = _mm256_add_pd(fiy1,ty);
1044 fiz1 = _mm256_add_pd(fiz1,tz);
1046 fjx2 = _mm256_add_pd(fjx2,tx);
1047 fjy2 = _mm256_add_pd(fjy2,ty);
1048 fjz2 = _mm256_add_pd(fjz2,tz);
1052 /**************************
1053 * CALCULATE INTERACTIONS *
1054 **************************/
1056 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1059 /* REACTION-FIELD ELECTROSTATICS */
1060 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
1061 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1063 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec = _mm256_and_pd(velec,cutoff_mask);
1067 velec = _mm256_andnot_pd(dummy_mask,velec);
1068 velecsum = _mm256_add_pd(velecsum,velec);
1072 fscal = _mm256_and_pd(fscal,cutoff_mask);
1074 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1076 /* Calculate temporary vectorial force */
1077 tx = _mm256_mul_pd(fscal,dx20);
1078 ty = _mm256_mul_pd(fscal,dy20);
1079 tz = _mm256_mul_pd(fscal,dz20);
1081 /* Update vectorial force */
1082 fix2 = _mm256_add_pd(fix2,tx);
1083 fiy2 = _mm256_add_pd(fiy2,ty);
1084 fiz2 = _mm256_add_pd(fiz2,tz);
1086 fjx0 = _mm256_add_pd(fjx0,tx);
1087 fjy0 = _mm256_add_pd(fjy0,ty);
1088 fjz0 = _mm256_add_pd(fjz0,tz);
1092 /**************************
1093 * CALCULATE INTERACTIONS *
1094 **************************/
1096 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1099 /* REACTION-FIELD ELECTROSTATICS */
1100 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1101 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1103 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1105 /* Update potential sum for this i atom from the interaction with this j atom. */
1106 velec = _mm256_and_pd(velec,cutoff_mask);
1107 velec = _mm256_andnot_pd(dummy_mask,velec);
1108 velecsum = _mm256_add_pd(velecsum,velec);
1112 fscal = _mm256_and_pd(fscal,cutoff_mask);
1114 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1116 /* Calculate temporary vectorial force */
1117 tx = _mm256_mul_pd(fscal,dx21);
1118 ty = _mm256_mul_pd(fscal,dy21);
1119 tz = _mm256_mul_pd(fscal,dz21);
1121 /* Update vectorial force */
1122 fix2 = _mm256_add_pd(fix2,tx);
1123 fiy2 = _mm256_add_pd(fiy2,ty);
1124 fiz2 = _mm256_add_pd(fiz2,tz);
1126 fjx1 = _mm256_add_pd(fjx1,tx);
1127 fjy1 = _mm256_add_pd(fjy1,ty);
1128 fjz1 = _mm256_add_pd(fjz1,tz);
1132 /**************************
1133 * CALCULATE INTERACTIONS *
1134 **************************/
1136 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1139 /* REACTION-FIELD ELECTROSTATICS */
1140 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1141 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1143 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1145 /* Update potential sum for this i atom from the interaction with this j atom. */
1146 velec = _mm256_and_pd(velec,cutoff_mask);
1147 velec = _mm256_andnot_pd(dummy_mask,velec);
1148 velecsum = _mm256_add_pd(velecsum,velec);
1152 fscal = _mm256_and_pd(fscal,cutoff_mask);
1154 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1156 /* Calculate temporary vectorial force */
1157 tx = _mm256_mul_pd(fscal,dx22);
1158 ty = _mm256_mul_pd(fscal,dy22);
1159 tz = _mm256_mul_pd(fscal,dz22);
1161 /* Update vectorial force */
1162 fix2 = _mm256_add_pd(fix2,tx);
1163 fiy2 = _mm256_add_pd(fiy2,ty);
1164 fiz2 = _mm256_add_pd(fiz2,tz);
1166 fjx2 = _mm256_add_pd(fjx2,tx);
1167 fjy2 = _mm256_add_pd(fjy2,ty);
1168 fjz2 = _mm256_add_pd(fjz2,tz);
1172 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1173 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1174 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1175 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1177 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1178 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1180 /* Inner loop uses 359 flops */
1183 /* End of innermost loop */
1185 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1186 f+i_coord_offset,fshift+i_shift_offset);
1189 /* Update potential energies */
1190 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1191 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1193 /* Increment number of inner iterations */
1194 inneriter += j_index_end - j_index_start;
1196 /* Outer loop uses 20 flops */
1199 /* Increment number of outer iterations */
1202 /* Update outer/inner flops */
1204 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1207 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
1208 * Electrostatics interaction: ReactionField
1209 * VdW interaction: LennardJones
1210 * Geometry: Water3-Water3
1211 * Calculate force/pot: Force
1214 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
1215 (t_nblist * gmx_restrict nlist,
1216 rvec * gmx_restrict xx,
1217 rvec * gmx_restrict ff,
1218 t_forcerec * gmx_restrict fr,
1219 t_mdatoms * gmx_restrict mdatoms,
1220 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1221 t_nrnb * gmx_restrict nrnb)
1223 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1224 * just 0 for non-waters.
1225 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1226 * jnr indices corresponding to data put in the four positions in the SIMD register.
1228 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1229 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1230 int jnrA,jnrB,jnrC,jnrD;
1231 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1232 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1233 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1234 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1235 real rcutoff_scalar;
1236 real *shiftvec,*fshift,*x,*f;
1237 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1238 real scratch[4*DIM];
1239 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1240 real * vdwioffsetptr0;
1241 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1242 real * vdwioffsetptr1;
1243 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1244 real * vdwioffsetptr2;
1245 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1246 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1247 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1248 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1249 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1250 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1251 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1252 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1253 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1254 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1255 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1256 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1257 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1258 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1259 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1260 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1261 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1264 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1267 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1268 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1269 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1270 real rswitch_scalar,d_scalar;
1271 __m256d dummy_mask,cutoff_mask;
1272 __m128 tmpmask0,tmpmask1;
1273 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1274 __m256d one = _mm256_set1_pd(1.0);
1275 __m256d two = _mm256_set1_pd(2.0);
1281 jindex = nlist->jindex;
1283 shiftidx = nlist->shift;
1285 shiftvec = fr->shift_vec[0];
1286 fshift = fr->fshift[0];
1287 facel = _mm256_set1_pd(fr->epsfac);
1288 charge = mdatoms->chargeA;
1289 krf = _mm256_set1_pd(fr->ic->k_rf);
1290 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1291 crf = _mm256_set1_pd(fr->ic->c_rf);
1292 nvdwtype = fr->ntype;
1293 vdwparam = fr->nbfp;
1294 vdwtype = mdatoms->typeA;
1296 /* Setup water-specific parameters */
1297 inr = nlist->iinr[0];
1298 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1299 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1300 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1301 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1303 jq0 = _mm256_set1_pd(charge[inr+0]);
1304 jq1 = _mm256_set1_pd(charge[inr+1]);
1305 jq2 = _mm256_set1_pd(charge[inr+2]);
1306 vdwjidx0A = 2*vdwtype[inr+0];
1307 qq00 = _mm256_mul_pd(iq0,jq0);
1308 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1309 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1310 qq01 = _mm256_mul_pd(iq0,jq1);
1311 qq02 = _mm256_mul_pd(iq0,jq2);
1312 qq10 = _mm256_mul_pd(iq1,jq0);
1313 qq11 = _mm256_mul_pd(iq1,jq1);
1314 qq12 = _mm256_mul_pd(iq1,jq2);
1315 qq20 = _mm256_mul_pd(iq2,jq0);
1316 qq21 = _mm256_mul_pd(iq2,jq1);
1317 qq22 = _mm256_mul_pd(iq2,jq2);
1319 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1320 rcutoff_scalar = fr->rcoulomb;
1321 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1322 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1324 rswitch_scalar = fr->rvdw_switch;
1325 rswitch = _mm256_set1_pd(rswitch_scalar);
1326 /* Setup switch parameters */
1327 d_scalar = rcutoff_scalar-rswitch_scalar;
1328 d = _mm256_set1_pd(d_scalar);
1329 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1330 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1331 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1332 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1333 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1334 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1336 /* Avoid stupid compiler warnings */
1337 jnrA = jnrB = jnrC = jnrD = 0;
1338 j_coord_offsetA = 0;
1339 j_coord_offsetB = 0;
1340 j_coord_offsetC = 0;
1341 j_coord_offsetD = 0;
1346 for(iidx=0;iidx<4*DIM;iidx++)
1348 scratch[iidx] = 0.0;
1351 /* Start outer loop over neighborlists */
1352 for(iidx=0; iidx<nri; iidx++)
1354 /* Load shift vector for this list */
1355 i_shift_offset = DIM*shiftidx[iidx];
1357 /* Load limits for loop over neighbors */
1358 j_index_start = jindex[iidx];
1359 j_index_end = jindex[iidx+1];
1361 /* Get outer coordinate index */
1363 i_coord_offset = DIM*inr;
1365 /* Load i particle coords and add shift vector */
1366 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1367 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1369 fix0 = _mm256_setzero_pd();
1370 fiy0 = _mm256_setzero_pd();
1371 fiz0 = _mm256_setzero_pd();
1372 fix1 = _mm256_setzero_pd();
1373 fiy1 = _mm256_setzero_pd();
1374 fiz1 = _mm256_setzero_pd();
1375 fix2 = _mm256_setzero_pd();
1376 fiy2 = _mm256_setzero_pd();
1377 fiz2 = _mm256_setzero_pd();
1379 /* Start inner kernel loop */
1380 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1383 /* Get j neighbor index, and coordinate index */
1385 jnrB = jjnr[jidx+1];
1386 jnrC = jjnr[jidx+2];
1387 jnrD = jjnr[jidx+3];
1388 j_coord_offsetA = DIM*jnrA;
1389 j_coord_offsetB = DIM*jnrB;
1390 j_coord_offsetC = DIM*jnrC;
1391 j_coord_offsetD = DIM*jnrD;
1393 /* load j atom coordinates */
1394 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1395 x+j_coord_offsetC,x+j_coord_offsetD,
1396 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1398 /* Calculate displacement vector */
1399 dx00 = _mm256_sub_pd(ix0,jx0);
1400 dy00 = _mm256_sub_pd(iy0,jy0);
1401 dz00 = _mm256_sub_pd(iz0,jz0);
1402 dx01 = _mm256_sub_pd(ix0,jx1);
1403 dy01 = _mm256_sub_pd(iy0,jy1);
1404 dz01 = _mm256_sub_pd(iz0,jz1);
1405 dx02 = _mm256_sub_pd(ix0,jx2);
1406 dy02 = _mm256_sub_pd(iy0,jy2);
1407 dz02 = _mm256_sub_pd(iz0,jz2);
1408 dx10 = _mm256_sub_pd(ix1,jx0);
1409 dy10 = _mm256_sub_pd(iy1,jy0);
1410 dz10 = _mm256_sub_pd(iz1,jz0);
1411 dx11 = _mm256_sub_pd(ix1,jx1);
1412 dy11 = _mm256_sub_pd(iy1,jy1);
1413 dz11 = _mm256_sub_pd(iz1,jz1);
1414 dx12 = _mm256_sub_pd(ix1,jx2);
1415 dy12 = _mm256_sub_pd(iy1,jy2);
1416 dz12 = _mm256_sub_pd(iz1,jz2);
1417 dx20 = _mm256_sub_pd(ix2,jx0);
1418 dy20 = _mm256_sub_pd(iy2,jy0);
1419 dz20 = _mm256_sub_pd(iz2,jz0);
1420 dx21 = _mm256_sub_pd(ix2,jx1);
1421 dy21 = _mm256_sub_pd(iy2,jy1);
1422 dz21 = _mm256_sub_pd(iz2,jz1);
1423 dx22 = _mm256_sub_pd(ix2,jx2);
1424 dy22 = _mm256_sub_pd(iy2,jy2);
1425 dz22 = _mm256_sub_pd(iz2,jz2);
1427 /* Calculate squared distance and things based on it */
1428 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1429 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1430 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1431 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1432 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1433 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1434 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1435 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1436 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1438 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1439 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1440 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1441 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1442 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1443 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1444 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1445 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1446 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1448 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1449 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1450 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1451 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1452 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1453 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1454 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1455 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1456 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1458 fjx0 = _mm256_setzero_pd();
1459 fjy0 = _mm256_setzero_pd();
1460 fjz0 = _mm256_setzero_pd();
1461 fjx1 = _mm256_setzero_pd();
1462 fjy1 = _mm256_setzero_pd();
1463 fjz1 = _mm256_setzero_pd();
1464 fjx2 = _mm256_setzero_pd();
1465 fjy2 = _mm256_setzero_pd();
1466 fjz2 = _mm256_setzero_pd();
1468 /**************************
1469 * CALCULATE INTERACTIONS *
1470 **************************/
1472 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1475 r00 = _mm256_mul_pd(rsq00,rinv00);
1477 /* REACTION-FIELD ELECTROSTATICS */
1478 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1480 /* LENNARD-JONES DISPERSION/REPULSION */
1482 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1483 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1484 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1485 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1486 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1488 d = _mm256_sub_pd(r00,rswitch);
1489 d = _mm256_max_pd(d,_mm256_setzero_pd());
1490 d2 = _mm256_mul_pd(d,d);
1491 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1493 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1495 /* Evaluate switch function */
1496 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1497 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1498 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1500 fscal = _mm256_add_pd(felec,fvdw);
1502 fscal = _mm256_and_pd(fscal,cutoff_mask);
1504 /* Calculate temporary vectorial force */
1505 tx = _mm256_mul_pd(fscal,dx00);
1506 ty = _mm256_mul_pd(fscal,dy00);
1507 tz = _mm256_mul_pd(fscal,dz00);
1509 /* Update vectorial force */
1510 fix0 = _mm256_add_pd(fix0,tx);
1511 fiy0 = _mm256_add_pd(fiy0,ty);
1512 fiz0 = _mm256_add_pd(fiz0,tz);
1514 fjx0 = _mm256_add_pd(fjx0,tx);
1515 fjy0 = _mm256_add_pd(fjy0,ty);
1516 fjz0 = _mm256_add_pd(fjz0,tz);
1520 /**************************
1521 * CALCULATE INTERACTIONS *
1522 **************************/
1524 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1527 /* REACTION-FIELD ELECTROSTATICS */
1528 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1530 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1534 fscal = _mm256_and_pd(fscal,cutoff_mask);
1536 /* Calculate temporary vectorial force */
1537 tx = _mm256_mul_pd(fscal,dx01);
1538 ty = _mm256_mul_pd(fscal,dy01);
1539 tz = _mm256_mul_pd(fscal,dz01);
1541 /* Update vectorial force */
1542 fix0 = _mm256_add_pd(fix0,tx);
1543 fiy0 = _mm256_add_pd(fiy0,ty);
1544 fiz0 = _mm256_add_pd(fiz0,tz);
1546 fjx1 = _mm256_add_pd(fjx1,tx);
1547 fjy1 = _mm256_add_pd(fjy1,ty);
1548 fjz1 = _mm256_add_pd(fjz1,tz);
1552 /**************************
1553 * CALCULATE INTERACTIONS *
1554 **************************/
1556 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1559 /* REACTION-FIELD ELECTROSTATICS */
1560 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1562 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1566 fscal = _mm256_and_pd(fscal,cutoff_mask);
1568 /* Calculate temporary vectorial force */
1569 tx = _mm256_mul_pd(fscal,dx02);
1570 ty = _mm256_mul_pd(fscal,dy02);
1571 tz = _mm256_mul_pd(fscal,dz02);
1573 /* Update vectorial force */
1574 fix0 = _mm256_add_pd(fix0,tx);
1575 fiy0 = _mm256_add_pd(fiy0,ty);
1576 fiz0 = _mm256_add_pd(fiz0,tz);
1578 fjx2 = _mm256_add_pd(fjx2,tx);
1579 fjy2 = _mm256_add_pd(fjy2,ty);
1580 fjz2 = _mm256_add_pd(fjz2,tz);
1584 /**************************
1585 * CALCULATE INTERACTIONS *
1586 **************************/
1588 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1591 /* REACTION-FIELD ELECTROSTATICS */
1592 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1594 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1598 fscal = _mm256_and_pd(fscal,cutoff_mask);
1600 /* Calculate temporary vectorial force */
1601 tx = _mm256_mul_pd(fscal,dx10);
1602 ty = _mm256_mul_pd(fscal,dy10);
1603 tz = _mm256_mul_pd(fscal,dz10);
1605 /* Update vectorial force */
1606 fix1 = _mm256_add_pd(fix1,tx);
1607 fiy1 = _mm256_add_pd(fiy1,ty);
1608 fiz1 = _mm256_add_pd(fiz1,tz);
1610 fjx0 = _mm256_add_pd(fjx0,tx);
1611 fjy0 = _mm256_add_pd(fjy0,ty);
1612 fjz0 = _mm256_add_pd(fjz0,tz);
1616 /**************************
1617 * CALCULATE INTERACTIONS *
1618 **************************/
1620 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1623 /* REACTION-FIELD ELECTROSTATICS */
1624 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1626 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1630 fscal = _mm256_and_pd(fscal,cutoff_mask);
1632 /* Calculate temporary vectorial force */
1633 tx = _mm256_mul_pd(fscal,dx11);
1634 ty = _mm256_mul_pd(fscal,dy11);
1635 tz = _mm256_mul_pd(fscal,dz11);
1637 /* Update vectorial force */
1638 fix1 = _mm256_add_pd(fix1,tx);
1639 fiy1 = _mm256_add_pd(fiy1,ty);
1640 fiz1 = _mm256_add_pd(fiz1,tz);
1642 fjx1 = _mm256_add_pd(fjx1,tx);
1643 fjy1 = _mm256_add_pd(fjy1,ty);
1644 fjz1 = _mm256_add_pd(fjz1,tz);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1655 /* REACTION-FIELD ELECTROSTATICS */
1656 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1658 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1662 fscal = _mm256_and_pd(fscal,cutoff_mask);
1664 /* Calculate temporary vectorial force */
1665 tx = _mm256_mul_pd(fscal,dx12);
1666 ty = _mm256_mul_pd(fscal,dy12);
1667 tz = _mm256_mul_pd(fscal,dz12);
1669 /* Update vectorial force */
1670 fix1 = _mm256_add_pd(fix1,tx);
1671 fiy1 = _mm256_add_pd(fiy1,ty);
1672 fiz1 = _mm256_add_pd(fiz1,tz);
1674 fjx2 = _mm256_add_pd(fjx2,tx);
1675 fjy2 = _mm256_add_pd(fjy2,ty);
1676 fjz2 = _mm256_add_pd(fjz2,tz);
1680 /**************************
1681 * CALCULATE INTERACTIONS *
1682 **************************/
1684 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1687 /* REACTION-FIELD ELECTROSTATICS */
1688 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1690 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1694 fscal = _mm256_and_pd(fscal,cutoff_mask);
1696 /* Calculate temporary vectorial force */
1697 tx = _mm256_mul_pd(fscal,dx20);
1698 ty = _mm256_mul_pd(fscal,dy20);
1699 tz = _mm256_mul_pd(fscal,dz20);
1701 /* Update vectorial force */
1702 fix2 = _mm256_add_pd(fix2,tx);
1703 fiy2 = _mm256_add_pd(fiy2,ty);
1704 fiz2 = _mm256_add_pd(fiz2,tz);
1706 fjx0 = _mm256_add_pd(fjx0,tx);
1707 fjy0 = _mm256_add_pd(fjy0,ty);
1708 fjz0 = _mm256_add_pd(fjz0,tz);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1719 /* REACTION-FIELD ELECTROSTATICS */
1720 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1722 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1726 fscal = _mm256_and_pd(fscal,cutoff_mask);
1728 /* Calculate temporary vectorial force */
1729 tx = _mm256_mul_pd(fscal,dx21);
1730 ty = _mm256_mul_pd(fscal,dy21);
1731 tz = _mm256_mul_pd(fscal,dz21);
1733 /* Update vectorial force */
1734 fix2 = _mm256_add_pd(fix2,tx);
1735 fiy2 = _mm256_add_pd(fiy2,ty);
1736 fiz2 = _mm256_add_pd(fiz2,tz);
1738 fjx1 = _mm256_add_pd(fjx1,tx);
1739 fjy1 = _mm256_add_pd(fjy1,ty);
1740 fjz1 = _mm256_add_pd(fjz1,tz);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1751 /* REACTION-FIELD ELECTROSTATICS */
1752 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1754 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1758 fscal = _mm256_and_pd(fscal,cutoff_mask);
1760 /* Calculate temporary vectorial force */
1761 tx = _mm256_mul_pd(fscal,dx22);
1762 ty = _mm256_mul_pd(fscal,dy22);
1763 tz = _mm256_mul_pd(fscal,dz22);
1765 /* Update vectorial force */
1766 fix2 = _mm256_add_pd(fix2,tx);
1767 fiy2 = _mm256_add_pd(fiy2,ty);
1768 fiz2 = _mm256_add_pd(fiz2,tz);
1770 fjx2 = _mm256_add_pd(fjx2,tx);
1771 fjy2 = _mm256_add_pd(fjy2,ty);
1772 fjz2 = _mm256_add_pd(fjz2,tz);
1776 fjptrA = f+j_coord_offsetA;
1777 fjptrB = f+j_coord_offsetB;
1778 fjptrC = f+j_coord_offsetC;
1779 fjptrD = f+j_coord_offsetD;
1781 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1782 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1784 /* Inner loop uses 301 flops */
1787 if(jidx<j_index_end)
1790 /* Get j neighbor index, and coordinate index */
1791 jnrlistA = jjnr[jidx];
1792 jnrlistB = jjnr[jidx+1];
1793 jnrlistC = jjnr[jidx+2];
1794 jnrlistD = jjnr[jidx+3];
1795 /* Sign of each element will be negative for non-real atoms.
1796 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1797 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1799 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1801 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1802 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1803 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1805 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1806 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1807 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1808 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1809 j_coord_offsetA = DIM*jnrA;
1810 j_coord_offsetB = DIM*jnrB;
1811 j_coord_offsetC = DIM*jnrC;
1812 j_coord_offsetD = DIM*jnrD;
1814 /* load j atom coordinates */
1815 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1816 x+j_coord_offsetC,x+j_coord_offsetD,
1817 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1819 /* Calculate displacement vector */
1820 dx00 = _mm256_sub_pd(ix0,jx0);
1821 dy00 = _mm256_sub_pd(iy0,jy0);
1822 dz00 = _mm256_sub_pd(iz0,jz0);
1823 dx01 = _mm256_sub_pd(ix0,jx1);
1824 dy01 = _mm256_sub_pd(iy0,jy1);
1825 dz01 = _mm256_sub_pd(iz0,jz1);
1826 dx02 = _mm256_sub_pd(ix0,jx2);
1827 dy02 = _mm256_sub_pd(iy0,jy2);
1828 dz02 = _mm256_sub_pd(iz0,jz2);
1829 dx10 = _mm256_sub_pd(ix1,jx0);
1830 dy10 = _mm256_sub_pd(iy1,jy0);
1831 dz10 = _mm256_sub_pd(iz1,jz0);
1832 dx11 = _mm256_sub_pd(ix1,jx1);
1833 dy11 = _mm256_sub_pd(iy1,jy1);
1834 dz11 = _mm256_sub_pd(iz1,jz1);
1835 dx12 = _mm256_sub_pd(ix1,jx2);
1836 dy12 = _mm256_sub_pd(iy1,jy2);
1837 dz12 = _mm256_sub_pd(iz1,jz2);
1838 dx20 = _mm256_sub_pd(ix2,jx0);
1839 dy20 = _mm256_sub_pd(iy2,jy0);
1840 dz20 = _mm256_sub_pd(iz2,jz0);
1841 dx21 = _mm256_sub_pd(ix2,jx1);
1842 dy21 = _mm256_sub_pd(iy2,jy1);
1843 dz21 = _mm256_sub_pd(iz2,jz1);
1844 dx22 = _mm256_sub_pd(ix2,jx2);
1845 dy22 = _mm256_sub_pd(iy2,jy2);
1846 dz22 = _mm256_sub_pd(iz2,jz2);
1848 /* Calculate squared distance and things based on it */
1849 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1850 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1851 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1852 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1853 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1854 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1855 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1856 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1857 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1859 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1860 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1861 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1862 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1863 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1864 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1865 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1866 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1867 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1869 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1870 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1871 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1872 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1873 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1874 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1875 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1876 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1877 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1879 fjx0 = _mm256_setzero_pd();
1880 fjy0 = _mm256_setzero_pd();
1881 fjz0 = _mm256_setzero_pd();
1882 fjx1 = _mm256_setzero_pd();
1883 fjy1 = _mm256_setzero_pd();
1884 fjz1 = _mm256_setzero_pd();
1885 fjx2 = _mm256_setzero_pd();
1886 fjy2 = _mm256_setzero_pd();
1887 fjz2 = _mm256_setzero_pd();
1889 /**************************
1890 * CALCULATE INTERACTIONS *
1891 **************************/
1893 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1896 r00 = _mm256_mul_pd(rsq00,rinv00);
1897 r00 = _mm256_andnot_pd(dummy_mask,r00);
1899 /* REACTION-FIELD ELECTROSTATICS */
1900 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1902 /* LENNARD-JONES DISPERSION/REPULSION */
1904 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1905 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1906 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1907 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1908 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1910 d = _mm256_sub_pd(r00,rswitch);
1911 d = _mm256_max_pd(d,_mm256_setzero_pd());
1912 d2 = _mm256_mul_pd(d,d);
1913 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1915 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1917 /* Evaluate switch function */
1918 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1919 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1920 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1922 fscal = _mm256_add_pd(felec,fvdw);
1924 fscal = _mm256_and_pd(fscal,cutoff_mask);
1926 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1928 /* Calculate temporary vectorial force */
1929 tx = _mm256_mul_pd(fscal,dx00);
1930 ty = _mm256_mul_pd(fscal,dy00);
1931 tz = _mm256_mul_pd(fscal,dz00);
1933 /* Update vectorial force */
1934 fix0 = _mm256_add_pd(fix0,tx);
1935 fiy0 = _mm256_add_pd(fiy0,ty);
1936 fiz0 = _mm256_add_pd(fiz0,tz);
1938 fjx0 = _mm256_add_pd(fjx0,tx);
1939 fjy0 = _mm256_add_pd(fjy0,ty);
1940 fjz0 = _mm256_add_pd(fjz0,tz);
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1951 /* REACTION-FIELD ELECTROSTATICS */
1952 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1954 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1958 fscal = _mm256_and_pd(fscal,cutoff_mask);
1960 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1962 /* Calculate temporary vectorial force */
1963 tx = _mm256_mul_pd(fscal,dx01);
1964 ty = _mm256_mul_pd(fscal,dy01);
1965 tz = _mm256_mul_pd(fscal,dz01);
1967 /* Update vectorial force */
1968 fix0 = _mm256_add_pd(fix0,tx);
1969 fiy0 = _mm256_add_pd(fiy0,ty);
1970 fiz0 = _mm256_add_pd(fiz0,tz);
1972 fjx1 = _mm256_add_pd(fjx1,tx);
1973 fjy1 = _mm256_add_pd(fjy1,ty);
1974 fjz1 = _mm256_add_pd(fjz1,tz);
1978 /**************************
1979 * CALCULATE INTERACTIONS *
1980 **************************/
1982 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1985 /* REACTION-FIELD ELECTROSTATICS */
1986 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1988 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1992 fscal = _mm256_and_pd(fscal,cutoff_mask);
1994 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1996 /* Calculate temporary vectorial force */
1997 tx = _mm256_mul_pd(fscal,dx02);
1998 ty = _mm256_mul_pd(fscal,dy02);
1999 tz = _mm256_mul_pd(fscal,dz02);
2001 /* Update vectorial force */
2002 fix0 = _mm256_add_pd(fix0,tx);
2003 fiy0 = _mm256_add_pd(fiy0,ty);
2004 fiz0 = _mm256_add_pd(fiz0,tz);
2006 fjx2 = _mm256_add_pd(fjx2,tx);
2007 fjy2 = _mm256_add_pd(fjy2,ty);
2008 fjz2 = _mm256_add_pd(fjz2,tz);
2012 /**************************
2013 * CALCULATE INTERACTIONS *
2014 **************************/
2016 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2019 /* REACTION-FIELD ELECTROSTATICS */
2020 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
2022 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2026 fscal = _mm256_and_pd(fscal,cutoff_mask);
2028 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2030 /* Calculate temporary vectorial force */
2031 tx = _mm256_mul_pd(fscal,dx10);
2032 ty = _mm256_mul_pd(fscal,dy10);
2033 tz = _mm256_mul_pd(fscal,dz10);
2035 /* Update vectorial force */
2036 fix1 = _mm256_add_pd(fix1,tx);
2037 fiy1 = _mm256_add_pd(fiy1,ty);
2038 fiz1 = _mm256_add_pd(fiz1,tz);
2040 fjx0 = _mm256_add_pd(fjx0,tx);
2041 fjy0 = _mm256_add_pd(fjy0,ty);
2042 fjz0 = _mm256_add_pd(fjz0,tz);
2046 /**************************
2047 * CALCULATE INTERACTIONS *
2048 **************************/
2050 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2053 /* REACTION-FIELD ELECTROSTATICS */
2054 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2056 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2060 fscal = _mm256_and_pd(fscal,cutoff_mask);
2062 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2064 /* Calculate temporary vectorial force */
2065 tx = _mm256_mul_pd(fscal,dx11);
2066 ty = _mm256_mul_pd(fscal,dy11);
2067 tz = _mm256_mul_pd(fscal,dz11);
2069 /* Update vectorial force */
2070 fix1 = _mm256_add_pd(fix1,tx);
2071 fiy1 = _mm256_add_pd(fiy1,ty);
2072 fiz1 = _mm256_add_pd(fiz1,tz);
2074 fjx1 = _mm256_add_pd(fjx1,tx);
2075 fjy1 = _mm256_add_pd(fjy1,ty);
2076 fjz1 = _mm256_add_pd(fjz1,tz);
2080 /**************************
2081 * CALCULATE INTERACTIONS *
2082 **************************/
2084 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2087 /* REACTION-FIELD ELECTROSTATICS */
2088 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2090 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2094 fscal = _mm256_and_pd(fscal,cutoff_mask);
2096 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2098 /* Calculate temporary vectorial force */
2099 tx = _mm256_mul_pd(fscal,dx12);
2100 ty = _mm256_mul_pd(fscal,dy12);
2101 tz = _mm256_mul_pd(fscal,dz12);
2103 /* Update vectorial force */
2104 fix1 = _mm256_add_pd(fix1,tx);
2105 fiy1 = _mm256_add_pd(fiy1,ty);
2106 fiz1 = _mm256_add_pd(fiz1,tz);
2108 fjx2 = _mm256_add_pd(fjx2,tx);
2109 fjy2 = _mm256_add_pd(fjy2,ty);
2110 fjz2 = _mm256_add_pd(fjz2,tz);
2114 /**************************
2115 * CALCULATE INTERACTIONS *
2116 **************************/
2118 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2121 /* REACTION-FIELD ELECTROSTATICS */
2122 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
2124 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2128 fscal = _mm256_and_pd(fscal,cutoff_mask);
2130 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2132 /* Calculate temporary vectorial force */
2133 tx = _mm256_mul_pd(fscal,dx20);
2134 ty = _mm256_mul_pd(fscal,dy20);
2135 tz = _mm256_mul_pd(fscal,dz20);
2137 /* Update vectorial force */
2138 fix2 = _mm256_add_pd(fix2,tx);
2139 fiy2 = _mm256_add_pd(fiy2,ty);
2140 fiz2 = _mm256_add_pd(fiz2,tz);
2142 fjx0 = _mm256_add_pd(fjx0,tx);
2143 fjy0 = _mm256_add_pd(fjy0,ty);
2144 fjz0 = _mm256_add_pd(fjz0,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2155 /* REACTION-FIELD ELECTROSTATICS */
2156 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2158 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2162 fscal = _mm256_and_pd(fscal,cutoff_mask);
2164 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2166 /* Calculate temporary vectorial force */
2167 tx = _mm256_mul_pd(fscal,dx21);
2168 ty = _mm256_mul_pd(fscal,dy21);
2169 tz = _mm256_mul_pd(fscal,dz21);
2171 /* Update vectorial force */
2172 fix2 = _mm256_add_pd(fix2,tx);
2173 fiy2 = _mm256_add_pd(fiy2,ty);
2174 fiz2 = _mm256_add_pd(fiz2,tz);
2176 fjx1 = _mm256_add_pd(fjx1,tx);
2177 fjy1 = _mm256_add_pd(fjy1,ty);
2178 fjz1 = _mm256_add_pd(fjz1,tz);
2182 /**************************
2183 * CALCULATE INTERACTIONS *
2184 **************************/
2186 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2189 /* REACTION-FIELD ELECTROSTATICS */
2190 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2192 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2196 fscal = _mm256_and_pd(fscal,cutoff_mask);
2198 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2200 /* Calculate temporary vectorial force */
2201 tx = _mm256_mul_pd(fscal,dx22);
2202 ty = _mm256_mul_pd(fscal,dy22);
2203 tz = _mm256_mul_pd(fscal,dz22);
2205 /* Update vectorial force */
2206 fix2 = _mm256_add_pd(fix2,tx);
2207 fiy2 = _mm256_add_pd(fiy2,ty);
2208 fiz2 = _mm256_add_pd(fiz2,tz);
2210 fjx2 = _mm256_add_pd(fjx2,tx);
2211 fjy2 = _mm256_add_pd(fjy2,ty);
2212 fjz2 = _mm256_add_pd(fjz2,tz);
2216 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2217 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2218 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2219 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2221 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2222 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2224 /* Inner loop uses 302 flops */
2227 /* End of innermost loop */
2229 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2230 f+i_coord_offset,fshift+i_shift_offset);
2232 /* Increment number of inner iterations */
2233 inneriter += j_index_end - j_index_start;
2235 /* Outer loop uses 18 flops */
2238 /* Increment number of outer iterations */
2241 /* Update outer/inner flops */
2243 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);