2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
100 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
101 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
102 real rswitch_scalar,d_scalar;
103 __m256 dummy_mask,cutoff_mask;
104 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
105 __m256 one = _mm256_set1_ps(1.0);
106 __m256 two = _mm256_set1_ps(2.0);
112 jindex = nlist->jindex;
114 shiftidx = nlist->shift;
116 shiftvec = fr->shift_vec[0];
117 fshift = fr->fshift[0];
118 facel = _mm256_set1_ps(fr->epsfac);
119 charge = mdatoms->chargeA;
120 krf = _mm256_set1_ps(fr->ic->k_rf);
121 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
122 crf = _mm256_set1_ps(fr->ic->c_rf);
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
130 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
131 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
132 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
134 jq0 = _mm256_set1_ps(charge[inr+0]);
135 jq1 = _mm256_set1_ps(charge[inr+1]);
136 jq2 = _mm256_set1_ps(charge[inr+2]);
137 vdwjidx0A = 2*vdwtype[inr+0];
138 qq00 = _mm256_mul_ps(iq0,jq0);
139 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
140 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
141 qq01 = _mm256_mul_ps(iq0,jq1);
142 qq02 = _mm256_mul_ps(iq0,jq2);
143 qq10 = _mm256_mul_ps(iq1,jq0);
144 qq11 = _mm256_mul_ps(iq1,jq1);
145 qq12 = _mm256_mul_ps(iq1,jq2);
146 qq20 = _mm256_mul_ps(iq2,jq0);
147 qq21 = _mm256_mul_ps(iq2,jq1);
148 qq22 = _mm256_mul_ps(iq2,jq2);
150 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
151 rcutoff_scalar = fr->rcoulomb;
152 rcutoff = _mm256_set1_ps(rcutoff_scalar);
153 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
155 rswitch_scalar = fr->rvdw_switch;
156 rswitch = _mm256_set1_ps(rswitch_scalar);
157 /* Setup switch parameters */
158 d_scalar = rcutoff_scalar-rswitch_scalar;
159 d = _mm256_set1_ps(d_scalar);
160 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
161 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
162 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
163 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
164 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
165 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
181 for(iidx=0;iidx<4*DIM;iidx++)
186 /* Start outer loop over neighborlists */
187 for(iidx=0; iidx<nri; iidx++)
189 /* Load shift vector for this list */
190 i_shift_offset = DIM*shiftidx[iidx];
192 /* Load limits for loop over neighbors */
193 j_index_start = jindex[iidx];
194 j_index_end = jindex[iidx+1];
196 /* Get outer coordinate index */
198 i_coord_offset = DIM*inr;
200 /* Load i particle coords and add shift vector */
201 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
202 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
204 fix0 = _mm256_setzero_ps();
205 fiy0 = _mm256_setzero_ps();
206 fiz0 = _mm256_setzero_ps();
207 fix1 = _mm256_setzero_ps();
208 fiy1 = _mm256_setzero_ps();
209 fiz1 = _mm256_setzero_ps();
210 fix2 = _mm256_setzero_ps();
211 fiy2 = _mm256_setzero_ps();
212 fiz2 = _mm256_setzero_ps();
214 /* Reset potential sums */
215 velecsum = _mm256_setzero_ps();
216 vvdwsum = _mm256_setzero_ps();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
222 /* Get j neighbor index, and coordinate index */
231 j_coord_offsetA = DIM*jnrA;
232 j_coord_offsetB = DIM*jnrB;
233 j_coord_offsetC = DIM*jnrC;
234 j_coord_offsetD = DIM*jnrD;
235 j_coord_offsetE = DIM*jnrE;
236 j_coord_offsetF = DIM*jnrF;
237 j_coord_offsetG = DIM*jnrG;
238 j_coord_offsetH = DIM*jnrH;
240 /* load j atom coordinates */
241 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
242 x+j_coord_offsetC,x+j_coord_offsetD,
243 x+j_coord_offsetE,x+j_coord_offsetF,
244 x+j_coord_offsetG,x+j_coord_offsetH,
245 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
247 /* Calculate displacement vector */
248 dx00 = _mm256_sub_ps(ix0,jx0);
249 dy00 = _mm256_sub_ps(iy0,jy0);
250 dz00 = _mm256_sub_ps(iz0,jz0);
251 dx01 = _mm256_sub_ps(ix0,jx1);
252 dy01 = _mm256_sub_ps(iy0,jy1);
253 dz01 = _mm256_sub_ps(iz0,jz1);
254 dx02 = _mm256_sub_ps(ix0,jx2);
255 dy02 = _mm256_sub_ps(iy0,jy2);
256 dz02 = _mm256_sub_ps(iz0,jz2);
257 dx10 = _mm256_sub_ps(ix1,jx0);
258 dy10 = _mm256_sub_ps(iy1,jy0);
259 dz10 = _mm256_sub_ps(iz1,jz0);
260 dx11 = _mm256_sub_ps(ix1,jx1);
261 dy11 = _mm256_sub_ps(iy1,jy1);
262 dz11 = _mm256_sub_ps(iz1,jz1);
263 dx12 = _mm256_sub_ps(ix1,jx2);
264 dy12 = _mm256_sub_ps(iy1,jy2);
265 dz12 = _mm256_sub_ps(iz1,jz2);
266 dx20 = _mm256_sub_ps(ix2,jx0);
267 dy20 = _mm256_sub_ps(iy2,jy0);
268 dz20 = _mm256_sub_ps(iz2,jz0);
269 dx21 = _mm256_sub_ps(ix2,jx1);
270 dy21 = _mm256_sub_ps(iy2,jy1);
271 dz21 = _mm256_sub_ps(iz2,jz1);
272 dx22 = _mm256_sub_ps(ix2,jx2);
273 dy22 = _mm256_sub_ps(iy2,jy2);
274 dz22 = _mm256_sub_ps(iz2,jz2);
276 /* Calculate squared distance and things based on it */
277 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
278 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
279 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
280 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
281 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
282 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
283 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
284 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
285 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
287 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
288 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
289 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
290 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
291 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
292 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
293 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
294 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
295 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
297 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
298 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
299 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
300 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
301 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
302 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
303 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
304 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
305 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
307 fjx0 = _mm256_setzero_ps();
308 fjy0 = _mm256_setzero_ps();
309 fjz0 = _mm256_setzero_ps();
310 fjx1 = _mm256_setzero_ps();
311 fjy1 = _mm256_setzero_ps();
312 fjz1 = _mm256_setzero_ps();
313 fjx2 = _mm256_setzero_ps();
314 fjy2 = _mm256_setzero_ps();
315 fjz2 = _mm256_setzero_ps();
317 /**************************
318 * CALCULATE INTERACTIONS *
319 **************************/
321 if (gmx_mm256_any_lt(rsq00,rcutoff2))
324 r00 = _mm256_mul_ps(rsq00,rinv00);
326 /* REACTION-FIELD ELECTROSTATICS */
327 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
328 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
330 /* LENNARD-JONES DISPERSION/REPULSION */
332 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
333 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
334 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
335 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
336 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
338 d = _mm256_sub_ps(r00,rswitch);
339 d = _mm256_max_ps(d,_mm256_setzero_ps());
340 d2 = _mm256_mul_ps(d,d);
341 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
343 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
345 /* Evaluate switch function */
346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
347 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
348 vvdw = _mm256_mul_ps(vvdw,sw);
349 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velec = _mm256_and_ps(velec,cutoff_mask);
353 velecsum = _mm256_add_ps(velecsum,velec);
354 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
355 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
357 fscal = _mm256_add_ps(felec,fvdw);
359 fscal = _mm256_and_ps(fscal,cutoff_mask);
361 /* Calculate temporary vectorial force */
362 tx = _mm256_mul_ps(fscal,dx00);
363 ty = _mm256_mul_ps(fscal,dy00);
364 tz = _mm256_mul_ps(fscal,dz00);
366 /* Update vectorial force */
367 fix0 = _mm256_add_ps(fix0,tx);
368 fiy0 = _mm256_add_ps(fiy0,ty);
369 fiz0 = _mm256_add_ps(fiz0,tz);
371 fjx0 = _mm256_add_ps(fjx0,tx);
372 fjy0 = _mm256_add_ps(fjy0,ty);
373 fjz0 = _mm256_add_ps(fjz0,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 if (gmx_mm256_any_lt(rsq01,rcutoff2))
384 /* REACTION-FIELD ELECTROSTATICS */
385 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
386 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
388 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velec = _mm256_and_ps(velec,cutoff_mask);
392 velecsum = _mm256_add_ps(velecsum,velec);
396 fscal = _mm256_and_ps(fscal,cutoff_mask);
398 /* Calculate temporary vectorial force */
399 tx = _mm256_mul_ps(fscal,dx01);
400 ty = _mm256_mul_ps(fscal,dy01);
401 tz = _mm256_mul_ps(fscal,dz01);
403 /* Update vectorial force */
404 fix0 = _mm256_add_ps(fix0,tx);
405 fiy0 = _mm256_add_ps(fiy0,ty);
406 fiz0 = _mm256_add_ps(fiz0,tz);
408 fjx1 = _mm256_add_ps(fjx1,tx);
409 fjy1 = _mm256_add_ps(fjy1,ty);
410 fjz1 = _mm256_add_ps(fjz1,tz);
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 if (gmx_mm256_any_lt(rsq02,rcutoff2))
421 /* REACTION-FIELD ELECTROSTATICS */
422 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
423 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
425 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velec = _mm256_and_ps(velec,cutoff_mask);
429 velecsum = _mm256_add_ps(velecsum,velec);
433 fscal = _mm256_and_ps(fscal,cutoff_mask);
435 /* Calculate temporary vectorial force */
436 tx = _mm256_mul_ps(fscal,dx02);
437 ty = _mm256_mul_ps(fscal,dy02);
438 tz = _mm256_mul_ps(fscal,dz02);
440 /* Update vectorial force */
441 fix0 = _mm256_add_ps(fix0,tx);
442 fiy0 = _mm256_add_ps(fiy0,ty);
443 fiz0 = _mm256_add_ps(fiz0,tz);
445 fjx2 = _mm256_add_ps(fjx2,tx);
446 fjy2 = _mm256_add_ps(fjy2,ty);
447 fjz2 = _mm256_add_ps(fjz2,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 if (gmx_mm256_any_lt(rsq10,rcutoff2))
458 /* REACTION-FIELD ELECTROSTATICS */
459 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
460 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
462 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm256_and_ps(velec,cutoff_mask);
466 velecsum = _mm256_add_ps(velecsum,velec);
470 fscal = _mm256_and_ps(fscal,cutoff_mask);
472 /* Calculate temporary vectorial force */
473 tx = _mm256_mul_ps(fscal,dx10);
474 ty = _mm256_mul_ps(fscal,dy10);
475 tz = _mm256_mul_ps(fscal,dz10);
477 /* Update vectorial force */
478 fix1 = _mm256_add_ps(fix1,tx);
479 fiy1 = _mm256_add_ps(fiy1,ty);
480 fiz1 = _mm256_add_ps(fiz1,tz);
482 fjx0 = _mm256_add_ps(fjx0,tx);
483 fjy0 = _mm256_add_ps(fjy0,ty);
484 fjz0 = _mm256_add_ps(fjz0,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 if (gmx_mm256_any_lt(rsq11,rcutoff2))
495 /* REACTION-FIELD ELECTROSTATICS */
496 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
497 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
499 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velec = _mm256_and_ps(velec,cutoff_mask);
503 velecsum = _mm256_add_ps(velecsum,velec);
507 fscal = _mm256_and_ps(fscal,cutoff_mask);
509 /* Calculate temporary vectorial force */
510 tx = _mm256_mul_ps(fscal,dx11);
511 ty = _mm256_mul_ps(fscal,dy11);
512 tz = _mm256_mul_ps(fscal,dz11);
514 /* Update vectorial force */
515 fix1 = _mm256_add_ps(fix1,tx);
516 fiy1 = _mm256_add_ps(fiy1,ty);
517 fiz1 = _mm256_add_ps(fiz1,tz);
519 fjx1 = _mm256_add_ps(fjx1,tx);
520 fjy1 = _mm256_add_ps(fjy1,ty);
521 fjz1 = _mm256_add_ps(fjz1,tz);
525 /**************************
526 * CALCULATE INTERACTIONS *
527 **************************/
529 if (gmx_mm256_any_lt(rsq12,rcutoff2))
532 /* REACTION-FIELD ELECTROSTATICS */
533 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
534 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
536 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velec = _mm256_and_ps(velec,cutoff_mask);
540 velecsum = _mm256_add_ps(velecsum,velec);
544 fscal = _mm256_and_ps(fscal,cutoff_mask);
546 /* Calculate temporary vectorial force */
547 tx = _mm256_mul_ps(fscal,dx12);
548 ty = _mm256_mul_ps(fscal,dy12);
549 tz = _mm256_mul_ps(fscal,dz12);
551 /* Update vectorial force */
552 fix1 = _mm256_add_ps(fix1,tx);
553 fiy1 = _mm256_add_ps(fiy1,ty);
554 fiz1 = _mm256_add_ps(fiz1,tz);
556 fjx2 = _mm256_add_ps(fjx2,tx);
557 fjy2 = _mm256_add_ps(fjy2,ty);
558 fjz2 = _mm256_add_ps(fjz2,tz);
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 if (gmx_mm256_any_lt(rsq20,rcutoff2))
569 /* REACTION-FIELD ELECTROSTATICS */
570 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
571 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
573 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velec = _mm256_and_ps(velec,cutoff_mask);
577 velecsum = _mm256_add_ps(velecsum,velec);
581 fscal = _mm256_and_ps(fscal,cutoff_mask);
583 /* Calculate temporary vectorial force */
584 tx = _mm256_mul_ps(fscal,dx20);
585 ty = _mm256_mul_ps(fscal,dy20);
586 tz = _mm256_mul_ps(fscal,dz20);
588 /* Update vectorial force */
589 fix2 = _mm256_add_ps(fix2,tx);
590 fiy2 = _mm256_add_ps(fiy2,ty);
591 fiz2 = _mm256_add_ps(fiz2,tz);
593 fjx0 = _mm256_add_ps(fjx0,tx);
594 fjy0 = _mm256_add_ps(fjy0,ty);
595 fjz0 = _mm256_add_ps(fjz0,tz);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 if (gmx_mm256_any_lt(rsq21,rcutoff2))
606 /* REACTION-FIELD ELECTROSTATICS */
607 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
608 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
610 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velec = _mm256_and_ps(velec,cutoff_mask);
614 velecsum = _mm256_add_ps(velecsum,velec);
618 fscal = _mm256_and_ps(fscal,cutoff_mask);
620 /* Calculate temporary vectorial force */
621 tx = _mm256_mul_ps(fscal,dx21);
622 ty = _mm256_mul_ps(fscal,dy21);
623 tz = _mm256_mul_ps(fscal,dz21);
625 /* Update vectorial force */
626 fix2 = _mm256_add_ps(fix2,tx);
627 fiy2 = _mm256_add_ps(fiy2,ty);
628 fiz2 = _mm256_add_ps(fiz2,tz);
630 fjx1 = _mm256_add_ps(fjx1,tx);
631 fjy1 = _mm256_add_ps(fjy1,ty);
632 fjz1 = _mm256_add_ps(fjz1,tz);
636 /**************************
637 * CALCULATE INTERACTIONS *
638 **************************/
640 if (gmx_mm256_any_lt(rsq22,rcutoff2))
643 /* REACTION-FIELD ELECTROSTATICS */
644 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
645 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
647 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 velec = _mm256_and_ps(velec,cutoff_mask);
651 velecsum = _mm256_add_ps(velecsum,velec);
655 fscal = _mm256_and_ps(fscal,cutoff_mask);
657 /* Calculate temporary vectorial force */
658 tx = _mm256_mul_ps(fscal,dx22);
659 ty = _mm256_mul_ps(fscal,dy22);
660 tz = _mm256_mul_ps(fscal,dz22);
662 /* Update vectorial force */
663 fix2 = _mm256_add_ps(fix2,tx);
664 fiy2 = _mm256_add_ps(fiy2,ty);
665 fiz2 = _mm256_add_ps(fiz2,tz);
667 fjx2 = _mm256_add_ps(fjx2,tx);
668 fjy2 = _mm256_add_ps(fjy2,ty);
669 fjz2 = _mm256_add_ps(fjz2,tz);
673 fjptrA = f+j_coord_offsetA;
674 fjptrB = f+j_coord_offsetB;
675 fjptrC = f+j_coord_offsetC;
676 fjptrD = f+j_coord_offsetD;
677 fjptrE = f+j_coord_offsetE;
678 fjptrF = f+j_coord_offsetF;
679 fjptrG = f+j_coord_offsetG;
680 fjptrH = f+j_coord_offsetH;
682 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
683 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
685 /* Inner loop uses 358 flops */
691 /* Get j neighbor index, and coordinate index */
692 jnrlistA = jjnr[jidx];
693 jnrlistB = jjnr[jidx+1];
694 jnrlistC = jjnr[jidx+2];
695 jnrlistD = jjnr[jidx+3];
696 jnrlistE = jjnr[jidx+4];
697 jnrlistF = jjnr[jidx+5];
698 jnrlistG = jjnr[jidx+6];
699 jnrlistH = jjnr[jidx+7];
700 /* Sign of each element will be negative for non-real atoms.
701 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
702 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
704 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
705 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
707 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
708 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
709 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
710 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
711 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
712 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
713 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
714 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
715 j_coord_offsetA = DIM*jnrA;
716 j_coord_offsetB = DIM*jnrB;
717 j_coord_offsetC = DIM*jnrC;
718 j_coord_offsetD = DIM*jnrD;
719 j_coord_offsetE = DIM*jnrE;
720 j_coord_offsetF = DIM*jnrF;
721 j_coord_offsetG = DIM*jnrG;
722 j_coord_offsetH = DIM*jnrH;
724 /* load j atom coordinates */
725 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
726 x+j_coord_offsetC,x+j_coord_offsetD,
727 x+j_coord_offsetE,x+j_coord_offsetF,
728 x+j_coord_offsetG,x+j_coord_offsetH,
729 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
731 /* Calculate displacement vector */
732 dx00 = _mm256_sub_ps(ix0,jx0);
733 dy00 = _mm256_sub_ps(iy0,jy0);
734 dz00 = _mm256_sub_ps(iz0,jz0);
735 dx01 = _mm256_sub_ps(ix0,jx1);
736 dy01 = _mm256_sub_ps(iy0,jy1);
737 dz01 = _mm256_sub_ps(iz0,jz1);
738 dx02 = _mm256_sub_ps(ix0,jx2);
739 dy02 = _mm256_sub_ps(iy0,jy2);
740 dz02 = _mm256_sub_ps(iz0,jz2);
741 dx10 = _mm256_sub_ps(ix1,jx0);
742 dy10 = _mm256_sub_ps(iy1,jy0);
743 dz10 = _mm256_sub_ps(iz1,jz0);
744 dx11 = _mm256_sub_ps(ix1,jx1);
745 dy11 = _mm256_sub_ps(iy1,jy1);
746 dz11 = _mm256_sub_ps(iz1,jz1);
747 dx12 = _mm256_sub_ps(ix1,jx2);
748 dy12 = _mm256_sub_ps(iy1,jy2);
749 dz12 = _mm256_sub_ps(iz1,jz2);
750 dx20 = _mm256_sub_ps(ix2,jx0);
751 dy20 = _mm256_sub_ps(iy2,jy0);
752 dz20 = _mm256_sub_ps(iz2,jz0);
753 dx21 = _mm256_sub_ps(ix2,jx1);
754 dy21 = _mm256_sub_ps(iy2,jy1);
755 dz21 = _mm256_sub_ps(iz2,jz1);
756 dx22 = _mm256_sub_ps(ix2,jx2);
757 dy22 = _mm256_sub_ps(iy2,jy2);
758 dz22 = _mm256_sub_ps(iz2,jz2);
760 /* Calculate squared distance and things based on it */
761 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
762 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
763 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
764 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
765 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
766 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
767 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
768 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
769 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
771 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
772 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
773 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
774 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
775 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
776 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
777 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
778 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
779 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
781 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
782 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
783 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
784 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
785 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
786 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
787 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
788 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
789 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
791 fjx0 = _mm256_setzero_ps();
792 fjy0 = _mm256_setzero_ps();
793 fjz0 = _mm256_setzero_ps();
794 fjx1 = _mm256_setzero_ps();
795 fjy1 = _mm256_setzero_ps();
796 fjz1 = _mm256_setzero_ps();
797 fjx2 = _mm256_setzero_ps();
798 fjy2 = _mm256_setzero_ps();
799 fjz2 = _mm256_setzero_ps();
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 if (gmx_mm256_any_lt(rsq00,rcutoff2))
808 r00 = _mm256_mul_ps(rsq00,rinv00);
809 r00 = _mm256_andnot_ps(dummy_mask,r00);
811 /* REACTION-FIELD ELECTROSTATICS */
812 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
813 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
815 /* LENNARD-JONES DISPERSION/REPULSION */
817 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
818 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
819 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
820 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
821 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
823 d = _mm256_sub_ps(r00,rswitch);
824 d = _mm256_max_ps(d,_mm256_setzero_ps());
825 d2 = _mm256_mul_ps(d,d);
826 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
828 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
830 /* Evaluate switch function */
831 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
832 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
833 vvdw = _mm256_mul_ps(vvdw,sw);
834 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
836 /* Update potential sum for this i atom from the interaction with this j atom. */
837 velec = _mm256_and_ps(velec,cutoff_mask);
838 velec = _mm256_andnot_ps(dummy_mask,velec);
839 velecsum = _mm256_add_ps(velecsum,velec);
840 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
841 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
842 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
844 fscal = _mm256_add_ps(felec,fvdw);
846 fscal = _mm256_and_ps(fscal,cutoff_mask);
848 fscal = _mm256_andnot_ps(dummy_mask,fscal);
850 /* Calculate temporary vectorial force */
851 tx = _mm256_mul_ps(fscal,dx00);
852 ty = _mm256_mul_ps(fscal,dy00);
853 tz = _mm256_mul_ps(fscal,dz00);
855 /* Update vectorial force */
856 fix0 = _mm256_add_ps(fix0,tx);
857 fiy0 = _mm256_add_ps(fiy0,ty);
858 fiz0 = _mm256_add_ps(fiz0,tz);
860 fjx0 = _mm256_add_ps(fjx0,tx);
861 fjy0 = _mm256_add_ps(fjy0,ty);
862 fjz0 = _mm256_add_ps(fjz0,tz);
866 /**************************
867 * CALCULATE INTERACTIONS *
868 **************************/
870 if (gmx_mm256_any_lt(rsq01,rcutoff2))
873 /* REACTION-FIELD ELECTROSTATICS */
874 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
875 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
877 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
879 /* Update potential sum for this i atom from the interaction with this j atom. */
880 velec = _mm256_and_ps(velec,cutoff_mask);
881 velec = _mm256_andnot_ps(dummy_mask,velec);
882 velecsum = _mm256_add_ps(velecsum,velec);
886 fscal = _mm256_and_ps(fscal,cutoff_mask);
888 fscal = _mm256_andnot_ps(dummy_mask,fscal);
890 /* Calculate temporary vectorial force */
891 tx = _mm256_mul_ps(fscal,dx01);
892 ty = _mm256_mul_ps(fscal,dy01);
893 tz = _mm256_mul_ps(fscal,dz01);
895 /* Update vectorial force */
896 fix0 = _mm256_add_ps(fix0,tx);
897 fiy0 = _mm256_add_ps(fiy0,ty);
898 fiz0 = _mm256_add_ps(fiz0,tz);
900 fjx1 = _mm256_add_ps(fjx1,tx);
901 fjy1 = _mm256_add_ps(fjy1,ty);
902 fjz1 = _mm256_add_ps(fjz1,tz);
906 /**************************
907 * CALCULATE INTERACTIONS *
908 **************************/
910 if (gmx_mm256_any_lt(rsq02,rcutoff2))
913 /* REACTION-FIELD ELECTROSTATICS */
914 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
915 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
917 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec = _mm256_and_ps(velec,cutoff_mask);
921 velec = _mm256_andnot_ps(dummy_mask,velec);
922 velecsum = _mm256_add_ps(velecsum,velec);
926 fscal = _mm256_and_ps(fscal,cutoff_mask);
928 fscal = _mm256_andnot_ps(dummy_mask,fscal);
930 /* Calculate temporary vectorial force */
931 tx = _mm256_mul_ps(fscal,dx02);
932 ty = _mm256_mul_ps(fscal,dy02);
933 tz = _mm256_mul_ps(fscal,dz02);
935 /* Update vectorial force */
936 fix0 = _mm256_add_ps(fix0,tx);
937 fiy0 = _mm256_add_ps(fiy0,ty);
938 fiz0 = _mm256_add_ps(fiz0,tz);
940 fjx2 = _mm256_add_ps(fjx2,tx);
941 fjy2 = _mm256_add_ps(fjy2,ty);
942 fjz2 = _mm256_add_ps(fjz2,tz);
946 /**************************
947 * CALCULATE INTERACTIONS *
948 **************************/
950 if (gmx_mm256_any_lt(rsq10,rcutoff2))
953 /* REACTION-FIELD ELECTROSTATICS */
954 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
955 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
957 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
959 /* Update potential sum for this i atom from the interaction with this j atom. */
960 velec = _mm256_and_ps(velec,cutoff_mask);
961 velec = _mm256_andnot_ps(dummy_mask,velec);
962 velecsum = _mm256_add_ps(velecsum,velec);
966 fscal = _mm256_and_ps(fscal,cutoff_mask);
968 fscal = _mm256_andnot_ps(dummy_mask,fscal);
970 /* Calculate temporary vectorial force */
971 tx = _mm256_mul_ps(fscal,dx10);
972 ty = _mm256_mul_ps(fscal,dy10);
973 tz = _mm256_mul_ps(fscal,dz10);
975 /* Update vectorial force */
976 fix1 = _mm256_add_ps(fix1,tx);
977 fiy1 = _mm256_add_ps(fiy1,ty);
978 fiz1 = _mm256_add_ps(fiz1,tz);
980 fjx0 = _mm256_add_ps(fjx0,tx);
981 fjy0 = _mm256_add_ps(fjy0,ty);
982 fjz0 = _mm256_add_ps(fjz0,tz);
986 /**************************
987 * CALCULATE INTERACTIONS *
988 **************************/
990 if (gmx_mm256_any_lt(rsq11,rcutoff2))
993 /* REACTION-FIELD ELECTROSTATICS */
994 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
995 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
997 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec = _mm256_and_ps(velec,cutoff_mask);
1001 velec = _mm256_andnot_ps(dummy_mask,velec);
1002 velecsum = _mm256_add_ps(velecsum,velec);
1006 fscal = _mm256_and_ps(fscal,cutoff_mask);
1008 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1010 /* Calculate temporary vectorial force */
1011 tx = _mm256_mul_ps(fscal,dx11);
1012 ty = _mm256_mul_ps(fscal,dy11);
1013 tz = _mm256_mul_ps(fscal,dz11);
1015 /* Update vectorial force */
1016 fix1 = _mm256_add_ps(fix1,tx);
1017 fiy1 = _mm256_add_ps(fiy1,ty);
1018 fiz1 = _mm256_add_ps(fiz1,tz);
1020 fjx1 = _mm256_add_ps(fjx1,tx);
1021 fjy1 = _mm256_add_ps(fjy1,ty);
1022 fjz1 = _mm256_add_ps(fjz1,tz);
1026 /**************************
1027 * CALCULATE INTERACTIONS *
1028 **************************/
1030 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1033 /* REACTION-FIELD ELECTROSTATICS */
1034 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1035 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1037 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1039 /* Update potential sum for this i atom from the interaction with this j atom. */
1040 velec = _mm256_and_ps(velec,cutoff_mask);
1041 velec = _mm256_andnot_ps(dummy_mask,velec);
1042 velecsum = _mm256_add_ps(velecsum,velec);
1046 fscal = _mm256_and_ps(fscal,cutoff_mask);
1048 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1050 /* Calculate temporary vectorial force */
1051 tx = _mm256_mul_ps(fscal,dx12);
1052 ty = _mm256_mul_ps(fscal,dy12);
1053 tz = _mm256_mul_ps(fscal,dz12);
1055 /* Update vectorial force */
1056 fix1 = _mm256_add_ps(fix1,tx);
1057 fiy1 = _mm256_add_ps(fiy1,ty);
1058 fiz1 = _mm256_add_ps(fiz1,tz);
1060 fjx2 = _mm256_add_ps(fjx2,tx);
1061 fjy2 = _mm256_add_ps(fjy2,ty);
1062 fjz2 = _mm256_add_ps(fjz2,tz);
1066 /**************************
1067 * CALCULATE INTERACTIONS *
1068 **************************/
1070 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1073 /* REACTION-FIELD ELECTROSTATICS */
1074 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1075 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1077 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1079 /* Update potential sum for this i atom from the interaction with this j atom. */
1080 velec = _mm256_and_ps(velec,cutoff_mask);
1081 velec = _mm256_andnot_ps(dummy_mask,velec);
1082 velecsum = _mm256_add_ps(velecsum,velec);
1086 fscal = _mm256_and_ps(fscal,cutoff_mask);
1088 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1090 /* Calculate temporary vectorial force */
1091 tx = _mm256_mul_ps(fscal,dx20);
1092 ty = _mm256_mul_ps(fscal,dy20);
1093 tz = _mm256_mul_ps(fscal,dz20);
1095 /* Update vectorial force */
1096 fix2 = _mm256_add_ps(fix2,tx);
1097 fiy2 = _mm256_add_ps(fiy2,ty);
1098 fiz2 = _mm256_add_ps(fiz2,tz);
1100 fjx0 = _mm256_add_ps(fjx0,tx);
1101 fjy0 = _mm256_add_ps(fjy0,ty);
1102 fjz0 = _mm256_add_ps(fjz0,tz);
1106 /**************************
1107 * CALCULATE INTERACTIONS *
1108 **************************/
1110 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1113 /* REACTION-FIELD ELECTROSTATICS */
1114 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1115 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1117 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1119 /* Update potential sum for this i atom from the interaction with this j atom. */
1120 velec = _mm256_and_ps(velec,cutoff_mask);
1121 velec = _mm256_andnot_ps(dummy_mask,velec);
1122 velecsum = _mm256_add_ps(velecsum,velec);
1126 fscal = _mm256_and_ps(fscal,cutoff_mask);
1128 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1130 /* Calculate temporary vectorial force */
1131 tx = _mm256_mul_ps(fscal,dx21);
1132 ty = _mm256_mul_ps(fscal,dy21);
1133 tz = _mm256_mul_ps(fscal,dz21);
1135 /* Update vectorial force */
1136 fix2 = _mm256_add_ps(fix2,tx);
1137 fiy2 = _mm256_add_ps(fiy2,ty);
1138 fiz2 = _mm256_add_ps(fiz2,tz);
1140 fjx1 = _mm256_add_ps(fjx1,tx);
1141 fjy1 = _mm256_add_ps(fjy1,ty);
1142 fjz1 = _mm256_add_ps(fjz1,tz);
1146 /**************************
1147 * CALCULATE INTERACTIONS *
1148 **************************/
1150 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1153 /* REACTION-FIELD ELECTROSTATICS */
1154 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1155 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1157 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1159 /* Update potential sum for this i atom from the interaction with this j atom. */
1160 velec = _mm256_and_ps(velec,cutoff_mask);
1161 velec = _mm256_andnot_ps(dummy_mask,velec);
1162 velecsum = _mm256_add_ps(velecsum,velec);
1166 fscal = _mm256_and_ps(fscal,cutoff_mask);
1168 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1170 /* Calculate temporary vectorial force */
1171 tx = _mm256_mul_ps(fscal,dx22);
1172 ty = _mm256_mul_ps(fscal,dy22);
1173 tz = _mm256_mul_ps(fscal,dz22);
1175 /* Update vectorial force */
1176 fix2 = _mm256_add_ps(fix2,tx);
1177 fiy2 = _mm256_add_ps(fiy2,ty);
1178 fiz2 = _mm256_add_ps(fiz2,tz);
1180 fjx2 = _mm256_add_ps(fjx2,tx);
1181 fjy2 = _mm256_add_ps(fjy2,ty);
1182 fjz2 = _mm256_add_ps(fjz2,tz);
1186 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1187 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1188 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1189 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1190 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1191 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1192 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1193 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1195 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1196 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1198 /* Inner loop uses 359 flops */
1201 /* End of innermost loop */
1203 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1204 f+i_coord_offset,fshift+i_shift_offset);
1207 /* Update potential energies */
1208 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1209 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1211 /* Increment number of inner iterations */
1212 inneriter += j_index_end - j_index_start;
1214 /* Outer loop uses 20 flops */
1217 /* Increment number of outer iterations */
1220 /* Update outer/inner flops */
1222 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1225 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single
1226 * Electrostatics interaction: ReactionField
1227 * VdW interaction: LennardJones
1228 * Geometry: Water3-Water3
1229 * Calculate force/pot: Force
1232 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single
1233 (t_nblist * gmx_restrict nlist,
1234 rvec * gmx_restrict xx,
1235 rvec * gmx_restrict ff,
1236 t_forcerec * gmx_restrict fr,
1237 t_mdatoms * gmx_restrict mdatoms,
1238 nb_kernel_data_t * gmx_restrict kernel_data,
1239 t_nrnb * gmx_restrict nrnb)
1241 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1242 * just 0 for non-waters.
1243 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1244 * jnr indices corresponding to data put in the four positions in the SIMD register.
1246 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1247 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1248 int jnrA,jnrB,jnrC,jnrD;
1249 int jnrE,jnrF,jnrG,jnrH;
1250 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1251 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1252 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1253 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1254 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1255 real rcutoff_scalar;
1256 real *shiftvec,*fshift,*x,*f;
1257 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1258 real scratch[4*DIM];
1259 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1260 real * vdwioffsetptr0;
1261 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1262 real * vdwioffsetptr1;
1263 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1264 real * vdwioffsetptr2;
1265 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1266 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1267 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1268 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1269 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1270 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1271 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1272 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1273 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1274 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1275 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1276 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1277 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1278 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1279 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1280 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1281 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1284 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1287 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1288 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1289 __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1290 real rswitch_scalar,d_scalar;
1291 __m256 dummy_mask,cutoff_mask;
1292 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1293 __m256 one = _mm256_set1_ps(1.0);
1294 __m256 two = _mm256_set1_ps(2.0);
1300 jindex = nlist->jindex;
1302 shiftidx = nlist->shift;
1304 shiftvec = fr->shift_vec[0];
1305 fshift = fr->fshift[0];
1306 facel = _mm256_set1_ps(fr->epsfac);
1307 charge = mdatoms->chargeA;
1308 krf = _mm256_set1_ps(fr->ic->k_rf);
1309 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1310 crf = _mm256_set1_ps(fr->ic->c_rf);
1311 nvdwtype = fr->ntype;
1312 vdwparam = fr->nbfp;
1313 vdwtype = mdatoms->typeA;
1315 /* Setup water-specific parameters */
1316 inr = nlist->iinr[0];
1317 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1318 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1319 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1320 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1322 jq0 = _mm256_set1_ps(charge[inr+0]);
1323 jq1 = _mm256_set1_ps(charge[inr+1]);
1324 jq2 = _mm256_set1_ps(charge[inr+2]);
1325 vdwjidx0A = 2*vdwtype[inr+0];
1326 qq00 = _mm256_mul_ps(iq0,jq0);
1327 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1328 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1329 qq01 = _mm256_mul_ps(iq0,jq1);
1330 qq02 = _mm256_mul_ps(iq0,jq2);
1331 qq10 = _mm256_mul_ps(iq1,jq0);
1332 qq11 = _mm256_mul_ps(iq1,jq1);
1333 qq12 = _mm256_mul_ps(iq1,jq2);
1334 qq20 = _mm256_mul_ps(iq2,jq0);
1335 qq21 = _mm256_mul_ps(iq2,jq1);
1336 qq22 = _mm256_mul_ps(iq2,jq2);
1338 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1339 rcutoff_scalar = fr->rcoulomb;
1340 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1341 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1343 rswitch_scalar = fr->rvdw_switch;
1344 rswitch = _mm256_set1_ps(rswitch_scalar);
1345 /* Setup switch parameters */
1346 d_scalar = rcutoff_scalar-rswitch_scalar;
1347 d = _mm256_set1_ps(d_scalar);
1348 swV3 = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1349 swV4 = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1350 swV5 = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1351 swF2 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1352 swF3 = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1353 swF4 = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1355 /* Avoid stupid compiler warnings */
1356 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1357 j_coord_offsetA = 0;
1358 j_coord_offsetB = 0;
1359 j_coord_offsetC = 0;
1360 j_coord_offsetD = 0;
1361 j_coord_offsetE = 0;
1362 j_coord_offsetF = 0;
1363 j_coord_offsetG = 0;
1364 j_coord_offsetH = 0;
1369 for(iidx=0;iidx<4*DIM;iidx++)
1371 scratch[iidx] = 0.0;
1374 /* Start outer loop over neighborlists */
1375 for(iidx=0; iidx<nri; iidx++)
1377 /* Load shift vector for this list */
1378 i_shift_offset = DIM*shiftidx[iidx];
1380 /* Load limits for loop over neighbors */
1381 j_index_start = jindex[iidx];
1382 j_index_end = jindex[iidx+1];
1384 /* Get outer coordinate index */
1386 i_coord_offset = DIM*inr;
1388 /* Load i particle coords and add shift vector */
1389 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1390 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1392 fix0 = _mm256_setzero_ps();
1393 fiy0 = _mm256_setzero_ps();
1394 fiz0 = _mm256_setzero_ps();
1395 fix1 = _mm256_setzero_ps();
1396 fiy1 = _mm256_setzero_ps();
1397 fiz1 = _mm256_setzero_ps();
1398 fix2 = _mm256_setzero_ps();
1399 fiy2 = _mm256_setzero_ps();
1400 fiz2 = _mm256_setzero_ps();
1402 /* Start inner kernel loop */
1403 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1406 /* Get j neighbor index, and coordinate index */
1408 jnrB = jjnr[jidx+1];
1409 jnrC = jjnr[jidx+2];
1410 jnrD = jjnr[jidx+3];
1411 jnrE = jjnr[jidx+4];
1412 jnrF = jjnr[jidx+5];
1413 jnrG = jjnr[jidx+6];
1414 jnrH = jjnr[jidx+7];
1415 j_coord_offsetA = DIM*jnrA;
1416 j_coord_offsetB = DIM*jnrB;
1417 j_coord_offsetC = DIM*jnrC;
1418 j_coord_offsetD = DIM*jnrD;
1419 j_coord_offsetE = DIM*jnrE;
1420 j_coord_offsetF = DIM*jnrF;
1421 j_coord_offsetG = DIM*jnrG;
1422 j_coord_offsetH = DIM*jnrH;
1424 /* load j atom coordinates */
1425 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1426 x+j_coord_offsetC,x+j_coord_offsetD,
1427 x+j_coord_offsetE,x+j_coord_offsetF,
1428 x+j_coord_offsetG,x+j_coord_offsetH,
1429 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1431 /* Calculate displacement vector */
1432 dx00 = _mm256_sub_ps(ix0,jx0);
1433 dy00 = _mm256_sub_ps(iy0,jy0);
1434 dz00 = _mm256_sub_ps(iz0,jz0);
1435 dx01 = _mm256_sub_ps(ix0,jx1);
1436 dy01 = _mm256_sub_ps(iy0,jy1);
1437 dz01 = _mm256_sub_ps(iz0,jz1);
1438 dx02 = _mm256_sub_ps(ix0,jx2);
1439 dy02 = _mm256_sub_ps(iy0,jy2);
1440 dz02 = _mm256_sub_ps(iz0,jz2);
1441 dx10 = _mm256_sub_ps(ix1,jx0);
1442 dy10 = _mm256_sub_ps(iy1,jy0);
1443 dz10 = _mm256_sub_ps(iz1,jz0);
1444 dx11 = _mm256_sub_ps(ix1,jx1);
1445 dy11 = _mm256_sub_ps(iy1,jy1);
1446 dz11 = _mm256_sub_ps(iz1,jz1);
1447 dx12 = _mm256_sub_ps(ix1,jx2);
1448 dy12 = _mm256_sub_ps(iy1,jy2);
1449 dz12 = _mm256_sub_ps(iz1,jz2);
1450 dx20 = _mm256_sub_ps(ix2,jx0);
1451 dy20 = _mm256_sub_ps(iy2,jy0);
1452 dz20 = _mm256_sub_ps(iz2,jz0);
1453 dx21 = _mm256_sub_ps(ix2,jx1);
1454 dy21 = _mm256_sub_ps(iy2,jy1);
1455 dz21 = _mm256_sub_ps(iz2,jz1);
1456 dx22 = _mm256_sub_ps(ix2,jx2);
1457 dy22 = _mm256_sub_ps(iy2,jy2);
1458 dz22 = _mm256_sub_ps(iz2,jz2);
1460 /* Calculate squared distance and things based on it */
1461 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1462 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1463 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1464 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1465 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1466 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1467 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1468 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1469 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1471 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1472 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1473 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1474 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1475 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1476 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1477 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1478 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1479 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1481 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1482 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1483 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1484 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1485 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1486 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1487 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1488 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1489 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1491 fjx0 = _mm256_setzero_ps();
1492 fjy0 = _mm256_setzero_ps();
1493 fjz0 = _mm256_setzero_ps();
1494 fjx1 = _mm256_setzero_ps();
1495 fjy1 = _mm256_setzero_ps();
1496 fjz1 = _mm256_setzero_ps();
1497 fjx2 = _mm256_setzero_ps();
1498 fjy2 = _mm256_setzero_ps();
1499 fjz2 = _mm256_setzero_ps();
1501 /**************************
1502 * CALCULATE INTERACTIONS *
1503 **************************/
1505 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1508 r00 = _mm256_mul_ps(rsq00,rinv00);
1510 /* REACTION-FIELD ELECTROSTATICS */
1511 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1513 /* LENNARD-JONES DISPERSION/REPULSION */
1515 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1516 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1517 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1518 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1519 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1521 d = _mm256_sub_ps(r00,rswitch);
1522 d = _mm256_max_ps(d,_mm256_setzero_ps());
1523 d2 = _mm256_mul_ps(d,d);
1524 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1526 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1528 /* Evaluate switch function */
1529 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1530 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1531 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1533 fscal = _mm256_add_ps(felec,fvdw);
1535 fscal = _mm256_and_ps(fscal,cutoff_mask);
1537 /* Calculate temporary vectorial force */
1538 tx = _mm256_mul_ps(fscal,dx00);
1539 ty = _mm256_mul_ps(fscal,dy00);
1540 tz = _mm256_mul_ps(fscal,dz00);
1542 /* Update vectorial force */
1543 fix0 = _mm256_add_ps(fix0,tx);
1544 fiy0 = _mm256_add_ps(fiy0,ty);
1545 fiz0 = _mm256_add_ps(fiz0,tz);
1547 fjx0 = _mm256_add_ps(fjx0,tx);
1548 fjy0 = _mm256_add_ps(fjy0,ty);
1549 fjz0 = _mm256_add_ps(fjz0,tz);
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1560 /* REACTION-FIELD ELECTROSTATICS */
1561 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1563 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1567 fscal = _mm256_and_ps(fscal,cutoff_mask);
1569 /* Calculate temporary vectorial force */
1570 tx = _mm256_mul_ps(fscal,dx01);
1571 ty = _mm256_mul_ps(fscal,dy01);
1572 tz = _mm256_mul_ps(fscal,dz01);
1574 /* Update vectorial force */
1575 fix0 = _mm256_add_ps(fix0,tx);
1576 fiy0 = _mm256_add_ps(fiy0,ty);
1577 fiz0 = _mm256_add_ps(fiz0,tz);
1579 fjx1 = _mm256_add_ps(fjx1,tx);
1580 fjy1 = _mm256_add_ps(fjy1,ty);
1581 fjz1 = _mm256_add_ps(fjz1,tz);
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1592 /* REACTION-FIELD ELECTROSTATICS */
1593 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1595 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1599 fscal = _mm256_and_ps(fscal,cutoff_mask);
1601 /* Calculate temporary vectorial force */
1602 tx = _mm256_mul_ps(fscal,dx02);
1603 ty = _mm256_mul_ps(fscal,dy02);
1604 tz = _mm256_mul_ps(fscal,dz02);
1606 /* Update vectorial force */
1607 fix0 = _mm256_add_ps(fix0,tx);
1608 fiy0 = _mm256_add_ps(fiy0,ty);
1609 fiz0 = _mm256_add_ps(fiz0,tz);
1611 fjx2 = _mm256_add_ps(fjx2,tx);
1612 fjy2 = _mm256_add_ps(fjy2,ty);
1613 fjz2 = _mm256_add_ps(fjz2,tz);
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1624 /* REACTION-FIELD ELECTROSTATICS */
1625 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1627 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1631 fscal = _mm256_and_ps(fscal,cutoff_mask);
1633 /* Calculate temporary vectorial force */
1634 tx = _mm256_mul_ps(fscal,dx10);
1635 ty = _mm256_mul_ps(fscal,dy10);
1636 tz = _mm256_mul_ps(fscal,dz10);
1638 /* Update vectorial force */
1639 fix1 = _mm256_add_ps(fix1,tx);
1640 fiy1 = _mm256_add_ps(fiy1,ty);
1641 fiz1 = _mm256_add_ps(fiz1,tz);
1643 fjx0 = _mm256_add_ps(fjx0,tx);
1644 fjy0 = _mm256_add_ps(fjy0,ty);
1645 fjz0 = _mm256_add_ps(fjz0,tz);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1656 /* REACTION-FIELD ELECTROSTATICS */
1657 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1659 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1663 fscal = _mm256_and_ps(fscal,cutoff_mask);
1665 /* Calculate temporary vectorial force */
1666 tx = _mm256_mul_ps(fscal,dx11);
1667 ty = _mm256_mul_ps(fscal,dy11);
1668 tz = _mm256_mul_ps(fscal,dz11);
1670 /* Update vectorial force */
1671 fix1 = _mm256_add_ps(fix1,tx);
1672 fiy1 = _mm256_add_ps(fiy1,ty);
1673 fiz1 = _mm256_add_ps(fiz1,tz);
1675 fjx1 = _mm256_add_ps(fjx1,tx);
1676 fjy1 = _mm256_add_ps(fjy1,ty);
1677 fjz1 = _mm256_add_ps(fjz1,tz);
1681 /**************************
1682 * CALCULATE INTERACTIONS *
1683 **************************/
1685 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1688 /* REACTION-FIELD ELECTROSTATICS */
1689 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1691 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1695 fscal = _mm256_and_ps(fscal,cutoff_mask);
1697 /* Calculate temporary vectorial force */
1698 tx = _mm256_mul_ps(fscal,dx12);
1699 ty = _mm256_mul_ps(fscal,dy12);
1700 tz = _mm256_mul_ps(fscal,dz12);
1702 /* Update vectorial force */
1703 fix1 = _mm256_add_ps(fix1,tx);
1704 fiy1 = _mm256_add_ps(fiy1,ty);
1705 fiz1 = _mm256_add_ps(fiz1,tz);
1707 fjx2 = _mm256_add_ps(fjx2,tx);
1708 fjy2 = _mm256_add_ps(fjy2,ty);
1709 fjz2 = _mm256_add_ps(fjz2,tz);
1713 /**************************
1714 * CALCULATE INTERACTIONS *
1715 **************************/
1717 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1720 /* REACTION-FIELD ELECTROSTATICS */
1721 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1723 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1727 fscal = _mm256_and_ps(fscal,cutoff_mask);
1729 /* Calculate temporary vectorial force */
1730 tx = _mm256_mul_ps(fscal,dx20);
1731 ty = _mm256_mul_ps(fscal,dy20);
1732 tz = _mm256_mul_ps(fscal,dz20);
1734 /* Update vectorial force */
1735 fix2 = _mm256_add_ps(fix2,tx);
1736 fiy2 = _mm256_add_ps(fiy2,ty);
1737 fiz2 = _mm256_add_ps(fiz2,tz);
1739 fjx0 = _mm256_add_ps(fjx0,tx);
1740 fjy0 = _mm256_add_ps(fjy0,ty);
1741 fjz0 = _mm256_add_ps(fjz0,tz);
1745 /**************************
1746 * CALCULATE INTERACTIONS *
1747 **************************/
1749 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1752 /* REACTION-FIELD ELECTROSTATICS */
1753 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1755 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1759 fscal = _mm256_and_ps(fscal,cutoff_mask);
1761 /* Calculate temporary vectorial force */
1762 tx = _mm256_mul_ps(fscal,dx21);
1763 ty = _mm256_mul_ps(fscal,dy21);
1764 tz = _mm256_mul_ps(fscal,dz21);
1766 /* Update vectorial force */
1767 fix2 = _mm256_add_ps(fix2,tx);
1768 fiy2 = _mm256_add_ps(fiy2,ty);
1769 fiz2 = _mm256_add_ps(fiz2,tz);
1771 fjx1 = _mm256_add_ps(fjx1,tx);
1772 fjy1 = _mm256_add_ps(fjy1,ty);
1773 fjz1 = _mm256_add_ps(fjz1,tz);
1777 /**************************
1778 * CALCULATE INTERACTIONS *
1779 **************************/
1781 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1784 /* REACTION-FIELD ELECTROSTATICS */
1785 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1787 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1791 fscal = _mm256_and_ps(fscal,cutoff_mask);
1793 /* Calculate temporary vectorial force */
1794 tx = _mm256_mul_ps(fscal,dx22);
1795 ty = _mm256_mul_ps(fscal,dy22);
1796 tz = _mm256_mul_ps(fscal,dz22);
1798 /* Update vectorial force */
1799 fix2 = _mm256_add_ps(fix2,tx);
1800 fiy2 = _mm256_add_ps(fiy2,ty);
1801 fiz2 = _mm256_add_ps(fiz2,tz);
1803 fjx2 = _mm256_add_ps(fjx2,tx);
1804 fjy2 = _mm256_add_ps(fjy2,ty);
1805 fjz2 = _mm256_add_ps(fjz2,tz);
1809 fjptrA = f+j_coord_offsetA;
1810 fjptrB = f+j_coord_offsetB;
1811 fjptrC = f+j_coord_offsetC;
1812 fjptrD = f+j_coord_offsetD;
1813 fjptrE = f+j_coord_offsetE;
1814 fjptrF = f+j_coord_offsetF;
1815 fjptrG = f+j_coord_offsetG;
1816 fjptrH = f+j_coord_offsetH;
1818 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1819 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1821 /* Inner loop uses 301 flops */
1824 if(jidx<j_index_end)
1827 /* Get j neighbor index, and coordinate index */
1828 jnrlistA = jjnr[jidx];
1829 jnrlistB = jjnr[jidx+1];
1830 jnrlistC = jjnr[jidx+2];
1831 jnrlistD = jjnr[jidx+3];
1832 jnrlistE = jjnr[jidx+4];
1833 jnrlistF = jjnr[jidx+5];
1834 jnrlistG = jjnr[jidx+6];
1835 jnrlistH = jjnr[jidx+7];
1836 /* Sign of each element will be negative for non-real atoms.
1837 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1838 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1840 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1841 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1843 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1844 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1845 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1846 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1847 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1848 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1849 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1850 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1851 j_coord_offsetA = DIM*jnrA;
1852 j_coord_offsetB = DIM*jnrB;
1853 j_coord_offsetC = DIM*jnrC;
1854 j_coord_offsetD = DIM*jnrD;
1855 j_coord_offsetE = DIM*jnrE;
1856 j_coord_offsetF = DIM*jnrF;
1857 j_coord_offsetG = DIM*jnrG;
1858 j_coord_offsetH = DIM*jnrH;
1860 /* load j atom coordinates */
1861 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1862 x+j_coord_offsetC,x+j_coord_offsetD,
1863 x+j_coord_offsetE,x+j_coord_offsetF,
1864 x+j_coord_offsetG,x+j_coord_offsetH,
1865 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1867 /* Calculate displacement vector */
1868 dx00 = _mm256_sub_ps(ix0,jx0);
1869 dy00 = _mm256_sub_ps(iy0,jy0);
1870 dz00 = _mm256_sub_ps(iz0,jz0);
1871 dx01 = _mm256_sub_ps(ix0,jx1);
1872 dy01 = _mm256_sub_ps(iy0,jy1);
1873 dz01 = _mm256_sub_ps(iz0,jz1);
1874 dx02 = _mm256_sub_ps(ix0,jx2);
1875 dy02 = _mm256_sub_ps(iy0,jy2);
1876 dz02 = _mm256_sub_ps(iz0,jz2);
1877 dx10 = _mm256_sub_ps(ix1,jx0);
1878 dy10 = _mm256_sub_ps(iy1,jy0);
1879 dz10 = _mm256_sub_ps(iz1,jz0);
1880 dx11 = _mm256_sub_ps(ix1,jx1);
1881 dy11 = _mm256_sub_ps(iy1,jy1);
1882 dz11 = _mm256_sub_ps(iz1,jz1);
1883 dx12 = _mm256_sub_ps(ix1,jx2);
1884 dy12 = _mm256_sub_ps(iy1,jy2);
1885 dz12 = _mm256_sub_ps(iz1,jz2);
1886 dx20 = _mm256_sub_ps(ix2,jx0);
1887 dy20 = _mm256_sub_ps(iy2,jy0);
1888 dz20 = _mm256_sub_ps(iz2,jz0);
1889 dx21 = _mm256_sub_ps(ix2,jx1);
1890 dy21 = _mm256_sub_ps(iy2,jy1);
1891 dz21 = _mm256_sub_ps(iz2,jz1);
1892 dx22 = _mm256_sub_ps(ix2,jx2);
1893 dy22 = _mm256_sub_ps(iy2,jy2);
1894 dz22 = _mm256_sub_ps(iz2,jz2);
1896 /* Calculate squared distance and things based on it */
1897 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1898 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1899 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1900 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1901 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1902 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1903 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1904 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1905 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1907 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1908 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1909 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1910 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1911 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1912 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1913 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1914 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1915 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1917 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1918 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1919 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1920 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1921 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1922 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1923 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1924 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1925 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1927 fjx0 = _mm256_setzero_ps();
1928 fjy0 = _mm256_setzero_ps();
1929 fjz0 = _mm256_setzero_ps();
1930 fjx1 = _mm256_setzero_ps();
1931 fjy1 = _mm256_setzero_ps();
1932 fjz1 = _mm256_setzero_ps();
1933 fjx2 = _mm256_setzero_ps();
1934 fjy2 = _mm256_setzero_ps();
1935 fjz2 = _mm256_setzero_ps();
1937 /**************************
1938 * CALCULATE INTERACTIONS *
1939 **************************/
1941 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1944 r00 = _mm256_mul_ps(rsq00,rinv00);
1945 r00 = _mm256_andnot_ps(dummy_mask,r00);
1947 /* REACTION-FIELD ELECTROSTATICS */
1948 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1950 /* LENNARD-JONES DISPERSION/REPULSION */
1952 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1953 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
1954 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1955 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1956 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1958 d = _mm256_sub_ps(r00,rswitch);
1959 d = _mm256_max_ps(d,_mm256_setzero_ps());
1960 d2 = _mm256_mul_ps(d,d);
1961 sw = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1963 dsw = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1965 /* Evaluate switch function */
1966 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1967 fvdw = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1968 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1970 fscal = _mm256_add_ps(felec,fvdw);
1972 fscal = _mm256_and_ps(fscal,cutoff_mask);
1974 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1976 /* Calculate temporary vectorial force */
1977 tx = _mm256_mul_ps(fscal,dx00);
1978 ty = _mm256_mul_ps(fscal,dy00);
1979 tz = _mm256_mul_ps(fscal,dz00);
1981 /* Update vectorial force */
1982 fix0 = _mm256_add_ps(fix0,tx);
1983 fiy0 = _mm256_add_ps(fiy0,ty);
1984 fiz0 = _mm256_add_ps(fiz0,tz);
1986 fjx0 = _mm256_add_ps(fjx0,tx);
1987 fjy0 = _mm256_add_ps(fjy0,ty);
1988 fjz0 = _mm256_add_ps(fjz0,tz);
1992 /**************************
1993 * CALCULATE INTERACTIONS *
1994 **************************/
1996 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1999 /* REACTION-FIELD ELECTROSTATICS */
2000 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
2002 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2006 fscal = _mm256_and_ps(fscal,cutoff_mask);
2008 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2010 /* Calculate temporary vectorial force */
2011 tx = _mm256_mul_ps(fscal,dx01);
2012 ty = _mm256_mul_ps(fscal,dy01);
2013 tz = _mm256_mul_ps(fscal,dz01);
2015 /* Update vectorial force */
2016 fix0 = _mm256_add_ps(fix0,tx);
2017 fiy0 = _mm256_add_ps(fiy0,ty);
2018 fiz0 = _mm256_add_ps(fiz0,tz);
2020 fjx1 = _mm256_add_ps(fjx1,tx);
2021 fjy1 = _mm256_add_ps(fjy1,ty);
2022 fjz1 = _mm256_add_ps(fjz1,tz);
2026 /**************************
2027 * CALCULATE INTERACTIONS *
2028 **************************/
2030 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2033 /* REACTION-FIELD ELECTROSTATICS */
2034 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
2036 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2040 fscal = _mm256_and_ps(fscal,cutoff_mask);
2042 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2044 /* Calculate temporary vectorial force */
2045 tx = _mm256_mul_ps(fscal,dx02);
2046 ty = _mm256_mul_ps(fscal,dy02);
2047 tz = _mm256_mul_ps(fscal,dz02);
2049 /* Update vectorial force */
2050 fix0 = _mm256_add_ps(fix0,tx);
2051 fiy0 = _mm256_add_ps(fiy0,ty);
2052 fiz0 = _mm256_add_ps(fiz0,tz);
2054 fjx2 = _mm256_add_ps(fjx2,tx);
2055 fjy2 = _mm256_add_ps(fjy2,ty);
2056 fjz2 = _mm256_add_ps(fjz2,tz);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2067 /* REACTION-FIELD ELECTROSTATICS */
2068 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
2070 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2074 fscal = _mm256_and_ps(fscal,cutoff_mask);
2076 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2078 /* Calculate temporary vectorial force */
2079 tx = _mm256_mul_ps(fscal,dx10);
2080 ty = _mm256_mul_ps(fscal,dy10);
2081 tz = _mm256_mul_ps(fscal,dz10);
2083 /* Update vectorial force */
2084 fix1 = _mm256_add_ps(fix1,tx);
2085 fiy1 = _mm256_add_ps(fiy1,ty);
2086 fiz1 = _mm256_add_ps(fiz1,tz);
2088 fjx0 = _mm256_add_ps(fjx0,tx);
2089 fjy0 = _mm256_add_ps(fjy0,ty);
2090 fjz0 = _mm256_add_ps(fjz0,tz);
2094 /**************************
2095 * CALCULATE INTERACTIONS *
2096 **************************/
2098 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2101 /* REACTION-FIELD ELECTROSTATICS */
2102 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2104 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2108 fscal = _mm256_and_ps(fscal,cutoff_mask);
2110 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2112 /* Calculate temporary vectorial force */
2113 tx = _mm256_mul_ps(fscal,dx11);
2114 ty = _mm256_mul_ps(fscal,dy11);
2115 tz = _mm256_mul_ps(fscal,dz11);
2117 /* Update vectorial force */
2118 fix1 = _mm256_add_ps(fix1,tx);
2119 fiy1 = _mm256_add_ps(fiy1,ty);
2120 fiz1 = _mm256_add_ps(fiz1,tz);
2122 fjx1 = _mm256_add_ps(fjx1,tx);
2123 fjy1 = _mm256_add_ps(fjy1,ty);
2124 fjz1 = _mm256_add_ps(fjz1,tz);
2128 /**************************
2129 * CALCULATE INTERACTIONS *
2130 **************************/
2132 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2135 /* REACTION-FIELD ELECTROSTATICS */
2136 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2138 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2142 fscal = _mm256_and_ps(fscal,cutoff_mask);
2144 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2146 /* Calculate temporary vectorial force */
2147 tx = _mm256_mul_ps(fscal,dx12);
2148 ty = _mm256_mul_ps(fscal,dy12);
2149 tz = _mm256_mul_ps(fscal,dz12);
2151 /* Update vectorial force */
2152 fix1 = _mm256_add_ps(fix1,tx);
2153 fiy1 = _mm256_add_ps(fiy1,ty);
2154 fiz1 = _mm256_add_ps(fiz1,tz);
2156 fjx2 = _mm256_add_ps(fjx2,tx);
2157 fjy2 = _mm256_add_ps(fjy2,ty);
2158 fjz2 = _mm256_add_ps(fjz2,tz);
2162 /**************************
2163 * CALCULATE INTERACTIONS *
2164 **************************/
2166 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2169 /* REACTION-FIELD ELECTROSTATICS */
2170 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2172 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2176 fscal = _mm256_and_ps(fscal,cutoff_mask);
2178 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2180 /* Calculate temporary vectorial force */
2181 tx = _mm256_mul_ps(fscal,dx20);
2182 ty = _mm256_mul_ps(fscal,dy20);
2183 tz = _mm256_mul_ps(fscal,dz20);
2185 /* Update vectorial force */
2186 fix2 = _mm256_add_ps(fix2,tx);
2187 fiy2 = _mm256_add_ps(fiy2,ty);
2188 fiz2 = _mm256_add_ps(fiz2,tz);
2190 fjx0 = _mm256_add_ps(fjx0,tx);
2191 fjy0 = _mm256_add_ps(fjy0,ty);
2192 fjz0 = _mm256_add_ps(fjz0,tz);
2196 /**************************
2197 * CALCULATE INTERACTIONS *
2198 **************************/
2200 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2203 /* REACTION-FIELD ELECTROSTATICS */
2204 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2206 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2210 fscal = _mm256_and_ps(fscal,cutoff_mask);
2212 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2214 /* Calculate temporary vectorial force */
2215 tx = _mm256_mul_ps(fscal,dx21);
2216 ty = _mm256_mul_ps(fscal,dy21);
2217 tz = _mm256_mul_ps(fscal,dz21);
2219 /* Update vectorial force */
2220 fix2 = _mm256_add_ps(fix2,tx);
2221 fiy2 = _mm256_add_ps(fiy2,ty);
2222 fiz2 = _mm256_add_ps(fiz2,tz);
2224 fjx1 = _mm256_add_ps(fjx1,tx);
2225 fjy1 = _mm256_add_ps(fjy1,ty);
2226 fjz1 = _mm256_add_ps(fjz1,tz);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2237 /* REACTION-FIELD ELECTROSTATICS */
2238 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2240 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2244 fscal = _mm256_and_ps(fscal,cutoff_mask);
2246 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2248 /* Calculate temporary vectorial force */
2249 tx = _mm256_mul_ps(fscal,dx22);
2250 ty = _mm256_mul_ps(fscal,dy22);
2251 tz = _mm256_mul_ps(fscal,dz22);
2253 /* Update vectorial force */
2254 fix2 = _mm256_add_ps(fix2,tx);
2255 fiy2 = _mm256_add_ps(fiy2,ty);
2256 fiz2 = _mm256_add_ps(fiz2,tz);
2258 fjx2 = _mm256_add_ps(fjx2,tx);
2259 fjy2 = _mm256_add_ps(fjy2,ty);
2260 fjz2 = _mm256_add_ps(fjz2,tz);
2264 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2265 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2266 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2267 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2268 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2269 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2270 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2271 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2273 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2274 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2276 /* Inner loop uses 302 flops */
2279 /* End of innermost loop */
2281 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2282 f+i_coord_offset,fshift+i_shift_offset);
2284 /* Increment number of inner iterations */
2285 inneriter += j_index_end - j_index_start;
2287 /* Outer loop uses 18 flops */
2290 /* Increment number of outer iterations */
2293 /* Update outer/inner flops */
2295 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);