2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
100 __m128i ifour = _mm_set1_epi32(4);
101 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
103 __m256d dummy_mask,cutoff_mask;
104 __m128 tmpmask0,tmpmask1;
105 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
106 __m256d one = _mm256_set1_pd(1.0);
107 __m256d two = _mm256_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm256_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 krf = _mm256_set1_pd(fr->ic->k_rf);
122 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
123 crf = _mm256_set1_pd(fr->ic->c_rf);
124 nvdwtype = fr->ntype;
126 vdwtype = mdatoms->typeA;
128 vftab = kernel_data->table_vdw->data;
129 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
134 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
135 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
136 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
138 jq0 = _mm256_set1_pd(charge[inr+0]);
139 jq1 = _mm256_set1_pd(charge[inr+1]);
140 jq2 = _mm256_set1_pd(charge[inr+2]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 qq00 = _mm256_mul_pd(iq0,jq0);
143 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
144 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
145 qq01 = _mm256_mul_pd(iq0,jq1);
146 qq02 = _mm256_mul_pd(iq0,jq2);
147 qq10 = _mm256_mul_pd(iq1,jq0);
148 qq11 = _mm256_mul_pd(iq1,jq1);
149 qq12 = _mm256_mul_pd(iq1,jq2);
150 qq20 = _mm256_mul_pd(iq2,jq0);
151 qq21 = _mm256_mul_pd(iq2,jq1);
152 qq22 = _mm256_mul_pd(iq2,jq2);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm256_set1_pd(rcutoff_scalar);
157 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
159 /* Avoid stupid compiler warnings */
160 jnrA = jnrB = jnrC = jnrD = 0;
169 for(iidx=0;iidx<4*DIM;iidx++)
174 /* Start outer loop over neighborlists */
175 for(iidx=0; iidx<nri; iidx++)
177 /* Load shift vector for this list */
178 i_shift_offset = DIM*shiftidx[iidx];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
190 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
192 fix0 = _mm256_setzero_pd();
193 fiy0 = _mm256_setzero_pd();
194 fiz0 = _mm256_setzero_pd();
195 fix1 = _mm256_setzero_pd();
196 fiy1 = _mm256_setzero_pd();
197 fiz1 = _mm256_setzero_pd();
198 fix2 = _mm256_setzero_pd();
199 fiy2 = _mm256_setzero_pd();
200 fiz2 = _mm256_setzero_pd();
202 /* Reset potential sums */
203 velecsum = _mm256_setzero_pd();
204 vvdwsum = _mm256_setzero_pd();
206 /* Start inner kernel loop */
207 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
210 /* Get j neighbor index, and coordinate index */
215 j_coord_offsetA = DIM*jnrA;
216 j_coord_offsetB = DIM*jnrB;
217 j_coord_offsetC = DIM*jnrC;
218 j_coord_offsetD = DIM*jnrD;
220 /* load j atom coordinates */
221 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222 x+j_coord_offsetC,x+j_coord_offsetD,
223 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
225 /* Calculate displacement vector */
226 dx00 = _mm256_sub_pd(ix0,jx0);
227 dy00 = _mm256_sub_pd(iy0,jy0);
228 dz00 = _mm256_sub_pd(iz0,jz0);
229 dx01 = _mm256_sub_pd(ix0,jx1);
230 dy01 = _mm256_sub_pd(iy0,jy1);
231 dz01 = _mm256_sub_pd(iz0,jz1);
232 dx02 = _mm256_sub_pd(ix0,jx2);
233 dy02 = _mm256_sub_pd(iy0,jy2);
234 dz02 = _mm256_sub_pd(iz0,jz2);
235 dx10 = _mm256_sub_pd(ix1,jx0);
236 dy10 = _mm256_sub_pd(iy1,jy0);
237 dz10 = _mm256_sub_pd(iz1,jz0);
238 dx11 = _mm256_sub_pd(ix1,jx1);
239 dy11 = _mm256_sub_pd(iy1,jy1);
240 dz11 = _mm256_sub_pd(iz1,jz1);
241 dx12 = _mm256_sub_pd(ix1,jx2);
242 dy12 = _mm256_sub_pd(iy1,jy2);
243 dz12 = _mm256_sub_pd(iz1,jz2);
244 dx20 = _mm256_sub_pd(ix2,jx0);
245 dy20 = _mm256_sub_pd(iy2,jy0);
246 dz20 = _mm256_sub_pd(iz2,jz0);
247 dx21 = _mm256_sub_pd(ix2,jx1);
248 dy21 = _mm256_sub_pd(iy2,jy1);
249 dz21 = _mm256_sub_pd(iz2,jz1);
250 dx22 = _mm256_sub_pd(ix2,jx2);
251 dy22 = _mm256_sub_pd(iy2,jy2);
252 dz22 = _mm256_sub_pd(iz2,jz2);
254 /* Calculate squared distance and things based on it */
255 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
256 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
257 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
258 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
259 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
260 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
261 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
262 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
263 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
265 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
266 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
267 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
268 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
269 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
270 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
271 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
272 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
273 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
275 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
276 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
277 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
278 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
279 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
280 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
281 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
282 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
283 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
285 fjx0 = _mm256_setzero_pd();
286 fjy0 = _mm256_setzero_pd();
287 fjz0 = _mm256_setzero_pd();
288 fjx1 = _mm256_setzero_pd();
289 fjy1 = _mm256_setzero_pd();
290 fjz1 = _mm256_setzero_pd();
291 fjx2 = _mm256_setzero_pd();
292 fjy2 = _mm256_setzero_pd();
293 fjz2 = _mm256_setzero_pd();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 if (gmx_mm256_any_lt(rsq00,rcutoff2))
302 r00 = _mm256_mul_pd(rsq00,rinv00);
304 /* Calculate table index by multiplying r with table scale and truncate to integer */
305 rt = _mm256_mul_pd(r00,vftabscale);
306 vfitab = _mm256_cvttpd_epi32(rt);
307 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
308 vfitab = _mm_slli_epi32(vfitab,3);
310 /* REACTION-FIELD ELECTROSTATICS */
311 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
312 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
314 /* CUBIC SPLINE TABLE DISPERSION */
315 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
316 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
317 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
318 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
319 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
320 Heps = _mm256_mul_pd(vfeps,H);
321 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
322 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
323 vvdw6 = _mm256_mul_pd(c6_00,VV);
324 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
325 fvdw6 = _mm256_mul_pd(c6_00,FF);
327 /* CUBIC SPLINE TABLE REPULSION */
328 vfitab = _mm_add_epi32(vfitab,ifour);
329 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
330 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
331 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
332 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
333 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
334 Heps = _mm256_mul_pd(vfeps,H);
335 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
336 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
337 vvdw12 = _mm256_mul_pd(c12_00,VV);
338 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
339 fvdw12 = _mm256_mul_pd(c12_00,FF);
340 vvdw = _mm256_add_pd(vvdw12,vvdw6);
341 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
343 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velec = _mm256_and_pd(velec,cutoff_mask);
347 velecsum = _mm256_add_pd(velecsum,velec);
348 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
349 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
351 fscal = _mm256_add_pd(felec,fvdw);
353 fscal = _mm256_and_pd(fscal,cutoff_mask);
355 /* Calculate temporary vectorial force */
356 tx = _mm256_mul_pd(fscal,dx00);
357 ty = _mm256_mul_pd(fscal,dy00);
358 tz = _mm256_mul_pd(fscal,dz00);
360 /* Update vectorial force */
361 fix0 = _mm256_add_pd(fix0,tx);
362 fiy0 = _mm256_add_pd(fiy0,ty);
363 fiz0 = _mm256_add_pd(fiz0,tz);
365 fjx0 = _mm256_add_pd(fjx0,tx);
366 fjy0 = _mm256_add_pd(fjy0,ty);
367 fjz0 = _mm256_add_pd(fjz0,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 if (gmx_mm256_any_lt(rsq01,rcutoff2))
378 /* REACTION-FIELD ELECTROSTATICS */
379 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
380 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
382 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
384 /* Update potential sum for this i atom from the interaction with this j atom. */
385 velec = _mm256_and_pd(velec,cutoff_mask);
386 velecsum = _mm256_add_pd(velecsum,velec);
390 fscal = _mm256_and_pd(fscal,cutoff_mask);
392 /* Calculate temporary vectorial force */
393 tx = _mm256_mul_pd(fscal,dx01);
394 ty = _mm256_mul_pd(fscal,dy01);
395 tz = _mm256_mul_pd(fscal,dz01);
397 /* Update vectorial force */
398 fix0 = _mm256_add_pd(fix0,tx);
399 fiy0 = _mm256_add_pd(fiy0,ty);
400 fiz0 = _mm256_add_pd(fiz0,tz);
402 fjx1 = _mm256_add_pd(fjx1,tx);
403 fjy1 = _mm256_add_pd(fjy1,ty);
404 fjz1 = _mm256_add_pd(fjz1,tz);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 if (gmx_mm256_any_lt(rsq02,rcutoff2))
415 /* REACTION-FIELD ELECTROSTATICS */
416 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
417 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
419 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
421 /* Update potential sum for this i atom from the interaction with this j atom. */
422 velec = _mm256_and_pd(velec,cutoff_mask);
423 velecsum = _mm256_add_pd(velecsum,velec);
427 fscal = _mm256_and_pd(fscal,cutoff_mask);
429 /* Calculate temporary vectorial force */
430 tx = _mm256_mul_pd(fscal,dx02);
431 ty = _mm256_mul_pd(fscal,dy02);
432 tz = _mm256_mul_pd(fscal,dz02);
434 /* Update vectorial force */
435 fix0 = _mm256_add_pd(fix0,tx);
436 fiy0 = _mm256_add_pd(fiy0,ty);
437 fiz0 = _mm256_add_pd(fiz0,tz);
439 fjx2 = _mm256_add_pd(fjx2,tx);
440 fjy2 = _mm256_add_pd(fjy2,ty);
441 fjz2 = _mm256_add_pd(fjz2,tz);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 if (gmx_mm256_any_lt(rsq10,rcutoff2))
452 /* REACTION-FIELD ELECTROSTATICS */
453 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
454 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
456 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velec = _mm256_and_pd(velec,cutoff_mask);
460 velecsum = _mm256_add_pd(velecsum,velec);
464 fscal = _mm256_and_pd(fscal,cutoff_mask);
466 /* Calculate temporary vectorial force */
467 tx = _mm256_mul_pd(fscal,dx10);
468 ty = _mm256_mul_pd(fscal,dy10);
469 tz = _mm256_mul_pd(fscal,dz10);
471 /* Update vectorial force */
472 fix1 = _mm256_add_pd(fix1,tx);
473 fiy1 = _mm256_add_pd(fiy1,ty);
474 fiz1 = _mm256_add_pd(fiz1,tz);
476 fjx0 = _mm256_add_pd(fjx0,tx);
477 fjy0 = _mm256_add_pd(fjy0,ty);
478 fjz0 = _mm256_add_pd(fjz0,tz);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 if (gmx_mm256_any_lt(rsq11,rcutoff2))
489 /* REACTION-FIELD ELECTROSTATICS */
490 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
491 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
493 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm256_and_pd(velec,cutoff_mask);
497 velecsum = _mm256_add_pd(velecsum,velec);
501 fscal = _mm256_and_pd(fscal,cutoff_mask);
503 /* Calculate temporary vectorial force */
504 tx = _mm256_mul_pd(fscal,dx11);
505 ty = _mm256_mul_pd(fscal,dy11);
506 tz = _mm256_mul_pd(fscal,dz11);
508 /* Update vectorial force */
509 fix1 = _mm256_add_pd(fix1,tx);
510 fiy1 = _mm256_add_pd(fiy1,ty);
511 fiz1 = _mm256_add_pd(fiz1,tz);
513 fjx1 = _mm256_add_pd(fjx1,tx);
514 fjy1 = _mm256_add_pd(fjy1,ty);
515 fjz1 = _mm256_add_pd(fjz1,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 if (gmx_mm256_any_lt(rsq12,rcutoff2))
526 /* REACTION-FIELD ELECTROSTATICS */
527 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
528 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
530 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velec = _mm256_and_pd(velec,cutoff_mask);
534 velecsum = _mm256_add_pd(velecsum,velec);
538 fscal = _mm256_and_pd(fscal,cutoff_mask);
540 /* Calculate temporary vectorial force */
541 tx = _mm256_mul_pd(fscal,dx12);
542 ty = _mm256_mul_pd(fscal,dy12);
543 tz = _mm256_mul_pd(fscal,dz12);
545 /* Update vectorial force */
546 fix1 = _mm256_add_pd(fix1,tx);
547 fiy1 = _mm256_add_pd(fiy1,ty);
548 fiz1 = _mm256_add_pd(fiz1,tz);
550 fjx2 = _mm256_add_pd(fjx2,tx);
551 fjy2 = _mm256_add_pd(fjy2,ty);
552 fjz2 = _mm256_add_pd(fjz2,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 if (gmx_mm256_any_lt(rsq20,rcutoff2))
563 /* REACTION-FIELD ELECTROSTATICS */
564 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
565 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
567 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velec = _mm256_and_pd(velec,cutoff_mask);
571 velecsum = _mm256_add_pd(velecsum,velec);
575 fscal = _mm256_and_pd(fscal,cutoff_mask);
577 /* Calculate temporary vectorial force */
578 tx = _mm256_mul_pd(fscal,dx20);
579 ty = _mm256_mul_pd(fscal,dy20);
580 tz = _mm256_mul_pd(fscal,dz20);
582 /* Update vectorial force */
583 fix2 = _mm256_add_pd(fix2,tx);
584 fiy2 = _mm256_add_pd(fiy2,ty);
585 fiz2 = _mm256_add_pd(fiz2,tz);
587 fjx0 = _mm256_add_pd(fjx0,tx);
588 fjy0 = _mm256_add_pd(fjy0,ty);
589 fjz0 = _mm256_add_pd(fjz0,tz);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 if (gmx_mm256_any_lt(rsq21,rcutoff2))
600 /* REACTION-FIELD ELECTROSTATICS */
601 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
602 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
604 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec = _mm256_and_pd(velec,cutoff_mask);
608 velecsum = _mm256_add_pd(velecsum,velec);
612 fscal = _mm256_and_pd(fscal,cutoff_mask);
614 /* Calculate temporary vectorial force */
615 tx = _mm256_mul_pd(fscal,dx21);
616 ty = _mm256_mul_pd(fscal,dy21);
617 tz = _mm256_mul_pd(fscal,dz21);
619 /* Update vectorial force */
620 fix2 = _mm256_add_pd(fix2,tx);
621 fiy2 = _mm256_add_pd(fiy2,ty);
622 fiz2 = _mm256_add_pd(fiz2,tz);
624 fjx1 = _mm256_add_pd(fjx1,tx);
625 fjy1 = _mm256_add_pd(fjy1,ty);
626 fjz1 = _mm256_add_pd(fjz1,tz);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 if (gmx_mm256_any_lt(rsq22,rcutoff2))
637 /* REACTION-FIELD ELECTROSTATICS */
638 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
639 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
641 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
643 /* Update potential sum for this i atom from the interaction with this j atom. */
644 velec = _mm256_and_pd(velec,cutoff_mask);
645 velecsum = _mm256_add_pd(velecsum,velec);
649 fscal = _mm256_and_pd(fscal,cutoff_mask);
651 /* Calculate temporary vectorial force */
652 tx = _mm256_mul_pd(fscal,dx22);
653 ty = _mm256_mul_pd(fscal,dy22);
654 tz = _mm256_mul_pd(fscal,dz22);
656 /* Update vectorial force */
657 fix2 = _mm256_add_pd(fix2,tx);
658 fiy2 = _mm256_add_pd(fiy2,ty);
659 fiz2 = _mm256_add_pd(fiz2,tz);
661 fjx2 = _mm256_add_pd(fjx2,tx);
662 fjy2 = _mm256_add_pd(fjy2,ty);
663 fjz2 = _mm256_add_pd(fjz2,tz);
667 fjptrA = f+j_coord_offsetA;
668 fjptrB = f+j_coord_offsetB;
669 fjptrC = f+j_coord_offsetC;
670 fjptrD = f+j_coord_offsetD;
672 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
673 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
675 /* Inner loop uses 360 flops */
681 /* Get j neighbor index, and coordinate index */
682 jnrlistA = jjnr[jidx];
683 jnrlistB = jjnr[jidx+1];
684 jnrlistC = jjnr[jidx+2];
685 jnrlistD = jjnr[jidx+3];
686 /* Sign of each element will be negative for non-real atoms.
687 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
688 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
690 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
692 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
693 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
694 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
696 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
697 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
698 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
699 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
700 j_coord_offsetA = DIM*jnrA;
701 j_coord_offsetB = DIM*jnrB;
702 j_coord_offsetC = DIM*jnrC;
703 j_coord_offsetD = DIM*jnrD;
705 /* load j atom coordinates */
706 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
707 x+j_coord_offsetC,x+j_coord_offsetD,
708 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
710 /* Calculate displacement vector */
711 dx00 = _mm256_sub_pd(ix0,jx0);
712 dy00 = _mm256_sub_pd(iy0,jy0);
713 dz00 = _mm256_sub_pd(iz0,jz0);
714 dx01 = _mm256_sub_pd(ix0,jx1);
715 dy01 = _mm256_sub_pd(iy0,jy1);
716 dz01 = _mm256_sub_pd(iz0,jz1);
717 dx02 = _mm256_sub_pd(ix0,jx2);
718 dy02 = _mm256_sub_pd(iy0,jy2);
719 dz02 = _mm256_sub_pd(iz0,jz2);
720 dx10 = _mm256_sub_pd(ix1,jx0);
721 dy10 = _mm256_sub_pd(iy1,jy0);
722 dz10 = _mm256_sub_pd(iz1,jz0);
723 dx11 = _mm256_sub_pd(ix1,jx1);
724 dy11 = _mm256_sub_pd(iy1,jy1);
725 dz11 = _mm256_sub_pd(iz1,jz1);
726 dx12 = _mm256_sub_pd(ix1,jx2);
727 dy12 = _mm256_sub_pd(iy1,jy2);
728 dz12 = _mm256_sub_pd(iz1,jz2);
729 dx20 = _mm256_sub_pd(ix2,jx0);
730 dy20 = _mm256_sub_pd(iy2,jy0);
731 dz20 = _mm256_sub_pd(iz2,jz0);
732 dx21 = _mm256_sub_pd(ix2,jx1);
733 dy21 = _mm256_sub_pd(iy2,jy1);
734 dz21 = _mm256_sub_pd(iz2,jz1);
735 dx22 = _mm256_sub_pd(ix2,jx2);
736 dy22 = _mm256_sub_pd(iy2,jy2);
737 dz22 = _mm256_sub_pd(iz2,jz2);
739 /* Calculate squared distance and things based on it */
740 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
741 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
742 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
743 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
744 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
745 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
746 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
747 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
748 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
750 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
751 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
752 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
753 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
754 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
755 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
756 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
757 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
758 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
760 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
761 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
762 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
763 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
764 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
765 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
766 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
767 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
768 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
770 fjx0 = _mm256_setzero_pd();
771 fjy0 = _mm256_setzero_pd();
772 fjz0 = _mm256_setzero_pd();
773 fjx1 = _mm256_setzero_pd();
774 fjy1 = _mm256_setzero_pd();
775 fjz1 = _mm256_setzero_pd();
776 fjx2 = _mm256_setzero_pd();
777 fjy2 = _mm256_setzero_pd();
778 fjz2 = _mm256_setzero_pd();
780 /**************************
781 * CALCULATE INTERACTIONS *
782 **************************/
784 if (gmx_mm256_any_lt(rsq00,rcutoff2))
787 r00 = _mm256_mul_pd(rsq00,rinv00);
788 r00 = _mm256_andnot_pd(dummy_mask,r00);
790 /* Calculate table index by multiplying r with table scale and truncate to integer */
791 rt = _mm256_mul_pd(r00,vftabscale);
792 vfitab = _mm256_cvttpd_epi32(rt);
793 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
794 vfitab = _mm_slli_epi32(vfitab,3);
796 /* REACTION-FIELD ELECTROSTATICS */
797 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
798 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
800 /* CUBIC SPLINE TABLE DISPERSION */
801 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
802 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
803 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
804 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
805 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
806 Heps = _mm256_mul_pd(vfeps,H);
807 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
808 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
809 vvdw6 = _mm256_mul_pd(c6_00,VV);
810 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
811 fvdw6 = _mm256_mul_pd(c6_00,FF);
813 /* CUBIC SPLINE TABLE REPULSION */
814 vfitab = _mm_add_epi32(vfitab,ifour);
815 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
816 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
817 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
818 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
819 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
820 Heps = _mm256_mul_pd(vfeps,H);
821 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
822 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
823 vvdw12 = _mm256_mul_pd(c12_00,VV);
824 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
825 fvdw12 = _mm256_mul_pd(c12_00,FF);
826 vvdw = _mm256_add_pd(vvdw12,vvdw6);
827 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
829 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
831 /* Update potential sum for this i atom from the interaction with this j atom. */
832 velec = _mm256_and_pd(velec,cutoff_mask);
833 velec = _mm256_andnot_pd(dummy_mask,velec);
834 velecsum = _mm256_add_pd(velecsum,velec);
835 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
836 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
837 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
839 fscal = _mm256_add_pd(felec,fvdw);
841 fscal = _mm256_and_pd(fscal,cutoff_mask);
843 fscal = _mm256_andnot_pd(dummy_mask,fscal);
845 /* Calculate temporary vectorial force */
846 tx = _mm256_mul_pd(fscal,dx00);
847 ty = _mm256_mul_pd(fscal,dy00);
848 tz = _mm256_mul_pd(fscal,dz00);
850 /* Update vectorial force */
851 fix0 = _mm256_add_pd(fix0,tx);
852 fiy0 = _mm256_add_pd(fiy0,ty);
853 fiz0 = _mm256_add_pd(fiz0,tz);
855 fjx0 = _mm256_add_pd(fjx0,tx);
856 fjy0 = _mm256_add_pd(fjy0,ty);
857 fjz0 = _mm256_add_pd(fjz0,tz);
861 /**************************
862 * CALCULATE INTERACTIONS *
863 **************************/
865 if (gmx_mm256_any_lt(rsq01,rcutoff2))
868 /* REACTION-FIELD ELECTROSTATICS */
869 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
870 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
872 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec = _mm256_and_pd(velec,cutoff_mask);
876 velec = _mm256_andnot_pd(dummy_mask,velec);
877 velecsum = _mm256_add_pd(velecsum,velec);
881 fscal = _mm256_and_pd(fscal,cutoff_mask);
883 fscal = _mm256_andnot_pd(dummy_mask,fscal);
885 /* Calculate temporary vectorial force */
886 tx = _mm256_mul_pd(fscal,dx01);
887 ty = _mm256_mul_pd(fscal,dy01);
888 tz = _mm256_mul_pd(fscal,dz01);
890 /* Update vectorial force */
891 fix0 = _mm256_add_pd(fix0,tx);
892 fiy0 = _mm256_add_pd(fiy0,ty);
893 fiz0 = _mm256_add_pd(fiz0,tz);
895 fjx1 = _mm256_add_pd(fjx1,tx);
896 fjy1 = _mm256_add_pd(fjy1,ty);
897 fjz1 = _mm256_add_pd(fjz1,tz);
901 /**************************
902 * CALCULATE INTERACTIONS *
903 **************************/
905 if (gmx_mm256_any_lt(rsq02,rcutoff2))
908 /* REACTION-FIELD ELECTROSTATICS */
909 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
910 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
912 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
914 /* Update potential sum for this i atom from the interaction with this j atom. */
915 velec = _mm256_and_pd(velec,cutoff_mask);
916 velec = _mm256_andnot_pd(dummy_mask,velec);
917 velecsum = _mm256_add_pd(velecsum,velec);
921 fscal = _mm256_and_pd(fscal,cutoff_mask);
923 fscal = _mm256_andnot_pd(dummy_mask,fscal);
925 /* Calculate temporary vectorial force */
926 tx = _mm256_mul_pd(fscal,dx02);
927 ty = _mm256_mul_pd(fscal,dy02);
928 tz = _mm256_mul_pd(fscal,dz02);
930 /* Update vectorial force */
931 fix0 = _mm256_add_pd(fix0,tx);
932 fiy0 = _mm256_add_pd(fiy0,ty);
933 fiz0 = _mm256_add_pd(fiz0,tz);
935 fjx2 = _mm256_add_pd(fjx2,tx);
936 fjy2 = _mm256_add_pd(fjy2,ty);
937 fjz2 = _mm256_add_pd(fjz2,tz);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 if (gmx_mm256_any_lt(rsq10,rcutoff2))
948 /* REACTION-FIELD ELECTROSTATICS */
949 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
950 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
952 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
954 /* Update potential sum for this i atom from the interaction with this j atom. */
955 velec = _mm256_and_pd(velec,cutoff_mask);
956 velec = _mm256_andnot_pd(dummy_mask,velec);
957 velecsum = _mm256_add_pd(velecsum,velec);
961 fscal = _mm256_and_pd(fscal,cutoff_mask);
963 fscal = _mm256_andnot_pd(dummy_mask,fscal);
965 /* Calculate temporary vectorial force */
966 tx = _mm256_mul_pd(fscal,dx10);
967 ty = _mm256_mul_pd(fscal,dy10);
968 tz = _mm256_mul_pd(fscal,dz10);
970 /* Update vectorial force */
971 fix1 = _mm256_add_pd(fix1,tx);
972 fiy1 = _mm256_add_pd(fiy1,ty);
973 fiz1 = _mm256_add_pd(fiz1,tz);
975 fjx0 = _mm256_add_pd(fjx0,tx);
976 fjy0 = _mm256_add_pd(fjy0,ty);
977 fjz0 = _mm256_add_pd(fjz0,tz);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 if (gmx_mm256_any_lt(rsq11,rcutoff2))
988 /* REACTION-FIELD ELECTROSTATICS */
989 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
990 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
992 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm256_and_pd(velec,cutoff_mask);
996 velec = _mm256_andnot_pd(dummy_mask,velec);
997 velecsum = _mm256_add_pd(velecsum,velec);
1001 fscal = _mm256_and_pd(fscal,cutoff_mask);
1003 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1005 /* Calculate temporary vectorial force */
1006 tx = _mm256_mul_pd(fscal,dx11);
1007 ty = _mm256_mul_pd(fscal,dy11);
1008 tz = _mm256_mul_pd(fscal,dz11);
1010 /* Update vectorial force */
1011 fix1 = _mm256_add_pd(fix1,tx);
1012 fiy1 = _mm256_add_pd(fiy1,ty);
1013 fiz1 = _mm256_add_pd(fiz1,tz);
1015 fjx1 = _mm256_add_pd(fjx1,tx);
1016 fjy1 = _mm256_add_pd(fjy1,ty);
1017 fjz1 = _mm256_add_pd(fjz1,tz);
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1025 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1028 /* REACTION-FIELD ELECTROSTATICS */
1029 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
1030 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1032 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec = _mm256_and_pd(velec,cutoff_mask);
1036 velec = _mm256_andnot_pd(dummy_mask,velec);
1037 velecsum = _mm256_add_pd(velecsum,velec);
1041 fscal = _mm256_and_pd(fscal,cutoff_mask);
1043 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1045 /* Calculate temporary vectorial force */
1046 tx = _mm256_mul_pd(fscal,dx12);
1047 ty = _mm256_mul_pd(fscal,dy12);
1048 tz = _mm256_mul_pd(fscal,dz12);
1050 /* Update vectorial force */
1051 fix1 = _mm256_add_pd(fix1,tx);
1052 fiy1 = _mm256_add_pd(fiy1,ty);
1053 fiz1 = _mm256_add_pd(fiz1,tz);
1055 fjx2 = _mm256_add_pd(fjx2,tx);
1056 fjy2 = _mm256_add_pd(fjy2,ty);
1057 fjz2 = _mm256_add_pd(fjz2,tz);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1068 /* REACTION-FIELD ELECTROSTATICS */
1069 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
1070 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1072 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1074 /* Update potential sum for this i atom from the interaction with this j atom. */
1075 velec = _mm256_and_pd(velec,cutoff_mask);
1076 velec = _mm256_andnot_pd(dummy_mask,velec);
1077 velecsum = _mm256_add_pd(velecsum,velec);
1081 fscal = _mm256_and_pd(fscal,cutoff_mask);
1083 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1085 /* Calculate temporary vectorial force */
1086 tx = _mm256_mul_pd(fscal,dx20);
1087 ty = _mm256_mul_pd(fscal,dy20);
1088 tz = _mm256_mul_pd(fscal,dz20);
1090 /* Update vectorial force */
1091 fix2 = _mm256_add_pd(fix2,tx);
1092 fiy2 = _mm256_add_pd(fiy2,ty);
1093 fiz2 = _mm256_add_pd(fiz2,tz);
1095 fjx0 = _mm256_add_pd(fjx0,tx);
1096 fjy0 = _mm256_add_pd(fjy0,ty);
1097 fjz0 = _mm256_add_pd(fjz0,tz);
1101 /**************************
1102 * CALCULATE INTERACTIONS *
1103 **************************/
1105 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1108 /* REACTION-FIELD ELECTROSTATICS */
1109 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1110 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1112 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1114 /* Update potential sum for this i atom from the interaction with this j atom. */
1115 velec = _mm256_and_pd(velec,cutoff_mask);
1116 velec = _mm256_andnot_pd(dummy_mask,velec);
1117 velecsum = _mm256_add_pd(velecsum,velec);
1121 fscal = _mm256_and_pd(fscal,cutoff_mask);
1123 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1125 /* Calculate temporary vectorial force */
1126 tx = _mm256_mul_pd(fscal,dx21);
1127 ty = _mm256_mul_pd(fscal,dy21);
1128 tz = _mm256_mul_pd(fscal,dz21);
1130 /* Update vectorial force */
1131 fix2 = _mm256_add_pd(fix2,tx);
1132 fiy2 = _mm256_add_pd(fiy2,ty);
1133 fiz2 = _mm256_add_pd(fiz2,tz);
1135 fjx1 = _mm256_add_pd(fjx1,tx);
1136 fjy1 = _mm256_add_pd(fjy1,ty);
1137 fjz1 = _mm256_add_pd(fjz1,tz);
1141 /**************************
1142 * CALCULATE INTERACTIONS *
1143 **************************/
1145 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1148 /* REACTION-FIELD ELECTROSTATICS */
1149 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1150 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1152 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1154 /* Update potential sum for this i atom from the interaction with this j atom. */
1155 velec = _mm256_and_pd(velec,cutoff_mask);
1156 velec = _mm256_andnot_pd(dummy_mask,velec);
1157 velecsum = _mm256_add_pd(velecsum,velec);
1161 fscal = _mm256_and_pd(fscal,cutoff_mask);
1163 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1165 /* Calculate temporary vectorial force */
1166 tx = _mm256_mul_pd(fscal,dx22);
1167 ty = _mm256_mul_pd(fscal,dy22);
1168 tz = _mm256_mul_pd(fscal,dz22);
1170 /* Update vectorial force */
1171 fix2 = _mm256_add_pd(fix2,tx);
1172 fiy2 = _mm256_add_pd(fiy2,ty);
1173 fiz2 = _mm256_add_pd(fiz2,tz);
1175 fjx2 = _mm256_add_pd(fjx2,tx);
1176 fjy2 = _mm256_add_pd(fjy2,ty);
1177 fjz2 = _mm256_add_pd(fjz2,tz);
1181 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1182 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1183 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1184 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1186 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1187 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1189 /* Inner loop uses 361 flops */
1192 /* End of innermost loop */
1194 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1195 f+i_coord_offset,fshift+i_shift_offset);
1198 /* Update potential energies */
1199 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1200 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1202 /* Increment number of inner iterations */
1203 inneriter += j_index_end - j_index_start;
1205 /* Outer loop uses 20 flops */
1208 /* Increment number of outer iterations */
1211 /* Update outer/inner flops */
1213 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*361);
1216 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_double
1217 * Electrostatics interaction: ReactionField
1218 * VdW interaction: CubicSplineTable
1219 * Geometry: Water3-Water3
1220 * Calculate force/pot: Force
1223 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_double
1224 (t_nblist * gmx_restrict nlist,
1225 rvec * gmx_restrict xx,
1226 rvec * gmx_restrict ff,
1227 t_forcerec * gmx_restrict fr,
1228 t_mdatoms * gmx_restrict mdatoms,
1229 nb_kernel_data_t * gmx_restrict kernel_data,
1230 t_nrnb * gmx_restrict nrnb)
1232 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1233 * just 0 for non-waters.
1234 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1235 * jnr indices corresponding to data put in the four positions in the SIMD register.
1237 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1238 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1239 int jnrA,jnrB,jnrC,jnrD;
1240 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1241 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1242 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1243 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1244 real rcutoff_scalar;
1245 real *shiftvec,*fshift,*x,*f;
1246 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1247 real scratch[4*DIM];
1248 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1249 real * vdwioffsetptr0;
1250 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1251 real * vdwioffsetptr1;
1252 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1253 real * vdwioffsetptr2;
1254 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1255 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1256 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1257 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1258 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1259 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1260 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1261 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1262 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1263 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1264 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1265 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1266 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1267 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1268 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1269 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1270 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1273 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1276 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1277 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1279 __m128i ifour = _mm_set1_epi32(4);
1280 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1282 __m256d dummy_mask,cutoff_mask;
1283 __m128 tmpmask0,tmpmask1;
1284 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1285 __m256d one = _mm256_set1_pd(1.0);
1286 __m256d two = _mm256_set1_pd(2.0);
1292 jindex = nlist->jindex;
1294 shiftidx = nlist->shift;
1296 shiftvec = fr->shift_vec[0];
1297 fshift = fr->fshift[0];
1298 facel = _mm256_set1_pd(fr->epsfac);
1299 charge = mdatoms->chargeA;
1300 krf = _mm256_set1_pd(fr->ic->k_rf);
1301 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1302 crf = _mm256_set1_pd(fr->ic->c_rf);
1303 nvdwtype = fr->ntype;
1304 vdwparam = fr->nbfp;
1305 vdwtype = mdatoms->typeA;
1307 vftab = kernel_data->table_vdw->data;
1308 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1310 /* Setup water-specific parameters */
1311 inr = nlist->iinr[0];
1312 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1313 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1314 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1315 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1317 jq0 = _mm256_set1_pd(charge[inr+0]);
1318 jq1 = _mm256_set1_pd(charge[inr+1]);
1319 jq2 = _mm256_set1_pd(charge[inr+2]);
1320 vdwjidx0A = 2*vdwtype[inr+0];
1321 qq00 = _mm256_mul_pd(iq0,jq0);
1322 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1323 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1324 qq01 = _mm256_mul_pd(iq0,jq1);
1325 qq02 = _mm256_mul_pd(iq0,jq2);
1326 qq10 = _mm256_mul_pd(iq1,jq0);
1327 qq11 = _mm256_mul_pd(iq1,jq1);
1328 qq12 = _mm256_mul_pd(iq1,jq2);
1329 qq20 = _mm256_mul_pd(iq2,jq0);
1330 qq21 = _mm256_mul_pd(iq2,jq1);
1331 qq22 = _mm256_mul_pd(iq2,jq2);
1333 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1334 rcutoff_scalar = fr->rcoulomb;
1335 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1336 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1338 /* Avoid stupid compiler warnings */
1339 jnrA = jnrB = jnrC = jnrD = 0;
1340 j_coord_offsetA = 0;
1341 j_coord_offsetB = 0;
1342 j_coord_offsetC = 0;
1343 j_coord_offsetD = 0;
1348 for(iidx=0;iidx<4*DIM;iidx++)
1350 scratch[iidx] = 0.0;
1353 /* Start outer loop over neighborlists */
1354 for(iidx=0; iidx<nri; iidx++)
1356 /* Load shift vector for this list */
1357 i_shift_offset = DIM*shiftidx[iidx];
1359 /* Load limits for loop over neighbors */
1360 j_index_start = jindex[iidx];
1361 j_index_end = jindex[iidx+1];
1363 /* Get outer coordinate index */
1365 i_coord_offset = DIM*inr;
1367 /* Load i particle coords and add shift vector */
1368 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1369 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1371 fix0 = _mm256_setzero_pd();
1372 fiy0 = _mm256_setzero_pd();
1373 fiz0 = _mm256_setzero_pd();
1374 fix1 = _mm256_setzero_pd();
1375 fiy1 = _mm256_setzero_pd();
1376 fiz1 = _mm256_setzero_pd();
1377 fix2 = _mm256_setzero_pd();
1378 fiy2 = _mm256_setzero_pd();
1379 fiz2 = _mm256_setzero_pd();
1381 /* Start inner kernel loop */
1382 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1385 /* Get j neighbor index, and coordinate index */
1387 jnrB = jjnr[jidx+1];
1388 jnrC = jjnr[jidx+2];
1389 jnrD = jjnr[jidx+3];
1390 j_coord_offsetA = DIM*jnrA;
1391 j_coord_offsetB = DIM*jnrB;
1392 j_coord_offsetC = DIM*jnrC;
1393 j_coord_offsetD = DIM*jnrD;
1395 /* load j atom coordinates */
1396 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1397 x+j_coord_offsetC,x+j_coord_offsetD,
1398 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1400 /* Calculate displacement vector */
1401 dx00 = _mm256_sub_pd(ix0,jx0);
1402 dy00 = _mm256_sub_pd(iy0,jy0);
1403 dz00 = _mm256_sub_pd(iz0,jz0);
1404 dx01 = _mm256_sub_pd(ix0,jx1);
1405 dy01 = _mm256_sub_pd(iy0,jy1);
1406 dz01 = _mm256_sub_pd(iz0,jz1);
1407 dx02 = _mm256_sub_pd(ix0,jx2);
1408 dy02 = _mm256_sub_pd(iy0,jy2);
1409 dz02 = _mm256_sub_pd(iz0,jz2);
1410 dx10 = _mm256_sub_pd(ix1,jx0);
1411 dy10 = _mm256_sub_pd(iy1,jy0);
1412 dz10 = _mm256_sub_pd(iz1,jz0);
1413 dx11 = _mm256_sub_pd(ix1,jx1);
1414 dy11 = _mm256_sub_pd(iy1,jy1);
1415 dz11 = _mm256_sub_pd(iz1,jz1);
1416 dx12 = _mm256_sub_pd(ix1,jx2);
1417 dy12 = _mm256_sub_pd(iy1,jy2);
1418 dz12 = _mm256_sub_pd(iz1,jz2);
1419 dx20 = _mm256_sub_pd(ix2,jx0);
1420 dy20 = _mm256_sub_pd(iy2,jy0);
1421 dz20 = _mm256_sub_pd(iz2,jz0);
1422 dx21 = _mm256_sub_pd(ix2,jx1);
1423 dy21 = _mm256_sub_pd(iy2,jy1);
1424 dz21 = _mm256_sub_pd(iz2,jz1);
1425 dx22 = _mm256_sub_pd(ix2,jx2);
1426 dy22 = _mm256_sub_pd(iy2,jy2);
1427 dz22 = _mm256_sub_pd(iz2,jz2);
1429 /* Calculate squared distance and things based on it */
1430 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1431 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1432 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1433 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1434 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1435 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1436 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1437 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1438 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1440 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1441 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1442 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1443 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1444 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1445 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1446 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1447 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1448 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1450 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1451 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1452 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1453 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1454 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1455 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1456 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1457 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1458 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1460 fjx0 = _mm256_setzero_pd();
1461 fjy0 = _mm256_setzero_pd();
1462 fjz0 = _mm256_setzero_pd();
1463 fjx1 = _mm256_setzero_pd();
1464 fjy1 = _mm256_setzero_pd();
1465 fjz1 = _mm256_setzero_pd();
1466 fjx2 = _mm256_setzero_pd();
1467 fjy2 = _mm256_setzero_pd();
1468 fjz2 = _mm256_setzero_pd();
1470 /**************************
1471 * CALCULATE INTERACTIONS *
1472 **************************/
1474 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1477 r00 = _mm256_mul_pd(rsq00,rinv00);
1479 /* Calculate table index by multiplying r with table scale and truncate to integer */
1480 rt = _mm256_mul_pd(r00,vftabscale);
1481 vfitab = _mm256_cvttpd_epi32(rt);
1482 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1483 vfitab = _mm_slli_epi32(vfitab,3);
1485 /* REACTION-FIELD ELECTROSTATICS */
1486 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1488 /* CUBIC SPLINE TABLE DISPERSION */
1489 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1490 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1491 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1492 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1493 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1494 Heps = _mm256_mul_pd(vfeps,H);
1495 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1496 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1497 fvdw6 = _mm256_mul_pd(c6_00,FF);
1499 /* CUBIC SPLINE TABLE REPULSION */
1500 vfitab = _mm_add_epi32(vfitab,ifour);
1501 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1502 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1503 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1504 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1505 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1506 Heps = _mm256_mul_pd(vfeps,H);
1507 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1508 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1509 fvdw12 = _mm256_mul_pd(c12_00,FF);
1510 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1512 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1514 fscal = _mm256_add_pd(felec,fvdw);
1516 fscal = _mm256_and_pd(fscal,cutoff_mask);
1518 /* Calculate temporary vectorial force */
1519 tx = _mm256_mul_pd(fscal,dx00);
1520 ty = _mm256_mul_pd(fscal,dy00);
1521 tz = _mm256_mul_pd(fscal,dz00);
1523 /* Update vectorial force */
1524 fix0 = _mm256_add_pd(fix0,tx);
1525 fiy0 = _mm256_add_pd(fiy0,ty);
1526 fiz0 = _mm256_add_pd(fiz0,tz);
1528 fjx0 = _mm256_add_pd(fjx0,tx);
1529 fjy0 = _mm256_add_pd(fjy0,ty);
1530 fjz0 = _mm256_add_pd(fjz0,tz);
1534 /**************************
1535 * CALCULATE INTERACTIONS *
1536 **************************/
1538 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1541 /* REACTION-FIELD ELECTROSTATICS */
1542 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1544 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1548 fscal = _mm256_and_pd(fscal,cutoff_mask);
1550 /* Calculate temporary vectorial force */
1551 tx = _mm256_mul_pd(fscal,dx01);
1552 ty = _mm256_mul_pd(fscal,dy01);
1553 tz = _mm256_mul_pd(fscal,dz01);
1555 /* Update vectorial force */
1556 fix0 = _mm256_add_pd(fix0,tx);
1557 fiy0 = _mm256_add_pd(fiy0,ty);
1558 fiz0 = _mm256_add_pd(fiz0,tz);
1560 fjx1 = _mm256_add_pd(fjx1,tx);
1561 fjy1 = _mm256_add_pd(fjy1,ty);
1562 fjz1 = _mm256_add_pd(fjz1,tz);
1566 /**************************
1567 * CALCULATE INTERACTIONS *
1568 **************************/
1570 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1573 /* REACTION-FIELD ELECTROSTATICS */
1574 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1576 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1580 fscal = _mm256_and_pd(fscal,cutoff_mask);
1582 /* Calculate temporary vectorial force */
1583 tx = _mm256_mul_pd(fscal,dx02);
1584 ty = _mm256_mul_pd(fscal,dy02);
1585 tz = _mm256_mul_pd(fscal,dz02);
1587 /* Update vectorial force */
1588 fix0 = _mm256_add_pd(fix0,tx);
1589 fiy0 = _mm256_add_pd(fiy0,ty);
1590 fiz0 = _mm256_add_pd(fiz0,tz);
1592 fjx2 = _mm256_add_pd(fjx2,tx);
1593 fjy2 = _mm256_add_pd(fjy2,ty);
1594 fjz2 = _mm256_add_pd(fjz2,tz);
1598 /**************************
1599 * CALCULATE INTERACTIONS *
1600 **************************/
1602 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1605 /* REACTION-FIELD ELECTROSTATICS */
1606 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1608 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1612 fscal = _mm256_and_pd(fscal,cutoff_mask);
1614 /* Calculate temporary vectorial force */
1615 tx = _mm256_mul_pd(fscal,dx10);
1616 ty = _mm256_mul_pd(fscal,dy10);
1617 tz = _mm256_mul_pd(fscal,dz10);
1619 /* Update vectorial force */
1620 fix1 = _mm256_add_pd(fix1,tx);
1621 fiy1 = _mm256_add_pd(fiy1,ty);
1622 fiz1 = _mm256_add_pd(fiz1,tz);
1624 fjx0 = _mm256_add_pd(fjx0,tx);
1625 fjy0 = _mm256_add_pd(fjy0,ty);
1626 fjz0 = _mm256_add_pd(fjz0,tz);
1630 /**************************
1631 * CALCULATE INTERACTIONS *
1632 **************************/
1634 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1637 /* REACTION-FIELD ELECTROSTATICS */
1638 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1640 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1644 fscal = _mm256_and_pd(fscal,cutoff_mask);
1646 /* Calculate temporary vectorial force */
1647 tx = _mm256_mul_pd(fscal,dx11);
1648 ty = _mm256_mul_pd(fscal,dy11);
1649 tz = _mm256_mul_pd(fscal,dz11);
1651 /* Update vectorial force */
1652 fix1 = _mm256_add_pd(fix1,tx);
1653 fiy1 = _mm256_add_pd(fiy1,ty);
1654 fiz1 = _mm256_add_pd(fiz1,tz);
1656 fjx1 = _mm256_add_pd(fjx1,tx);
1657 fjy1 = _mm256_add_pd(fjy1,ty);
1658 fjz1 = _mm256_add_pd(fjz1,tz);
1662 /**************************
1663 * CALCULATE INTERACTIONS *
1664 **************************/
1666 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1669 /* REACTION-FIELD ELECTROSTATICS */
1670 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1672 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1676 fscal = _mm256_and_pd(fscal,cutoff_mask);
1678 /* Calculate temporary vectorial force */
1679 tx = _mm256_mul_pd(fscal,dx12);
1680 ty = _mm256_mul_pd(fscal,dy12);
1681 tz = _mm256_mul_pd(fscal,dz12);
1683 /* Update vectorial force */
1684 fix1 = _mm256_add_pd(fix1,tx);
1685 fiy1 = _mm256_add_pd(fiy1,ty);
1686 fiz1 = _mm256_add_pd(fiz1,tz);
1688 fjx2 = _mm256_add_pd(fjx2,tx);
1689 fjy2 = _mm256_add_pd(fjy2,ty);
1690 fjz2 = _mm256_add_pd(fjz2,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1701 /* REACTION-FIELD ELECTROSTATICS */
1702 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1704 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1708 fscal = _mm256_and_pd(fscal,cutoff_mask);
1710 /* Calculate temporary vectorial force */
1711 tx = _mm256_mul_pd(fscal,dx20);
1712 ty = _mm256_mul_pd(fscal,dy20);
1713 tz = _mm256_mul_pd(fscal,dz20);
1715 /* Update vectorial force */
1716 fix2 = _mm256_add_pd(fix2,tx);
1717 fiy2 = _mm256_add_pd(fiy2,ty);
1718 fiz2 = _mm256_add_pd(fiz2,tz);
1720 fjx0 = _mm256_add_pd(fjx0,tx);
1721 fjy0 = _mm256_add_pd(fjy0,ty);
1722 fjz0 = _mm256_add_pd(fjz0,tz);
1726 /**************************
1727 * CALCULATE INTERACTIONS *
1728 **************************/
1730 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1733 /* REACTION-FIELD ELECTROSTATICS */
1734 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1736 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1740 fscal = _mm256_and_pd(fscal,cutoff_mask);
1742 /* Calculate temporary vectorial force */
1743 tx = _mm256_mul_pd(fscal,dx21);
1744 ty = _mm256_mul_pd(fscal,dy21);
1745 tz = _mm256_mul_pd(fscal,dz21);
1747 /* Update vectorial force */
1748 fix2 = _mm256_add_pd(fix2,tx);
1749 fiy2 = _mm256_add_pd(fiy2,ty);
1750 fiz2 = _mm256_add_pd(fiz2,tz);
1752 fjx1 = _mm256_add_pd(fjx1,tx);
1753 fjy1 = _mm256_add_pd(fjy1,ty);
1754 fjz1 = _mm256_add_pd(fjz1,tz);
1758 /**************************
1759 * CALCULATE INTERACTIONS *
1760 **************************/
1762 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1765 /* REACTION-FIELD ELECTROSTATICS */
1766 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1768 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1772 fscal = _mm256_and_pd(fscal,cutoff_mask);
1774 /* Calculate temporary vectorial force */
1775 tx = _mm256_mul_pd(fscal,dx22);
1776 ty = _mm256_mul_pd(fscal,dy22);
1777 tz = _mm256_mul_pd(fscal,dz22);
1779 /* Update vectorial force */
1780 fix2 = _mm256_add_pd(fix2,tx);
1781 fiy2 = _mm256_add_pd(fiy2,ty);
1782 fiz2 = _mm256_add_pd(fiz2,tz);
1784 fjx2 = _mm256_add_pd(fjx2,tx);
1785 fjy2 = _mm256_add_pd(fjy2,ty);
1786 fjz2 = _mm256_add_pd(fjz2,tz);
1790 fjptrA = f+j_coord_offsetA;
1791 fjptrB = f+j_coord_offsetB;
1792 fjptrC = f+j_coord_offsetC;
1793 fjptrD = f+j_coord_offsetD;
1795 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1796 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1798 /* Inner loop uses 297 flops */
1801 if(jidx<j_index_end)
1804 /* Get j neighbor index, and coordinate index */
1805 jnrlistA = jjnr[jidx];
1806 jnrlistB = jjnr[jidx+1];
1807 jnrlistC = jjnr[jidx+2];
1808 jnrlistD = jjnr[jidx+3];
1809 /* Sign of each element will be negative for non-real atoms.
1810 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1811 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1813 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1815 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1816 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1817 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1819 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1820 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1821 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1822 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1823 j_coord_offsetA = DIM*jnrA;
1824 j_coord_offsetB = DIM*jnrB;
1825 j_coord_offsetC = DIM*jnrC;
1826 j_coord_offsetD = DIM*jnrD;
1828 /* load j atom coordinates */
1829 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1830 x+j_coord_offsetC,x+j_coord_offsetD,
1831 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1833 /* Calculate displacement vector */
1834 dx00 = _mm256_sub_pd(ix0,jx0);
1835 dy00 = _mm256_sub_pd(iy0,jy0);
1836 dz00 = _mm256_sub_pd(iz0,jz0);
1837 dx01 = _mm256_sub_pd(ix0,jx1);
1838 dy01 = _mm256_sub_pd(iy0,jy1);
1839 dz01 = _mm256_sub_pd(iz0,jz1);
1840 dx02 = _mm256_sub_pd(ix0,jx2);
1841 dy02 = _mm256_sub_pd(iy0,jy2);
1842 dz02 = _mm256_sub_pd(iz0,jz2);
1843 dx10 = _mm256_sub_pd(ix1,jx0);
1844 dy10 = _mm256_sub_pd(iy1,jy0);
1845 dz10 = _mm256_sub_pd(iz1,jz0);
1846 dx11 = _mm256_sub_pd(ix1,jx1);
1847 dy11 = _mm256_sub_pd(iy1,jy1);
1848 dz11 = _mm256_sub_pd(iz1,jz1);
1849 dx12 = _mm256_sub_pd(ix1,jx2);
1850 dy12 = _mm256_sub_pd(iy1,jy2);
1851 dz12 = _mm256_sub_pd(iz1,jz2);
1852 dx20 = _mm256_sub_pd(ix2,jx0);
1853 dy20 = _mm256_sub_pd(iy2,jy0);
1854 dz20 = _mm256_sub_pd(iz2,jz0);
1855 dx21 = _mm256_sub_pd(ix2,jx1);
1856 dy21 = _mm256_sub_pd(iy2,jy1);
1857 dz21 = _mm256_sub_pd(iz2,jz1);
1858 dx22 = _mm256_sub_pd(ix2,jx2);
1859 dy22 = _mm256_sub_pd(iy2,jy2);
1860 dz22 = _mm256_sub_pd(iz2,jz2);
1862 /* Calculate squared distance and things based on it */
1863 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1864 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1865 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1866 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1867 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1868 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1869 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1870 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1871 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1873 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1874 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1875 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1876 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1877 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1878 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1879 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1880 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1881 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1883 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1884 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1885 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1886 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1887 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1888 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1889 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1890 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1891 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1893 fjx0 = _mm256_setzero_pd();
1894 fjy0 = _mm256_setzero_pd();
1895 fjz0 = _mm256_setzero_pd();
1896 fjx1 = _mm256_setzero_pd();
1897 fjy1 = _mm256_setzero_pd();
1898 fjz1 = _mm256_setzero_pd();
1899 fjx2 = _mm256_setzero_pd();
1900 fjy2 = _mm256_setzero_pd();
1901 fjz2 = _mm256_setzero_pd();
1903 /**************************
1904 * CALCULATE INTERACTIONS *
1905 **************************/
1907 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1910 r00 = _mm256_mul_pd(rsq00,rinv00);
1911 r00 = _mm256_andnot_pd(dummy_mask,r00);
1913 /* Calculate table index by multiplying r with table scale and truncate to integer */
1914 rt = _mm256_mul_pd(r00,vftabscale);
1915 vfitab = _mm256_cvttpd_epi32(rt);
1916 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1917 vfitab = _mm_slli_epi32(vfitab,3);
1919 /* REACTION-FIELD ELECTROSTATICS */
1920 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1922 /* CUBIC SPLINE TABLE DISPERSION */
1923 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1924 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1925 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1926 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1927 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1928 Heps = _mm256_mul_pd(vfeps,H);
1929 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1930 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1931 fvdw6 = _mm256_mul_pd(c6_00,FF);
1933 /* CUBIC SPLINE TABLE REPULSION */
1934 vfitab = _mm_add_epi32(vfitab,ifour);
1935 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1936 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1937 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1938 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1939 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1940 Heps = _mm256_mul_pd(vfeps,H);
1941 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1942 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1943 fvdw12 = _mm256_mul_pd(c12_00,FF);
1944 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1946 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1948 fscal = _mm256_add_pd(felec,fvdw);
1950 fscal = _mm256_and_pd(fscal,cutoff_mask);
1952 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1954 /* Calculate temporary vectorial force */
1955 tx = _mm256_mul_pd(fscal,dx00);
1956 ty = _mm256_mul_pd(fscal,dy00);
1957 tz = _mm256_mul_pd(fscal,dz00);
1959 /* Update vectorial force */
1960 fix0 = _mm256_add_pd(fix0,tx);
1961 fiy0 = _mm256_add_pd(fiy0,ty);
1962 fiz0 = _mm256_add_pd(fiz0,tz);
1964 fjx0 = _mm256_add_pd(fjx0,tx);
1965 fjy0 = _mm256_add_pd(fjy0,ty);
1966 fjz0 = _mm256_add_pd(fjz0,tz);
1970 /**************************
1971 * CALCULATE INTERACTIONS *
1972 **************************/
1974 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1977 /* REACTION-FIELD ELECTROSTATICS */
1978 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1980 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1984 fscal = _mm256_and_pd(fscal,cutoff_mask);
1986 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1988 /* Calculate temporary vectorial force */
1989 tx = _mm256_mul_pd(fscal,dx01);
1990 ty = _mm256_mul_pd(fscal,dy01);
1991 tz = _mm256_mul_pd(fscal,dz01);
1993 /* Update vectorial force */
1994 fix0 = _mm256_add_pd(fix0,tx);
1995 fiy0 = _mm256_add_pd(fiy0,ty);
1996 fiz0 = _mm256_add_pd(fiz0,tz);
1998 fjx1 = _mm256_add_pd(fjx1,tx);
1999 fjy1 = _mm256_add_pd(fjy1,ty);
2000 fjz1 = _mm256_add_pd(fjz1,tz);
2004 /**************************
2005 * CALCULATE INTERACTIONS *
2006 **************************/
2008 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2011 /* REACTION-FIELD ELECTROSTATICS */
2012 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
2014 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2018 fscal = _mm256_and_pd(fscal,cutoff_mask);
2020 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2022 /* Calculate temporary vectorial force */
2023 tx = _mm256_mul_pd(fscal,dx02);
2024 ty = _mm256_mul_pd(fscal,dy02);
2025 tz = _mm256_mul_pd(fscal,dz02);
2027 /* Update vectorial force */
2028 fix0 = _mm256_add_pd(fix0,tx);
2029 fiy0 = _mm256_add_pd(fiy0,ty);
2030 fiz0 = _mm256_add_pd(fiz0,tz);
2032 fjx2 = _mm256_add_pd(fjx2,tx);
2033 fjy2 = _mm256_add_pd(fjy2,ty);
2034 fjz2 = _mm256_add_pd(fjz2,tz);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2045 /* REACTION-FIELD ELECTROSTATICS */
2046 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
2048 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2052 fscal = _mm256_and_pd(fscal,cutoff_mask);
2054 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2056 /* Calculate temporary vectorial force */
2057 tx = _mm256_mul_pd(fscal,dx10);
2058 ty = _mm256_mul_pd(fscal,dy10);
2059 tz = _mm256_mul_pd(fscal,dz10);
2061 /* Update vectorial force */
2062 fix1 = _mm256_add_pd(fix1,tx);
2063 fiy1 = _mm256_add_pd(fiy1,ty);
2064 fiz1 = _mm256_add_pd(fiz1,tz);
2066 fjx0 = _mm256_add_pd(fjx0,tx);
2067 fjy0 = _mm256_add_pd(fjy0,ty);
2068 fjz0 = _mm256_add_pd(fjz0,tz);
2072 /**************************
2073 * CALCULATE INTERACTIONS *
2074 **************************/
2076 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2079 /* REACTION-FIELD ELECTROSTATICS */
2080 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2082 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2086 fscal = _mm256_and_pd(fscal,cutoff_mask);
2088 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2090 /* Calculate temporary vectorial force */
2091 tx = _mm256_mul_pd(fscal,dx11);
2092 ty = _mm256_mul_pd(fscal,dy11);
2093 tz = _mm256_mul_pd(fscal,dz11);
2095 /* Update vectorial force */
2096 fix1 = _mm256_add_pd(fix1,tx);
2097 fiy1 = _mm256_add_pd(fiy1,ty);
2098 fiz1 = _mm256_add_pd(fiz1,tz);
2100 fjx1 = _mm256_add_pd(fjx1,tx);
2101 fjy1 = _mm256_add_pd(fjy1,ty);
2102 fjz1 = _mm256_add_pd(fjz1,tz);
2106 /**************************
2107 * CALCULATE INTERACTIONS *
2108 **************************/
2110 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2113 /* REACTION-FIELD ELECTROSTATICS */
2114 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2116 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2120 fscal = _mm256_and_pd(fscal,cutoff_mask);
2122 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2124 /* Calculate temporary vectorial force */
2125 tx = _mm256_mul_pd(fscal,dx12);
2126 ty = _mm256_mul_pd(fscal,dy12);
2127 tz = _mm256_mul_pd(fscal,dz12);
2129 /* Update vectorial force */
2130 fix1 = _mm256_add_pd(fix1,tx);
2131 fiy1 = _mm256_add_pd(fiy1,ty);
2132 fiz1 = _mm256_add_pd(fiz1,tz);
2134 fjx2 = _mm256_add_pd(fjx2,tx);
2135 fjy2 = _mm256_add_pd(fjy2,ty);
2136 fjz2 = _mm256_add_pd(fjz2,tz);
2140 /**************************
2141 * CALCULATE INTERACTIONS *
2142 **************************/
2144 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2147 /* REACTION-FIELD ELECTROSTATICS */
2148 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
2150 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2154 fscal = _mm256_and_pd(fscal,cutoff_mask);
2156 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2158 /* Calculate temporary vectorial force */
2159 tx = _mm256_mul_pd(fscal,dx20);
2160 ty = _mm256_mul_pd(fscal,dy20);
2161 tz = _mm256_mul_pd(fscal,dz20);
2163 /* Update vectorial force */
2164 fix2 = _mm256_add_pd(fix2,tx);
2165 fiy2 = _mm256_add_pd(fiy2,ty);
2166 fiz2 = _mm256_add_pd(fiz2,tz);
2168 fjx0 = _mm256_add_pd(fjx0,tx);
2169 fjy0 = _mm256_add_pd(fjy0,ty);
2170 fjz0 = _mm256_add_pd(fjz0,tz);
2174 /**************************
2175 * CALCULATE INTERACTIONS *
2176 **************************/
2178 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2181 /* REACTION-FIELD ELECTROSTATICS */
2182 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2184 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2188 fscal = _mm256_and_pd(fscal,cutoff_mask);
2190 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2192 /* Calculate temporary vectorial force */
2193 tx = _mm256_mul_pd(fscal,dx21);
2194 ty = _mm256_mul_pd(fscal,dy21);
2195 tz = _mm256_mul_pd(fscal,dz21);
2197 /* Update vectorial force */
2198 fix2 = _mm256_add_pd(fix2,tx);
2199 fiy2 = _mm256_add_pd(fiy2,ty);
2200 fiz2 = _mm256_add_pd(fiz2,tz);
2202 fjx1 = _mm256_add_pd(fjx1,tx);
2203 fjy1 = _mm256_add_pd(fjy1,ty);
2204 fjz1 = _mm256_add_pd(fjz1,tz);
2208 /**************************
2209 * CALCULATE INTERACTIONS *
2210 **************************/
2212 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2215 /* REACTION-FIELD ELECTROSTATICS */
2216 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2218 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2222 fscal = _mm256_and_pd(fscal,cutoff_mask);
2224 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2226 /* Calculate temporary vectorial force */
2227 tx = _mm256_mul_pd(fscal,dx22);
2228 ty = _mm256_mul_pd(fscal,dy22);
2229 tz = _mm256_mul_pd(fscal,dz22);
2231 /* Update vectorial force */
2232 fix2 = _mm256_add_pd(fix2,tx);
2233 fiy2 = _mm256_add_pd(fiy2,ty);
2234 fiz2 = _mm256_add_pd(fiz2,tz);
2236 fjx2 = _mm256_add_pd(fjx2,tx);
2237 fjy2 = _mm256_add_pd(fjy2,ty);
2238 fjz2 = _mm256_add_pd(fjz2,tz);
2242 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2243 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2244 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2245 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2247 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2248 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2250 /* Inner loop uses 298 flops */
2253 /* End of innermost loop */
2255 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2256 f+i_coord_offset,fshift+i_shift_offset);
2258 /* Increment number of inner iterations */
2259 inneriter += j_index_end - j_index_start;
2261 /* Outer loop uses 18 flops */
2264 /* Increment number of outer iterations */
2267 /* Update outer/inner flops */
2269 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);