2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
105 __m128i ifour = _mm_set1_epi32(4);
106 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
108 __m256d dummy_mask,cutoff_mask;
109 __m128 tmpmask0,tmpmask1;
110 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
111 __m256d one = _mm256_set1_pd(1.0);
112 __m256d two = _mm256_set1_pd(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm256_set1_pd(fr->epsfac);
125 charge = mdatoms->chargeA;
126 krf = _mm256_set1_pd(fr->ic->k_rf);
127 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
128 crf = _mm256_set1_pd(fr->ic->c_rf);
129 nvdwtype = fr->ntype;
131 vdwtype = mdatoms->typeA;
133 vftab = kernel_data->table_vdw->data;
134 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
139 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
140 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
141 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
143 jq1 = _mm256_set1_pd(charge[inr+1]);
144 jq2 = _mm256_set1_pd(charge[inr+2]);
145 jq3 = _mm256_set1_pd(charge[inr+3]);
146 vdwjidx0A = 2*vdwtype[inr+0];
147 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
148 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
149 qq11 = _mm256_mul_pd(iq1,jq1);
150 qq12 = _mm256_mul_pd(iq1,jq2);
151 qq13 = _mm256_mul_pd(iq1,jq3);
152 qq21 = _mm256_mul_pd(iq2,jq1);
153 qq22 = _mm256_mul_pd(iq2,jq2);
154 qq23 = _mm256_mul_pd(iq2,jq3);
155 qq31 = _mm256_mul_pd(iq3,jq1);
156 qq32 = _mm256_mul_pd(iq3,jq2);
157 qq33 = _mm256_mul_pd(iq3,jq3);
159 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160 rcutoff_scalar = fr->rcoulomb;
161 rcutoff = _mm256_set1_pd(rcutoff_scalar);
162 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
164 /* Avoid stupid compiler warnings */
165 jnrA = jnrB = jnrC = jnrD = 0;
174 for(iidx=0;iidx<4*DIM;iidx++)
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
197 fix0 = _mm256_setzero_pd();
198 fiy0 = _mm256_setzero_pd();
199 fiz0 = _mm256_setzero_pd();
200 fix1 = _mm256_setzero_pd();
201 fiy1 = _mm256_setzero_pd();
202 fiz1 = _mm256_setzero_pd();
203 fix2 = _mm256_setzero_pd();
204 fiy2 = _mm256_setzero_pd();
205 fiz2 = _mm256_setzero_pd();
206 fix3 = _mm256_setzero_pd();
207 fiy3 = _mm256_setzero_pd();
208 fiz3 = _mm256_setzero_pd();
210 /* Reset potential sums */
211 velecsum = _mm256_setzero_pd();
212 vvdwsum = _mm256_setzero_pd();
214 /* Start inner kernel loop */
215 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
225 j_coord_offsetC = DIM*jnrC;
226 j_coord_offsetD = DIM*jnrD;
228 /* load j atom coordinates */
229 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
230 x+j_coord_offsetC,x+j_coord_offsetD,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
232 &jy2,&jz2,&jx3,&jy3,&jz3);
234 /* Calculate displacement vector */
235 dx00 = _mm256_sub_pd(ix0,jx0);
236 dy00 = _mm256_sub_pd(iy0,jy0);
237 dz00 = _mm256_sub_pd(iz0,jz0);
238 dx11 = _mm256_sub_pd(ix1,jx1);
239 dy11 = _mm256_sub_pd(iy1,jy1);
240 dz11 = _mm256_sub_pd(iz1,jz1);
241 dx12 = _mm256_sub_pd(ix1,jx2);
242 dy12 = _mm256_sub_pd(iy1,jy2);
243 dz12 = _mm256_sub_pd(iz1,jz2);
244 dx13 = _mm256_sub_pd(ix1,jx3);
245 dy13 = _mm256_sub_pd(iy1,jy3);
246 dz13 = _mm256_sub_pd(iz1,jz3);
247 dx21 = _mm256_sub_pd(ix2,jx1);
248 dy21 = _mm256_sub_pd(iy2,jy1);
249 dz21 = _mm256_sub_pd(iz2,jz1);
250 dx22 = _mm256_sub_pd(ix2,jx2);
251 dy22 = _mm256_sub_pd(iy2,jy2);
252 dz22 = _mm256_sub_pd(iz2,jz2);
253 dx23 = _mm256_sub_pd(ix2,jx3);
254 dy23 = _mm256_sub_pd(iy2,jy3);
255 dz23 = _mm256_sub_pd(iz2,jz3);
256 dx31 = _mm256_sub_pd(ix3,jx1);
257 dy31 = _mm256_sub_pd(iy3,jy1);
258 dz31 = _mm256_sub_pd(iz3,jz1);
259 dx32 = _mm256_sub_pd(ix3,jx2);
260 dy32 = _mm256_sub_pd(iy3,jy2);
261 dz32 = _mm256_sub_pd(iz3,jz2);
262 dx33 = _mm256_sub_pd(ix3,jx3);
263 dy33 = _mm256_sub_pd(iy3,jy3);
264 dz33 = _mm256_sub_pd(iz3,jz3);
266 /* Calculate squared distance and things based on it */
267 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
268 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
269 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
270 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
271 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
272 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
273 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
274 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
275 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
276 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
278 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
279 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
280 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
281 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
282 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
283 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
284 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
285 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
286 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
287 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
289 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
290 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
291 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
292 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
293 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
294 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
295 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
296 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
297 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
299 fjx0 = _mm256_setzero_pd();
300 fjy0 = _mm256_setzero_pd();
301 fjz0 = _mm256_setzero_pd();
302 fjx1 = _mm256_setzero_pd();
303 fjy1 = _mm256_setzero_pd();
304 fjz1 = _mm256_setzero_pd();
305 fjx2 = _mm256_setzero_pd();
306 fjy2 = _mm256_setzero_pd();
307 fjz2 = _mm256_setzero_pd();
308 fjx3 = _mm256_setzero_pd();
309 fjy3 = _mm256_setzero_pd();
310 fjz3 = _mm256_setzero_pd();
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm256_any_lt(rsq00,rcutoff2))
319 r00 = _mm256_mul_pd(rsq00,rinv00);
321 /* Calculate table index by multiplying r with table scale and truncate to integer */
322 rt = _mm256_mul_pd(r00,vftabscale);
323 vfitab = _mm256_cvttpd_epi32(rt);
324 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
325 vfitab = _mm_slli_epi32(vfitab,3);
327 /* CUBIC SPLINE TABLE DISPERSION */
328 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
329 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
330 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
331 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
332 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
333 Heps = _mm256_mul_pd(vfeps,H);
334 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
335 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
336 vvdw6 = _mm256_mul_pd(c6_00,VV);
337 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
338 fvdw6 = _mm256_mul_pd(c6_00,FF);
340 /* CUBIC SPLINE TABLE REPULSION */
341 vfitab = _mm_add_epi32(vfitab,ifour);
342 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
343 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
344 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
345 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
346 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
347 Heps = _mm256_mul_pd(vfeps,H);
348 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
349 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
350 vvdw12 = _mm256_mul_pd(c12_00,VV);
351 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
352 fvdw12 = _mm256_mul_pd(c12_00,FF);
353 vvdw = _mm256_add_pd(vvdw12,vvdw6);
354 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
356 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
358 /* Update potential sum for this i atom from the interaction with this j atom. */
359 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
360 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
364 fscal = _mm256_and_pd(fscal,cutoff_mask);
366 /* Calculate temporary vectorial force */
367 tx = _mm256_mul_pd(fscal,dx00);
368 ty = _mm256_mul_pd(fscal,dy00);
369 tz = _mm256_mul_pd(fscal,dz00);
371 /* Update vectorial force */
372 fix0 = _mm256_add_pd(fix0,tx);
373 fiy0 = _mm256_add_pd(fiy0,ty);
374 fiz0 = _mm256_add_pd(fiz0,tz);
376 fjx0 = _mm256_add_pd(fjx0,tx);
377 fjy0 = _mm256_add_pd(fjy0,ty);
378 fjz0 = _mm256_add_pd(fjz0,tz);
382 /**************************
383 * CALCULATE INTERACTIONS *
384 **************************/
386 if (gmx_mm256_any_lt(rsq11,rcutoff2))
389 /* REACTION-FIELD ELECTROSTATICS */
390 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
391 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
393 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velec = _mm256_and_pd(velec,cutoff_mask);
397 velecsum = _mm256_add_pd(velecsum,velec);
401 fscal = _mm256_and_pd(fscal,cutoff_mask);
403 /* Calculate temporary vectorial force */
404 tx = _mm256_mul_pd(fscal,dx11);
405 ty = _mm256_mul_pd(fscal,dy11);
406 tz = _mm256_mul_pd(fscal,dz11);
408 /* Update vectorial force */
409 fix1 = _mm256_add_pd(fix1,tx);
410 fiy1 = _mm256_add_pd(fiy1,ty);
411 fiz1 = _mm256_add_pd(fiz1,tz);
413 fjx1 = _mm256_add_pd(fjx1,tx);
414 fjy1 = _mm256_add_pd(fjy1,ty);
415 fjz1 = _mm256_add_pd(fjz1,tz);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 if (gmx_mm256_any_lt(rsq12,rcutoff2))
426 /* REACTION-FIELD ELECTROSTATICS */
427 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
428 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
430 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm256_and_pd(velec,cutoff_mask);
434 velecsum = _mm256_add_pd(velecsum,velec);
438 fscal = _mm256_and_pd(fscal,cutoff_mask);
440 /* Calculate temporary vectorial force */
441 tx = _mm256_mul_pd(fscal,dx12);
442 ty = _mm256_mul_pd(fscal,dy12);
443 tz = _mm256_mul_pd(fscal,dz12);
445 /* Update vectorial force */
446 fix1 = _mm256_add_pd(fix1,tx);
447 fiy1 = _mm256_add_pd(fiy1,ty);
448 fiz1 = _mm256_add_pd(fiz1,tz);
450 fjx2 = _mm256_add_pd(fjx2,tx);
451 fjy2 = _mm256_add_pd(fjy2,ty);
452 fjz2 = _mm256_add_pd(fjz2,tz);
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
460 if (gmx_mm256_any_lt(rsq13,rcutoff2))
463 /* REACTION-FIELD ELECTROSTATICS */
464 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
465 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
467 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velec = _mm256_and_pd(velec,cutoff_mask);
471 velecsum = _mm256_add_pd(velecsum,velec);
475 fscal = _mm256_and_pd(fscal,cutoff_mask);
477 /* Calculate temporary vectorial force */
478 tx = _mm256_mul_pd(fscal,dx13);
479 ty = _mm256_mul_pd(fscal,dy13);
480 tz = _mm256_mul_pd(fscal,dz13);
482 /* Update vectorial force */
483 fix1 = _mm256_add_pd(fix1,tx);
484 fiy1 = _mm256_add_pd(fiy1,ty);
485 fiz1 = _mm256_add_pd(fiz1,tz);
487 fjx3 = _mm256_add_pd(fjx3,tx);
488 fjy3 = _mm256_add_pd(fjy3,ty);
489 fjz3 = _mm256_add_pd(fjz3,tz);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 if (gmx_mm256_any_lt(rsq21,rcutoff2))
500 /* REACTION-FIELD ELECTROSTATICS */
501 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
502 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
504 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
506 /* Update potential sum for this i atom from the interaction with this j atom. */
507 velec = _mm256_and_pd(velec,cutoff_mask);
508 velecsum = _mm256_add_pd(velecsum,velec);
512 fscal = _mm256_and_pd(fscal,cutoff_mask);
514 /* Calculate temporary vectorial force */
515 tx = _mm256_mul_pd(fscal,dx21);
516 ty = _mm256_mul_pd(fscal,dy21);
517 tz = _mm256_mul_pd(fscal,dz21);
519 /* Update vectorial force */
520 fix2 = _mm256_add_pd(fix2,tx);
521 fiy2 = _mm256_add_pd(fiy2,ty);
522 fiz2 = _mm256_add_pd(fiz2,tz);
524 fjx1 = _mm256_add_pd(fjx1,tx);
525 fjy1 = _mm256_add_pd(fjy1,ty);
526 fjz1 = _mm256_add_pd(fjz1,tz);
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 if (gmx_mm256_any_lt(rsq22,rcutoff2))
537 /* REACTION-FIELD ELECTROSTATICS */
538 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
539 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
541 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
543 /* Update potential sum for this i atom from the interaction with this j atom. */
544 velec = _mm256_and_pd(velec,cutoff_mask);
545 velecsum = _mm256_add_pd(velecsum,velec);
549 fscal = _mm256_and_pd(fscal,cutoff_mask);
551 /* Calculate temporary vectorial force */
552 tx = _mm256_mul_pd(fscal,dx22);
553 ty = _mm256_mul_pd(fscal,dy22);
554 tz = _mm256_mul_pd(fscal,dz22);
556 /* Update vectorial force */
557 fix2 = _mm256_add_pd(fix2,tx);
558 fiy2 = _mm256_add_pd(fiy2,ty);
559 fiz2 = _mm256_add_pd(fiz2,tz);
561 fjx2 = _mm256_add_pd(fjx2,tx);
562 fjy2 = _mm256_add_pd(fjy2,ty);
563 fjz2 = _mm256_add_pd(fjz2,tz);
567 /**************************
568 * CALCULATE INTERACTIONS *
569 **************************/
571 if (gmx_mm256_any_lt(rsq23,rcutoff2))
574 /* REACTION-FIELD ELECTROSTATICS */
575 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
576 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
578 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
580 /* Update potential sum for this i atom from the interaction with this j atom. */
581 velec = _mm256_and_pd(velec,cutoff_mask);
582 velecsum = _mm256_add_pd(velecsum,velec);
586 fscal = _mm256_and_pd(fscal,cutoff_mask);
588 /* Calculate temporary vectorial force */
589 tx = _mm256_mul_pd(fscal,dx23);
590 ty = _mm256_mul_pd(fscal,dy23);
591 tz = _mm256_mul_pd(fscal,dz23);
593 /* Update vectorial force */
594 fix2 = _mm256_add_pd(fix2,tx);
595 fiy2 = _mm256_add_pd(fiy2,ty);
596 fiz2 = _mm256_add_pd(fiz2,tz);
598 fjx3 = _mm256_add_pd(fjx3,tx);
599 fjy3 = _mm256_add_pd(fjy3,ty);
600 fjz3 = _mm256_add_pd(fjz3,tz);
604 /**************************
605 * CALCULATE INTERACTIONS *
606 **************************/
608 if (gmx_mm256_any_lt(rsq31,rcutoff2))
611 /* REACTION-FIELD ELECTROSTATICS */
612 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
613 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
615 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
617 /* Update potential sum for this i atom from the interaction with this j atom. */
618 velec = _mm256_and_pd(velec,cutoff_mask);
619 velecsum = _mm256_add_pd(velecsum,velec);
623 fscal = _mm256_and_pd(fscal,cutoff_mask);
625 /* Calculate temporary vectorial force */
626 tx = _mm256_mul_pd(fscal,dx31);
627 ty = _mm256_mul_pd(fscal,dy31);
628 tz = _mm256_mul_pd(fscal,dz31);
630 /* Update vectorial force */
631 fix3 = _mm256_add_pd(fix3,tx);
632 fiy3 = _mm256_add_pd(fiy3,ty);
633 fiz3 = _mm256_add_pd(fiz3,tz);
635 fjx1 = _mm256_add_pd(fjx1,tx);
636 fjy1 = _mm256_add_pd(fjy1,ty);
637 fjz1 = _mm256_add_pd(fjz1,tz);
641 /**************************
642 * CALCULATE INTERACTIONS *
643 **************************/
645 if (gmx_mm256_any_lt(rsq32,rcutoff2))
648 /* REACTION-FIELD ELECTROSTATICS */
649 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
650 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
652 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velec = _mm256_and_pd(velec,cutoff_mask);
656 velecsum = _mm256_add_pd(velecsum,velec);
660 fscal = _mm256_and_pd(fscal,cutoff_mask);
662 /* Calculate temporary vectorial force */
663 tx = _mm256_mul_pd(fscal,dx32);
664 ty = _mm256_mul_pd(fscal,dy32);
665 tz = _mm256_mul_pd(fscal,dz32);
667 /* Update vectorial force */
668 fix3 = _mm256_add_pd(fix3,tx);
669 fiy3 = _mm256_add_pd(fiy3,ty);
670 fiz3 = _mm256_add_pd(fiz3,tz);
672 fjx2 = _mm256_add_pd(fjx2,tx);
673 fjy2 = _mm256_add_pd(fjy2,ty);
674 fjz2 = _mm256_add_pd(fjz2,tz);
678 /**************************
679 * CALCULATE INTERACTIONS *
680 **************************/
682 if (gmx_mm256_any_lt(rsq33,rcutoff2))
685 /* REACTION-FIELD ELECTROSTATICS */
686 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
687 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
689 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
691 /* Update potential sum for this i atom from the interaction with this j atom. */
692 velec = _mm256_and_pd(velec,cutoff_mask);
693 velecsum = _mm256_add_pd(velecsum,velec);
697 fscal = _mm256_and_pd(fscal,cutoff_mask);
699 /* Calculate temporary vectorial force */
700 tx = _mm256_mul_pd(fscal,dx33);
701 ty = _mm256_mul_pd(fscal,dy33);
702 tz = _mm256_mul_pd(fscal,dz33);
704 /* Update vectorial force */
705 fix3 = _mm256_add_pd(fix3,tx);
706 fiy3 = _mm256_add_pd(fiy3,ty);
707 fiz3 = _mm256_add_pd(fiz3,tz);
709 fjx3 = _mm256_add_pd(fjx3,tx);
710 fjy3 = _mm256_add_pd(fjy3,ty);
711 fjz3 = _mm256_add_pd(fjz3,tz);
715 fjptrA = f+j_coord_offsetA;
716 fjptrB = f+j_coord_offsetB;
717 fjptrC = f+j_coord_offsetC;
718 fjptrD = f+j_coord_offsetD;
720 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
721 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
722 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
724 /* Inner loop uses 387 flops */
730 /* Get j neighbor index, and coordinate index */
731 jnrlistA = jjnr[jidx];
732 jnrlistB = jjnr[jidx+1];
733 jnrlistC = jjnr[jidx+2];
734 jnrlistD = jjnr[jidx+3];
735 /* Sign of each element will be negative for non-real atoms.
736 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
737 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
739 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
741 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
742 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
743 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
745 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
746 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
747 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
748 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
749 j_coord_offsetA = DIM*jnrA;
750 j_coord_offsetB = DIM*jnrB;
751 j_coord_offsetC = DIM*jnrC;
752 j_coord_offsetD = DIM*jnrD;
754 /* load j atom coordinates */
755 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
756 x+j_coord_offsetC,x+j_coord_offsetD,
757 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
758 &jy2,&jz2,&jx3,&jy3,&jz3);
760 /* Calculate displacement vector */
761 dx00 = _mm256_sub_pd(ix0,jx0);
762 dy00 = _mm256_sub_pd(iy0,jy0);
763 dz00 = _mm256_sub_pd(iz0,jz0);
764 dx11 = _mm256_sub_pd(ix1,jx1);
765 dy11 = _mm256_sub_pd(iy1,jy1);
766 dz11 = _mm256_sub_pd(iz1,jz1);
767 dx12 = _mm256_sub_pd(ix1,jx2);
768 dy12 = _mm256_sub_pd(iy1,jy2);
769 dz12 = _mm256_sub_pd(iz1,jz2);
770 dx13 = _mm256_sub_pd(ix1,jx3);
771 dy13 = _mm256_sub_pd(iy1,jy3);
772 dz13 = _mm256_sub_pd(iz1,jz3);
773 dx21 = _mm256_sub_pd(ix2,jx1);
774 dy21 = _mm256_sub_pd(iy2,jy1);
775 dz21 = _mm256_sub_pd(iz2,jz1);
776 dx22 = _mm256_sub_pd(ix2,jx2);
777 dy22 = _mm256_sub_pd(iy2,jy2);
778 dz22 = _mm256_sub_pd(iz2,jz2);
779 dx23 = _mm256_sub_pd(ix2,jx3);
780 dy23 = _mm256_sub_pd(iy2,jy3);
781 dz23 = _mm256_sub_pd(iz2,jz3);
782 dx31 = _mm256_sub_pd(ix3,jx1);
783 dy31 = _mm256_sub_pd(iy3,jy1);
784 dz31 = _mm256_sub_pd(iz3,jz1);
785 dx32 = _mm256_sub_pd(ix3,jx2);
786 dy32 = _mm256_sub_pd(iy3,jy2);
787 dz32 = _mm256_sub_pd(iz3,jz2);
788 dx33 = _mm256_sub_pd(ix3,jx3);
789 dy33 = _mm256_sub_pd(iy3,jy3);
790 dz33 = _mm256_sub_pd(iz3,jz3);
792 /* Calculate squared distance and things based on it */
793 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
794 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
795 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
796 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
797 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
798 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
799 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
800 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
801 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
802 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
804 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
805 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
806 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
807 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
808 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
809 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
810 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
811 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
812 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
813 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
815 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
816 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
817 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
818 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
819 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
820 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
821 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
822 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
823 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
825 fjx0 = _mm256_setzero_pd();
826 fjy0 = _mm256_setzero_pd();
827 fjz0 = _mm256_setzero_pd();
828 fjx1 = _mm256_setzero_pd();
829 fjy1 = _mm256_setzero_pd();
830 fjz1 = _mm256_setzero_pd();
831 fjx2 = _mm256_setzero_pd();
832 fjy2 = _mm256_setzero_pd();
833 fjz2 = _mm256_setzero_pd();
834 fjx3 = _mm256_setzero_pd();
835 fjy3 = _mm256_setzero_pd();
836 fjz3 = _mm256_setzero_pd();
838 /**************************
839 * CALCULATE INTERACTIONS *
840 **************************/
842 if (gmx_mm256_any_lt(rsq00,rcutoff2))
845 r00 = _mm256_mul_pd(rsq00,rinv00);
846 r00 = _mm256_andnot_pd(dummy_mask,r00);
848 /* Calculate table index by multiplying r with table scale and truncate to integer */
849 rt = _mm256_mul_pd(r00,vftabscale);
850 vfitab = _mm256_cvttpd_epi32(rt);
851 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
852 vfitab = _mm_slli_epi32(vfitab,3);
854 /* CUBIC SPLINE TABLE DISPERSION */
855 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
856 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
857 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
858 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
859 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
860 Heps = _mm256_mul_pd(vfeps,H);
861 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
862 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
863 vvdw6 = _mm256_mul_pd(c6_00,VV);
864 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
865 fvdw6 = _mm256_mul_pd(c6_00,FF);
867 /* CUBIC SPLINE TABLE REPULSION */
868 vfitab = _mm_add_epi32(vfitab,ifour);
869 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
870 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
871 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
872 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
873 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
874 Heps = _mm256_mul_pd(vfeps,H);
875 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
876 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
877 vvdw12 = _mm256_mul_pd(c12_00,VV);
878 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
879 fvdw12 = _mm256_mul_pd(c12_00,FF);
880 vvdw = _mm256_add_pd(vvdw12,vvdw6);
881 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
883 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
885 /* Update potential sum for this i atom from the interaction with this j atom. */
886 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
887 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
888 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
892 fscal = _mm256_and_pd(fscal,cutoff_mask);
894 fscal = _mm256_andnot_pd(dummy_mask,fscal);
896 /* Calculate temporary vectorial force */
897 tx = _mm256_mul_pd(fscal,dx00);
898 ty = _mm256_mul_pd(fscal,dy00);
899 tz = _mm256_mul_pd(fscal,dz00);
901 /* Update vectorial force */
902 fix0 = _mm256_add_pd(fix0,tx);
903 fiy0 = _mm256_add_pd(fiy0,ty);
904 fiz0 = _mm256_add_pd(fiz0,tz);
906 fjx0 = _mm256_add_pd(fjx0,tx);
907 fjy0 = _mm256_add_pd(fjy0,ty);
908 fjz0 = _mm256_add_pd(fjz0,tz);
912 /**************************
913 * CALCULATE INTERACTIONS *
914 **************************/
916 if (gmx_mm256_any_lt(rsq11,rcutoff2))
919 /* REACTION-FIELD ELECTROSTATICS */
920 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
921 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
923 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
925 /* Update potential sum for this i atom from the interaction with this j atom. */
926 velec = _mm256_and_pd(velec,cutoff_mask);
927 velec = _mm256_andnot_pd(dummy_mask,velec);
928 velecsum = _mm256_add_pd(velecsum,velec);
932 fscal = _mm256_and_pd(fscal,cutoff_mask);
934 fscal = _mm256_andnot_pd(dummy_mask,fscal);
936 /* Calculate temporary vectorial force */
937 tx = _mm256_mul_pd(fscal,dx11);
938 ty = _mm256_mul_pd(fscal,dy11);
939 tz = _mm256_mul_pd(fscal,dz11);
941 /* Update vectorial force */
942 fix1 = _mm256_add_pd(fix1,tx);
943 fiy1 = _mm256_add_pd(fiy1,ty);
944 fiz1 = _mm256_add_pd(fiz1,tz);
946 fjx1 = _mm256_add_pd(fjx1,tx);
947 fjy1 = _mm256_add_pd(fjy1,ty);
948 fjz1 = _mm256_add_pd(fjz1,tz);
952 /**************************
953 * CALCULATE INTERACTIONS *
954 **************************/
956 if (gmx_mm256_any_lt(rsq12,rcutoff2))
959 /* REACTION-FIELD ELECTROSTATICS */
960 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
961 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
963 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
965 /* Update potential sum for this i atom from the interaction with this j atom. */
966 velec = _mm256_and_pd(velec,cutoff_mask);
967 velec = _mm256_andnot_pd(dummy_mask,velec);
968 velecsum = _mm256_add_pd(velecsum,velec);
972 fscal = _mm256_and_pd(fscal,cutoff_mask);
974 fscal = _mm256_andnot_pd(dummy_mask,fscal);
976 /* Calculate temporary vectorial force */
977 tx = _mm256_mul_pd(fscal,dx12);
978 ty = _mm256_mul_pd(fscal,dy12);
979 tz = _mm256_mul_pd(fscal,dz12);
981 /* Update vectorial force */
982 fix1 = _mm256_add_pd(fix1,tx);
983 fiy1 = _mm256_add_pd(fiy1,ty);
984 fiz1 = _mm256_add_pd(fiz1,tz);
986 fjx2 = _mm256_add_pd(fjx2,tx);
987 fjy2 = _mm256_add_pd(fjy2,ty);
988 fjz2 = _mm256_add_pd(fjz2,tz);
992 /**************************
993 * CALCULATE INTERACTIONS *
994 **************************/
996 if (gmx_mm256_any_lt(rsq13,rcutoff2))
999 /* REACTION-FIELD ELECTROSTATICS */
1000 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
1001 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1003 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1005 /* Update potential sum for this i atom from the interaction with this j atom. */
1006 velec = _mm256_and_pd(velec,cutoff_mask);
1007 velec = _mm256_andnot_pd(dummy_mask,velec);
1008 velecsum = _mm256_add_pd(velecsum,velec);
1012 fscal = _mm256_and_pd(fscal,cutoff_mask);
1014 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1016 /* Calculate temporary vectorial force */
1017 tx = _mm256_mul_pd(fscal,dx13);
1018 ty = _mm256_mul_pd(fscal,dy13);
1019 tz = _mm256_mul_pd(fscal,dz13);
1021 /* Update vectorial force */
1022 fix1 = _mm256_add_pd(fix1,tx);
1023 fiy1 = _mm256_add_pd(fiy1,ty);
1024 fiz1 = _mm256_add_pd(fiz1,tz);
1026 fjx3 = _mm256_add_pd(fjx3,tx);
1027 fjy3 = _mm256_add_pd(fjy3,ty);
1028 fjz3 = _mm256_add_pd(fjz3,tz);
1032 /**************************
1033 * CALCULATE INTERACTIONS *
1034 **************************/
1036 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1039 /* REACTION-FIELD ELECTROSTATICS */
1040 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1041 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1043 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1045 /* Update potential sum for this i atom from the interaction with this j atom. */
1046 velec = _mm256_and_pd(velec,cutoff_mask);
1047 velec = _mm256_andnot_pd(dummy_mask,velec);
1048 velecsum = _mm256_add_pd(velecsum,velec);
1052 fscal = _mm256_and_pd(fscal,cutoff_mask);
1054 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1056 /* Calculate temporary vectorial force */
1057 tx = _mm256_mul_pd(fscal,dx21);
1058 ty = _mm256_mul_pd(fscal,dy21);
1059 tz = _mm256_mul_pd(fscal,dz21);
1061 /* Update vectorial force */
1062 fix2 = _mm256_add_pd(fix2,tx);
1063 fiy2 = _mm256_add_pd(fiy2,ty);
1064 fiz2 = _mm256_add_pd(fiz2,tz);
1066 fjx1 = _mm256_add_pd(fjx1,tx);
1067 fjy1 = _mm256_add_pd(fjy1,ty);
1068 fjz1 = _mm256_add_pd(fjz1,tz);
1072 /**************************
1073 * CALCULATE INTERACTIONS *
1074 **************************/
1076 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1079 /* REACTION-FIELD ELECTROSTATICS */
1080 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1081 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1083 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1085 /* Update potential sum for this i atom from the interaction with this j atom. */
1086 velec = _mm256_and_pd(velec,cutoff_mask);
1087 velec = _mm256_andnot_pd(dummy_mask,velec);
1088 velecsum = _mm256_add_pd(velecsum,velec);
1092 fscal = _mm256_and_pd(fscal,cutoff_mask);
1094 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1096 /* Calculate temporary vectorial force */
1097 tx = _mm256_mul_pd(fscal,dx22);
1098 ty = _mm256_mul_pd(fscal,dy22);
1099 tz = _mm256_mul_pd(fscal,dz22);
1101 /* Update vectorial force */
1102 fix2 = _mm256_add_pd(fix2,tx);
1103 fiy2 = _mm256_add_pd(fiy2,ty);
1104 fiz2 = _mm256_add_pd(fiz2,tz);
1106 fjx2 = _mm256_add_pd(fjx2,tx);
1107 fjy2 = _mm256_add_pd(fjy2,ty);
1108 fjz2 = _mm256_add_pd(fjz2,tz);
1112 /**************************
1113 * CALCULATE INTERACTIONS *
1114 **************************/
1116 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1119 /* REACTION-FIELD ELECTROSTATICS */
1120 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1121 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1123 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1125 /* Update potential sum for this i atom from the interaction with this j atom. */
1126 velec = _mm256_and_pd(velec,cutoff_mask);
1127 velec = _mm256_andnot_pd(dummy_mask,velec);
1128 velecsum = _mm256_add_pd(velecsum,velec);
1132 fscal = _mm256_and_pd(fscal,cutoff_mask);
1134 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1136 /* Calculate temporary vectorial force */
1137 tx = _mm256_mul_pd(fscal,dx23);
1138 ty = _mm256_mul_pd(fscal,dy23);
1139 tz = _mm256_mul_pd(fscal,dz23);
1141 /* Update vectorial force */
1142 fix2 = _mm256_add_pd(fix2,tx);
1143 fiy2 = _mm256_add_pd(fiy2,ty);
1144 fiz2 = _mm256_add_pd(fiz2,tz);
1146 fjx3 = _mm256_add_pd(fjx3,tx);
1147 fjy3 = _mm256_add_pd(fjy3,ty);
1148 fjz3 = _mm256_add_pd(fjz3,tz);
1152 /**************************
1153 * CALCULATE INTERACTIONS *
1154 **************************/
1156 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1159 /* REACTION-FIELD ELECTROSTATICS */
1160 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1161 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1163 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1165 /* Update potential sum for this i atom from the interaction with this j atom. */
1166 velec = _mm256_and_pd(velec,cutoff_mask);
1167 velec = _mm256_andnot_pd(dummy_mask,velec);
1168 velecsum = _mm256_add_pd(velecsum,velec);
1172 fscal = _mm256_and_pd(fscal,cutoff_mask);
1174 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1176 /* Calculate temporary vectorial force */
1177 tx = _mm256_mul_pd(fscal,dx31);
1178 ty = _mm256_mul_pd(fscal,dy31);
1179 tz = _mm256_mul_pd(fscal,dz31);
1181 /* Update vectorial force */
1182 fix3 = _mm256_add_pd(fix3,tx);
1183 fiy3 = _mm256_add_pd(fiy3,ty);
1184 fiz3 = _mm256_add_pd(fiz3,tz);
1186 fjx1 = _mm256_add_pd(fjx1,tx);
1187 fjy1 = _mm256_add_pd(fjy1,ty);
1188 fjz1 = _mm256_add_pd(fjz1,tz);
1192 /**************************
1193 * CALCULATE INTERACTIONS *
1194 **************************/
1196 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1199 /* REACTION-FIELD ELECTROSTATICS */
1200 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1201 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1203 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1205 /* Update potential sum for this i atom from the interaction with this j atom. */
1206 velec = _mm256_and_pd(velec,cutoff_mask);
1207 velec = _mm256_andnot_pd(dummy_mask,velec);
1208 velecsum = _mm256_add_pd(velecsum,velec);
1212 fscal = _mm256_and_pd(fscal,cutoff_mask);
1214 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1216 /* Calculate temporary vectorial force */
1217 tx = _mm256_mul_pd(fscal,dx32);
1218 ty = _mm256_mul_pd(fscal,dy32);
1219 tz = _mm256_mul_pd(fscal,dz32);
1221 /* Update vectorial force */
1222 fix3 = _mm256_add_pd(fix3,tx);
1223 fiy3 = _mm256_add_pd(fiy3,ty);
1224 fiz3 = _mm256_add_pd(fiz3,tz);
1226 fjx2 = _mm256_add_pd(fjx2,tx);
1227 fjy2 = _mm256_add_pd(fjy2,ty);
1228 fjz2 = _mm256_add_pd(fjz2,tz);
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1239 /* REACTION-FIELD ELECTROSTATICS */
1240 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1241 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1243 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1245 /* Update potential sum for this i atom from the interaction with this j atom. */
1246 velec = _mm256_and_pd(velec,cutoff_mask);
1247 velec = _mm256_andnot_pd(dummy_mask,velec);
1248 velecsum = _mm256_add_pd(velecsum,velec);
1252 fscal = _mm256_and_pd(fscal,cutoff_mask);
1254 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1256 /* Calculate temporary vectorial force */
1257 tx = _mm256_mul_pd(fscal,dx33);
1258 ty = _mm256_mul_pd(fscal,dy33);
1259 tz = _mm256_mul_pd(fscal,dz33);
1261 /* Update vectorial force */
1262 fix3 = _mm256_add_pd(fix3,tx);
1263 fiy3 = _mm256_add_pd(fiy3,ty);
1264 fiz3 = _mm256_add_pd(fiz3,tz);
1266 fjx3 = _mm256_add_pd(fjx3,tx);
1267 fjy3 = _mm256_add_pd(fjy3,ty);
1268 fjz3 = _mm256_add_pd(fjz3,tz);
1272 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1273 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1274 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1275 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1277 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1278 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1279 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1281 /* Inner loop uses 388 flops */
1284 /* End of innermost loop */
1286 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1287 f+i_coord_offset,fshift+i_shift_offset);
1290 /* Update potential energies */
1291 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1292 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1294 /* Increment number of inner iterations */
1295 inneriter += j_index_end - j_index_start;
1297 /* Outer loop uses 26 flops */
1300 /* Increment number of outer iterations */
1303 /* Update outer/inner flops */
1305 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*388);
1308 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_double
1309 * Electrostatics interaction: ReactionField
1310 * VdW interaction: CubicSplineTable
1311 * Geometry: Water4-Water4
1312 * Calculate force/pot: Force
1315 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_256_double
1316 (t_nblist * gmx_restrict nlist,
1317 rvec * gmx_restrict xx,
1318 rvec * gmx_restrict ff,
1319 t_forcerec * gmx_restrict fr,
1320 t_mdatoms * gmx_restrict mdatoms,
1321 nb_kernel_data_t * gmx_restrict kernel_data,
1322 t_nrnb * gmx_restrict nrnb)
1324 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1325 * just 0 for non-waters.
1326 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1327 * jnr indices corresponding to data put in the four positions in the SIMD register.
1329 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1330 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1331 int jnrA,jnrB,jnrC,jnrD;
1332 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1333 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1334 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1335 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1336 real rcutoff_scalar;
1337 real *shiftvec,*fshift,*x,*f;
1338 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1339 real scratch[4*DIM];
1340 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1341 real * vdwioffsetptr0;
1342 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1343 real * vdwioffsetptr1;
1344 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1345 real * vdwioffsetptr2;
1346 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1347 real * vdwioffsetptr3;
1348 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1349 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1350 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1351 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1352 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1353 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1354 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1355 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1356 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1357 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1358 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1359 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1360 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1361 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1362 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1363 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1364 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1365 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1366 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1367 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1370 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1373 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1374 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1376 __m128i ifour = _mm_set1_epi32(4);
1377 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1379 __m256d dummy_mask,cutoff_mask;
1380 __m128 tmpmask0,tmpmask1;
1381 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1382 __m256d one = _mm256_set1_pd(1.0);
1383 __m256d two = _mm256_set1_pd(2.0);
1389 jindex = nlist->jindex;
1391 shiftidx = nlist->shift;
1393 shiftvec = fr->shift_vec[0];
1394 fshift = fr->fshift[0];
1395 facel = _mm256_set1_pd(fr->epsfac);
1396 charge = mdatoms->chargeA;
1397 krf = _mm256_set1_pd(fr->ic->k_rf);
1398 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1399 crf = _mm256_set1_pd(fr->ic->c_rf);
1400 nvdwtype = fr->ntype;
1401 vdwparam = fr->nbfp;
1402 vdwtype = mdatoms->typeA;
1404 vftab = kernel_data->table_vdw->data;
1405 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1407 /* Setup water-specific parameters */
1408 inr = nlist->iinr[0];
1409 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1410 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1411 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1412 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1414 jq1 = _mm256_set1_pd(charge[inr+1]);
1415 jq2 = _mm256_set1_pd(charge[inr+2]);
1416 jq3 = _mm256_set1_pd(charge[inr+3]);
1417 vdwjidx0A = 2*vdwtype[inr+0];
1418 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1419 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1420 qq11 = _mm256_mul_pd(iq1,jq1);
1421 qq12 = _mm256_mul_pd(iq1,jq2);
1422 qq13 = _mm256_mul_pd(iq1,jq3);
1423 qq21 = _mm256_mul_pd(iq2,jq1);
1424 qq22 = _mm256_mul_pd(iq2,jq2);
1425 qq23 = _mm256_mul_pd(iq2,jq3);
1426 qq31 = _mm256_mul_pd(iq3,jq1);
1427 qq32 = _mm256_mul_pd(iq3,jq2);
1428 qq33 = _mm256_mul_pd(iq3,jq3);
1430 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1431 rcutoff_scalar = fr->rcoulomb;
1432 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1433 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1435 /* Avoid stupid compiler warnings */
1436 jnrA = jnrB = jnrC = jnrD = 0;
1437 j_coord_offsetA = 0;
1438 j_coord_offsetB = 0;
1439 j_coord_offsetC = 0;
1440 j_coord_offsetD = 0;
1445 for(iidx=0;iidx<4*DIM;iidx++)
1447 scratch[iidx] = 0.0;
1450 /* Start outer loop over neighborlists */
1451 for(iidx=0; iidx<nri; iidx++)
1453 /* Load shift vector for this list */
1454 i_shift_offset = DIM*shiftidx[iidx];
1456 /* Load limits for loop over neighbors */
1457 j_index_start = jindex[iidx];
1458 j_index_end = jindex[iidx+1];
1460 /* Get outer coordinate index */
1462 i_coord_offset = DIM*inr;
1464 /* Load i particle coords and add shift vector */
1465 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1466 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1468 fix0 = _mm256_setzero_pd();
1469 fiy0 = _mm256_setzero_pd();
1470 fiz0 = _mm256_setzero_pd();
1471 fix1 = _mm256_setzero_pd();
1472 fiy1 = _mm256_setzero_pd();
1473 fiz1 = _mm256_setzero_pd();
1474 fix2 = _mm256_setzero_pd();
1475 fiy2 = _mm256_setzero_pd();
1476 fiz2 = _mm256_setzero_pd();
1477 fix3 = _mm256_setzero_pd();
1478 fiy3 = _mm256_setzero_pd();
1479 fiz3 = _mm256_setzero_pd();
1481 /* Start inner kernel loop */
1482 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1485 /* Get j neighbor index, and coordinate index */
1487 jnrB = jjnr[jidx+1];
1488 jnrC = jjnr[jidx+2];
1489 jnrD = jjnr[jidx+3];
1490 j_coord_offsetA = DIM*jnrA;
1491 j_coord_offsetB = DIM*jnrB;
1492 j_coord_offsetC = DIM*jnrC;
1493 j_coord_offsetD = DIM*jnrD;
1495 /* load j atom coordinates */
1496 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1497 x+j_coord_offsetC,x+j_coord_offsetD,
1498 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1499 &jy2,&jz2,&jx3,&jy3,&jz3);
1501 /* Calculate displacement vector */
1502 dx00 = _mm256_sub_pd(ix0,jx0);
1503 dy00 = _mm256_sub_pd(iy0,jy0);
1504 dz00 = _mm256_sub_pd(iz0,jz0);
1505 dx11 = _mm256_sub_pd(ix1,jx1);
1506 dy11 = _mm256_sub_pd(iy1,jy1);
1507 dz11 = _mm256_sub_pd(iz1,jz1);
1508 dx12 = _mm256_sub_pd(ix1,jx2);
1509 dy12 = _mm256_sub_pd(iy1,jy2);
1510 dz12 = _mm256_sub_pd(iz1,jz2);
1511 dx13 = _mm256_sub_pd(ix1,jx3);
1512 dy13 = _mm256_sub_pd(iy1,jy3);
1513 dz13 = _mm256_sub_pd(iz1,jz3);
1514 dx21 = _mm256_sub_pd(ix2,jx1);
1515 dy21 = _mm256_sub_pd(iy2,jy1);
1516 dz21 = _mm256_sub_pd(iz2,jz1);
1517 dx22 = _mm256_sub_pd(ix2,jx2);
1518 dy22 = _mm256_sub_pd(iy2,jy2);
1519 dz22 = _mm256_sub_pd(iz2,jz2);
1520 dx23 = _mm256_sub_pd(ix2,jx3);
1521 dy23 = _mm256_sub_pd(iy2,jy3);
1522 dz23 = _mm256_sub_pd(iz2,jz3);
1523 dx31 = _mm256_sub_pd(ix3,jx1);
1524 dy31 = _mm256_sub_pd(iy3,jy1);
1525 dz31 = _mm256_sub_pd(iz3,jz1);
1526 dx32 = _mm256_sub_pd(ix3,jx2);
1527 dy32 = _mm256_sub_pd(iy3,jy2);
1528 dz32 = _mm256_sub_pd(iz3,jz2);
1529 dx33 = _mm256_sub_pd(ix3,jx3);
1530 dy33 = _mm256_sub_pd(iy3,jy3);
1531 dz33 = _mm256_sub_pd(iz3,jz3);
1533 /* Calculate squared distance and things based on it */
1534 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1535 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1536 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1537 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1538 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1539 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1540 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1541 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1542 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1543 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1545 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1546 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1547 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1548 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1549 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1550 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1551 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1552 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1553 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1554 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1556 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1557 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1558 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1559 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1560 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1561 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1562 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1563 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1564 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1566 fjx0 = _mm256_setzero_pd();
1567 fjy0 = _mm256_setzero_pd();
1568 fjz0 = _mm256_setzero_pd();
1569 fjx1 = _mm256_setzero_pd();
1570 fjy1 = _mm256_setzero_pd();
1571 fjz1 = _mm256_setzero_pd();
1572 fjx2 = _mm256_setzero_pd();
1573 fjy2 = _mm256_setzero_pd();
1574 fjz2 = _mm256_setzero_pd();
1575 fjx3 = _mm256_setzero_pd();
1576 fjy3 = _mm256_setzero_pd();
1577 fjz3 = _mm256_setzero_pd();
1579 /**************************
1580 * CALCULATE INTERACTIONS *
1581 **************************/
1583 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1586 r00 = _mm256_mul_pd(rsq00,rinv00);
1588 /* Calculate table index by multiplying r with table scale and truncate to integer */
1589 rt = _mm256_mul_pd(r00,vftabscale);
1590 vfitab = _mm256_cvttpd_epi32(rt);
1591 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1592 vfitab = _mm_slli_epi32(vfitab,3);
1594 /* CUBIC SPLINE TABLE DISPERSION */
1595 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1596 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1597 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1598 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1599 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1600 Heps = _mm256_mul_pd(vfeps,H);
1601 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1602 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1603 fvdw6 = _mm256_mul_pd(c6_00,FF);
1605 /* CUBIC SPLINE TABLE REPULSION */
1606 vfitab = _mm_add_epi32(vfitab,ifour);
1607 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1608 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1609 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1610 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1611 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1612 Heps = _mm256_mul_pd(vfeps,H);
1613 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1614 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1615 fvdw12 = _mm256_mul_pd(c12_00,FF);
1616 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1618 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1622 fscal = _mm256_and_pd(fscal,cutoff_mask);
1624 /* Calculate temporary vectorial force */
1625 tx = _mm256_mul_pd(fscal,dx00);
1626 ty = _mm256_mul_pd(fscal,dy00);
1627 tz = _mm256_mul_pd(fscal,dz00);
1629 /* Update vectorial force */
1630 fix0 = _mm256_add_pd(fix0,tx);
1631 fiy0 = _mm256_add_pd(fiy0,ty);
1632 fiz0 = _mm256_add_pd(fiz0,tz);
1634 fjx0 = _mm256_add_pd(fjx0,tx);
1635 fjy0 = _mm256_add_pd(fjy0,ty);
1636 fjz0 = _mm256_add_pd(fjz0,tz);
1640 /**************************
1641 * CALCULATE INTERACTIONS *
1642 **************************/
1644 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1647 /* REACTION-FIELD ELECTROSTATICS */
1648 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1650 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1654 fscal = _mm256_and_pd(fscal,cutoff_mask);
1656 /* Calculate temporary vectorial force */
1657 tx = _mm256_mul_pd(fscal,dx11);
1658 ty = _mm256_mul_pd(fscal,dy11);
1659 tz = _mm256_mul_pd(fscal,dz11);
1661 /* Update vectorial force */
1662 fix1 = _mm256_add_pd(fix1,tx);
1663 fiy1 = _mm256_add_pd(fiy1,ty);
1664 fiz1 = _mm256_add_pd(fiz1,tz);
1666 fjx1 = _mm256_add_pd(fjx1,tx);
1667 fjy1 = _mm256_add_pd(fjy1,ty);
1668 fjz1 = _mm256_add_pd(fjz1,tz);
1672 /**************************
1673 * CALCULATE INTERACTIONS *
1674 **************************/
1676 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1679 /* REACTION-FIELD ELECTROSTATICS */
1680 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1682 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1686 fscal = _mm256_and_pd(fscal,cutoff_mask);
1688 /* Calculate temporary vectorial force */
1689 tx = _mm256_mul_pd(fscal,dx12);
1690 ty = _mm256_mul_pd(fscal,dy12);
1691 tz = _mm256_mul_pd(fscal,dz12);
1693 /* Update vectorial force */
1694 fix1 = _mm256_add_pd(fix1,tx);
1695 fiy1 = _mm256_add_pd(fiy1,ty);
1696 fiz1 = _mm256_add_pd(fiz1,tz);
1698 fjx2 = _mm256_add_pd(fjx2,tx);
1699 fjy2 = _mm256_add_pd(fjy2,ty);
1700 fjz2 = _mm256_add_pd(fjz2,tz);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1711 /* REACTION-FIELD ELECTROSTATICS */
1712 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1714 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1718 fscal = _mm256_and_pd(fscal,cutoff_mask);
1720 /* Calculate temporary vectorial force */
1721 tx = _mm256_mul_pd(fscal,dx13);
1722 ty = _mm256_mul_pd(fscal,dy13);
1723 tz = _mm256_mul_pd(fscal,dz13);
1725 /* Update vectorial force */
1726 fix1 = _mm256_add_pd(fix1,tx);
1727 fiy1 = _mm256_add_pd(fiy1,ty);
1728 fiz1 = _mm256_add_pd(fiz1,tz);
1730 fjx3 = _mm256_add_pd(fjx3,tx);
1731 fjy3 = _mm256_add_pd(fjy3,ty);
1732 fjz3 = _mm256_add_pd(fjz3,tz);
1736 /**************************
1737 * CALCULATE INTERACTIONS *
1738 **************************/
1740 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1743 /* REACTION-FIELD ELECTROSTATICS */
1744 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1746 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1750 fscal = _mm256_and_pd(fscal,cutoff_mask);
1752 /* Calculate temporary vectorial force */
1753 tx = _mm256_mul_pd(fscal,dx21);
1754 ty = _mm256_mul_pd(fscal,dy21);
1755 tz = _mm256_mul_pd(fscal,dz21);
1757 /* Update vectorial force */
1758 fix2 = _mm256_add_pd(fix2,tx);
1759 fiy2 = _mm256_add_pd(fiy2,ty);
1760 fiz2 = _mm256_add_pd(fiz2,tz);
1762 fjx1 = _mm256_add_pd(fjx1,tx);
1763 fjy1 = _mm256_add_pd(fjy1,ty);
1764 fjz1 = _mm256_add_pd(fjz1,tz);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1775 /* REACTION-FIELD ELECTROSTATICS */
1776 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1778 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1782 fscal = _mm256_and_pd(fscal,cutoff_mask);
1784 /* Calculate temporary vectorial force */
1785 tx = _mm256_mul_pd(fscal,dx22);
1786 ty = _mm256_mul_pd(fscal,dy22);
1787 tz = _mm256_mul_pd(fscal,dz22);
1789 /* Update vectorial force */
1790 fix2 = _mm256_add_pd(fix2,tx);
1791 fiy2 = _mm256_add_pd(fiy2,ty);
1792 fiz2 = _mm256_add_pd(fiz2,tz);
1794 fjx2 = _mm256_add_pd(fjx2,tx);
1795 fjy2 = _mm256_add_pd(fjy2,ty);
1796 fjz2 = _mm256_add_pd(fjz2,tz);
1800 /**************************
1801 * CALCULATE INTERACTIONS *
1802 **************************/
1804 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1807 /* REACTION-FIELD ELECTROSTATICS */
1808 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1810 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1814 fscal = _mm256_and_pd(fscal,cutoff_mask);
1816 /* Calculate temporary vectorial force */
1817 tx = _mm256_mul_pd(fscal,dx23);
1818 ty = _mm256_mul_pd(fscal,dy23);
1819 tz = _mm256_mul_pd(fscal,dz23);
1821 /* Update vectorial force */
1822 fix2 = _mm256_add_pd(fix2,tx);
1823 fiy2 = _mm256_add_pd(fiy2,ty);
1824 fiz2 = _mm256_add_pd(fiz2,tz);
1826 fjx3 = _mm256_add_pd(fjx3,tx);
1827 fjy3 = _mm256_add_pd(fjy3,ty);
1828 fjz3 = _mm256_add_pd(fjz3,tz);
1832 /**************************
1833 * CALCULATE INTERACTIONS *
1834 **************************/
1836 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1839 /* REACTION-FIELD ELECTROSTATICS */
1840 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1842 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1846 fscal = _mm256_and_pd(fscal,cutoff_mask);
1848 /* Calculate temporary vectorial force */
1849 tx = _mm256_mul_pd(fscal,dx31);
1850 ty = _mm256_mul_pd(fscal,dy31);
1851 tz = _mm256_mul_pd(fscal,dz31);
1853 /* Update vectorial force */
1854 fix3 = _mm256_add_pd(fix3,tx);
1855 fiy3 = _mm256_add_pd(fiy3,ty);
1856 fiz3 = _mm256_add_pd(fiz3,tz);
1858 fjx1 = _mm256_add_pd(fjx1,tx);
1859 fjy1 = _mm256_add_pd(fjy1,ty);
1860 fjz1 = _mm256_add_pd(fjz1,tz);
1864 /**************************
1865 * CALCULATE INTERACTIONS *
1866 **************************/
1868 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1871 /* REACTION-FIELD ELECTROSTATICS */
1872 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1874 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1878 fscal = _mm256_and_pd(fscal,cutoff_mask);
1880 /* Calculate temporary vectorial force */
1881 tx = _mm256_mul_pd(fscal,dx32);
1882 ty = _mm256_mul_pd(fscal,dy32);
1883 tz = _mm256_mul_pd(fscal,dz32);
1885 /* Update vectorial force */
1886 fix3 = _mm256_add_pd(fix3,tx);
1887 fiy3 = _mm256_add_pd(fiy3,ty);
1888 fiz3 = _mm256_add_pd(fiz3,tz);
1890 fjx2 = _mm256_add_pd(fjx2,tx);
1891 fjy2 = _mm256_add_pd(fjy2,ty);
1892 fjz2 = _mm256_add_pd(fjz2,tz);
1896 /**************************
1897 * CALCULATE INTERACTIONS *
1898 **************************/
1900 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1903 /* REACTION-FIELD ELECTROSTATICS */
1904 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1906 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1910 fscal = _mm256_and_pd(fscal,cutoff_mask);
1912 /* Calculate temporary vectorial force */
1913 tx = _mm256_mul_pd(fscal,dx33);
1914 ty = _mm256_mul_pd(fscal,dy33);
1915 tz = _mm256_mul_pd(fscal,dz33);
1917 /* Update vectorial force */
1918 fix3 = _mm256_add_pd(fix3,tx);
1919 fiy3 = _mm256_add_pd(fiy3,ty);
1920 fiz3 = _mm256_add_pd(fiz3,tz);
1922 fjx3 = _mm256_add_pd(fjx3,tx);
1923 fjy3 = _mm256_add_pd(fjy3,ty);
1924 fjz3 = _mm256_add_pd(fjz3,tz);
1928 fjptrA = f+j_coord_offsetA;
1929 fjptrB = f+j_coord_offsetB;
1930 fjptrC = f+j_coord_offsetC;
1931 fjptrD = f+j_coord_offsetD;
1933 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1934 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1935 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1937 /* Inner loop uses 324 flops */
1940 if(jidx<j_index_end)
1943 /* Get j neighbor index, and coordinate index */
1944 jnrlistA = jjnr[jidx];
1945 jnrlistB = jjnr[jidx+1];
1946 jnrlistC = jjnr[jidx+2];
1947 jnrlistD = jjnr[jidx+3];
1948 /* Sign of each element will be negative for non-real atoms.
1949 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1950 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1952 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1954 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1955 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1956 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1958 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1959 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1960 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1961 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1962 j_coord_offsetA = DIM*jnrA;
1963 j_coord_offsetB = DIM*jnrB;
1964 j_coord_offsetC = DIM*jnrC;
1965 j_coord_offsetD = DIM*jnrD;
1967 /* load j atom coordinates */
1968 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1969 x+j_coord_offsetC,x+j_coord_offsetD,
1970 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1971 &jy2,&jz2,&jx3,&jy3,&jz3);
1973 /* Calculate displacement vector */
1974 dx00 = _mm256_sub_pd(ix0,jx0);
1975 dy00 = _mm256_sub_pd(iy0,jy0);
1976 dz00 = _mm256_sub_pd(iz0,jz0);
1977 dx11 = _mm256_sub_pd(ix1,jx1);
1978 dy11 = _mm256_sub_pd(iy1,jy1);
1979 dz11 = _mm256_sub_pd(iz1,jz1);
1980 dx12 = _mm256_sub_pd(ix1,jx2);
1981 dy12 = _mm256_sub_pd(iy1,jy2);
1982 dz12 = _mm256_sub_pd(iz1,jz2);
1983 dx13 = _mm256_sub_pd(ix1,jx3);
1984 dy13 = _mm256_sub_pd(iy1,jy3);
1985 dz13 = _mm256_sub_pd(iz1,jz3);
1986 dx21 = _mm256_sub_pd(ix2,jx1);
1987 dy21 = _mm256_sub_pd(iy2,jy1);
1988 dz21 = _mm256_sub_pd(iz2,jz1);
1989 dx22 = _mm256_sub_pd(ix2,jx2);
1990 dy22 = _mm256_sub_pd(iy2,jy2);
1991 dz22 = _mm256_sub_pd(iz2,jz2);
1992 dx23 = _mm256_sub_pd(ix2,jx3);
1993 dy23 = _mm256_sub_pd(iy2,jy3);
1994 dz23 = _mm256_sub_pd(iz2,jz3);
1995 dx31 = _mm256_sub_pd(ix3,jx1);
1996 dy31 = _mm256_sub_pd(iy3,jy1);
1997 dz31 = _mm256_sub_pd(iz3,jz1);
1998 dx32 = _mm256_sub_pd(ix3,jx2);
1999 dy32 = _mm256_sub_pd(iy3,jy2);
2000 dz32 = _mm256_sub_pd(iz3,jz2);
2001 dx33 = _mm256_sub_pd(ix3,jx3);
2002 dy33 = _mm256_sub_pd(iy3,jy3);
2003 dz33 = _mm256_sub_pd(iz3,jz3);
2005 /* Calculate squared distance and things based on it */
2006 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2007 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2008 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2009 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2010 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2011 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2012 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2013 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2014 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2015 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2017 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2018 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2019 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2020 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2021 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2022 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2023 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2024 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2025 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2026 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2028 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2029 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2030 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2031 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2032 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2033 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2034 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2035 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2036 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2038 fjx0 = _mm256_setzero_pd();
2039 fjy0 = _mm256_setzero_pd();
2040 fjz0 = _mm256_setzero_pd();
2041 fjx1 = _mm256_setzero_pd();
2042 fjy1 = _mm256_setzero_pd();
2043 fjz1 = _mm256_setzero_pd();
2044 fjx2 = _mm256_setzero_pd();
2045 fjy2 = _mm256_setzero_pd();
2046 fjz2 = _mm256_setzero_pd();
2047 fjx3 = _mm256_setzero_pd();
2048 fjy3 = _mm256_setzero_pd();
2049 fjz3 = _mm256_setzero_pd();
2051 /**************************
2052 * CALCULATE INTERACTIONS *
2053 **************************/
2055 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2058 r00 = _mm256_mul_pd(rsq00,rinv00);
2059 r00 = _mm256_andnot_pd(dummy_mask,r00);
2061 /* Calculate table index by multiplying r with table scale and truncate to integer */
2062 rt = _mm256_mul_pd(r00,vftabscale);
2063 vfitab = _mm256_cvttpd_epi32(rt);
2064 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2065 vfitab = _mm_slli_epi32(vfitab,3);
2067 /* CUBIC SPLINE TABLE DISPERSION */
2068 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2069 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2070 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2071 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2072 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2073 Heps = _mm256_mul_pd(vfeps,H);
2074 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2075 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2076 fvdw6 = _mm256_mul_pd(c6_00,FF);
2078 /* CUBIC SPLINE TABLE REPULSION */
2079 vfitab = _mm_add_epi32(vfitab,ifour);
2080 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2081 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2082 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2083 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2084 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2085 Heps = _mm256_mul_pd(vfeps,H);
2086 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2087 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2088 fvdw12 = _mm256_mul_pd(c12_00,FF);
2089 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2091 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2095 fscal = _mm256_and_pd(fscal,cutoff_mask);
2097 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2099 /* Calculate temporary vectorial force */
2100 tx = _mm256_mul_pd(fscal,dx00);
2101 ty = _mm256_mul_pd(fscal,dy00);
2102 tz = _mm256_mul_pd(fscal,dz00);
2104 /* Update vectorial force */
2105 fix0 = _mm256_add_pd(fix0,tx);
2106 fiy0 = _mm256_add_pd(fiy0,ty);
2107 fiz0 = _mm256_add_pd(fiz0,tz);
2109 fjx0 = _mm256_add_pd(fjx0,tx);
2110 fjy0 = _mm256_add_pd(fjy0,ty);
2111 fjz0 = _mm256_add_pd(fjz0,tz);
2115 /**************************
2116 * CALCULATE INTERACTIONS *
2117 **************************/
2119 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2122 /* REACTION-FIELD ELECTROSTATICS */
2123 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2125 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2129 fscal = _mm256_and_pd(fscal,cutoff_mask);
2131 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2133 /* Calculate temporary vectorial force */
2134 tx = _mm256_mul_pd(fscal,dx11);
2135 ty = _mm256_mul_pd(fscal,dy11);
2136 tz = _mm256_mul_pd(fscal,dz11);
2138 /* Update vectorial force */
2139 fix1 = _mm256_add_pd(fix1,tx);
2140 fiy1 = _mm256_add_pd(fiy1,ty);
2141 fiz1 = _mm256_add_pd(fiz1,tz);
2143 fjx1 = _mm256_add_pd(fjx1,tx);
2144 fjy1 = _mm256_add_pd(fjy1,ty);
2145 fjz1 = _mm256_add_pd(fjz1,tz);
2149 /**************************
2150 * CALCULATE INTERACTIONS *
2151 **************************/
2153 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2156 /* REACTION-FIELD ELECTROSTATICS */
2157 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2159 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2163 fscal = _mm256_and_pd(fscal,cutoff_mask);
2165 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2167 /* Calculate temporary vectorial force */
2168 tx = _mm256_mul_pd(fscal,dx12);
2169 ty = _mm256_mul_pd(fscal,dy12);
2170 tz = _mm256_mul_pd(fscal,dz12);
2172 /* Update vectorial force */
2173 fix1 = _mm256_add_pd(fix1,tx);
2174 fiy1 = _mm256_add_pd(fiy1,ty);
2175 fiz1 = _mm256_add_pd(fiz1,tz);
2177 fjx2 = _mm256_add_pd(fjx2,tx);
2178 fjy2 = _mm256_add_pd(fjy2,ty);
2179 fjz2 = _mm256_add_pd(fjz2,tz);
2183 /**************************
2184 * CALCULATE INTERACTIONS *
2185 **************************/
2187 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2190 /* REACTION-FIELD ELECTROSTATICS */
2191 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2193 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2197 fscal = _mm256_and_pd(fscal,cutoff_mask);
2199 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2201 /* Calculate temporary vectorial force */
2202 tx = _mm256_mul_pd(fscal,dx13);
2203 ty = _mm256_mul_pd(fscal,dy13);
2204 tz = _mm256_mul_pd(fscal,dz13);
2206 /* Update vectorial force */
2207 fix1 = _mm256_add_pd(fix1,tx);
2208 fiy1 = _mm256_add_pd(fiy1,ty);
2209 fiz1 = _mm256_add_pd(fiz1,tz);
2211 fjx3 = _mm256_add_pd(fjx3,tx);
2212 fjy3 = _mm256_add_pd(fjy3,ty);
2213 fjz3 = _mm256_add_pd(fjz3,tz);
2217 /**************************
2218 * CALCULATE INTERACTIONS *
2219 **************************/
2221 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2224 /* REACTION-FIELD ELECTROSTATICS */
2225 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2227 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2231 fscal = _mm256_and_pd(fscal,cutoff_mask);
2233 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2235 /* Calculate temporary vectorial force */
2236 tx = _mm256_mul_pd(fscal,dx21);
2237 ty = _mm256_mul_pd(fscal,dy21);
2238 tz = _mm256_mul_pd(fscal,dz21);
2240 /* Update vectorial force */
2241 fix2 = _mm256_add_pd(fix2,tx);
2242 fiy2 = _mm256_add_pd(fiy2,ty);
2243 fiz2 = _mm256_add_pd(fiz2,tz);
2245 fjx1 = _mm256_add_pd(fjx1,tx);
2246 fjy1 = _mm256_add_pd(fjy1,ty);
2247 fjz1 = _mm256_add_pd(fjz1,tz);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2258 /* REACTION-FIELD ELECTROSTATICS */
2259 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2261 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2265 fscal = _mm256_and_pd(fscal,cutoff_mask);
2267 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2269 /* Calculate temporary vectorial force */
2270 tx = _mm256_mul_pd(fscal,dx22);
2271 ty = _mm256_mul_pd(fscal,dy22);
2272 tz = _mm256_mul_pd(fscal,dz22);
2274 /* Update vectorial force */
2275 fix2 = _mm256_add_pd(fix2,tx);
2276 fiy2 = _mm256_add_pd(fiy2,ty);
2277 fiz2 = _mm256_add_pd(fiz2,tz);
2279 fjx2 = _mm256_add_pd(fjx2,tx);
2280 fjy2 = _mm256_add_pd(fjy2,ty);
2281 fjz2 = _mm256_add_pd(fjz2,tz);
2285 /**************************
2286 * CALCULATE INTERACTIONS *
2287 **************************/
2289 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2292 /* REACTION-FIELD ELECTROSTATICS */
2293 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2295 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2299 fscal = _mm256_and_pd(fscal,cutoff_mask);
2301 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2303 /* Calculate temporary vectorial force */
2304 tx = _mm256_mul_pd(fscal,dx23);
2305 ty = _mm256_mul_pd(fscal,dy23);
2306 tz = _mm256_mul_pd(fscal,dz23);
2308 /* Update vectorial force */
2309 fix2 = _mm256_add_pd(fix2,tx);
2310 fiy2 = _mm256_add_pd(fiy2,ty);
2311 fiz2 = _mm256_add_pd(fiz2,tz);
2313 fjx3 = _mm256_add_pd(fjx3,tx);
2314 fjy3 = _mm256_add_pd(fjy3,ty);
2315 fjz3 = _mm256_add_pd(fjz3,tz);
2319 /**************************
2320 * CALCULATE INTERACTIONS *
2321 **************************/
2323 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2326 /* REACTION-FIELD ELECTROSTATICS */
2327 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2329 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2333 fscal = _mm256_and_pd(fscal,cutoff_mask);
2335 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2337 /* Calculate temporary vectorial force */
2338 tx = _mm256_mul_pd(fscal,dx31);
2339 ty = _mm256_mul_pd(fscal,dy31);
2340 tz = _mm256_mul_pd(fscal,dz31);
2342 /* Update vectorial force */
2343 fix3 = _mm256_add_pd(fix3,tx);
2344 fiy3 = _mm256_add_pd(fiy3,ty);
2345 fiz3 = _mm256_add_pd(fiz3,tz);
2347 fjx1 = _mm256_add_pd(fjx1,tx);
2348 fjy1 = _mm256_add_pd(fjy1,ty);
2349 fjz1 = _mm256_add_pd(fjz1,tz);
2353 /**************************
2354 * CALCULATE INTERACTIONS *
2355 **************************/
2357 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2360 /* REACTION-FIELD ELECTROSTATICS */
2361 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2363 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2367 fscal = _mm256_and_pd(fscal,cutoff_mask);
2369 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2371 /* Calculate temporary vectorial force */
2372 tx = _mm256_mul_pd(fscal,dx32);
2373 ty = _mm256_mul_pd(fscal,dy32);
2374 tz = _mm256_mul_pd(fscal,dz32);
2376 /* Update vectorial force */
2377 fix3 = _mm256_add_pd(fix3,tx);
2378 fiy3 = _mm256_add_pd(fiy3,ty);
2379 fiz3 = _mm256_add_pd(fiz3,tz);
2381 fjx2 = _mm256_add_pd(fjx2,tx);
2382 fjy2 = _mm256_add_pd(fjy2,ty);
2383 fjz2 = _mm256_add_pd(fjz2,tz);
2387 /**************************
2388 * CALCULATE INTERACTIONS *
2389 **************************/
2391 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2394 /* REACTION-FIELD ELECTROSTATICS */
2395 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2397 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2401 fscal = _mm256_and_pd(fscal,cutoff_mask);
2403 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2405 /* Calculate temporary vectorial force */
2406 tx = _mm256_mul_pd(fscal,dx33);
2407 ty = _mm256_mul_pd(fscal,dy33);
2408 tz = _mm256_mul_pd(fscal,dz33);
2410 /* Update vectorial force */
2411 fix3 = _mm256_add_pd(fix3,tx);
2412 fiy3 = _mm256_add_pd(fiy3,ty);
2413 fiz3 = _mm256_add_pd(fiz3,tz);
2415 fjx3 = _mm256_add_pd(fjx3,tx);
2416 fjy3 = _mm256_add_pd(fjy3,ty);
2417 fjz3 = _mm256_add_pd(fjz3,tz);
2421 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2422 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2423 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2424 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2426 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2427 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2428 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2430 /* Inner loop uses 325 flops */
2433 /* End of innermost loop */
2435 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2436 f+i_coord_offset,fshift+i_shift_offset);
2438 /* Increment number of inner iterations */
2439 inneriter += j_index_end - j_index_start;
2441 /* Outer loop uses 24 flops */
2444 /* Increment number of outer iterations */
2447 /* Update outer/inner flops */
2449 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);