2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
100 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
101 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
103 __m256d dummy_mask,cutoff_mask;
104 __m128 tmpmask0,tmpmask1;
105 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
106 __m256d one = _mm256_set1_pd(1.0);
107 __m256d two = _mm256_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm256_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
126 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
127 beta2 = _mm256_mul_pd(beta,beta);
128 beta3 = _mm256_mul_pd(beta,beta2);
130 ewtab = fr->ic->tabq_coul_FDV0;
131 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
132 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
134 /* Setup water-specific parameters */
135 inr = nlist->iinr[0];
136 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
137 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
138 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
139 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
141 jq0 = _mm256_set1_pd(charge[inr+0]);
142 jq1 = _mm256_set1_pd(charge[inr+1]);
143 jq2 = _mm256_set1_pd(charge[inr+2]);
144 vdwjidx0A = 2*vdwtype[inr+0];
145 qq00 = _mm256_mul_pd(iq0,jq0);
146 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
147 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
148 qq01 = _mm256_mul_pd(iq0,jq1);
149 qq02 = _mm256_mul_pd(iq0,jq2);
150 qq10 = _mm256_mul_pd(iq1,jq0);
151 qq11 = _mm256_mul_pd(iq1,jq1);
152 qq12 = _mm256_mul_pd(iq1,jq2);
153 qq20 = _mm256_mul_pd(iq2,jq0);
154 qq21 = _mm256_mul_pd(iq2,jq1);
155 qq22 = _mm256_mul_pd(iq2,jq2);
157 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
158 rcutoff_scalar = fr->rcoulomb;
159 rcutoff = _mm256_set1_pd(rcutoff_scalar);
160 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
162 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
163 rvdw = _mm256_set1_pd(fr->rvdw);
165 /* Avoid stupid compiler warnings */
166 jnrA = jnrB = jnrC = jnrD = 0;
175 for(iidx=0;iidx<4*DIM;iidx++)
180 /* Start outer loop over neighborlists */
181 for(iidx=0; iidx<nri; iidx++)
183 /* Load shift vector for this list */
184 i_shift_offset = DIM*shiftidx[iidx];
186 /* Load limits for loop over neighbors */
187 j_index_start = jindex[iidx];
188 j_index_end = jindex[iidx+1];
190 /* Get outer coordinate index */
192 i_coord_offset = DIM*inr;
194 /* Load i particle coords and add shift vector */
195 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
196 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
198 fix0 = _mm256_setzero_pd();
199 fiy0 = _mm256_setzero_pd();
200 fiz0 = _mm256_setzero_pd();
201 fix1 = _mm256_setzero_pd();
202 fiy1 = _mm256_setzero_pd();
203 fiz1 = _mm256_setzero_pd();
204 fix2 = _mm256_setzero_pd();
205 fiy2 = _mm256_setzero_pd();
206 fiz2 = _mm256_setzero_pd();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_pd();
210 vvdwsum = _mm256_setzero_pd();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
216 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA = DIM*jnrA;
222 j_coord_offsetB = DIM*jnrB;
223 j_coord_offsetC = DIM*jnrC;
224 j_coord_offsetD = DIM*jnrD;
226 /* load j atom coordinates */
227 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
231 /* Calculate displacement vector */
232 dx00 = _mm256_sub_pd(ix0,jx0);
233 dy00 = _mm256_sub_pd(iy0,jy0);
234 dz00 = _mm256_sub_pd(iz0,jz0);
235 dx01 = _mm256_sub_pd(ix0,jx1);
236 dy01 = _mm256_sub_pd(iy0,jy1);
237 dz01 = _mm256_sub_pd(iz0,jz1);
238 dx02 = _mm256_sub_pd(ix0,jx2);
239 dy02 = _mm256_sub_pd(iy0,jy2);
240 dz02 = _mm256_sub_pd(iz0,jz2);
241 dx10 = _mm256_sub_pd(ix1,jx0);
242 dy10 = _mm256_sub_pd(iy1,jy0);
243 dz10 = _mm256_sub_pd(iz1,jz0);
244 dx11 = _mm256_sub_pd(ix1,jx1);
245 dy11 = _mm256_sub_pd(iy1,jy1);
246 dz11 = _mm256_sub_pd(iz1,jz1);
247 dx12 = _mm256_sub_pd(ix1,jx2);
248 dy12 = _mm256_sub_pd(iy1,jy2);
249 dz12 = _mm256_sub_pd(iz1,jz2);
250 dx20 = _mm256_sub_pd(ix2,jx0);
251 dy20 = _mm256_sub_pd(iy2,jy0);
252 dz20 = _mm256_sub_pd(iz2,jz0);
253 dx21 = _mm256_sub_pd(ix2,jx1);
254 dy21 = _mm256_sub_pd(iy2,jy1);
255 dz21 = _mm256_sub_pd(iz2,jz1);
256 dx22 = _mm256_sub_pd(ix2,jx2);
257 dy22 = _mm256_sub_pd(iy2,jy2);
258 dz22 = _mm256_sub_pd(iz2,jz2);
260 /* Calculate squared distance and things based on it */
261 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
262 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
263 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
264 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
265 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
266 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
267 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
268 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
269 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
271 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
272 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
273 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
274 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
275 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
276 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
277 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
278 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
279 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
281 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
282 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
283 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
284 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
285 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
286 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
287 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
288 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
289 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
291 fjx0 = _mm256_setzero_pd();
292 fjy0 = _mm256_setzero_pd();
293 fjz0 = _mm256_setzero_pd();
294 fjx1 = _mm256_setzero_pd();
295 fjy1 = _mm256_setzero_pd();
296 fjz1 = _mm256_setzero_pd();
297 fjx2 = _mm256_setzero_pd();
298 fjy2 = _mm256_setzero_pd();
299 fjz2 = _mm256_setzero_pd();
301 /**************************
302 * CALCULATE INTERACTIONS *
303 **************************/
305 if (gmx_mm256_any_lt(rsq00,rcutoff2))
308 r00 = _mm256_mul_pd(rsq00,rinv00);
310 /* EWALD ELECTROSTATICS */
312 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
313 ewrt = _mm256_mul_pd(r00,ewtabscale);
314 ewitab = _mm256_cvttpd_epi32(ewrt);
315 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
316 ewitab = _mm_slli_epi32(ewitab,2);
317 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
318 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
319 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
320 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
321 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
322 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
323 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
324 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
325 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
327 /* LENNARD-JONES DISPERSION/REPULSION */
329 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
330 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
331 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
332 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
333 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
334 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
336 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 velec = _mm256_and_pd(velec,cutoff_mask);
340 velecsum = _mm256_add_pd(velecsum,velec);
341 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
342 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
344 fscal = _mm256_add_pd(felec,fvdw);
346 fscal = _mm256_and_pd(fscal,cutoff_mask);
348 /* Calculate temporary vectorial force */
349 tx = _mm256_mul_pd(fscal,dx00);
350 ty = _mm256_mul_pd(fscal,dy00);
351 tz = _mm256_mul_pd(fscal,dz00);
353 /* Update vectorial force */
354 fix0 = _mm256_add_pd(fix0,tx);
355 fiy0 = _mm256_add_pd(fiy0,ty);
356 fiz0 = _mm256_add_pd(fiz0,tz);
358 fjx0 = _mm256_add_pd(fjx0,tx);
359 fjy0 = _mm256_add_pd(fjy0,ty);
360 fjz0 = _mm256_add_pd(fjz0,tz);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 if (gmx_mm256_any_lt(rsq01,rcutoff2))
371 r01 = _mm256_mul_pd(rsq01,rinv01);
373 /* EWALD ELECTROSTATICS */
375 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
376 ewrt = _mm256_mul_pd(r01,ewtabscale);
377 ewitab = _mm256_cvttpd_epi32(ewrt);
378 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
379 ewitab = _mm_slli_epi32(ewitab,2);
380 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
381 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
382 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
383 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
384 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
385 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
386 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
387 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
388 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
390 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
392 /* Update potential sum for this i atom from the interaction with this j atom. */
393 velec = _mm256_and_pd(velec,cutoff_mask);
394 velecsum = _mm256_add_pd(velecsum,velec);
398 fscal = _mm256_and_pd(fscal,cutoff_mask);
400 /* Calculate temporary vectorial force */
401 tx = _mm256_mul_pd(fscal,dx01);
402 ty = _mm256_mul_pd(fscal,dy01);
403 tz = _mm256_mul_pd(fscal,dz01);
405 /* Update vectorial force */
406 fix0 = _mm256_add_pd(fix0,tx);
407 fiy0 = _mm256_add_pd(fiy0,ty);
408 fiz0 = _mm256_add_pd(fiz0,tz);
410 fjx1 = _mm256_add_pd(fjx1,tx);
411 fjy1 = _mm256_add_pd(fjy1,ty);
412 fjz1 = _mm256_add_pd(fjz1,tz);
416 /**************************
417 * CALCULATE INTERACTIONS *
418 **************************/
420 if (gmx_mm256_any_lt(rsq02,rcutoff2))
423 r02 = _mm256_mul_pd(rsq02,rinv02);
425 /* EWALD ELECTROSTATICS */
427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
428 ewrt = _mm256_mul_pd(r02,ewtabscale);
429 ewitab = _mm256_cvttpd_epi32(ewrt);
430 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
431 ewitab = _mm_slli_epi32(ewitab,2);
432 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
433 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
434 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
435 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
436 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
437 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
438 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
439 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
440 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
442 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
444 /* Update potential sum for this i atom from the interaction with this j atom. */
445 velec = _mm256_and_pd(velec,cutoff_mask);
446 velecsum = _mm256_add_pd(velecsum,velec);
450 fscal = _mm256_and_pd(fscal,cutoff_mask);
452 /* Calculate temporary vectorial force */
453 tx = _mm256_mul_pd(fscal,dx02);
454 ty = _mm256_mul_pd(fscal,dy02);
455 tz = _mm256_mul_pd(fscal,dz02);
457 /* Update vectorial force */
458 fix0 = _mm256_add_pd(fix0,tx);
459 fiy0 = _mm256_add_pd(fiy0,ty);
460 fiz0 = _mm256_add_pd(fiz0,tz);
462 fjx2 = _mm256_add_pd(fjx2,tx);
463 fjy2 = _mm256_add_pd(fjy2,ty);
464 fjz2 = _mm256_add_pd(fjz2,tz);
468 /**************************
469 * CALCULATE INTERACTIONS *
470 **************************/
472 if (gmx_mm256_any_lt(rsq10,rcutoff2))
475 r10 = _mm256_mul_pd(rsq10,rinv10);
477 /* EWALD ELECTROSTATICS */
479 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
480 ewrt = _mm256_mul_pd(r10,ewtabscale);
481 ewitab = _mm256_cvttpd_epi32(ewrt);
482 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
483 ewitab = _mm_slli_epi32(ewitab,2);
484 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
485 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
486 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
487 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
488 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
489 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
490 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
491 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
492 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
494 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec = _mm256_and_pd(velec,cutoff_mask);
498 velecsum = _mm256_add_pd(velecsum,velec);
502 fscal = _mm256_and_pd(fscal,cutoff_mask);
504 /* Calculate temporary vectorial force */
505 tx = _mm256_mul_pd(fscal,dx10);
506 ty = _mm256_mul_pd(fscal,dy10);
507 tz = _mm256_mul_pd(fscal,dz10);
509 /* Update vectorial force */
510 fix1 = _mm256_add_pd(fix1,tx);
511 fiy1 = _mm256_add_pd(fiy1,ty);
512 fiz1 = _mm256_add_pd(fiz1,tz);
514 fjx0 = _mm256_add_pd(fjx0,tx);
515 fjy0 = _mm256_add_pd(fjy0,ty);
516 fjz0 = _mm256_add_pd(fjz0,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm256_any_lt(rsq11,rcutoff2))
527 r11 = _mm256_mul_pd(rsq11,rinv11);
529 /* EWALD ELECTROSTATICS */
531 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
532 ewrt = _mm256_mul_pd(r11,ewtabscale);
533 ewitab = _mm256_cvttpd_epi32(ewrt);
534 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
535 ewitab = _mm_slli_epi32(ewitab,2);
536 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
537 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
538 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
539 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
540 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
541 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
542 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
543 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
544 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
546 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velec = _mm256_and_pd(velec,cutoff_mask);
550 velecsum = _mm256_add_pd(velecsum,velec);
554 fscal = _mm256_and_pd(fscal,cutoff_mask);
556 /* Calculate temporary vectorial force */
557 tx = _mm256_mul_pd(fscal,dx11);
558 ty = _mm256_mul_pd(fscal,dy11);
559 tz = _mm256_mul_pd(fscal,dz11);
561 /* Update vectorial force */
562 fix1 = _mm256_add_pd(fix1,tx);
563 fiy1 = _mm256_add_pd(fiy1,ty);
564 fiz1 = _mm256_add_pd(fiz1,tz);
566 fjx1 = _mm256_add_pd(fjx1,tx);
567 fjy1 = _mm256_add_pd(fjy1,ty);
568 fjz1 = _mm256_add_pd(fjz1,tz);
572 /**************************
573 * CALCULATE INTERACTIONS *
574 **************************/
576 if (gmx_mm256_any_lt(rsq12,rcutoff2))
579 r12 = _mm256_mul_pd(rsq12,rinv12);
581 /* EWALD ELECTROSTATICS */
583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584 ewrt = _mm256_mul_pd(r12,ewtabscale);
585 ewitab = _mm256_cvttpd_epi32(ewrt);
586 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
587 ewitab = _mm_slli_epi32(ewitab,2);
588 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
589 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
590 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
591 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
592 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
593 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
594 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
595 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
596 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
598 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
600 /* Update potential sum for this i atom from the interaction with this j atom. */
601 velec = _mm256_and_pd(velec,cutoff_mask);
602 velecsum = _mm256_add_pd(velecsum,velec);
606 fscal = _mm256_and_pd(fscal,cutoff_mask);
608 /* Calculate temporary vectorial force */
609 tx = _mm256_mul_pd(fscal,dx12);
610 ty = _mm256_mul_pd(fscal,dy12);
611 tz = _mm256_mul_pd(fscal,dz12);
613 /* Update vectorial force */
614 fix1 = _mm256_add_pd(fix1,tx);
615 fiy1 = _mm256_add_pd(fiy1,ty);
616 fiz1 = _mm256_add_pd(fiz1,tz);
618 fjx2 = _mm256_add_pd(fjx2,tx);
619 fjy2 = _mm256_add_pd(fjy2,ty);
620 fjz2 = _mm256_add_pd(fjz2,tz);
624 /**************************
625 * CALCULATE INTERACTIONS *
626 **************************/
628 if (gmx_mm256_any_lt(rsq20,rcutoff2))
631 r20 = _mm256_mul_pd(rsq20,rinv20);
633 /* EWALD ELECTROSTATICS */
635 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
636 ewrt = _mm256_mul_pd(r20,ewtabscale);
637 ewitab = _mm256_cvttpd_epi32(ewrt);
638 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
639 ewitab = _mm_slli_epi32(ewitab,2);
640 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
641 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
642 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
643 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
644 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
645 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
646 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
647 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
648 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
650 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velec = _mm256_and_pd(velec,cutoff_mask);
654 velecsum = _mm256_add_pd(velecsum,velec);
658 fscal = _mm256_and_pd(fscal,cutoff_mask);
660 /* Calculate temporary vectorial force */
661 tx = _mm256_mul_pd(fscal,dx20);
662 ty = _mm256_mul_pd(fscal,dy20);
663 tz = _mm256_mul_pd(fscal,dz20);
665 /* Update vectorial force */
666 fix2 = _mm256_add_pd(fix2,tx);
667 fiy2 = _mm256_add_pd(fiy2,ty);
668 fiz2 = _mm256_add_pd(fiz2,tz);
670 fjx0 = _mm256_add_pd(fjx0,tx);
671 fjy0 = _mm256_add_pd(fjy0,ty);
672 fjz0 = _mm256_add_pd(fjz0,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 if (gmx_mm256_any_lt(rsq21,rcutoff2))
683 r21 = _mm256_mul_pd(rsq21,rinv21);
685 /* EWALD ELECTROSTATICS */
687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688 ewrt = _mm256_mul_pd(r21,ewtabscale);
689 ewitab = _mm256_cvttpd_epi32(ewrt);
690 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
691 ewitab = _mm_slli_epi32(ewitab,2);
692 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
693 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
694 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
695 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
696 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
697 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
698 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
699 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
700 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
702 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
704 /* Update potential sum for this i atom from the interaction with this j atom. */
705 velec = _mm256_and_pd(velec,cutoff_mask);
706 velecsum = _mm256_add_pd(velecsum,velec);
710 fscal = _mm256_and_pd(fscal,cutoff_mask);
712 /* Calculate temporary vectorial force */
713 tx = _mm256_mul_pd(fscal,dx21);
714 ty = _mm256_mul_pd(fscal,dy21);
715 tz = _mm256_mul_pd(fscal,dz21);
717 /* Update vectorial force */
718 fix2 = _mm256_add_pd(fix2,tx);
719 fiy2 = _mm256_add_pd(fiy2,ty);
720 fiz2 = _mm256_add_pd(fiz2,tz);
722 fjx1 = _mm256_add_pd(fjx1,tx);
723 fjy1 = _mm256_add_pd(fjy1,ty);
724 fjz1 = _mm256_add_pd(fjz1,tz);
728 /**************************
729 * CALCULATE INTERACTIONS *
730 **************************/
732 if (gmx_mm256_any_lt(rsq22,rcutoff2))
735 r22 = _mm256_mul_pd(rsq22,rinv22);
737 /* EWALD ELECTROSTATICS */
739 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
740 ewrt = _mm256_mul_pd(r22,ewtabscale);
741 ewitab = _mm256_cvttpd_epi32(ewrt);
742 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
743 ewitab = _mm_slli_epi32(ewitab,2);
744 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
745 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
746 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
747 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
748 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
749 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
750 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
751 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
752 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
754 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
756 /* Update potential sum for this i atom from the interaction with this j atom. */
757 velec = _mm256_and_pd(velec,cutoff_mask);
758 velecsum = _mm256_add_pd(velecsum,velec);
762 fscal = _mm256_and_pd(fscal,cutoff_mask);
764 /* Calculate temporary vectorial force */
765 tx = _mm256_mul_pd(fscal,dx22);
766 ty = _mm256_mul_pd(fscal,dy22);
767 tz = _mm256_mul_pd(fscal,dz22);
769 /* Update vectorial force */
770 fix2 = _mm256_add_pd(fix2,tx);
771 fiy2 = _mm256_add_pd(fiy2,ty);
772 fiz2 = _mm256_add_pd(fiz2,tz);
774 fjx2 = _mm256_add_pd(fjx2,tx);
775 fjy2 = _mm256_add_pd(fjy2,ty);
776 fjz2 = _mm256_add_pd(fjz2,tz);
780 fjptrA = f+j_coord_offsetA;
781 fjptrB = f+j_coord_offsetB;
782 fjptrC = f+j_coord_offsetC;
783 fjptrD = f+j_coord_offsetD;
785 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
786 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
788 /* Inner loop uses 432 flops */
794 /* Get j neighbor index, and coordinate index */
795 jnrlistA = jjnr[jidx];
796 jnrlistB = jjnr[jidx+1];
797 jnrlistC = jjnr[jidx+2];
798 jnrlistD = jjnr[jidx+3];
799 /* Sign of each element will be negative for non-real atoms.
800 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
801 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
803 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
805 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
806 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
807 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
809 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
810 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
811 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
812 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
813 j_coord_offsetA = DIM*jnrA;
814 j_coord_offsetB = DIM*jnrB;
815 j_coord_offsetC = DIM*jnrC;
816 j_coord_offsetD = DIM*jnrD;
818 /* load j atom coordinates */
819 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
820 x+j_coord_offsetC,x+j_coord_offsetD,
821 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
823 /* Calculate displacement vector */
824 dx00 = _mm256_sub_pd(ix0,jx0);
825 dy00 = _mm256_sub_pd(iy0,jy0);
826 dz00 = _mm256_sub_pd(iz0,jz0);
827 dx01 = _mm256_sub_pd(ix0,jx1);
828 dy01 = _mm256_sub_pd(iy0,jy1);
829 dz01 = _mm256_sub_pd(iz0,jz1);
830 dx02 = _mm256_sub_pd(ix0,jx2);
831 dy02 = _mm256_sub_pd(iy0,jy2);
832 dz02 = _mm256_sub_pd(iz0,jz2);
833 dx10 = _mm256_sub_pd(ix1,jx0);
834 dy10 = _mm256_sub_pd(iy1,jy0);
835 dz10 = _mm256_sub_pd(iz1,jz0);
836 dx11 = _mm256_sub_pd(ix1,jx1);
837 dy11 = _mm256_sub_pd(iy1,jy1);
838 dz11 = _mm256_sub_pd(iz1,jz1);
839 dx12 = _mm256_sub_pd(ix1,jx2);
840 dy12 = _mm256_sub_pd(iy1,jy2);
841 dz12 = _mm256_sub_pd(iz1,jz2);
842 dx20 = _mm256_sub_pd(ix2,jx0);
843 dy20 = _mm256_sub_pd(iy2,jy0);
844 dz20 = _mm256_sub_pd(iz2,jz0);
845 dx21 = _mm256_sub_pd(ix2,jx1);
846 dy21 = _mm256_sub_pd(iy2,jy1);
847 dz21 = _mm256_sub_pd(iz2,jz1);
848 dx22 = _mm256_sub_pd(ix2,jx2);
849 dy22 = _mm256_sub_pd(iy2,jy2);
850 dz22 = _mm256_sub_pd(iz2,jz2);
852 /* Calculate squared distance and things based on it */
853 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
854 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
855 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
856 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
857 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
858 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
859 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
860 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
861 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
863 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
864 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
865 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
866 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
867 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
868 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
869 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
870 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
871 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
873 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
874 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
875 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
876 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
877 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
878 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
879 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
880 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
881 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
883 fjx0 = _mm256_setzero_pd();
884 fjy0 = _mm256_setzero_pd();
885 fjz0 = _mm256_setzero_pd();
886 fjx1 = _mm256_setzero_pd();
887 fjy1 = _mm256_setzero_pd();
888 fjz1 = _mm256_setzero_pd();
889 fjx2 = _mm256_setzero_pd();
890 fjy2 = _mm256_setzero_pd();
891 fjz2 = _mm256_setzero_pd();
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 if (gmx_mm256_any_lt(rsq00,rcutoff2))
900 r00 = _mm256_mul_pd(rsq00,rinv00);
901 r00 = _mm256_andnot_pd(dummy_mask,r00);
903 /* EWALD ELECTROSTATICS */
905 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
906 ewrt = _mm256_mul_pd(r00,ewtabscale);
907 ewitab = _mm256_cvttpd_epi32(ewrt);
908 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
909 ewitab = _mm_slli_epi32(ewitab,2);
910 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
911 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
912 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
913 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
914 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
915 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
916 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
917 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
918 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
920 /* LENNARD-JONES DISPERSION/REPULSION */
922 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
923 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
924 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
925 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
926 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
927 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
929 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
931 /* Update potential sum for this i atom from the interaction with this j atom. */
932 velec = _mm256_and_pd(velec,cutoff_mask);
933 velec = _mm256_andnot_pd(dummy_mask,velec);
934 velecsum = _mm256_add_pd(velecsum,velec);
935 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
936 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
937 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
939 fscal = _mm256_add_pd(felec,fvdw);
941 fscal = _mm256_and_pd(fscal,cutoff_mask);
943 fscal = _mm256_andnot_pd(dummy_mask,fscal);
945 /* Calculate temporary vectorial force */
946 tx = _mm256_mul_pd(fscal,dx00);
947 ty = _mm256_mul_pd(fscal,dy00);
948 tz = _mm256_mul_pd(fscal,dz00);
950 /* Update vectorial force */
951 fix0 = _mm256_add_pd(fix0,tx);
952 fiy0 = _mm256_add_pd(fiy0,ty);
953 fiz0 = _mm256_add_pd(fiz0,tz);
955 fjx0 = _mm256_add_pd(fjx0,tx);
956 fjy0 = _mm256_add_pd(fjy0,ty);
957 fjz0 = _mm256_add_pd(fjz0,tz);
961 /**************************
962 * CALCULATE INTERACTIONS *
963 **************************/
965 if (gmx_mm256_any_lt(rsq01,rcutoff2))
968 r01 = _mm256_mul_pd(rsq01,rinv01);
969 r01 = _mm256_andnot_pd(dummy_mask,r01);
971 /* EWALD ELECTROSTATICS */
973 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
974 ewrt = _mm256_mul_pd(r01,ewtabscale);
975 ewitab = _mm256_cvttpd_epi32(ewrt);
976 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
977 ewitab = _mm_slli_epi32(ewitab,2);
978 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
979 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
980 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
981 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
982 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
983 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
984 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
985 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
986 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
988 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec = _mm256_and_pd(velec,cutoff_mask);
992 velec = _mm256_andnot_pd(dummy_mask,velec);
993 velecsum = _mm256_add_pd(velecsum,velec);
997 fscal = _mm256_and_pd(fscal,cutoff_mask);
999 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1001 /* Calculate temporary vectorial force */
1002 tx = _mm256_mul_pd(fscal,dx01);
1003 ty = _mm256_mul_pd(fscal,dy01);
1004 tz = _mm256_mul_pd(fscal,dz01);
1006 /* Update vectorial force */
1007 fix0 = _mm256_add_pd(fix0,tx);
1008 fiy0 = _mm256_add_pd(fiy0,ty);
1009 fiz0 = _mm256_add_pd(fiz0,tz);
1011 fjx1 = _mm256_add_pd(fjx1,tx);
1012 fjy1 = _mm256_add_pd(fjy1,ty);
1013 fjz1 = _mm256_add_pd(fjz1,tz);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1024 r02 = _mm256_mul_pd(rsq02,rinv02);
1025 r02 = _mm256_andnot_pd(dummy_mask,r02);
1027 /* EWALD ELECTROSTATICS */
1029 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1030 ewrt = _mm256_mul_pd(r02,ewtabscale);
1031 ewitab = _mm256_cvttpd_epi32(ewrt);
1032 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1033 ewitab = _mm_slli_epi32(ewitab,2);
1034 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1035 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1036 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1037 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1038 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1039 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1040 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1041 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1042 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1044 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1046 /* Update potential sum for this i atom from the interaction with this j atom. */
1047 velec = _mm256_and_pd(velec,cutoff_mask);
1048 velec = _mm256_andnot_pd(dummy_mask,velec);
1049 velecsum = _mm256_add_pd(velecsum,velec);
1053 fscal = _mm256_and_pd(fscal,cutoff_mask);
1055 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1057 /* Calculate temporary vectorial force */
1058 tx = _mm256_mul_pd(fscal,dx02);
1059 ty = _mm256_mul_pd(fscal,dy02);
1060 tz = _mm256_mul_pd(fscal,dz02);
1062 /* Update vectorial force */
1063 fix0 = _mm256_add_pd(fix0,tx);
1064 fiy0 = _mm256_add_pd(fiy0,ty);
1065 fiz0 = _mm256_add_pd(fiz0,tz);
1067 fjx2 = _mm256_add_pd(fjx2,tx);
1068 fjy2 = _mm256_add_pd(fjy2,ty);
1069 fjz2 = _mm256_add_pd(fjz2,tz);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1080 r10 = _mm256_mul_pd(rsq10,rinv10);
1081 r10 = _mm256_andnot_pd(dummy_mask,r10);
1083 /* EWALD ELECTROSTATICS */
1085 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1086 ewrt = _mm256_mul_pd(r10,ewtabscale);
1087 ewitab = _mm256_cvttpd_epi32(ewrt);
1088 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1089 ewitab = _mm_slli_epi32(ewitab,2);
1090 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1091 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1092 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1093 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1094 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1095 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1096 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1097 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1098 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1100 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1102 /* Update potential sum for this i atom from the interaction with this j atom. */
1103 velec = _mm256_and_pd(velec,cutoff_mask);
1104 velec = _mm256_andnot_pd(dummy_mask,velec);
1105 velecsum = _mm256_add_pd(velecsum,velec);
1109 fscal = _mm256_and_pd(fscal,cutoff_mask);
1111 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1113 /* Calculate temporary vectorial force */
1114 tx = _mm256_mul_pd(fscal,dx10);
1115 ty = _mm256_mul_pd(fscal,dy10);
1116 tz = _mm256_mul_pd(fscal,dz10);
1118 /* Update vectorial force */
1119 fix1 = _mm256_add_pd(fix1,tx);
1120 fiy1 = _mm256_add_pd(fiy1,ty);
1121 fiz1 = _mm256_add_pd(fiz1,tz);
1123 fjx0 = _mm256_add_pd(fjx0,tx);
1124 fjy0 = _mm256_add_pd(fjy0,ty);
1125 fjz0 = _mm256_add_pd(fjz0,tz);
1129 /**************************
1130 * CALCULATE INTERACTIONS *
1131 **************************/
1133 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1136 r11 = _mm256_mul_pd(rsq11,rinv11);
1137 r11 = _mm256_andnot_pd(dummy_mask,r11);
1139 /* EWALD ELECTROSTATICS */
1141 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1142 ewrt = _mm256_mul_pd(r11,ewtabscale);
1143 ewitab = _mm256_cvttpd_epi32(ewrt);
1144 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1145 ewitab = _mm_slli_epi32(ewitab,2);
1146 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1147 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1148 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1149 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1150 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1151 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1152 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1153 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1154 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1156 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1158 /* Update potential sum for this i atom from the interaction with this j atom. */
1159 velec = _mm256_and_pd(velec,cutoff_mask);
1160 velec = _mm256_andnot_pd(dummy_mask,velec);
1161 velecsum = _mm256_add_pd(velecsum,velec);
1165 fscal = _mm256_and_pd(fscal,cutoff_mask);
1167 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1169 /* Calculate temporary vectorial force */
1170 tx = _mm256_mul_pd(fscal,dx11);
1171 ty = _mm256_mul_pd(fscal,dy11);
1172 tz = _mm256_mul_pd(fscal,dz11);
1174 /* Update vectorial force */
1175 fix1 = _mm256_add_pd(fix1,tx);
1176 fiy1 = _mm256_add_pd(fiy1,ty);
1177 fiz1 = _mm256_add_pd(fiz1,tz);
1179 fjx1 = _mm256_add_pd(fjx1,tx);
1180 fjy1 = _mm256_add_pd(fjy1,ty);
1181 fjz1 = _mm256_add_pd(fjz1,tz);
1185 /**************************
1186 * CALCULATE INTERACTIONS *
1187 **************************/
1189 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1192 r12 = _mm256_mul_pd(rsq12,rinv12);
1193 r12 = _mm256_andnot_pd(dummy_mask,r12);
1195 /* EWALD ELECTROSTATICS */
1197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1198 ewrt = _mm256_mul_pd(r12,ewtabscale);
1199 ewitab = _mm256_cvttpd_epi32(ewrt);
1200 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1201 ewitab = _mm_slli_epi32(ewitab,2);
1202 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1203 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1204 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1205 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1206 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1207 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1208 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1209 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1210 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1212 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1214 /* Update potential sum for this i atom from the interaction with this j atom. */
1215 velec = _mm256_and_pd(velec,cutoff_mask);
1216 velec = _mm256_andnot_pd(dummy_mask,velec);
1217 velecsum = _mm256_add_pd(velecsum,velec);
1221 fscal = _mm256_and_pd(fscal,cutoff_mask);
1223 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1225 /* Calculate temporary vectorial force */
1226 tx = _mm256_mul_pd(fscal,dx12);
1227 ty = _mm256_mul_pd(fscal,dy12);
1228 tz = _mm256_mul_pd(fscal,dz12);
1230 /* Update vectorial force */
1231 fix1 = _mm256_add_pd(fix1,tx);
1232 fiy1 = _mm256_add_pd(fiy1,ty);
1233 fiz1 = _mm256_add_pd(fiz1,tz);
1235 fjx2 = _mm256_add_pd(fjx2,tx);
1236 fjy2 = _mm256_add_pd(fjy2,ty);
1237 fjz2 = _mm256_add_pd(fjz2,tz);
1241 /**************************
1242 * CALCULATE INTERACTIONS *
1243 **************************/
1245 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1248 r20 = _mm256_mul_pd(rsq20,rinv20);
1249 r20 = _mm256_andnot_pd(dummy_mask,r20);
1251 /* EWALD ELECTROSTATICS */
1253 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1254 ewrt = _mm256_mul_pd(r20,ewtabscale);
1255 ewitab = _mm256_cvttpd_epi32(ewrt);
1256 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1257 ewitab = _mm_slli_epi32(ewitab,2);
1258 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1259 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1260 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1261 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1262 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1263 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1264 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1265 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1266 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1268 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1270 /* Update potential sum for this i atom from the interaction with this j atom. */
1271 velec = _mm256_and_pd(velec,cutoff_mask);
1272 velec = _mm256_andnot_pd(dummy_mask,velec);
1273 velecsum = _mm256_add_pd(velecsum,velec);
1277 fscal = _mm256_and_pd(fscal,cutoff_mask);
1279 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1281 /* Calculate temporary vectorial force */
1282 tx = _mm256_mul_pd(fscal,dx20);
1283 ty = _mm256_mul_pd(fscal,dy20);
1284 tz = _mm256_mul_pd(fscal,dz20);
1286 /* Update vectorial force */
1287 fix2 = _mm256_add_pd(fix2,tx);
1288 fiy2 = _mm256_add_pd(fiy2,ty);
1289 fiz2 = _mm256_add_pd(fiz2,tz);
1291 fjx0 = _mm256_add_pd(fjx0,tx);
1292 fjy0 = _mm256_add_pd(fjy0,ty);
1293 fjz0 = _mm256_add_pd(fjz0,tz);
1297 /**************************
1298 * CALCULATE INTERACTIONS *
1299 **************************/
1301 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1304 r21 = _mm256_mul_pd(rsq21,rinv21);
1305 r21 = _mm256_andnot_pd(dummy_mask,r21);
1307 /* EWALD ELECTROSTATICS */
1309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1310 ewrt = _mm256_mul_pd(r21,ewtabscale);
1311 ewitab = _mm256_cvttpd_epi32(ewrt);
1312 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1313 ewitab = _mm_slli_epi32(ewitab,2);
1314 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1315 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1316 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1317 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1318 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1319 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1320 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1321 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1322 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1324 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1326 /* Update potential sum for this i atom from the interaction with this j atom. */
1327 velec = _mm256_and_pd(velec,cutoff_mask);
1328 velec = _mm256_andnot_pd(dummy_mask,velec);
1329 velecsum = _mm256_add_pd(velecsum,velec);
1333 fscal = _mm256_and_pd(fscal,cutoff_mask);
1335 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1337 /* Calculate temporary vectorial force */
1338 tx = _mm256_mul_pd(fscal,dx21);
1339 ty = _mm256_mul_pd(fscal,dy21);
1340 tz = _mm256_mul_pd(fscal,dz21);
1342 /* Update vectorial force */
1343 fix2 = _mm256_add_pd(fix2,tx);
1344 fiy2 = _mm256_add_pd(fiy2,ty);
1345 fiz2 = _mm256_add_pd(fiz2,tz);
1347 fjx1 = _mm256_add_pd(fjx1,tx);
1348 fjy1 = _mm256_add_pd(fjy1,ty);
1349 fjz1 = _mm256_add_pd(fjz1,tz);
1353 /**************************
1354 * CALCULATE INTERACTIONS *
1355 **************************/
1357 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1360 r22 = _mm256_mul_pd(rsq22,rinv22);
1361 r22 = _mm256_andnot_pd(dummy_mask,r22);
1363 /* EWALD ELECTROSTATICS */
1365 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1366 ewrt = _mm256_mul_pd(r22,ewtabscale);
1367 ewitab = _mm256_cvttpd_epi32(ewrt);
1368 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1369 ewitab = _mm_slli_epi32(ewitab,2);
1370 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1371 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1372 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1373 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1374 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1375 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1376 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1377 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1378 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1380 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1382 /* Update potential sum for this i atom from the interaction with this j atom. */
1383 velec = _mm256_and_pd(velec,cutoff_mask);
1384 velec = _mm256_andnot_pd(dummy_mask,velec);
1385 velecsum = _mm256_add_pd(velecsum,velec);
1389 fscal = _mm256_and_pd(fscal,cutoff_mask);
1391 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1393 /* Calculate temporary vectorial force */
1394 tx = _mm256_mul_pd(fscal,dx22);
1395 ty = _mm256_mul_pd(fscal,dy22);
1396 tz = _mm256_mul_pd(fscal,dz22);
1398 /* Update vectorial force */
1399 fix2 = _mm256_add_pd(fix2,tx);
1400 fiy2 = _mm256_add_pd(fiy2,ty);
1401 fiz2 = _mm256_add_pd(fiz2,tz);
1403 fjx2 = _mm256_add_pd(fjx2,tx);
1404 fjy2 = _mm256_add_pd(fjy2,ty);
1405 fjz2 = _mm256_add_pd(fjz2,tz);
1409 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1410 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1411 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1412 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1414 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1415 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1417 /* Inner loop uses 441 flops */
1420 /* End of innermost loop */
1422 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1423 f+i_coord_offset,fshift+i_shift_offset);
1426 /* Update potential energies */
1427 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1428 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1430 /* Increment number of inner iterations */
1431 inneriter += j_index_end - j_index_start;
1433 /* Outer loop uses 20 flops */
1436 /* Increment number of outer iterations */
1439 /* Update outer/inner flops */
1441 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1444 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1445 * Electrostatics interaction: Ewald
1446 * VdW interaction: LennardJones
1447 * Geometry: Water3-Water3
1448 * Calculate force/pot: Force
1451 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_double
1452 (t_nblist * gmx_restrict nlist,
1453 rvec * gmx_restrict xx,
1454 rvec * gmx_restrict ff,
1455 t_forcerec * gmx_restrict fr,
1456 t_mdatoms * gmx_restrict mdatoms,
1457 nb_kernel_data_t * gmx_restrict kernel_data,
1458 t_nrnb * gmx_restrict nrnb)
1460 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1461 * just 0 for non-waters.
1462 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1463 * jnr indices corresponding to data put in the four positions in the SIMD register.
1465 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1466 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1467 int jnrA,jnrB,jnrC,jnrD;
1468 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1469 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1470 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1471 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1472 real rcutoff_scalar;
1473 real *shiftvec,*fshift,*x,*f;
1474 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1475 real scratch[4*DIM];
1476 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1477 real * vdwioffsetptr0;
1478 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1479 real * vdwioffsetptr1;
1480 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1481 real * vdwioffsetptr2;
1482 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1483 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1484 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1485 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1486 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1487 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1488 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1489 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1490 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1491 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1492 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1493 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1494 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1495 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1496 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1497 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1498 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1501 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1504 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1505 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1507 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1508 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1510 __m256d dummy_mask,cutoff_mask;
1511 __m128 tmpmask0,tmpmask1;
1512 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1513 __m256d one = _mm256_set1_pd(1.0);
1514 __m256d two = _mm256_set1_pd(2.0);
1520 jindex = nlist->jindex;
1522 shiftidx = nlist->shift;
1524 shiftvec = fr->shift_vec[0];
1525 fshift = fr->fshift[0];
1526 facel = _mm256_set1_pd(fr->epsfac);
1527 charge = mdatoms->chargeA;
1528 nvdwtype = fr->ntype;
1529 vdwparam = fr->nbfp;
1530 vdwtype = mdatoms->typeA;
1532 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1533 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1534 beta2 = _mm256_mul_pd(beta,beta);
1535 beta3 = _mm256_mul_pd(beta,beta2);
1537 ewtab = fr->ic->tabq_coul_F;
1538 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1539 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1541 /* Setup water-specific parameters */
1542 inr = nlist->iinr[0];
1543 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1544 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1545 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1546 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1548 jq0 = _mm256_set1_pd(charge[inr+0]);
1549 jq1 = _mm256_set1_pd(charge[inr+1]);
1550 jq2 = _mm256_set1_pd(charge[inr+2]);
1551 vdwjidx0A = 2*vdwtype[inr+0];
1552 qq00 = _mm256_mul_pd(iq0,jq0);
1553 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1554 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1555 qq01 = _mm256_mul_pd(iq0,jq1);
1556 qq02 = _mm256_mul_pd(iq0,jq2);
1557 qq10 = _mm256_mul_pd(iq1,jq0);
1558 qq11 = _mm256_mul_pd(iq1,jq1);
1559 qq12 = _mm256_mul_pd(iq1,jq2);
1560 qq20 = _mm256_mul_pd(iq2,jq0);
1561 qq21 = _mm256_mul_pd(iq2,jq1);
1562 qq22 = _mm256_mul_pd(iq2,jq2);
1564 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1565 rcutoff_scalar = fr->rcoulomb;
1566 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1567 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1569 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1570 rvdw = _mm256_set1_pd(fr->rvdw);
1572 /* Avoid stupid compiler warnings */
1573 jnrA = jnrB = jnrC = jnrD = 0;
1574 j_coord_offsetA = 0;
1575 j_coord_offsetB = 0;
1576 j_coord_offsetC = 0;
1577 j_coord_offsetD = 0;
1582 for(iidx=0;iidx<4*DIM;iidx++)
1584 scratch[iidx] = 0.0;
1587 /* Start outer loop over neighborlists */
1588 for(iidx=0; iidx<nri; iidx++)
1590 /* Load shift vector for this list */
1591 i_shift_offset = DIM*shiftidx[iidx];
1593 /* Load limits for loop over neighbors */
1594 j_index_start = jindex[iidx];
1595 j_index_end = jindex[iidx+1];
1597 /* Get outer coordinate index */
1599 i_coord_offset = DIM*inr;
1601 /* Load i particle coords and add shift vector */
1602 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1603 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1605 fix0 = _mm256_setzero_pd();
1606 fiy0 = _mm256_setzero_pd();
1607 fiz0 = _mm256_setzero_pd();
1608 fix1 = _mm256_setzero_pd();
1609 fiy1 = _mm256_setzero_pd();
1610 fiz1 = _mm256_setzero_pd();
1611 fix2 = _mm256_setzero_pd();
1612 fiy2 = _mm256_setzero_pd();
1613 fiz2 = _mm256_setzero_pd();
1615 /* Start inner kernel loop */
1616 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1619 /* Get j neighbor index, and coordinate index */
1621 jnrB = jjnr[jidx+1];
1622 jnrC = jjnr[jidx+2];
1623 jnrD = jjnr[jidx+3];
1624 j_coord_offsetA = DIM*jnrA;
1625 j_coord_offsetB = DIM*jnrB;
1626 j_coord_offsetC = DIM*jnrC;
1627 j_coord_offsetD = DIM*jnrD;
1629 /* load j atom coordinates */
1630 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1631 x+j_coord_offsetC,x+j_coord_offsetD,
1632 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1634 /* Calculate displacement vector */
1635 dx00 = _mm256_sub_pd(ix0,jx0);
1636 dy00 = _mm256_sub_pd(iy0,jy0);
1637 dz00 = _mm256_sub_pd(iz0,jz0);
1638 dx01 = _mm256_sub_pd(ix0,jx1);
1639 dy01 = _mm256_sub_pd(iy0,jy1);
1640 dz01 = _mm256_sub_pd(iz0,jz1);
1641 dx02 = _mm256_sub_pd(ix0,jx2);
1642 dy02 = _mm256_sub_pd(iy0,jy2);
1643 dz02 = _mm256_sub_pd(iz0,jz2);
1644 dx10 = _mm256_sub_pd(ix1,jx0);
1645 dy10 = _mm256_sub_pd(iy1,jy0);
1646 dz10 = _mm256_sub_pd(iz1,jz0);
1647 dx11 = _mm256_sub_pd(ix1,jx1);
1648 dy11 = _mm256_sub_pd(iy1,jy1);
1649 dz11 = _mm256_sub_pd(iz1,jz1);
1650 dx12 = _mm256_sub_pd(ix1,jx2);
1651 dy12 = _mm256_sub_pd(iy1,jy2);
1652 dz12 = _mm256_sub_pd(iz1,jz2);
1653 dx20 = _mm256_sub_pd(ix2,jx0);
1654 dy20 = _mm256_sub_pd(iy2,jy0);
1655 dz20 = _mm256_sub_pd(iz2,jz0);
1656 dx21 = _mm256_sub_pd(ix2,jx1);
1657 dy21 = _mm256_sub_pd(iy2,jy1);
1658 dz21 = _mm256_sub_pd(iz2,jz1);
1659 dx22 = _mm256_sub_pd(ix2,jx2);
1660 dy22 = _mm256_sub_pd(iy2,jy2);
1661 dz22 = _mm256_sub_pd(iz2,jz2);
1663 /* Calculate squared distance and things based on it */
1664 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1665 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1666 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1667 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1668 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1669 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1670 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1671 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1672 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1674 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1675 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1676 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1677 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1678 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1679 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1680 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1681 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1682 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1684 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1685 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1686 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1687 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1688 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1689 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1690 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1691 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1692 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1694 fjx0 = _mm256_setzero_pd();
1695 fjy0 = _mm256_setzero_pd();
1696 fjz0 = _mm256_setzero_pd();
1697 fjx1 = _mm256_setzero_pd();
1698 fjy1 = _mm256_setzero_pd();
1699 fjz1 = _mm256_setzero_pd();
1700 fjx2 = _mm256_setzero_pd();
1701 fjy2 = _mm256_setzero_pd();
1702 fjz2 = _mm256_setzero_pd();
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1711 r00 = _mm256_mul_pd(rsq00,rinv00);
1713 /* EWALD ELECTROSTATICS */
1715 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1716 ewrt = _mm256_mul_pd(r00,ewtabscale);
1717 ewitab = _mm256_cvttpd_epi32(ewrt);
1718 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1719 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1720 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1722 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1723 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1725 /* LENNARD-JONES DISPERSION/REPULSION */
1727 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1728 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1730 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1732 fscal = _mm256_add_pd(felec,fvdw);
1734 fscal = _mm256_and_pd(fscal,cutoff_mask);
1736 /* Calculate temporary vectorial force */
1737 tx = _mm256_mul_pd(fscal,dx00);
1738 ty = _mm256_mul_pd(fscal,dy00);
1739 tz = _mm256_mul_pd(fscal,dz00);
1741 /* Update vectorial force */
1742 fix0 = _mm256_add_pd(fix0,tx);
1743 fiy0 = _mm256_add_pd(fiy0,ty);
1744 fiz0 = _mm256_add_pd(fiz0,tz);
1746 fjx0 = _mm256_add_pd(fjx0,tx);
1747 fjy0 = _mm256_add_pd(fjy0,ty);
1748 fjz0 = _mm256_add_pd(fjz0,tz);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1759 r01 = _mm256_mul_pd(rsq01,rinv01);
1761 /* EWALD ELECTROSTATICS */
1763 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1764 ewrt = _mm256_mul_pd(r01,ewtabscale);
1765 ewitab = _mm256_cvttpd_epi32(ewrt);
1766 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1767 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1768 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1770 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1771 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1773 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1777 fscal = _mm256_and_pd(fscal,cutoff_mask);
1779 /* Calculate temporary vectorial force */
1780 tx = _mm256_mul_pd(fscal,dx01);
1781 ty = _mm256_mul_pd(fscal,dy01);
1782 tz = _mm256_mul_pd(fscal,dz01);
1784 /* Update vectorial force */
1785 fix0 = _mm256_add_pd(fix0,tx);
1786 fiy0 = _mm256_add_pd(fiy0,ty);
1787 fiz0 = _mm256_add_pd(fiz0,tz);
1789 fjx1 = _mm256_add_pd(fjx1,tx);
1790 fjy1 = _mm256_add_pd(fjy1,ty);
1791 fjz1 = _mm256_add_pd(fjz1,tz);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1802 r02 = _mm256_mul_pd(rsq02,rinv02);
1804 /* EWALD ELECTROSTATICS */
1806 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1807 ewrt = _mm256_mul_pd(r02,ewtabscale);
1808 ewitab = _mm256_cvttpd_epi32(ewrt);
1809 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1810 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1811 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1813 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1814 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1816 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1820 fscal = _mm256_and_pd(fscal,cutoff_mask);
1822 /* Calculate temporary vectorial force */
1823 tx = _mm256_mul_pd(fscal,dx02);
1824 ty = _mm256_mul_pd(fscal,dy02);
1825 tz = _mm256_mul_pd(fscal,dz02);
1827 /* Update vectorial force */
1828 fix0 = _mm256_add_pd(fix0,tx);
1829 fiy0 = _mm256_add_pd(fiy0,ty);
1830 fiz0 = _mm256_add_pd(fiz0,tz);
1832 fjx2 = _mm256_add_pd(fjx2,tx);
1833 fjy2 = _mm256_add_pd(fjy2,ty);
1834 fjz2 = _mm256_add_pd(fjz2,tz);
1838 /**************************
1839 * CALCULATE INTERACTIONS *
1840 **************************/
1842 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1845 r10 = _mm256_mul_pd(rsq10,rinv10);
1847 /* EWALD ELECTROSTATICS */
1849 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1850 ewrt = _mm256_mul_pd(r10,ewtabscale);
1851 ewitab = _mm256_cvttpd_epi32(ewrt);
1852 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1853 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1854 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1856 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1857 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1859 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1863 fscal = _mm256_and_pd(fscal,cutoff_mask);
1865 /* Calculate temporary vectorial force */
1866 tx = _mm256_mul_pd(fscal,dx10);
1867 ty = _mm256_mul_pd(fscal,dy10);
1868 tz = _mm256_mul_pd(fscal,dz10);
1870 /* Update vectorial force */
1871 fix1 = _mm256_add_pd(fix1,tx);
1872 fiy1 = _mm256_add_pd(fiy1,ty);
1873 fiz1 = _mm256_add_pd(fiz1,tz);
1875 fjx0 = _mm256_add_pd(fjx0,tx);
1876 fjy0 = _mm256_add_pd(fjy0,ty);
1877 fjz0 = _mm256_add_pd(fjz0,tz);
1881 /**************************
1882 * CALCULATE INTERACTIONS *
1883 **************************/
1885 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1888 r11 = _mm256_mul_pd(rsq11,rinv11);
1890 /* EWALD ELECTROSTATICS */
1892 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1893 ewrt = _mm256_mul_pd(r11,ewtabscale);
1894 ewitab = _mm256_cvttpd_epi32(ewrt);
1895 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1896 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1897 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1899 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1900 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1902 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1906 fscal = _mm256_and_pd(fscal,cutoff_mask);
1908 /* Calculate temporary vectorial force */
1909 tx = _mm256_mul_pd(fscal,dx11);
1910 ty = _mm256_mul_pd(fscal,dy11);
1911 tz = _mm256_mul_pd(fscal,dz11);
1913 /* Update vectorial force */
1914 fix1 = _mm256_add_pd(fix1,tx);
1915 fiy1 = _mm256_add_pd(fiy1,ty);
1916 fiz1 = _mm256_add_pd(fiz1,tz);
1918 fjx1 = _mm256_add_pd(fjx1,tx);
1919 fjy1 = _mm256_add_pd(fjy1,ty);
1920 fjz1 = _mm256_add_pd(fjz1,tz);
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1928 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1931 r12 = _mm256_mul_pd(rsq12,rinv12);
1933 /* EWALD ELECTROSTATICS */
1935 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1936 ewrt = _mm256_mul_pd(r12,ewtabscale);
1937 ewitab = _mm256_cvttpd_epi32(ewrt);
1938 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1939 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1940 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1942 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1943 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1945 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1949 fscal = _mm256_and_pd(fscal,cutoff_mask);
1951 /* Calculate temporary vectorial force */
1952 tx = _mm256_mul_pd(fscal,dx12);
1953 ty = _mm256_mul_pd(fscal,dy12);
1954 tz = _mm256_mul_pd(fscal,dz12);
1956 /* Update vectorial force */
1957 fix1 = _mm256_add_pd(fix1,tx);
1958 fiy1 = _mm256_add_pd(fiy1,ty);
1959 fiz1 = _mm256_add_pd(fiz1,tz);
1961 fjx2 = _mm256_add_pd(fjx2,tx);
1962 fjy2 = _mm256_add_pd(fjy2,ty);
1963 fjz2 = _mm256_add_pd(fjz2,tz);
1967 /**************************
1968 * CALCULATE INTERACTIONS *
1969 **************************/
1971 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1974 r20 = _mm256_mul_pd(rsq20,rinv20);
1976 /* EWALD ELECTROSTATICS */
1978 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1979 ewrt = _mm256_mul_pd(r20,ewtabscale);
1980 ewitab = _mm256_cvttpd_epi32(ewrt);
1981 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1982 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1983 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1985 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1986 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1988 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1992 fscal = _mm256_and_pd(fscal,cutoff_mask);
1994 /* Calculate temporary vectorial force */
1995 tx = _mm256_mul_pd(fscal,dx20);
1996 ty = _mm256_mul_pd(fscal,dy20);
1997 tz = _mm256_mul_pd(fscal,dz20);
1999 /* Update vectorial force */
2000 fix2 = _mm256_add_pd(fix2,tx);
2001 fiy2 = _mm256_add_pd(fiy2,ty);
2002 fiz2 = _mm256_add_pd(fiz2,tz);
2004 fjx0 = _mm256_add_pd(fjx0,tx);
2005 fjy0 = _mm256_add_pd(fjy0,ty);
2006 fjz0 = _mm256_add_pd(fjz0,tz);
2010 /**************************
2011 * CALCULATE INTERACTIONS *
2012 **************************/
2014 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2017 r21 = _mm256_mul_pd(rsq21,rinv21);
2019 /* EWALD ELECTROSTATICS */
2021 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2022 ewrt = _mm256_mul_pd(r21,ewtabscale);
2023 ewitab = _mm256_cvttpd_epi32(ewrt);
2024 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2025 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2026 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2028 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2029 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2031 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2035 fscal = _mm256_and_pd(fscal,cutoff_mask);
2037 /* Calculate temporary vectorial force */
2038 tx = _mm256_mul_pd(fscal,dx21);
2039 ty = _mm256_mul_pd(fscal,dy21);
2040 tz = _mm256_mul_pd(fscal,dz21);
2042 /* Update vectorial force */
2043 fix2 = _mm256_add_pd(fix2,tx);
2044 fiy2 = _mm256_add_pd(fiy2,ty);
2045 fiz2 = _mm256_add_pd(fiz2,tz);
2047 fjx1 = _mm256_add_pd(fjx1,tx);
2048 fjy1 = _mm256_add_pd(fjy1,ty);
2049 fjz1 = _mm256_add_pd(fjz1,tz);
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2060 r22 = _mm256_mul_pd(rsq22,rinv22);
2062 /* EWALD ELECTROSTATICS */
2064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2065 ewrt = _mm256_mul_pd(r22,ewtabscale);
2066 ewitab = _mm256_cvttpd_epi32(ewrt);
2067 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2068 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2069 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2071 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2072 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2074 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2078 fscal = _mm256_and_pd(fscal,cutoff_mask);
2080 /* Calculate temporary vectorial force */
2081 tx = _mm256_mul_pd(fscal,dx22);
2082 ty = _mm256_mul_pd(fscal,dy22);
2083 tz = _mm256_mul_pd(fscal,dz22);
2085 /* Update vectorial force */
2086 fix2 = _mm256_add_pd(fix2,tx);
2087 fiy2 = _mm256_add_pd(fiy2,ty);
2088 fiz2 = _mm256_add_pd(fiz2,tz);
2090 fjx2 = _mm256_add_pd(fjx2,tx);
2091 fjy2 = _mm256_add_pd(fjy2,ty);
2092 fjz2 = _mm256_add_pd(fjz2,tz);
2096 fjptrA = f+j_coord_offsetA;
2097 fjptrB = f+j_coord_offsetB;
2098 fjptrC = f+j_coord_offsetC;
2099 fjptrD = f+j_coord_offsetD;
2101 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2102 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2104 /* Inner loop uses 358 flops */
2107 if(jidx<j_index_end)
2110 /* Get j neighbor index, and coordinate index */
2111 jnrlistA = jjnr[jidx];
2112 jnrlistB = jjnr[jidx+1];
2113 jnrlistC = jjnr[jidx+2];
2114 jnrlistD = jjnr[jidx+3];
2115 /* Sign of each element will be negative for non-real atoms.
2116 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2117 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2119 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2121 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2122 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2123 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2125 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2126 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2127 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2128 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2129 j_coord_offsetA = DIM*jnrA;
2130 j_coord_offsetB = DIM*jnrB;
2131 j_coord_offsetC = DIM*jnrC;
2132 j_coord_offsetD = DIM*jnrD;
2134 /* load j atom coordinates */
2135 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2136 x+j_coord_offsetC,x+j_coord_offsetD,
2137 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2139 /* Calculate displacement vector */
2140 dx00 = _mm256_sub_pd(ix0,jx0);
2141 dy00 = _mm256_sub_pd(iy0,jy0);
2142 dz00 = _mm256_sub_pd(iz0,jz0);
2143 dx01 = _mm256_sub_pd(ix0,jx1);
2144 dy01 = _mm256_sub_pd(iy0,jy1);
2145 dz01 = _mm256_sub_pd(iz0,jz1);
2146 dx02 = _mm256_sub_pd(ix0,jx2);
2147 dy02 = _mm256_sub_pd(iy0,jy2);
2148 dz02 = _mm256_sub_pd(iz0,jz2);
2149 dx10 = _mm256_sub_pd(ix1,jx0);
2150 dy10 = _mm256_sub_pd(iy1,jy0);
2151 dz10 = _mm256_sub_pd(iz1,jz0);
2152 dx11 = _mm256_sub_pd(ix1,jx1);
2153 dy11 = _mm256_sub_pd(iy1,jy1);
2154 dz11 = _mm256_sub_pd(iz1,jz1);
2155 dx12 = _mm256_sub_pd(ix1,jx2);
2156 dy12 = _mm256_sub_pd(iy1,jy2);
2157 dz12 = _mm256_sub_pd(iz1,jz2);
2158 dx20 = _mm256_sub_pd(ix2,jx0);
2159 dy20 = _mm256_sub_pd(iy2,jy0);
2160 dz20 = _mm256_sub_pd(iz2,jz0);
2161 dx21 = _mm256_sub_pd(ix2,jx1);
2162 dy21 = _mm256_sub_pd(iy2,jy1);
2163 dz21 = _mm256_sub_pd(iz2,jz1);
2164 dx22 = _mm256_sub_pd(ix2,jx2);
2165 dy22 = _mm256_sub_pd(iy2,jy2);
2166 dz22 = _mm256_sub_pd(iz2,jz2);
2168 /* Calculate squared distance and things based on it */
2169 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2170 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2171 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2172 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2173 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2174 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2175 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2176 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2177 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2179 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2180 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2181 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2182 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2183 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2184 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2185 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2186 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2187 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2189 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2190 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2191 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2192 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2193 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2194 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2195 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2196 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2197 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2199 fjx0 = _mm256_setzero_pd();
2200 fjy0 = _mm256_setzero_pd();
2201 fjz0 = _mm256_setzero_pd();
2202 fjx1 = _mm256_setzero_pd();
2203 fjy1 = _mm256_setzero_pd();
2204 fjz1 = _mm256_setzero_pd();
2205 fjx2 = _mm256_setzero_pd();
2206 fjy2 = _mm256_setzero_pd();
2207 fjz2 = _mm256_setzero_pd();
2209 /**************************
2210 * CALCULATE INTERACTIONS *
2211 **************************/
2213 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2216 r00 = _mm256_mul_pd(rsq00,rinv00);
2217 r00 = _mm256_andnot_pd(dummy_mask,r00);
2219 /* EWALD ELECTROSTATICS */
2221 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2222 ewrt = _mm256_mul_pd(r00,ewtabscale);
2223 ewitab = _mm256_cvttpd_epi32(ewrt);
2224 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2225 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2226 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2228 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2229 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2231 /* LENNARD-JONES DISPERSION/REPULSION */
2233 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2234 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2236 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2238 fscal = _mm256_add_pd(felec,fvdw);
2240 fscal = _mm256_and_pd(fscal,cutoff_mask);
2242 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2244 /* Calculate temporary vectorial force */
2245 tx = _mm256_mul_pd(fscal,dx00);
2246 ty = _mm256_mul_pd(fscal,dy00);
2247 tz = _mm256_mul_pd(fscal,dz00);
2249 /* Update vectorial force */
2250 fix0 = _mm256_add_pd(fix0,tx);
2251 fiy0 = _mm256_add_pd(fiy0,ty);
2252 fiz0 = _mm256_add_pd(fiz0,tz);
2254 fjx0 = _mm256_add_pd(fjx0,tx);
2255 fjy0 = _mm256_add_pd(fjy0,ty);
2256 fjz0 = _mm256_add_pd(fjz0,tz);
2260 /**************************
2261 * CALCULATE INTERACTIONS *
2262 **************************/
2264 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2267 r01 = _mm256_mul_pd(rsq01,rinv01);
2268 r01 = _mm256_andnot_pd(dummy_mask,r01);
2270 /* EWALD ELECTROSTATICS */
2272 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2273 ewrt = _mm256_mul_pd(r01,ewtabscale);
2274 ewitab = _mm256_cvttpd_epi32(ewrt);
2275 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2276 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2277 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2279 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2280 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2282 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2286 fscal = _mm256_and_pd(fscal,cutoff_mask);
2288 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2290 /* Calculate temporary vectorial force */
2291 tx = _mm256_mul_pd(fscal,dx01);
2292 ty = _mm256_mul_pd(fscal,dy01);
2293 tz = _mm256_mul_pd(fscal,dz01);
2295 /* Update vectorial force */
2296 fix0 = _mm256_add_pd(fix0,tx);
2297 fiy0 = _mm256_add_pd(fiy0,ty);
2298 fiz0 = _mm256_add_pd(fiz0,tz);
2300 fjx1 = _mm256_add_pd(fjx1,tx);
2301 fjy1 = _mm256_add_pd(fjy1,ty);
2302 fjz1 = _mm256_add_pd(fjz1,tz);
2306 /**************************
2307 * CALCULATE INTERACTIONS *
2308 **************************/
2310 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2313 r02 = _mm256_mul_pd(rsq02,rinv02);
2314 r02 = _mm256_andnot_pd(dummy_mask,r02);
2316 /* EWALD ELECTROSTATICS */
2318 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2319 ewrt = _mm256_mul_pd(r02,ewtabscale);
2320 ewitab = _mm256_cvttpd_epi32(ewrt);
2321 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2322 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2323 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2325 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2326 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2328 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2332 fscal = _mm256_and_pd(fscal,cutoff_mask);
2334 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2336 /* Calculate temporary vectorial force */
2337 tx = _mm256_mul_pd(fscal,dx02);
2338 ty = _mm256_mul_pd(fscal,dy02);
2339 tz = _mm256_mul_pd(fscal,dz02);
2341 /* Update vectorial force */
2342 fix0 = _mm256_add_pd(fix0,tx);
2343 fiy0 = _mm256_add_pd(fiy0,ty);
2344 fiz0 = _mm256_add_pd(fiz0,tz);
2346 fjx2 = _mm256_add_pd(fjx2,tx);
2347 fjy2 = _mm256_add_pd(fjy2,ty);
2348 fjz2 = _mm256_add_pd(fjz2,tz);
2352 /**************************
2353 * CALCULATE INTERACTIONS *
2354 **************************/
2356 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2359 r10 = _mm256_mul_pd(rsq10,rinv10);
2360 r10 = _mm256_andnot_pd(dummy_mask,r10);
2362 /* EWALD ELECTROSTATICS */
2364 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2365 ewrt = _mm256_mul_pd(r10,ewtabscale);
2366 ewitab = _mm256_cvttpd_epi32(ewrt);
2367 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2368 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2369 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2371 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2372 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2374 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2378 fscal = _mm256_and_pd(fscal,cutoff_mask);
2380 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2382 /* Calculate temporary vectorial force */
2383 tx = _mm256_mul_pd(fscal,dx10);
2384 ty = _mm256_mul_pd(fscal,dy10);
2385 tz = _mm256_mul_pd(fscal,dz10);
2387 /* Update vectorial force */
2388 fix1 = _mm256_add_pd(fix1,tx);
2389 fiy1 = _mm256_add_pd(fiy1,ty);
2390 fiz1 = _mm256_add_pd(fiz1,tz);
2392 fjx0 = _mm256_add_pd(fjx0,tx);
2393 fjy0 = _mm256_add_pd(fjy0,ty);
2394 fjz0 = _mm256_add_pd(fjz0,tz);
2398 /**************************
2399 * CALCULATE INTERACTIONS *
2400 **************************/
2402 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2405 r11 = _mm256_mul_pd(rsq11,rinv11);
2406 r11 = _mm256_andnot_pd(dummy_mask,r11);
2408 /* EWALD ELECTROSTATICS */
2410 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2411 ewrt = _mm256_mul_pd(r11,ewtabscale);
2412 ewitab = _mm256_cvttpd_epi32(ewrt);
2413 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2414 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2415 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2417 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2418 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2420 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2424 fscal = _mm256_and_pd(fscal,cutoff_mask);
2426 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2428 /* Calculate temporary vectorial force */
2429 tx = _mm256_mul_pd(fscal,dx11);
2430 ty = _mm256_mul_pd(fscal,dy11);
2431 tz = _mm256_mul_pd(fscal,dz11);
2433 /* Update vectorial force */
2434 fix1 = _mm256_add_pd(fix1,tx);
2435 fiy1 = _mm256_add_pd(fiy1,ty);
2436 fiz1 = _mm256_add_pd(fiz1,tz);
2438 fjx1 = _mm256_add_pd(fjx1,tx);
2439 fjy1 = _mm256_add_pd(fjy1,ty);
2440 fjz1 = _mm256_add_pd(fjz1,tz);
2444 /**************************
2445 * CALCULATE INTERACTIONS *
2446 **************************/
2448 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2451 r12 = _mm256_mul_pd(rsq12,rinv12);
2452 r12 = _mm256_andnot_pd(dummy_mask,r12);
2454 /* EWALD ELECTROSTATICS */
2456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2457 ewrt = _mm256_mul_pd(r12,ewtabscale);
2458 ewitab = _mm256_cvttpd_epi32(ewrt);
2459 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2460 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2461 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2463 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2464 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2466 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2470 fscal = _mm256_and_pd(fscal,cutoff_mask);
2472 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2474 /* Calculate temporary vectorial force */
2475 tx = _mm256_mul_pd(fscal,dx12);
2476 ty = _mm256_mul_pd(fscal,dy12);
2477 tz = _mm256_mul_pd(fscal,dz12);
2479 /* Update vectorial force */
2480 fix1 = _mm256_add_pd(fix1,tx);
2481 fiy1 = _mm256_add_pd(fiy1,ty);
2482 fiz1 = _mm256_add_pd(fiz1,tz);
2484 fjx2 = _mm256_add_pd(fjx2,tx);
2485 fjy2 = _mm256_add_pd(fjy2,ty);
2486 fjz2 = _mm256_add_pd(fjz2,tz);
2490 /**************************
2491 * CALCULATE INTERACTIONS *
2492 **************************/
2494 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2497 r20 = _mm256_mul_pd(rsq20,rinv20);
2498 r20 = _mm256_andnot_pd(dummy_mask,r20);
2500 /* EWALD ELECTROSTATICS */
2502 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2503 ewrt = _mm256_mul_pd(r20,ewtabscale);
2504 ewitab = _mm256_cvttpd_epi32(ewrt);
2505 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2506 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2507 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2509 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2510 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2512 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2516 fscal = _mm256_and_pd(fscal,cutoff_mask);
2518 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2520 /* Calculate temporary vectorial force */
2521 tx = _mm256_mul_pd(fscal,dx20);
2522 ty = _mm256_mul_pd(fscal,dy20);
2523 tz = _mm256_mul_pd(fscal,dz20);
2525 /* Update vectorial force */
2526 fix2 = _mm256_add_pd(fix2,tx);
2527 fiy2 = _mm256_add_pd(fiy2,ty);
2528 fiz2 = _mm256_add_pd(fiz2,tz);
2530 fjx0 = _mm256_add_pd(fjx0,tx);
2531 fjy0 = _mm256_add_pd(fjy0,ty);
2532 fjz0 = _mm256_add_pd(fjz0,tz);
2536 /**************************
2537 * CALCULATE INTERACTIONS *
2538 **************************/
2540 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2543 r21 = _mm256_mul_pd(rsq21,rinv21);
2544 r21 = _mm256_andnot_pd(dummy_mask,r21);
2546 /* EWALD ELECTROSTATICS */
2548 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2549 ewrt = _mm256_mul_pd(r21,ewtabscale);
2550 ewitab = _mm256_cvttpd_epi32(ewrt);
2551 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2552 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2553 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2555 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2556 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2558 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2562 fscal = _mm256_and_pd(fscal,cutoff_mask);
2564 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2566 /* Calculate temporary vectorial force */
2567 tx = _mm256_mul_pd(fscal,dx21);
2568 ty = _mm256_mul_pd(fscal,dy21);
2569 tz = _mm256_mul_pd(fscal,dz21);
2571 /* Update vectorial force */
2572 fix2 = _mm256_add_pd(fix2,tx);
2573 fiy2 = _mm256_add_pd(fiy2,ty);
2574 fiz2 = _mm256_add_pd(fiz2,tz);
2576 fjx1 = _mm256_add_pd(fjx1,tx);
2577 fjy1 = _mm256_add_pd(fjy1,ty);
2578 fjz1 = _mm256_add_pd(fjz1,tz);
2582 /**************************
2583 * CALCULATE INTERACTIONS *
2584 **************************/
2586 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2589 r22 = _mm256_mul_pd(rsq22,rinv22);
2590 r22 = _mm256_andnot_pd(dummy_mask,r22);
2592 /* EWALD ELECTROSTATICS */
2594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2595 ewrt = _mm256_mul_pd(r22,ewtabscale);
2596 ewitab = _mm256_cvttpd_epi32(ewrt);
2597 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2598 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2599 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2601 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2602 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2604 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2608 fscal = _mm256_and_pd(fscal,cutoff_mask);
2610 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2612 /* Calculate temporary vectorial force */
2613 tx = _mm256_mul_pd(fscal,dx22);
2614 ty = _mm256_mul_pd(fscal,dy22);
2615 tz = _mm256_mul_pd(fscal,dz22);
2617 /* Update vectorial force */
2618 fix2 = _mm256_add_pd(fix2,tx);
2619 fiy2 = _mm256_add_pd(fiy2,ty);
2620 fiz2 = _mm256_add_pd(fiz2,tz);
2622 fjx2 = _mm256_add_pd(fjx2,tx);
2623 fjy2 = _mm256_add_pd(fjy2,ty);
2624 fjz2 = _mm256_add_pd(fjz2,tz);
2628 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2629 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2630 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2631 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2633 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2634 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2636 /* Inner loop uses 367 flops */
2639 /* End of innermost loop */
2641 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2642 f+i_coord_offset,fshift+i_shift_offset);
2644 /* Increment number of inner iterations */
2645 inneriter += j_index_end - j_index_start;
2647 /* Outer loop uses 18 flops */
2650 /* Increment number of outer iterations */
2653 /* Update outer/inner flops */
2655 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);