2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
105 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
106 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
108 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
109 real rswitch_scalar,d_scalar;
110 __m256d dummy_mask,cutoff_mask;
111 __m128 tmpmask0,tmpmask1;
112 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
113 __m256d one = _mm256_set1_pd(1.0);
114 __m256d two = _mm256_set1_pd(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_pd(fr->epsfac);
127 charge = mdatoms->chargeA;
128 nvdwtype = fr->ntype;
130 vdwtype = mdatoms->typeA;
132 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
133 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
134 beta2 = _mm256_mul_pd(beta,beta);
135 beta3 = _mm256_mul_pd(beta,beta2);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
144 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
145 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
146 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
148 jq1 = _mm256_set1_pd(charge[inr+1]);
149 jq2 = _mm256_set1_pd(charge[inr+2]);
150 jq3 = _mm256_set1_pd(charge[inr+3]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
153 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
154 qq11 = _mm256_mul_pd(iq1,jq1);
155 qq12 = _mm256_mul_pd(iq1,jq2);
156 qq13 = _mm256_mul_pd(iq1,jq3);
157 qq21 = _mm256_mul_pd(iq2,jq1);
158 qq22 = _mm256_mul_pd(iq2,jq2);
159 qq23 = _mm256_mul_pd(iq2,jq3);
160 qq31 = _mm256_mul_pd(iq3,jq1);
161 qq32 = _mm256_mul_pd(iq3,jq2);
162 qq33 = _mm256_mul_pd(iq3,jq3);
164 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165 rcutoff_scalar = fr->rcoulomb;
166 rcutoff = _mm256_set1_pd(rcutoff_scalar);
167 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
169 rswitch_scalar = fr->rcoulomb_switch;
170 rswitch = _mm256_set1_pd(rswitch_scalar);
171 /* Setup switch parameters */
172 d_scalar = rcutoff_scalar-rswitch_scalar;
173 d = _mm256_set1_pd(d_scalar);
174 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
175 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
178 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
179 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 /* Avoid stupid compiler warnings */
182 jnrA = jnrB = jnrC = jnrD = 0;
191 for(iidx=0;iidx<4*DIM;iidx++)
196 /* Start outer loop over neighborlists */
197 for(iidx=0; iidx<nri; iidx++)
199 /* Load shift vector for this list */
200 i_shift_offset = DIM*shiftidx[iidx];
202 /* Load limits for loop over neighbors */
203 j_index_start = jindex[iidx];
204 j_index_end = jindex[iidx+1];
206 /* Get outer coordinate index */
208 i_coord_offset = DIM*inr;
210 /* Load i particle coords and add shift vector */
211 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
212 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
214 fix0 = _mm256_setzero_pd();
215 fiy0 = _mm256_setzero_pd();
216 fiz0 = _mm256_setzero_pd();
217 fix1 = _mm256_setzero_pd();
218 fiy1 = _mm256_setzero_pd();
219 fiz1 = _mm256_setzero_pd();
220 fix2 = _mm256_setzero_pd();
221 fiy2 = _mm256_setzero_pd();
222 fiz2 = _mm256_setzero_pd();
223 fix3 = _mm256_setzero_pd();
224 fiy3 = _mm256_setzero_pd();
225 fiz3 = _mm256_setzero_pd();
227 /* Reset potential sums */
228 velecsum = _mm256_setzero_pd();
229 vvdwsum = _mm256_setzero_pd();
231 /* Start inner kernel loop */
232 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
235 /* Get j neighbor index, and coordinate index */
240 j_coord_offsetA = DIM*jnrA;
241 j_coord_offsetB = DIM*jnrB;
242 j_coord_offsetC = DIM*jnrC;
243 j_coord_offsetD = DIM*jnrD;
245 /* load j atom coordinates */
246 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
247 x+j_coord_offsetC,x+j_coord_offsetD,
248 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
249 &jy2,&jz2,&jx3,&jy3,&jz3);
251 /* Calculate displacement vector */
252 dx00 = _mm256_sub_pd(ix0,jx0);
253 dy00 = _mm256_sub_pd(iy0,jy0);
254 dz00 = _mm256_sub_pd(iz0,jz0);
255 dx11 = _mm256_sub_pd(ix1,jx1);
256 dy11 = _mm256_sub_pd(iy1,jy1);
257 dz11 = _mm256_sub_pd(iz1,jz1);
258 dx12 = _mm256_sub_pd(ix1,jx2);
259 dy12 = _mm256_sub_pd(iy1,jy2);
260 dz12 = _mm256_sub_pd(iz1,jz2);
261 dx13 = _mm256_sub_pd(ix1,jx3);
262 dy13 = _mm256_sub_pd(iy1,jy3);
263 dz13 = _mm256_sub_pd(iz1,jz3);
264 dx21 = _mm256_sub_pd(ix2,jx1);
265 dy21 = _mm256_sub_pd(iy2,jy1);
266 dz21 = _mm256_sub_pd(iz2,jz1);
267 dx22 = _mm256_sub_pd(ix2,jx2);
268 dy22 = _mm256_sub_pd(iy2,jy2);
269 dz22 = _mm256_sub_pd(iz2,jz2);
270 dx23 = _mm256_sub_pd(ix2,jx3);
271 dy23 = _mm256_sub_pd(iy2,jy3);
272 dz23 = _mm256_sub_pd(iz2,jz3);
273 dx31 = _mm256_sub_pd(ix3,jx1);
274 dy31 = _mm256_sub_pd(iy3,jy1);
275 dz31 = _mm256_sub_pd(iz3,jz1);
276 dx32 = _mm256_sub_pd(ix3,jx2);
277 dy32 = _mm256_sub_pd(iy3,jy2);
278 dz32 = _mm256_sub_pd(iz3,jz2);
279 dx33 = _mm256_sub_pd(ix3,jx3);
280 dy33 = _mm256_sub_pd(iy3,jy3);
281 dz33 = _mm256_sub_pd(iz3,jz3);
283 /* Calculate squared distance and things based on it */
284 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
285 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
286 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
287 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
288 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
289 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
290 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
291 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
292 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
293 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
295 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
296 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
297 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
298 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
299 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
300 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
301 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
302 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
303 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
304 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
306 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
307 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
308 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
309 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
310 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
311 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
312 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
313 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
314 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
315 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
317 fjx0 = _mm256_setzero_pd();
318 fjy0 = _mm256_setzero_pd();
319 fjz0 = _mm256_setzero_pd();
320 fjx1 = _mm256_setzero_pd();
321 fjy1 = _mm256_setzero_pd();
322 fjz1 = _mm256_setzero_pd();
323 fjx2 = _mm256_setzero_pd();
324 fjy2 = _mm256_setzero_pd();
325 fjz2 = _mm256_setzero_pd();
326 fjx3 = _mm256_setzero_pd();
327 fjy3 = _mm256_setzero_pd();
328 fjz3 = _mm256_setzero_pd();
330 /**************************
331 * CALCULATE INTERACTIONS *
332 **************************/
334 if (gmx_mm256_any_lt(rsq00,rcutoff2))
337 r00 = _mm256_mul_pd(rsq00,rinv00);
339 /* LENNARD-JONES DISPERSION/REPULSION */
341 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
342 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
343 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
344 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
345 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
347 d = _mm256_sub_pd(r00,rswitch);
348 d = _mm256_max_pd(d,_mm256_setzero_pd());
349 d2 = _mm256_mul_pd(d,d);
350 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
352 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
354 /* Evaluate switch function */
355 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
356 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
357 vvdw = _mm256_mul_pd(vvdw,sw);
358 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
362 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
366 fscal = _mm256_and_pd(fscal,cutoff_mask);
368 /* Calculate temporary vectorial force */
369 tx = _mm256_mul_pd(fscal,dx00);
370 ty = _mm256_mul_pd(fscal,dy00);
371 tz = _mm256_mul_pd(fscal,dz00);
373 /* Update vectorial force */
374 fix0 = _mm256_add_pd(fix0,tx);
375 fiy0 = _mm256_add_pd(fiy0,ty);
376 fiz0 = _mm256_add_pd(fiz0,tz);
378 fjx0 = _mm256_add_pd(fjx0,tx);
379 fjy0 = _mm256_add_pd(fjy0,ty);
380 fjz0 = _mm256_add_pd(fjz0,tz);
384 /**************************
385 * CALCULATE INTERACTIONS *
386 **************************/
388 if (gmx_mm256_any_lt(rsq11,rcutoff2))
391 r11 = _mm256_mul_pd(rsq11,rinv11);
393 /* EWALD ELECTROSTATICS */
395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
396 ewrt = _mm256_mul_pd(r11,ewtabscale);
397 ewitab = _mm256_cvttpd_epi32(ewrt);
398 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
399 ewitab = _mm_slli_epi32(ewitab,2);
400 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
401 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
402 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
403 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
404 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
405 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
406 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
407 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
408 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
410 d = _mm256_sub_pd(r11,rswitch);
411 d = _mm256_max_pd(d,_mm256_setzero_pd());
412 d2 = _mm256_mul_pd(d,d);
413 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
415 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
417 /* Evaluate switch function */
418 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
419 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
420 velec = _mm256_mul_pd(velec,sw);
421 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velec = _mm256_and_pd(velec,cutoff_mask);
425 velecsum = _mm256_add_pd(velecsum,velec);
429 fscal = _mm256_and_pd(fscal,cutoff_mask);
431 /* Calculate temporary vectorial force */
432 tx = _mm256_mul_pd(fscal,dx11);
433 ty = _mm256_mul_pd(fscal,dy11);
434 tz = _mm256_mul_pd(fscal,dz11);
436 /* Update vectorial force */
437 fix1 = _mm256_add_pd(fix1,tx);
438 fiy1 = _mm256_add_pd(fiy1,ty);
439 fiz1 = _mm256_add_pd(fiz1,tz);
441 fjx1 = _mm256_add_pd(fjx1,tx);
442 fjy1 = _mm256_add_pd(fjy1,ty);
443 fjz1 = _mm256_add_pd(fjz1,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 if (gmx_mm256_any_lt(rsq12,rcutoff2))
454 r12 = _mm256_mul_pd(rsq12,rinv12);
456 /* EWALD ELECTROSTATICS */
458 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
459 ewrt = _mm256_mul_pd(r12,ewtabscale);
460 ewitab = _mm256_cvttpd_epi32(ewrt);
461 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
462 ewitab = _mm_slli_epi32(ewitab,2);
463 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
464 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
465 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
466 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
467 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
468 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
469 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
470 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
471 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
473 d = _mm256_sub_pd(r12,rswitch);
474 d = _mm256_max_pd(d,_mm256_setzero_pd());
475 d2 = _mm256_mul_pd(d,d);
476 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
478 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
480 /* Evaluate switch function */
481 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
482 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
483 velec = _mm256_mul_pd(velec,sw);
484 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velec = _mm256_and_pd(velec,cutoff_mask);
488 velecsum = _mm256_add_pd(velecsum,velec);
492 fscal = _mm256_and_pd(fscal,cutoff_mask);
494 /* Calculate temporary vectorial force */
495 tx = _mm256_mul_pd(fscal,dx12);
496 ty = _mm256_mul_pd(fscal,dy12);
497 tz = _mm256_mul_pd(fscal,dz12);
499 /* Update vectorial force */
500 fix1 = _mm256_add_pd(fix1,tx);
501 fiy1 = _mm256_add_pd(fiy1,ty);
502 fiz1 = _mm256_add_pd(fiz1,tz);
504 fjx2 = _mm256_add_pd(fjx2,tx);
505 fjy2 = _mm256_add_pd(fjy2,ty);
506 fjz2 = _mm256_add_pd(fjz2,tz);
510 /**************************
511 * CALCULATE INTERACTIONS *
512 **************************/
514 if (gmx_mm256_any_lt(rsq13,rcutoff2))
517 r13 = _mm256_mul_pd(rsq13,rinv13);
519 /* EWALD ELECTROSTATICS */
521 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
522 ewrt = _mm256_mul_pd(r13,ewtabscale);
523 ewitab = _mm256_cvttpd_epi32(ewrt);
524 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
525 ewitab = _mm_slli_epi32(ewitab,2);
526 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
527 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
528 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
529 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
530 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
531 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
532 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
533 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
534 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
536 d = _mm256_sub_pd(r13,rswitch);
537 d = _mm256_max_pd(d,_mm256_setzero_pd());
538 d2 = _mm256_mul_pd(d,d);
539 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
541 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
543 /* Evaluate switch function */
544 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
545 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
546 velec = _mm256_mul_pd(velec,sw);
547 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
549 /* Update potential sum for this i atom from the interaction with this j atom. */
550 velec = _mm256_and_pd(velec,cutoff_mask);
551 velecsum = _mm256_add_pd(velecsum,velec);
555 fscal = _mm256_and_pd(fscal,cutoff_mask);
557 /* Calculate temporary vectorial force */
558 tx = _mm256_mul_pd(fscal,dx13);
559 ty = _mm256_mul_pd(fscal,dy13);
560 tz = _mm256_mul_pd(fscal,dz13);
562 /* Update vectorial force */
563 fix1 = _mm256_add_pd(fix1,tx);
564 fiy1 = _mm256_add_pd(fiy1,ty);
565 fiz1 = _mm256_add_pd(fiz1,tz);
567 fjx3 = _mm256_add_pd(fjx3,tx);
568 fjy3 = _mm256_add_pd(fjy3,ty);
569 fjz3 = _mm256_add_pd(fjz3,tz);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 if (gmx_mm256_any_lt(rsq21,rcutoff2))
580 r21 = _mm256_mul_pd(rsq21,rinv21);
582 /* EWALD ELECTROSTATICS */
584 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
585 ewrt = _mm256_mul_pd(r21,ewtabscale);
586 ewitab = _mm256_cvttpd_epi32(ewrt);
587 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
588 ewitab = _mm_slli_epi32(ewitab,2);
589 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
590 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
591 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
592 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
593 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
594 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
595 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
596 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
597 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
599 d = _mm256_sub_pd(r21,rswitch);
600 d = _mm256_max_pd(d,_mm256_setzero_pd());
601 d2 = _mm256_mul_pd(d,d);
602 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
604 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
606 /* Evaluate switch function */
607 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
608 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
609 velec = _mm256_mul_pd(velec,sw);
610 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velec = _mm256_and_pd(velec,cutoff_mask);
614 velecsum = _mm256_add_pd(velecsum,velec);
618 fscal = _mm256_and_pd(fscal,cutoff_mask);
620 /* Calculate temporary vectorial force */
621 tx = _mm256_mul_pd(fscal,dx21);
622 ty = _mm256_mul_pd(fscal,dy21);
623 tz = _mm256_mul_pd(fscal,dz21);
625 /* Update vectorial force */
626 fix2 = _mm256_add_pd(fix2,tx);
627 fiy2 = _mm256_add_pd(fiy2,ty);
628 fiz2 = _mm256_add_pd(fiz2,tz);
630 fjx1 = _mm256_add_pd(fjx1,tx);
631 fjy1 = _mm256_add_pd(fjy1,ty);
632 fjz1 = _mm256_add_pd(fjz1,tz);
636 /**************************
637 * CALCULATE INTERACTIONS *
638 **************************/
640 if (gmx_mm256_any_lt(rsq22,rcutoff2))
643 r22 = _mm256_mul_pd(rsq22,rinv22);
645 /* EWALD ELECTROSTATICS */
647 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
648 ewrt = _mm256_mul_pd(r22,ewtabscale);
649 ewitab = _mm256_cvttpd_epi32(ewrt);
650 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
651 ewitab = _mm_slli_epi32(ewitab,2);
652 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
653 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
654 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
655 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
656 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
657 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
658 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
659 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
660 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
662 d = _mm256_sub_pd(r22,rswitch);
663 d = _mm256_max_pd(d,_mm256_setzero_pd());
664 d2 = _mm256_mul_pd(d,d);
665 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
667 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
669 /* Evaluate switch function */
670 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
671 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
672 velec = _mm256_mul_pd(velec,sw);
673 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 velec = _mm256_and_pd(velec,cutoff_mask);
677 velecsum = _mm256_add_pd(velecsum,velec);
681 fscal = _mm256_and_pd(fscal,cutoff_mask);
683 /* Calculate temporary vectorial force */
684 tx = _mm256_mul_pd(fscal,dx22);
685 ty = _mm256_mul_pd(fscal,dy22);
686 tz = _mm256_mul_pd(fscal,dz22);
688 /* Update vectorial force */
689 fix2 = _mm256_add_pd(fix2,tx);
690 fiy2 = _mm256_add_pd(fiy2,ty);
691 fiz2 = _mm256_add_pd(fiz2,tz);
693 fjx2 = _mm256_add_pd(fjx2,tx);
694 fjy2 = _mm256_add_pd(fjy2,ty);
695 fjz2 = _mm256_add_pd(fjz2,tz);
699 /**************************
700 * CALCULATE INTERACTIONS *
701 **************************/
703 if (gmx_mm256_any_lt(rsq23,rcutoff2))
706 r23 = _mm256_mul_pd(rsq23,rinv23);
708 /* EWALD ELECTROSTATICS */
710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
711 ewrt = _mm256_mul_pd(r23,ewtabscale);
712 ewitab = _mm256_cvttpd_epi32(ewrt);
713 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
714 ewitab = _mm_slli_epi32(ewitab,2);
715 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
716 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
717 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
718 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
719 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
720 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
721 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
722 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
723 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
725 d = _mm256_sub_pd(r23,rswitch);
726 d = _mm256_max_pd(d,_mm256_setzero_pd());
727 d2 = _mm256_mul_pd(d,d);
728 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
730 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
732 /* Evaluate switch function */
733 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
734 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
735 velec = _mm256_mul_pd(velec,sw);
736 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
738 /* Update potential sum for this i atom from the interaction with this j atom. */
739 velec = _mm256_and_pd(velec,cutoff_mask);
740 velecsum = _mm256_add_pd(velecsum,velec);
744 fscal = _mm256_and_pd(fscal,cutoff_mask);
746 /* Calculate temporary vectorial force */
747 tx = _mm256_mul_pd(fscal,dx23);
748 ty = _mm256_mul_pd(fscal,dy23);
749 tz = _mm256_mul_pd(fscal,dz23);
751 /* Update vectorial force */
752 fix2 = _mm256_add_pd(fix2,tx);
753 fiy2 = _mm256_add_pd(fiy2,ty);
754 fiz2 = _mm256_add_pd(fiz2,tz);
756 fjx3 = _mm256_add_pd(fjx3,tx);
757 fjy3 = _mm256_add_pd(fjy3,ty);
758 fjz3 = _mm256_add_pd(fjz3,tz);
762 /**************************
763 * CALCULATE INTERACTIONS *
764 **************************/
766 if (gmx_mm256_any_lt(rsq31,rcutoff2))
769 r31 = _mm256_mul_pd(rsq31,rinv31);
771 /* EWALD ELECTROSTATICS */
773 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
774 ewrt = _mm256_mul_pd(r31,ewtabscale);
775 ewitab = _mm256_cvttpd_epi32(ewrt);
776 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
777 ewitab = _mm_slli_epi32(ewitab,2);
778 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
779 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
780 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
781 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
782 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
783 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
784 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
785 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
786 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
788 d = _mm256_sub_pd(r31,rswitch);
789 d = _mm256_max_pd(d,_mm256_setzero_pd());
790 d2 = _mm256_mul_pd(d,d);
791 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
793 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
795 /* Evaluate switch function */
796 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
797 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
798 velec = _mm256_mul_pd(velec,sw);
799 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
801 /* Update potential sum for this i atom from the interaction with this j atom. */
802 velec = _mm256_and_pd(velec,cutoff_mask);
803 velecsum = _mm256_add_pd(velecsum,velec);
807 fscal = _mm256_and_pd(fscal,cutoff_mask);
809 /* Calculate temporary vectorial force */
810 tx = _mm256_mul_pd(fscal,dx31);
811 ty = _mm256_mul_pd(fscal,dy31);
812 tz = _mm256_mul_pd(fscal,dz31);
814 /* Update vectorial force */
815 fix3 = _mm256_add_pd(fix3,tx);
816 fiy3 = _mm256_add_pd(fiy3,ty);
817 fiz3 = _mm256_add_pd(fiz3,tz);
819 fjx1 = _mm256_add_pd(fjx1,tx);
820 fjy1 = _mm256_add_pd(fjy1,ty);
821 fjz1 = _mm256_add_pd(fjz1,tz);
825 /**************************
826 * CALCULATE INTERACTIONS *
827 **************************/
829 if (gmx_mm256_any_lt(rsq32,rcutoff2))
832 r32 = _mm256_mul_pd(rsq32,rinv32);
834 /* EWALD ELECTROSTATICS */
836 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
837 ewrt = _mm256_mul_pd(r32,ewtabscale);
838 ewitab = _mm256_cvttpd_epi32(ewrt);
839 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
840 ewitab = _mm_slli_epi32(ewitab,2);
841 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
842 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
843 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
844 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
845 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
846 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
847 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
848 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
849 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
851 d = _mm256_sub_pd(r32,rswitch);
852 d = _mm256_max_pd(d,_mm256_setzero_pd());
853 d2 = _mm256_mul_pd(d,d);
854 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
856 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
858 /* Evaluate switch function */
859 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
860 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
861 velec = _mm256_mul_pd(velec,sw);
862 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec = _mm256_and_pd(velec,cutoff_mask);
866 velecsum = _mm256_add_pd(velecsum,velec);
870 fscal = _mm256_and_pd(fscal,cutoff_mask);
872 /* Calculate temporary vectorial force */
873 tx = _mm256_mul_pd(fscal,dx32);
874 ty = _mm256_mul_pd(fscal,dy32);
875 tz = _mm256_mul_pd(fscal,dz32);
877 /* Update vectorial force */
878 fix3 = _mm256_add_pd(fix3,tx);
879 fiy3 = _mm256_add_pd(fiy3,ty);
880 fiz3 = _mm256_add_pd(fiz3,tz);
882 fjx2 = _mm256_add_pd(fjx2,tx);
883 fjy2 = _mm256_add_pd(fjy2,ty);
884 fjz2 = _mm256_add_pd(fjz2,tz);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 if (gmx_mm256_any_lt(rsq33,rcutoff2))
895 r33 = _mm256_mul_pd(rsq33,rinv33);
897 /* EWALD ELECTROSTATICS */
899 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
900 ewrt = _mm256_mul_pd(r33,ewtabscale);
901 ewitab = _mm256_cvttpd_epi32(ewrt);
902 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
903 ewitab = _mm_slli_epi32(ewitab,2);
904 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
905 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
906 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
907 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
908 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
909 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
910 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
911 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
912 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
914 d = _mm256_sub_pd(r33,rswitch);
915 d = _mm256_max_pd(d,_mm256_setzero_pd());
916 d2 = _mm256_mul_pd(d,d);
917 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
919 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
921 /* Evaluate switch function */
922 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
923 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
924 velec = _mm256_mul_pd(velec,sw);
925 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
927 /* Update potential sum for this i atom from the interaction with this j atom. */
928 velec = _mm256_and_pd(velec,cutoff_mask);
929 velecsum = _mm256_add_pd(velecsum,velec);
933 fscal = _mm256_and_pd(fscal,cutoff_mask);
935 /* Calculate temporary vectorial force */
936 tx = _mm256_mul_pd(fscal,dx33);
937 ty = _mm256_mul_pd(fscal,dy33);
938 tz = _mm256_mul_pd(fscal,dz33);
940 /* Update vectorial force */
941 fix3 = _mm256_add_pd(fix3,tx);
942 fiy3 = _mm256_add_pd(fiy3,ty);
943 fiz3 = _mm256_add_pd(fiz3,tz);
945 fjx3 = _mm256_add_pd(fjx3,tx);
946 fjy3 = _mm256_add_pd(fjy3,ty);
947 fjz3 = _mm256_add_pd(fjz3,tz);
951 fjptrA = f+j_coord_offsetA;
952 fjptrB = f+j_coord_offsetB;
953 fjptrC = f+j_coord_offsetC;
954 fjptrD = f+j_coord_offsetD;
956 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
957 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
958 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
960 /* Inner loop uses 647 flops */
966 /* Get j neighbor index, and coordinate index */
967 jnrlistA = jjnr[jidx];
968 jnrlistB = jjnr[jidx+1];
969 jnrlistC = jjnr[jidx+2];
970 jnrlistD = jjnr[jidx+3];
971 /* Sign of each element will be negative for non-real atoms.
972 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
973 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
975 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
977 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
978 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
979 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
981 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
982 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
983 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
984 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
985 j_coord_offsetA = DIM*jnrA;
986 j_coord_offsetB = DIM*jnrB;
987 j_coord_offsetC = DIM*jnrC;
988 j_coord_offsetD = DIM*jnrD;
990 /* load j atom coordinates */
991 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
992 x+j_coord_offsetC,x+j_coord_offsetD,
993 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
994 &jy2,&jz2,&jx3,&jy3,&jz3);
996 /* Calculate displacement vector */
997 dx00 = _mm256_sub_pd(ix0,jx0);
998 dy00 = _mm256_sub_pd(iy0,jy0);
999 dz00 = _mm256_sub_pd(iz0,jz0);
1000 dx11 = _mm256_sub_pd(ix1,jx1);
1001 dy11 = _mm256_sub_pd(iy1,jy1);
1002 dz11 = _mm256_sub_pd(iz1,jz1);
1003 dx12 = _mm256_sub_pd(ix1,jx2);
1004 dy12 = _mm256_sub_pd(iy1,jy2);
1005 dz12 = _mm256_sub_pd(iz1,jz2);
1006 dx13 = _mm256_sub_pd(ix1,jx3);
1007 dy13 = _mm256_sub_pd(iy1,jy3);
1008 dz13 = _mm256_sub_pd(iz1,jz3);
1009 dx21 = _mm256_sub_pd(ix2,jx1);
1010 dy21 = _mm256_sub_pd(iy2,jy1);
1011 dz21 = _mm256_sub_pd(iz2,jz1);
1012 dx22 = _mm256_sub_pd(ix2,jx2);
1013 dy22 = _mm256_sub_pd(iy2,jy2);
1014 dz22 = _mm256_sub_pd(iz2,jz2);
1015 dx23 = _mm256_sub_pd(ix2,jx3);
1016 dy23 = _mm256_sub_pd(iy2,jy3);
1017 dz23 = _mm256_sub_pd(iz2,jz3);
1018 dx31 = _mm256_sub_pd(ix3,jx1);
1019 dy31 = _mm256_sub_pd(iy3,jy1);
1020 dz31 = _mm256_sub_pd(iz3,jz1);
1021 dx32 = _mm256_sub_pd(ix3,jx2);
1022 dy32 = _mm256_sub_pd(iy3,jy2);
1023 dz32 = _mm256_sub_pd(iz3,jz2);
1024 dx33 = _mm256_sub_pd(ix3,jx3);
1025 dy33 = _mm256_sub_pd(iy3,jy3);
1026 dz33 = _mm256_sub_pd(iz3,jz3);
1028 /* Calculate squared distance and things based on it */
1029 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1030 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1031 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1032 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1033 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1034 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1035 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1036 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1037 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1038 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1040 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1041 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1042 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1043 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1044 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1045 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1046 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1047 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1048 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1049 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1051 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1052 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1053 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1054 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1055 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1056 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1057 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1058 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1059 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1060 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1062 fjx0 = _mm256_setzero_pd();
1063 fjy0 = _mm256_setzero_pd();
1064 fjz0 = _mm256_setzero_pd();
1065 fjx1 = _mm256_setzero_pd();
1066 fjy1 = _mm256_setzero_pd();
1067 fjz1 = _mm256_setzero_pd();
1068 fjx2 = _mm256_setzero_pd();
1069 fjy2 = _mm256_setzero_pd();
1070 fjz2 = _mm256_setzero_pd();
1071 fjx3 = _mm256_setzero_pd();
1072 fjy3 = _mm256_setzero_pd();
1073 fjz3 = _mm256_setzero_pd();
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1082 r00 = _mm256_mul_pd(rsq00,rinv00);
1083 r00 = _mm256_andnot_pd(dummy_mask,r00);
1085 /* LENNARD-JONES DISPERSION/REPULSION */
1087 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1088 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1089 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1090 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1091 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1093 d = _mm256_sub_pd(r00,rswitch);
1094 d = _mm256_max_pd(d,_mm256_setzero_pd());
1095 d2 = _mm256_mul_pd(d,d);
1096 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1098 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1100 /* Evaluate switch function */
1101 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1102 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1103 vvdw = _mm256_mul_pd(vvdw,sw);
1104 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
1108 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
1109 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
1113 fscal = _mm256_and_pd(fscal,cutoff_mask);
1115 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1117 /* Calculate temporary vectorial force */
1118 tx = _mm256_mul_pd(fscal,dx00);
1119 ty = _mm256_mul_pd(fscal,dy00);
1120 tz = _mm256_mul_pd(fscal,dz00);
1122 /* Update vectorial force */
1123 fix0 = _mm256_add_pd(fix0,tx);
1124 fiy0 = _mm256_add_pd(fiy0,ty);
1125 fiz0 = _mm256_add_pd(fiz0,tz);
1127 fjx0 = _mm256_add_pd(fjx0,tx);
1128 fjy0 = _mm256_add_pd(fjy0,ty);
1129 fjz0 = _mm256_add_pd(fjz0,tz);
1133 /**************************
1134 * CALCULATE INTERACTIONS *
1135 **************************/
1137 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1140 r11 = _mm256_mul_pd(rsq11,rinv11);
1141 r11 = _mm256_andnot_pd(dummy_mask,r11);
1143 /* EWALD ELECTROSTATICS */
1145 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1146 ewrt = _mm256_mul_pd(r11,ewtabscale);
1147 ewitab = _mm256_cvttpd_epi32(ewrt);
1148 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1149 ewitab = _mm_slli_epi32(ewitab,2);
1150 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1151 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1152 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1153 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1154 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1155 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1156 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1157 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1158 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1160 d = _mm256_sub_pd(r11,rswitch);
1161 d = _mm256_max_pd(d,_mm256_setzero_pd());
1162 d2 = _mm256_mul_pd(d,d);
1163 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1165 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1167 /* Evaluate switch function */
1168 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1169 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1170 velec = _mm256_mul_pd(velec,sw);
1171 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1173 /* Update potential sum for this i atom from the interaction with this j atom. */
1174 velec = _mm256_and_pd(velec,cutoff_mask);
1175 velec = _mm256_andnot_pd(dummy_mask,velec);
1176 velecsum = _mm256_add_pd(velecsum,velec);
1180 fscal = _mm256_and_pd(fscal,cutoff_mask);
1182 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1184 /* Calculate temporary vectorial force */
1185 tx = _mm256_mul_pd(fscal,dx11);
1186 ty = _mm256_mul_pd(fscal,dy11);
1187 tz = _mm256_mul_pd(fscal,dz11);
1189 /* Update vectorial force */
1190 fix1 = _mm256_add_pd(fix1,tx);
1191 fiy1 = _mm256_add_pd(fiy1,ty);
1192 fiz1 = _mm256_add_pd(fiz1,tz);
1194 fjx1 = _mm256_add_pd(fjx1,tx);
1195 fjy1 = _mm256_add_pd(fjy1,ty);
1196 fjz1 = _mm256_add_pd(fjz1,tz);
1200 /**************************
1201 * CALCULATE INTERACTIONS *
1202 **************************/
1204 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1207 r12 = _mm256_mul_pd(rsq12,rinv12);
1208 r12 = _mm256_andnot_pd(dummy_mask,r12);
1210 /* EWALD ELECTROSTATICS */
1212 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1213 ewrt = _mm256_mul_pd(r12,ewtabscale);
1214 ewitab = _mm256_cvttpd_epi32(ewrt);
1215 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1216 ewitab = _mm_slli_epi32(ewitab,2);
1217 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1218 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1219 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1220 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1221 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1222 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1223 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1224 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1225 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1227 d = _mm256_sub_pd(r12,rswitch);
1228 d = _mm256_max_pd(d,_mm256_setzero_pd());
1229 d2 = _mm256_mul_pd(d,d);
1230 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1232 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1234 /* Evaluate switch function */
1235 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1236 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1237 velec = _mm256_mul_pd(velec,sw);
1238 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1240 /* Update potential sum for this i atom from the interaction with this j atom. */
1241 velec = _mm256_and_pd(velec,cutoff_mask);
1242 velec = _mm256_andnot_pd(dummy_mask,velec);
1243 velecsum = _mm256_add_pd(velecsum,velec);
1247 fscal = _mm256_and_pd(fscal,cutoff_mask);
1249 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1251 /* Calculate temporary vectorial force */
1252 tx = _mm256_mul_pd(fscal,dx12);
1253 ty = _mm256_mul_pd(fscal,dy12);
1254 tz = _mm256_mul_pd(fscal,dz12);
1256 /* Update vectorial force */
1257 fix1 = _mm256_add_pd(fix1,tx);
1258 fiy1 = _mm256_add_pd(fiy1,ty);
1259 fiz1 = _mm256_add_pd(fiz1,tz);
1261 fjx2 = _mm256_add_pd(fjx2,tx);
1262 fjy2 = _mm256_add_pd(fjy2,ty);
1263 fjz2 = _mm256_add_pd(fjz2,tz);
1267 /**************************
1268 * CALCULATE INTERACTIONS *
1269 **************************/
1271 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1274 r13 = _mm256_mul_pd(rsq13,rinv13);
1275 r13 = _mm256_andnot_pd(dummy_mask,r13);
1277 /* EWALD ELECTROSTATICS */
1279 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1280 ewrt = _mm256_mul_pd(r13,ewtabscale);
1281 ewitab = _mm256_cvttpd_epi32(ewrt);
1282 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1283 ewitab = _mm_slli_epi32(ewitab,2);
1284 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1285 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1286 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1287 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1288 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1289 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1290 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1291 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1292 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1294 d = _mm256_sub_pd(r13,rswitch);
1295 d = _mm256_max_pd(d,_mm256_setzero_pd());
1296 d2 = _mm256_mul_pd(d,d);
1297 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1299 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1301 /* Evaluate switch function */
1302 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1303 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
1304 velec = _mm256_mul_pd(velec,sw);
1305 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1307 /* Update potential sum for this i atom from the interaction with this j atom. */
1308 velec = _mm256_and_pd(velec,cutoff_mask);
1309 velec = _mm256_andnot_pd(dummy_mask,velec);
1310 velecsum = _mm256_add_pd(velecsum,velec);
1314 fscal = _mm256_and_pd(fscal,cutoff_mask);
1316 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1318 /* Calculate temporary vectorial force */
1319 tx = _mm256_mul_pd(fscal,dx13);
1320 ty = _mm256_mul_pd(fscal,dy13);
1321 tz = _mm256_mul_pd(fscal,dz13);
1323 /* Update vectorial force */
1324 fix1 = _mm256_add_pd(fix1,tx);
1325 fiy1 = _mm256_add_pd(fiy1,ty);
1326 fiz1 = _mm256_add_pd(fiz1,tz);
1328 fjx3 = _mm256_add_pd(fjx3,tx);
1329 fjy3 = _mm256_add_pd(fjy3,ty);
1330 fjz3 = _mm256_add_pd(fjz3,tz);
1334 /**************************
1335 * CALCULATE INTERACTIONS *
1336 **************************/
1338 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1341 r21 = _mm256_mul_pd(rsq21,rinv21);
1342 r21 = _mm256_andnot_pd(dummy_mask,r21);
1344 /* EWALD ELECTROSTATICS */
1346 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1347 ewrt = _mm256_mul_pd(r21,ewtabscale);
1348 ewitab = _mm256_cvttpd_epi32(ewrt);
1349 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1350 ewitab = _mm_slli_epi32(ewitab,2);
1351 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1352 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1353 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1354 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1355 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1356 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1357 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1358 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1359 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1361 d = _mm256_sub_pd(r21,rswitch);
1362 d = _mm256_max_pd(d,_mm256_setzero_pd());
1363 d2 = _mm256_mul_pd(d,d);
1364 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1366 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1368 /* Evaluate switch function */
1369 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1370 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1371 velec = _mm256_mul_pd(velec,sw);
1372 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1374 /* Update potential sum for this i atom from the interaction with this j atom. */
1375 velec = _mm256_and_pd(velec,cutoff_mask);
1376 velec = _mm256_andnot_pd(dummy_mask,velec);
1377 velecsum = _mm256_add_pd(velecsum,velec);
1381 fscal = _mm256_and_pd(fscal,cutoff_mask);
1383 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1385 /* Calculate temporary vectorial force */
1386 tx = _mm256_mul_pd(fscal,dx21);
1387 ty = _mm256_mul_pd(fscal,dy21);
1388 tz = _mm256_mul_pd(fscal,dz21);
1390 /* Update vectorial force */
1391 fix2 = _mm256_add_pd(fix2,tx);
1392 fiy2 = _mm256_add_pd(fiy2,ty);
1393 fiz2 = _mm256_add_pd(fiz2,tz);
1395 fjx1 = _mm256_add_pd(fjx1,tx);
1396 fjy1 = _mm256_add_pd(fjy1,ty);
1397 fjz1 = _mm256_add_pd(fjz1,tz);
1401 /**************************
1402 * CALCULATE INTERACTIONS *
1403 **************************/
1405 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1408 r22 = _mm256_mul_pd(rsq22,rinv22);
1409 r22 = _mm256_andnot_pd(dummy_mask,r22);
1411 /* EWALD ELECTROSTATICS */
1413 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1414 ewrt = _mm256_mul_pd(r22,ewtabscale);
1415 ewitab = _mm256_cvttpd_epi32(ewrt);
1416 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1417 ewitab = _mm_slli_epi32(ewitab,2);
1418 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1419 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1420 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1421 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1422 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1423 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1424 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1425 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1426 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1428 d = _mm256_sub_pd(r22,rswitch);
1429 d = _mm256_max_pd(d,_mm256_setzero_pd());
1430 d2 = _mm256_mul_pd(d,d);
1431 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1433 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1435 /* Evaluate switch function */
1436 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1437 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1438 velec = _mm256_mul_pd(velec,sw);
1439 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1441 /* Update potential sum for this i atom from the interaction with this j atom. */
1442 velec = _mm256_and_pd(velec,cutoff_mask);
1443 velec = _mm256_andnot_pd(dummy_mask,velec);
1444 velecsum = _mm256_add_pd(velecsum,velec);
1448 fscal = _mm256_and_pd(fscal,cutoff_mask);
1450 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1452 /* Calculate temporary vectorial force */
1453 tx = _mm256_mul_pd(fscal,dx22);
1454 ty = _mm256_mul_pd(fscal,dy22);
1455 tz = _mm256_mul_pd(fscal,dz22);
1457 /* Update vectorial force */
1458 fix2 = _mm256_add_pd(fix2,tx);
1459 fiy2 = _mm256_add_pd(fiy2,ty);
1460 fiz2 = _mm256_add_pd(fiz2,tz);
1462 fjx2 = _mm256_add_pd(fjx2,tx);
1463 fjy2 = _mm256_add_pd(fjy2,ty);
1464 fjz2 = _mm256_add_pd(fjz2,tz);
1468 /**************************
1469 * CALCULATE INTERACTIONS *
1470 **************************/
1472 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1475 r23 = _mm256_mul_pd(rsq23,rinv23);
1476 r23 = _mm256_andnot_pd(dummy_mask,r23);
1478 /* EWALD ELECTROSTATICS */
1480 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1481 ewrt = _mm256_mul_pd(r23,ewtabscale);
1482 ewitab = _mm256_cvttpd_epi32(ewrt);
1483 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1484 ewitab = _mm_slli_epi32(ewitab,2);
1485 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1486 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1487 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1488 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1489 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1490 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1491 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1492 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1493 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1495 d = _mm256_sub_pd(r23,rswitch);
1496 d = _mm256_max_pd(d,_mm256_setzero_pd());
1497 d2 = _mm256_mul_pd(d,d);
1498 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1500 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1502 /* Evaluate switch function */
1503 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1504 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
1505 velec = _mm256_mul_pd(velec,sw);
1506 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1508 /* Update potential sum for this i atom from the interaction with this j atom. */
1509 velec = _mm256_and_pd(velec,cutoff_mask);
1510 velec = _mm256_andnot_pd(dummy_mask,velec);
1511 velecsum = _mm256_add_pd(velecsum,velec);
1515 fscal = _mm256_and_pd(fscal,cutoff_mask);
1517 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1519 /* Calculate temporary vectorial force */
1520 tx = _mm256_mul_pd(fscal,dx23);
1521 ty = _mm256_mul_pd(fscal,dy23);
1522 tz = _mm256_mul_pd(fscal,dz23);
1524 /* Update vectorial force */
1525 fix2 = _mm256_add_pd(fix2,tx);
1526 fiy2 = _mm256_add_pd(fiy2,ty);
1527 fiz2 = _mm256_add_pd(fiz2,tz);
1529 fjx3 = _mm256_add_pd(fjx3,tx);
1530 fjy3 = _mm256_add_pd(fjy3,ty);
1531 fjz3 = _mm256_add_pd(fjz3,tz);
1535 /**************************
1536 * CALCULATE INTERACTIONS *
1537 **************************/
1539 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1542 r31 = _mm256_mul_pd(rsq31,rinv31);
1543 r31 = _mm256_andnot_pd(dummy_mask,r31);
1545 /* EWALD ELECTROSTATICS */
1547 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1548 ewrt = _mm256_mul_pd(r31,ewtabscale);
1549 ewitab = _mm256_cvttpd_epi32(ewrt);
1550 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1551 ewitab = _mm_slli_epi32(ewitab,2);
1552 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1553 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1554 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1555 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1556 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1557 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1558 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1559 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1560 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1562 d = _mm256_sub_pd(r31,rswitch);
1563 d = _mm256_max_pd(d,_mm256_setzero_pd());
1564 d2 = _mm256_mul_pd(d,d);
1565 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1567 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1569 /* Evaluate switch function */
1570 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1571 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
1572 velec = _mm256_mul_pd(velec,sw);
1573 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1575 /* Update potential sum for this i atom from the interaction with this j atom. */
1576 velec = _mm256_and_pd(velec,cutoff_mask);
1577 velec = _mm256_andnot_pd(dummy_mask,velec);
1578 velecsum = _mm256_add_pd(velecsum,velec);
1582 fscal = _mm256_and_pd(fscal,cutoff_mask);
1584 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1586 /* Calculate temporary vectorial force */
1587 tx = _mm256_mul_pd(fscal,dx31);
1588 ty = _mm256_mul_pd(fscal,dy31);
1589 tz = _mm256_mul_pd(fscal,dz31);
1591 /* Update vectorial force */
1592 fix3 = _mm256_add_pd(fix3,tx);
1593 fiy3 = _mm256_add_pd(fiy3,ty);
1594 fiz3 = _mm256_add_pd(fiz3,tz);
1596 fjx1 = _mm256_add_pd(fjx1,tx);
1597 fjy1 = _mm256_add_pd(fjy1,ty);
1598 fjz1 = _mm256_add_pd(fjz1,tz);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1609 r32 = _mm256_mul_pd(rsq32,rinv32);
1610 r32 = _mm256_andnot_pd(dummy_mask,r32);
1612 /* EWALD ELECTROSTATICS */
1614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1615 ewrt = _mm256_mul_pd(r32,ewtabscale);
1616 ewitab = _mm256_cvttpd_epi32(ewrt);
1617 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1618 ewitab = _mm_slli_epi32(ewitab,2);
1619 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1620 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1621 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1622 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1623 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1624 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1625 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1626 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1627 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1629 d = _mm256_sub_pd(r32,rswitch);
1630 d = _mm256_max_pd(d,_mm256_setzero_pd());
1631 d2 = _mm256_mul_pd(d,d);
1632 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1634 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1636 /* Evaluate switch function */
1637 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1638 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
1639 velec = _mm256_mul_pd(velec,sw);
1640 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1642 /* Update potential sum for this i atom from the interaction with this j atom. */
1643 velec = _mm256_and_pd(velec,cutoff_mask);
1644 velec = _mm256_andnot_pd(dummy_mask,velec);
1645 velecsum = _mm256_add_pd(velecsum,velec);
1649 fscal = _mm256_and_pd(fscal,cutoff_mask);
1651 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1653 /* Calculate temporary vectorial force */
1654 tx = _mm256_mul_pd(fscal,dx32);
1655 ty = _mm256_mul_pd(fscal,dy32);
1656 tz = _mm256_mul_pd(fscal,dz32);
1658 /* Update vectorial force */
1659 fix3 = _mm256_add_pd(fix3,tx);
1660 fiy3 = _mm256_add_pd(fiy3,ty);
1661 fiz3 = _mm256_add_pd(fiz3,tz);
1663 fjx2 = _mm256_add_pd(fjx2,tx);
1664 fjy2 = _mm256_add_pd(fjy2,ty);
1665 fjz2 = _mm256_add_pd(fjz2,tz);
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1676 r33 = _mm256_mul_pd(rsq33,rinv33);
1677 r33 = _mm256_andnot_pd(dummy_mask,r33);
1679 /* EWALD ELECTROSTATICS */
1681 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1682 ewrt = _mm256_mul_pd(r33,ewtabscale);
1683 ewitab = _mm256_cvttpd_epi32(ewrt);
1684 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1685 ewitab = _mm_slli_epi32(ewitab,2);
1686 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1687 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1688 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1689 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1690 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1691 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1692 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1693 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1694 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1696 d = _mm256_sub_pd(r33,rswitch);
1697 d = _mm256_max_pd(d,_mm256_setzero_pd());
1698 d2 = _mm256_mul_pd(d,d);
1699 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1701 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1703 /* Evaluate switch function */
1704 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1705 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
1706 velec = _mm256_mul_pd(velec,sw);
1707 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1709 /* Update potential sum for this i atom from the interaction with this j atom. */
1710 velec = _mm256_and_pd(velec,cutoff_mask);
1711 velec = _mm256_andnot_pd(dummy_mask,velec);
1712 velecsum = _mm256_add_pd(velecsum,velec);
1716 fscal = _mm256_and_pd(fscal,cutoff_mask);
1718 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1720 /* Calculate temporary vectorial force */
1721 tx = _mm256_mul_pd(fscal,dx33);
1722 ty = _mm256_mul_pd(fscal,dy33);
1723 tz = _mm256_mul_pd(fscal,dz33);
1725 /* Update vectorial force */
1726 fix3 = _mm256_add_pd(fix3,tx);
1727 fiy3 = _mm256_add_pd(fiy3,ty);
1728 fiz3 = _mm256_add_pd(fiz3,tz);
1730 fjx3 = _mm256_add_pd(fjx3,tx);
1731 fjy3 = _mm256_add_pd(fjy3,ty);
1732 fjz3 = _mm256_add_pd(fjz3,tz);
1736 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1737 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1738 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1739 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1741 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1742 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1743 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1745 /* Inner loop uses 657 flops */
1748 /* End of innermost loop */
1750 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1751 f+i_coord_offset,fshift+i_shift_offset);
1754 /* Update potential energies */
1755 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1756 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1758 /* Increment number of inner iterations */
1759 inneriter += j_index_end - j_index_start;
1761 /* Outer loop uses 26 flops */
1764 /* Increment number of outer iterations */
1767 /* Update outer/inner flops */
1769 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1772 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1773 * Electrostatics interaction: Ewald
1774 * VdW interaction: LennardJones
1775 * Geometry: Water4-Water4
1776 * Calculate force/pot: Force
1779 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1780 (t_nblist * gmx_restrict nlist,
1781 rvec * gmx_restrict xx,
1782 rvec * gmx_restrict ff,
1783 t_forcerec * gmx_restrict fr,
1784 t_mdatoms * gmx_restrict mdatoms,
1785 nb_kernel_data_t * gmx_restrict kernel_data,
1786 t_nrnb * gmx_restrict nrnb)
1788 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1789 * just 0 for non-waters.
1790 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1791 * jnr indices corresponding to data put in the four positions in the SIMD register.
1793 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1794 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1795 int jnrA,jnrB,jnrC,jnrD;
1796 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1797 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1798 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1799 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1800 real rcutoff_scalar;
1801 real *shiftvec,*fshift,*x,*f;
1802 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1803 real scratch[4*DIM];
1804 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1805 real * vdwioffsetptr0;
1806 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1807 real * vdwioffsetptr1;
1808 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1809 real * vdwioffsetptr2;
1810 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1811 real * vdwioffsetptr3;
1812 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1813 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1814 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1815 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1816 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1817 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1818 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1819 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1820 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1821 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1822 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1823 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1824 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1825 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1826 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1827 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1828 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1829 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1830 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1831 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1834 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1837 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1838 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1840 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1841 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1843 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1844 real rswitch_scalar,d_scalar;
1845 __m256d dummy_mask,cutoff_mask;
1846 __m128 tmpmask0,tmpmask1;
1847 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1848 __m256d one = _mm256_set1_pd(1.0);
1849 __m256d two = _mm256_set1_pd(2.0);
1855 jindex = nlist->jindex;
1857 shiftidx = nlist->shift;
1859 shiftvec = fr->shift_vec[0];
1860 fshift = fr->fshift[0];
1861 facel = _mm256_set1_pd(fr->epsfac);
1862 charge = mdatoms->chargeA;
1863 nvdwtype = fr->ntype;
1864 vdwparam = fr->nbfp;
1865 vdwtype = mdatoms->typeA;
1867 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1868 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1869 beta2 = _mm256_mul_pd(beta,beta);
1870 beta3 = _mm256_mul_pd(beta,beta2);
1872 ewtab = fr->ic->tabq_coul_FDV0;
1873 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1874 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1876 /* Setup water-specific parameters */
1877 inr = nlist->iinr[0];
1878 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1879 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1880 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1881 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1883 jq1 = _mm256_set1_pd(charge[inr+1]);
1884 jq2 = _mm256_set1_pd(charge[inr+2]);
1885 jq3 = _mm256_set1_pd(charge[inr+3]);
1886 vdwjidx0A = 2*vdwtype[inr+0];
1887 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1888 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1889 qq11 = _mm256_mul_pd(iq1,jq1);
1890 qq12 = _mm256_mul_pd(iq1,jq2);
1891 qq13 = _mm256_mul_pd(iq1,jq3);
1892 qq21 = _mm256_mul_pd(iq2,jq1);
1893 qq22 = _mm256_mul_pd(iq2,jq2);
1894 qq23 = _mm256_mul_pd(iq2,jq3);
1895 qq31 = _mm256_mul_pd(iq3,jq1);
1896 qq32 = _mm256_mul_pd(iq3,jq2);
1897 qq33 = _mm256_mul_pd(iq3,jq3);
1899 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1900 rcutoff_scalar = fr->rcoulomb;
1901 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1902 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1904 rswitch_scalar = fr->rcoulomb_switch;
1905 rswitch = _mm256_set1_pd(rswitch_scalar);
1906 /* Setup switch parameters */
1907 d_scalar = rcutoff_scalar-rswitch_scalar;
1908 d = _mm256_set1_pd(d_scalar);
1909 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1910 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1911 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1912 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1913 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1914 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1916 /* Avoid stupid compiler warnings */
1917 jnrA = jnrB = jnrC = jnrD = 0;
1918 j_coord_offsetA = 0;
1919 j_coord_offsetB = 0;
1920 j_coord_offsetC = 0;
1921 j_coord_offsetD = 0;
1926 for(iidx=0;iidx<4*DIM;iidx++)
1928 scratch[iidx] = 0.0;
1931 /* Start outer loop over neighborlists */
1932 for(iidx=0; iidx<nri; iidx++)
1934 /* Load shift vector for this list */
1935 i_shift_offset = DIM*shiftidx[iidx];
1937 /* Load limits for loop over neighbors */
1938 j_index_start = jindex[iidx];
1939 j_index_end = jindex[iidx+1];
1941 /* Get outer coordinate index */
1943 i_coord_offset = DIM*inr;
1945 /* Load i particle coords and add shift vector */
1946 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1947 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1949 fix0 = _mm256_setzero_pd();
1950 fiy0 = _mm256_setzero_pd();
1951 fiz0 = _mm256_setzero_pd();
1952 fix1 = _mm256_setzero_pd();
1953 fiy1 = _mm256_setzero_pd();
1954 fiz1 = _mm256_setzero_pd();
1955 fix2 = _mm256_setzero_pd();
1956 fiy2 = _mm256_setzero_pd();
1957 fiz2 = _mm256_setzero_pd();
1958 fix3 = _mm256_setzero_pd();
1959 fiy3 = _mm256_setzero_pd();
1960 fiz3 = _mm256_setzero_pd();
1962 /* Start inner kernel loop */
1963 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1966 /* Get j neighbor index, and coordinate index */
1968 jnrB = jjnr[jidx+1];
1969 jnrC = jjnr[jidx+2];
1970 jnrD = jjnr[jidx+3];
1971 j_coord_offsetA = DIM*jnrA;
1972 j_coord_offsetB = DIM*jnrB;
1973 j_coord_offsetC = DIM*jnrC;
1974 j_coord_offsetD = DIM*jnrD;
1976 /* load j atom coordinates */
1977 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1978 x+j_coord_offsetC,x+j_coord_offsetD,
1979 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1980 &jy2,&jz2,&jx3,&jy3,&jz3);
1982 /* Calculate displacement vector */
1983 dx00 = _mm256_sub_pd(ix0,jx0);
1984 dy00 = _mm256_sub_pd(iy0,jy0);
1985 dz00 = _mm256_sub_pd(iz0,jz0);
1986 dx11 = _mm256_sub_pd(ix1,jx1);
1987 dy11 = _mm256_sub_pd(iy1,jy1);
1988 dz11 = _mm256_sub_pd(iz1,jz1);
1989 dx12 = _mm256_sub_pd(ix1,jx2);
1990 dy12 = _mm256_sub_pd(iy1,jy2);
1991 dz12 = _mm256_sub_pd(iz1,jz2);
1992 dx13 = _mm256_sub_pd(ix1,jx3);
1993 dy13 = _mm256_sub_pd(iy1,jy3);
1994 dz13 = _mm256_sub_pd(iz1,jz3);
1995 dx21 = _mm256_sub_pd(ix2,jx1);
1996 dy21 = _mm256_sub_pd(iy2,jy1);
1997 dz21 = _mm256_sub_pd(iz2,jz1);
1998 dx22 = _mm256_sub_pd(ix2,jx2);
1999 dy22 = _mm256_sub_pd(iy2,jy2);
2000 dz22 = _mm256_sub_pd(iz2,jz2);
2001 dx23 = _mm256_sub_pd(ix2,jx3);
2002 dy23 = _mm256_sub_pd(iy2,jy3);
2003 dz23 = _mm256_sub_pd(iz2,jz3);
2004 dx31 = _mm256_sub_pd(ix3,jx1);
2005 dy31 = _mm256_sub_pd(iy3,jy1);
2006 dz31 = _mm256_sub_pd(iz3,jz1);
2007 dx32 = _mm256_sub_pd(ix3,jx2);
2008 dy32 = _mm256_sub_pd(iy3,jy2);
2009 dz32 = _mm256_sub_pd(iz3,jz2);
2010 dx33 = _mm256_sub_pd(ix3,jx3);
2011 dy33 = _mm256_sub_pd(iy3,jy3);
2012 dz33 = _mm256_sub_pd(iz3,jz3);
2014 /* Calculate squared distance and things based on it */
2015 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2016 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2017 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2018 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2019 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2020 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2021 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2022 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2023 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2024 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2026 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2027 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2028 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2029 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2030 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2031 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2032 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2033 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2034 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2035 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2037 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2038 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2039 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2040 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2041 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2042 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2043 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2044 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2045 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2046 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2048 fjx0 = _mm256_setzero_pd();
2049 fjy0 = _mm256_setzero_pd();
2050 fjz0 = _mm256_setzero_pd();
2051 fjx1 = _mm256_setzero_pd();
2052 fjy1 = _mm256_setzero_pd();
2053 fjz1 = _mm256_setzero_pd();
2054 fjx2 = _mm256_setzero_pd();
2055 fjy2 = _mm256_setzero_pd();
2056 fjz2 = _mm256_setzero_pd();
2057 fjx3 = _mm256_setzero_pd();
2058 fjy3 = _mm256_setzero_pd();
2059 fjz3 = _mm256_setzero_pd();
2061 /**************************
2062 * CALCULATE INTERACTIONS *
2063 **************************/
2065 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2068 r00 = _mm256_mul_pd(rsq00,rinv00);
2070 /* LENNARD-JONES DISPERSION/REPULSION */
2072 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2073 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2074 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2075 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2076 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2078 d = _mm256_sub_pd(r00,rswitch);
2079 d = _mm256_max_pd(d,_mm256_setzero_pd());
2080 d2 = _mm256_mul_pd(d,d);
2081 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2083 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2085 /* Evaluate switch function */
2086 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2087 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2088 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2092 fscal = _mm256_and_pd(fscal,cutoff_mask);
2094 /* Calculate temporary vectorial force */
2095 tx = _mm256_mul_pd(fscal,dx00);
2096 ty = _mm256_mul_pd(fscal,dy00);
2097 tz = _mm256_mul_pd(fscal,dz00);
2099 /* Update vectorial force */
2100 fix0 = _mm256_add_pd(fix0,tx);
2101 fiy0 = _mm256_add_pd(fiy0,ty);
2102 fiz0 = _mm256_add_pd(fiz0,tz);
2104 fjx0 = _mm256_add_pd(fjx0,tx);
2105 fjy0 = _mm256_add_pd(fjy0,ty);
2106 fjz0 = _mm256_add_pd(fjz0,tz);
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2117 r11 = _mm256_mul_pd(rsq11,rinv11);
2119 /* EWALD ELECTROSTATICS */
2121 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2122 ewrt = _mm256_mul_pd(r11,ewtabscale);
2123 ewitab = _mm256_cvttpd_epi32(ewrt);
2124 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2125 ewitab = _mm_slli_epi32(ewitab,2);
2126 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2127 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2128 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2129 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2130 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2131 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2132 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2133 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2134 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2136 d = _mm256_sub_pd(r11,rswitch);
2137 d = _mm256_max_pd(d,_mm256_setzero_pd());
2138 d2 = _mm256_mul_pd(d,d);
2139 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2141 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2143 /* Evaluate switch function */
2144 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2145 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2146 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2150 fscal = _mm256_and_pd(fscal,cutoff_mask);
2152 /* Calculate temporary vectorial force */
2153 tx = _mm256_mul_pd(fscal,dx11);
2154 ty = _mm256_mul_pd(fscal,dy11);
2155 tz = _mm256_mul_pd(fscal,dz11);
2157 /* Update vectorial force */
2158 fix1 = _mm256_add_pd(fix1,tx);
2159 fiy1 = _mm256_add_pd(fiy1,ty);
2160 fiz1 = _mm256_add_pd(fiz1,tz);
2162 fjx1 = _mm256_add_pd(fjx1,tx);
2163 fjy1 = _mm256_add_pd(fjy1,ty);
2164 fjz1 = _mm256_add_pd(fjz1,tz);
2168 /**************************
2169 * CALCULATE INTERACTIONS *
2170 **************************/
2172 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2175 r12 = _mm256_mul_pd(rsq12,rinv12);
2177 /* EWALD ELECTROSTATICS */
2179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2180 ewrt = _mm256_mul_pd(r12,ewtabscale);
2181 ewitab = _mm256_cvttpd_epi32(ewrt);
2182 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2183 ewitab = _mm_slli_epi32(ewitab,2);
2184 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2185 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2186 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2187 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2188 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2189 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2190 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2191 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2192 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2194 d = _mm256_sub_pd(r12,rswitch);
2195 d = _mm256_max_pd(d,_mm256_setzero_pd());
2196 d2 = _mm256_mul_pd(d,d);
2197 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2199 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2201 /* Evaluate switch function */
2202 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2203 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2204 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2208 fscal = _mm256_and_pd(fscal,cutoff_mask);
2210 /* Calculate temporary vectorial force */
2211 tx = _mm256_mul_pd(fscal,dx12);
2212 ty = _mm256_mul_pd(fscal,dy12);
2213 tz = _mm256_mul_pd(fscal,dz12);
2215 /* Update vectorial force */
2216 fix1 = _mm256_add_pd(fix1,tx);
2217 fiy1 = _mm256_add_pd(fiy1,ty);
2218 fiz1 = _mm256_add_pd(fiz1,tz);
2220 fjx2 = _mm256_add_pd(fjx2,tx);
2221 fjy2 = _mm256_add_pd(fjy2,ty);
2222 fjz2 = _mm256_add_pd(fjz2,tz);
2226 /**************************
2227 * CALCULATE INTERACTIONS *
2228 **************************/
2230 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2233 r13 = _mm256_mul_pd(rsq13,rinv13);
2235 /* EWALD ELECTROSTATICS */
2237 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2238 ewrt = _mm256_mul_pd(r13,ewtabscale);
2239 ewitab = _mm256_cvttpd_epi32(ewrt);
2240 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2241 ewitab = _mm_slli_epi32(ewitab,2);
2242 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2243 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2244 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2245 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2246 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2247 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2248 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2249 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2250 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2252 d = _mm256_sub_pd(r13,rswitch);
2253 d = _mm256_max_pd(d,_mm256_setzero_pd());
2254 d2 = _mm256_mul_pd(d,d);
2255 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2257 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2259 /* Evaluate switch function */
2260 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2261 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2262 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2266 fscal = _mm256_and_pd(fscal,cutoff_mask);
2268 /* Calculate temporary vectorial force */
2269 tx = _mm256_mul_pd(fscal,dx13);
2270 ty = _mm256_mul_pd(fscal,dy13);
2271 tz = _mm256_mul_pd(fscal,dz13);
2273 /* Update vectorial force */
2274 fix1 = _mm256_add_pd(fix1,tx);
2275 fiy1 = _mm256_add_pd(fiy1,ty);
2276 fiz1 = _mm256_add_pd(fiz1,tz);
2278 fjx3 = _mm256_add_pd(fjx3,tx);
2279 fjy3 = _mm256_add_pd(fjy3,ty);
2280 fjz3 = _mm256_add_pd(fjz3,tz);
2284 /**************************
2285 * CALCULATE INTERACTIONS *
2286 **************************/
2288 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2291 r21 = _mm256_mul_pd(rsq21,rinv21);
2293 /* EWALD ELECTROSTATICS */
2295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296 ewrt = _mm256_mul_pd(r21,ewtabscale);
2297 ewitab = _mm256_cvttpd_epi32(ewrt);
2298 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2299 ewitab = _mm_slli_epi32(ewitab,2);
2300 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2301 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2302 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2303 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2304 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2305 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2306 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2307 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2308 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2310 d = _mm256_sub_pd(r21,rswitch);
2311 d = _mm256_max_pd(d,_mm256_setzero_pd());
2312 d2 = _mm256_mul_pd(d,d);
2313 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2315 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2317 /* Evaluate switch function */
2318 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2319 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2320 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2324 fscal = _mm256_and_pd(fscal,cutoff_mask);
2326 /* Calculate temporary vectorial force */
2327 tx = _mm256_mul_pd(fscal,dx21);
2328 ty = _mm256_mul_pd(fscal,dy21);
2329 tz = _mm256_mul_pd(fscal,dz21);
2331 /* Update vectorial force */
2332 fix2 = _mm256_add_pd(fix2,tx);
2333 fiy2 = _mm256_add_pd(fiy2,ty);
2334 fiz2 = _mm256_add_pd(fiz2,tz);
2336 fjx1 = _mm256_add_pd(fjx1,tx);
2337 fjy1 = _mm256_add_pd(fjy1,ty);
2338 fjz1 = _mm256_add_pd(fjz1,tz);
2342 /**************************
2343 * CALCULATE INTERACTIONS *
2344 **************************/
2346 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2349 r22 = _mm256_mul_pd(rsq22,rinv22);
2351 /* EWALD ELECTROSTATICS */
2353 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2354 ewrt = _mm256_mul_pd(r22,ewtabscale);
2355 ewitab = _mm256_cvttpd_epi32(ewrt);
2356 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2357 ewitab = _mm_slli_epi32(ewitab,2);
2358 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2359 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2360 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2361 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2362 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2363 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2364 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2365 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2366 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2368 d = _mm256_sub_pd(r22,rswitch);
2369 d = _mm256_max_pd(d,_mm256_setzero_pd());
2370 d2 = _mm256_mul_pd(d,d);
2371 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2373 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2375 /* Evaluate switch function */
2376 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2377 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2378 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2382 fscal = _mm256_and_pd(fscal,cutoff_mask);
2384 /* Calculate temporary vectorial force */
2385 tx = _mm256_mul_pd(fscal,dx22);
2386 ty = _mm256_mul_pd(fscal,dy22);
2387 tz = _mm256_mul_pd(fscal,dz22);
2389 /* Update vectorial force */
2390 fix2 = _mm256_add_pd(fix2,tx);
2391 fiy2 = _mm256_add_pd(fiy2,ty);
2392 fiz2 = _mm256_add_pd(fiz2,tz);
2394 fjx2 = _mm256_add_pd(fjx2,tx);
2395 fjy2 = _mm256_add_pd(fjy2,ty);
2396 fjz2 = _mm256_add_pd(fjz2,tz);
2400 /**************************
2401 * CALCULATE INTERACTIONS *
2402 **************************/
2404 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2407 r23 = _mm256_mul_pd(rsq23,rinv23);
2409 /* EWALD ELECTROSTATICS */
2411 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2412 ewrt = _mm256_mul_pd(r23,ewtabscale);
2413 ewitab = _mm256_cvttpd_epi32(ewrt);
2414 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2415 ewitab = _mm_slli_epi32(ewitab,2);
2416 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2417 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2418 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2419 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2420 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2421 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2422 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2423 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2424 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2426 d = _mm256_sub_pd(r23,rswitch);
2427 d = _mm256_max_pd(d,_mm256_setzero_pd());
2428 d2 = _mm256_mul_pd(d,d);
2429 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2431 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2433 /* Evaluate switch function */
2434 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2435 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2436 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2440 fscal = _mm256_and_pd(fscal,cutoff_mask);
2442 /* Calculate temporary vectorial force */
2443 tx = _mm256_mul_pd(fscal,dx23);
2444 ty = _mm256_mul_pd(fscal,dy23);
2445 tz = _mm256_mul_pd(fscal,dz23);
2447 /* Update vectorial force */
2448 fix2 = _mm256_add_pd(fix2,tx);
2449 fiy2 = _mm256_add_pd(fiy2,ty);
2450 fiz2 = _mm256_add_pd(fiz2,tz);
2452 fjx3 = _mm256_add_pd(fjx3,tx);
2453 fjy3 = _mm256_add_pd(fjy3,ty);
2454 fjz3 = _mm256_add_pd(fjz3,tz);
2458 /**************************
2459 * CALCULATE INTERACTIONS *
2460 **************************/
2462 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2465 r31 = _mm256_mul_pd(rsq31,rinv31);
2467 /* EWALD ELECTROSTATICS */
2469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2470 ewrt = _mm256_mul_pd(r31,ewtabscale);
2471 ewitab = _mm256_cvttpd_epi32(ewrt);
2472 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2473 ewitab = _mm_slli_epi32(ewitab,2);
2474 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2475 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2476 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2477 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2478 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2479 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2480 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2481 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2482 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2484 d = _mm256_sub_pd(r31,rswitch);
2485 d = _mm256_max_pd(d,_mm256_setzero_pd());
2486 d2 = _mm256_mul_pd(d,d);
2487 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2489 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2491 /* Evaluate switch function */
2492 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2493 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2494 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2498 fscal = _mm256_and_pd(fscal,cutoff_mask);
2500 /* Calculate temporary vectorial force */
2501 tx = _mm256_mul_pd(fscal,dx31);
2502 ty = _mm256_mul_pd(fscal,dy31);
2503 tz = _mm256_mul_pd(fscal,dz31);
2505 /* Update vectorial force */
2506 fix3 = _mm256_add_pd(fix3,tx);
2507 fiy3 = _mm256_add_pd(fiy3,ty);
2508 fiz3 = _mm256_add_pd(fiz3,tz);
2510 fjx1 = _mm256_add_pd(fjx1,tx);
2511 fjy1 = _mm256_add_pd(fjy1,ty);
2512 fjz1 = _mm256_add_pd(fjz1,tz);
2516 /**************************
2517 * CALCULATE INTERACTIONS *
2518 **************************/
2520 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2523 r32 = _mm256_mul_pd(rsq32,rinv32);
2525 /* EWALD ELECTROSTATICS */
2527 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2528 ewrt = _mm256_mul_pd(r32,ewtabscale);
2529 ewitab = _mm256_cvttpd_epi32(ewrt);
2530 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2531 ewitab = _mm_slli_epi32(ewitab,2);
2532 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2533 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2534 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2535 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2536 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2537 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2538 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2539 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2540 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2542 d = _mm256_sub_pd(r32,rswitch);
2543 d = _mm256_max_pd(d,_mm256_setzero_pd());
2544 d2 = _mm256_mul_pd(d,d);
2545 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2547 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2549 /* Evaluate switch function */
2550 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2551 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2552 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2556 fscal = _mm256_and_pd(fscal,cutoff_mask);
2558 /* Calculate temporary vectorial force */
2559 tx = _mm256_mul_pd(fscal,dx32);
2560 ty = _mm256_mul_pd(fscal,dy32);
2561 tz = _mm256_mul_pd(fscal,dz32);
2563 /* Update vectorial force */
2564 fix3 = _mm256_add_pd(fix3,tx);
2565 fiy3 = _mm256_add_pd(fiy3,ty);
2566 fiz3 = _mm256_add_pd(fiz3,tz);
2568 fjx2 = _mm256_add_pd(fjx2,tx);
2569 fjy2 = _mm256_add_pd(fjy2,ty);
2570 fjz2 = _mm256_add_pd(fjz2,tz);
2574 /**************************
2575 * CALCULATE INTERACTIONS *
2576 **************************/
2578 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2581 r33 = _mm256_mul_pd(rsq33,rinv33);
2583 /* EWALD ELECTROSTATICS */
2585 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2586 ewrt = _mm256_mul_pd(r33,ewtabscale);
2587 ewitab = _mm256_cvttpd_epi32(ewrt);
2588 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2589 ewitab = _mm_slli_epi32(ewitab,2);
2590 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2591 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2592 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2593 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2594 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2595 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2596 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2597 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
2598 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2600 d = _mm256_sub_pd(r33,rswitch);
2601 d = _mm256_max_pd(d,_mm256_setzero_pd());
2602 d2 = _mm256_mul_pd(d,d);
2603 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2605 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2607 /* Evaluate switch function */
2608 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2609 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
2610 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2614 fscal = _mm256_and_pd(fscal,cutoff_mask);
2616 /* Calculate temporary vectorial force */
2617 tx = _mm256_mul_pd(fscal,dx33);
2618 ty = _mm256_mul_pd(fscal,dy33);
2619 tz = _mm256_mul_pd(fscal,dz33);
2621 /* Update vectorial force */
2622 fix3 = _mm256_add_pd(fix3,tx);
2623 fiy3 = _mm256_add_pd(fiy3,ty);
2624 fiz3 = _mm256_add_pd(fiz3,tz);
2626 fjx3 = _mm256_add_pd(fjx3,tx);
2627 fjy3 = _mm256_add_pd(fjy3,ty);
2628 fjz3 = _mm256_add_pd(fjz3,tz);
2632 fjptrA = f+j_coord_offsetA;
2633 fjptrB = f+j_coord_offsetB;
2634 fjptrC = f+j_coord_offsetC;
2635 fjptrD = f+j_coord_offsetD;
2637 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2638 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2639 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2641 /* Inner loop uses 617 flops */
2644 if(jidx<j_index_end)
2647 /* Get j neighbor index, and coordinate index */
2648 jnrlistA = jjnr[jidx];
2649 jnrlistB = jjnr[jidx+1];
2650 jnrlistC = jjnr[jidx+2];
2651 jnrlistD = jjnr[jidx+3];
2652 /* Sign of each element will be negative for non-real atoms.
2653 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2654 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2656 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2658 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2659 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2660 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2662 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2663 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2664 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2665 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2666 j_coord_offsetA = DIM*jnrA;
2667 j_coord_offsetB = DIM*jnrB;
2668 j_coord_offsetC = DIM*jnrC;
2669 j_coord_offsetD = DIM*jnrD;
2671 /* load j atom coordinates */
2672 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2673 x+j_coord_offsetC,x+j_coord_offsetD,
2674 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2675 &jy2,&jz2,&jx3,&jy3,&jz3);
2677 /* Calculate displacement vector */
2678 dx00 = _mm256_sub_pd(ix0,jx0);
2679 dy00 = _mm256_sub_pd(iy0,jy0);
2680 dz00 = _mm256_sub_pd(iz0,jz0);
2681 dx11 = _mm256_sub_pd(ix1,jx1);
2682 dy11 = _mm256_sub_pd(iy1,jy1);
2683 dz11 = _mm256_sub_pd(iz1,jz1);
2684 dx12 = _mm256_sub_pd(ix1,jx2);
2685 dy12 = _mm256_sub_pd(iy1,jy2);
2686 dz12 = _mm256_sub_pd(iz1,jz2);
2687 dx13 = _mm256_sub_pd(ix1,jx3);
2688 dy13 = _mm256_sub_pd(iy1,jy3);
2689 dz13 = _mm256_sub_pd(iz1,jz3);
2690 dx21 = _mm256_sub_pd(ix2,jx1);
2691 dy21 = _mm256_sub_pd(iy2,jy1);
2692 dz21 = _mm256_sub_pd(iz2,jz1);
2693 dx22 = _mm256_sub_pd(ix2,jx2);
2694 dy22 = _mm256_sub_pd(iy2,jy2);
2695 dz22 = _mm256_sub_pd(iz2,jz2);
2696 dx23 = _mm256_sub_pd(ix2,jx3);
2697 dy23 = _mm256_sub_pd(iy2,jy3);
2698 dz23 = _mm256_sub_pd(iz2,jz3);
2699 dx31 = _mm256_sub_pd(ix3,jx1);
2700 dy31 = _mm256_sub_pd(iy3,jy1);
2701 dz31 = _mm256_sub_pd(iz3,jz1);
2702 dx32 = _mm256_sub_pd(ix3,jx2);
2703 dy32 = _mm256_sub_pd(iy3,jy2);
2704 dz32 = _mm256_sub_pd(iz3,jz2);
2705 dx33 = _mm256_sub_pd(ix3,jx3);
2706 dy33 = _mm256_sub_pd(iy3,jy3);
2707 dz33 = _mm256_sub_pd(iz3,jz3);
2709 /* Calculate squared distance and things based on it */
2710 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2711 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2712 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2713 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2714 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2715 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2716 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2717 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2718 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2719 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2721 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2722 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2723 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2724 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2725 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2726 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2727 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2728 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2729 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2730 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2732 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2733 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2734 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2735 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2736 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2737 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2738 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2739 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2740 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2741 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2743 fjx0 = _mm256_setzero_pd();
2744 fjy0 = _mm256_setzero_pd();
2745 fjz0 = _mm256_setzero_pd();
2746 fjx1 = _mm256_setzero_pd();
2747 fjy1 = _mm256_setzero_pd();
2748 fjz1 = _mm256_setzero_pd();
2749 fjx2 = _mm256_setzero_pd();
2750 fjy2 = _mm256_setzero_pd();
2751 fjz2 = _mm256_setzero_pd();
2752 fjx3 = _mm256_setzero_pd();
2753 fjy3 = _mm256_setzero_pd();
2754 fjz3 = _mm256_setzero_pd();
2756 /**************************
2757 * CALCULATE INTERACTIONS *
2758 **************************/
2760 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2763 r00 = _mm256_mul_pd(rsq00,rinv00);
2764 r00 = _mm256_andnot_pd(dummy_mask,r00);
2766 /* LENNARD-JONES DISPERSION/REPULSION */
2768 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2769 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2770 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2771 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2772 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2774 d = _mm256_sub_pd(r00,rswitch);
2775 d = _mm256_max_pd(d,_mm256_setzero_pd());
2776 d2 = _mm256_mul_pd(d,d);
2777 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2779 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2781 /* Evaluate switch function */
2782 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2783 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2784 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2788 fscal = _mm256_and_pd(fscal,cutoff_mask);
2790 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2792 /* Calculate temporary vectorial force */
2793 tx = _mm256_mul_pd(fscal,dx00);
2794 ty = _mm256_mul_pd(fscal,dy00);
2795 tz = _mm256_mul_pd(fscal,dz00);
2797 /* Update vectorial force */
2798 fix0 = _mm256_add_pd(fix0,tx);
2799 fiy0 = _mm256_add_pd(fiy0,ty);
2800 fiz0 = _mm256_add_pd(fiz0,tz);
2802 fjx0 = _mm256_add_pd(fjx0,tx);
2803 fjy0 = _mm256_add_pd(fjy0,ty);
2804 fjz0 = _mm256_add_pd(fjz0,tz);
2808 /**************************
2809 * CALCULATE INTERACTIONS *
2810 **************************/
2812 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2815 r11 = _mm256_mul_pd(rsq11,rinv11);
2816 r11 = _mm256_andnot_pd(dummy_mask,r11);
2818 /* EWALD ELECTROSTATICS */
2820 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2821 ewrt = _mm256_mul_pd(r11,ewtabscale);
2822 ewitab = _mm256_cvttpd_epi32(ewrt);
2823 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2824 ewitab = _mm_slli_epi32(ewitab,2);
2825 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2826 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2827 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2828 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2829 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2830 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2831 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2832 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2833 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2835 d = _mm256_sub_pd(r11,rswitch);
2836 d = _mm256_max_pd(d,_mm256_setzero_pd());
2837 d2 = _mm256_mul_pd(d,d);
2838 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2840 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2842 /* Evaluate switch function */
2843 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2844 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2845 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2849 fscal = _mm256_and_pd(fscal,cutoff_mask);
2851 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2853 /* Calculate temporary vectorial force */
2854 tx = _mm256_mul_pd(fscal,dx11);
2855 ty = _mm256_mul_pd(fscal,dy11);
2856 tz = _mm256_mul_pd(fscal,dz11);
2858 /* Update vectorial force */
2859 fix1 = _mm256_add_pd(fix1,tx);
2860 fiy1 = _mm256_add_pd(fiy1,ty);
2861 fiz1 = _mm256_add_pd(fiz1,tz);
2863 fjx1 = _mm256_add_pd(fjx1,tx);
2864 fjy1 = _mm256_add_pd(fjy1,ty);
2865 fjz1 = _mm256_add_pd(fjz1,tz);
2869 /**************************
2870 * CALCULATE INTERACTIONS *
2871 **************************/
2873 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2876 r12 = _mm256_mul_pd(rsq12,rinv12);
2877 r12 = _mm256_andnot_pd(dummy_mask,r12);
2879 /* EWALD ELECTROSTATICS */
2881 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2882 ewrt = _mm256_mul_pd(r12,ewtabscale);
2883 ewitab = _mm256_cvttpd_epi32(ewrt);
2884 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2885 ewitab = _mm_slli_epi32(ewitab,2);
2886 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2887 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2888 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2889 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2890 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2891 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2892 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2893 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2894 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2896 d = _mm256_sub_pd(r12,rswitch);
2897 d = _mm256_max_pd(d,_mm256_setzero_pd());
2898 d2 = _mm256_mul_pd(d,d);
2899 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2901 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2903 /* Evaluate switch function */
2904 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2905 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2906 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2910 fscal = _mm256_and_pd(fscal,cutoff_mask);
2912 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2914 /* Calculate temporary vectorial force */
2915 tx = _mm256_mul_pd(fscal,dx12);
2916 ty = _mm256_mul_pd(fscal,dy12);
2917 tz = _mm256_mul_pd(fscal,dz12);
2919 /* Update vectorial force */
2920 fix1 = _mm256_add_pd(fix1,tx);
2921 fiy1 = _mm256_add_pd(fiy1,ty);
2922 fiz1 = _mm256_add_pd(fiz1,tz);
2924 fjx2 = _mm256_add_pd(fjx2,tx);
2925 fjy2 = _mm256_add_pd(fjy2,ty);
2926 fjz2 = _mm256_add_pd(fjz2,tz);
2930 /**************************
2931 * CALCULATE INTERACTIONS *
2932 **************************/
2934 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2937 r13 = _mm256_mul_pd(rsq13,rinv13);
2938 r13 = _mm256_andnot_pd(dummy_mask,r13);
2940 /* EWALD ELECTROSTATICS */
2942 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2943 ewrt = _mm256_mul_pd(r13,ewtabscale);
2944 ewitab = _mm256_cvttpd_epi32(ewrt);
2945 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2946 ewitab = _mm_slli_epi32(ewitab,2);
2947 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2948 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2949 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2950 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2951 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2952 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2953 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2954 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2955 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2957 d = _mm256_sub_pd(r13,rswitch);
2958 d = _mm256_max_pd(d,_mm256_setzero_pd());
2959 d2 = _mm256_mul_pd(d,d);
2960 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2962 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2964 /* Evaluate switch function */
2965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2966 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2967 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2971 fscal = _mm256_and_pd(fscal,cutoff_mask);
2973 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2975 /* Calculate temporary vectorial force */
2976 tx = _mm256_mul_pd(fscal,dx13);
2977 ty = _mm256_mul_pd(fscal,dy13);
2978 tz = _mm256_mul_pd(fscal,dz13);
2980 /* Update vectorial force */
2981 fix1 = _mm256_add_pd(fix1,tx);
2982 fiy1 = _mm256_add_pd(fiy1,ty);
2983 fiz1 = _mm256_add_pd(fiz1,tz);
2985 fjx3 = _mm256_add_pd(fjx3,tx);
2986 fjy3 = _mm256_add_pd(fjy3,ty);
2987 fjz3 = _mm256_add_pd(fjz3,tz);
2991 /**************************
2992 * CALCULATE INTERACTIONS *
2993 **************************/
2995 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2998 r21 = _mm256_mul_pd(rsq21,rinv21);
2999 r21 = _mm256_andnot_pd(dummy_mask,r21);
3001 /* EWALD ELECTROSTATICS */
3003 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3004 ewrt = _mm256_mul_pd(r21,ewtabscale);
3005 ewitab = _mm256_cvttpd_epi32(ewrt);
3006 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3007 ewitab = _mm_slli_epi32(ewitab,2);
3008 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3009 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3010 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3011 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3012 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3013 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3014 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3015 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
3016 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
3018 d = _mm256_sub_pd(r21,rswitch);
3019 d = _mm256_max_pd(d,_mm256_setzero_pd());
3020 d2 = _mm256_mul_pd(d,d);
3021 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3023 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3025 /* Evaluate switch function */
3026 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3027 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
3028 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
3032 fscal = _mm256_and_pd(fscal,cutoff_mask);
3034 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3036 /* Calculate temporary vectorial force */
3037 tx = _mm256_mul_pd(fscal,dx21);
3038 ty = _mm256_mul_pd(fscal,dy21);
3039 tz = _mm256_mul_pd(fscal,dz21);
3041 /* Update vectorial force */
3042 fix2 = _mm256_add_pd(fix2,tx);
3043 fiy2 = _mm256_add_pd(fiy2,ty);
3044 fiz2 = _mm256_add_pd(fiz2,tz);
3046 fjx1 = _mm256_add_pd(fjx1,tx);
3047 fjy1 = _mm256_add_pd(fjy1,ty);
3048 fjz1 = _mm256_add_pd(fjz1,tz);
3052 /**************************
3053 * CALCULATE INTERACTIONS *
3054 **************************/
3056 if (gmx_mm256_any_lt(rsq22,rcutoff2))
3059 r22 = _mm256_mul_pd(rsq22,rinv22);
3060 r22 = _mm256_andnot_pd(dummy_mask,r22);
3062 /* EWALD ELECTROSTATICS */
3064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3065 ewrt = _mm256_mul_pd(r22,ewtabscale);
3066 ewitab = _mm256_cvttpd_epi32(ewrt);
3067 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3068 ewitab = _mm_slli_epi32(ewitab,2);
3069 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3070 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3071 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3072 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3073 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3074 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3075 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3076 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
3077 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
3079 d = _mm256_sub_pd(r22,rswitch);
3080 d = _mm256_max_pd(d,_mm256_setzero_pd());
3081 d2 = _mm256_mul_pd(d,d);
3082 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3084 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3086 /* Evaluate switch function */
3087 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3088 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
3089 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
3093 fscal = _mm256_and_pd(fscal,cutoff_mask);
3095 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3097 /* Calculate temporary vectorial force */
3098 tx = _mm256_mul_pd(fscal,dx22);
3099 ty = _mm256_mul_pd(fscal,dy22);
3100 tz = _mm256_mul_pd(fscal,dz22);
3102 /* Update vectorial force */
3103 fix2 = _mm256_add_pd(fix2,tx);
3104 fiy2 = _mm256_add_pd(fiy2,ty);
3105 fiz2 = _mm256_add_pd(fiz2,tz);
3107 fjx2 = _mm256_add_pd(fjx2,tx);
3108 fjy2 = _mm256_add_pd(fjy2,ty);
3109 fjz2 = _mm256_add_pd(fjz2,tz);
3113 /**************************
3114 * CALCULATE INTERACTIONS *
3115 **************************/
3117 if (gmx_mm256_any_lt(rsq23,rcutoff2))
3120 r23 = _mm256_mul_pd(rsq23,rinv23);
3121 r23 = _mm256_andnot_pd(dummy_mask,r23);
3123 /* EWALD ELECTROSTATICS */
3125 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3126 ewrt = _mm256_mul_pd(r23,ewtabscale);
3127 ewitab = _mm256_cvttpd_epi32(ewrt);
3128 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3129 ewitab = _mm_slli_epi32(ewitab,2);
3130 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3131 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3132 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3133 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3134 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3135 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3136 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3137 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
3138 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
3140 d = _mm256_sub_pd(r23,rswitch);
3141 d = _mm256_max_pd(d,_mm256_setzero_pd());
3142 d2 = _mm256_mul_pd(d,d);
3143 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3145 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3147 /* Evaluate switch function */
3148 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3149 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
3150 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
3154 fscal = _mm256_and_pd(fscal,cutoff_mask);
3156 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3158 /* Calculate temporary vectorial force */
3159 tx = _mm256_mul_pd(fscal,dx23);
3160 ty = _mm256_mul_pd(fscal,dy23);
3161 tz = _mm256_mul_pd(fscal,dz23);
3163 /* Update vectorial force */
3164 fix2 = _mm256_add_pd(fix2,tx);
3165 fiy2 = _mm256_add_pd(fiy2,ty);
3166 fiz2 = _mm256_add_pd(fiz2,tz);
3168 fjx3 = _mm256_add_pd(fjx3,tx);
3169 fjy3 = _mm256_add_pd(fjy3,ty);
3170 fjz3 = _mm256_add_pd(fjz3,tz);
3174 /**************************
3175 * CALCULATE INTERACTIONS *
3176 **************************/
3178 if (gmx_mm256_any_lt(rsq31,rcutoff2))
3181 r31 = _mm256_mul_pd(rsq31,rinv31);
3182 r31 = _mm256_andnot_pd(dummy_mask,r31);
3184 /* EWALD ELECTROSTATICS */
3186 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3187 ewrt = _mm256_mul_pd(r31,ewtabscale);
3188 ewitab = _mm256_cvttpd_epi32(ewrt);
3189 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3190 ewitab = _mm_slli_epi32(ewitab,2);
3191 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3192 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3193 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3194 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3195 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3196 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3197 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3198 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
3199 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
3201 d = _mm256_sub_pd(r31,rswitch);
3202 d = _mm256_max_pd(d,_mm256_setzero_pd());
3203 d2 = _mm256_mul_pd(d,d);
3204 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3206 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3208 /* Evaluate switch function */
3209 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3210 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
3211 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
3215 fscal = _mm256_and_pd(fscal,cutoff_mask);
3217 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3219 /* Calculate temporary vectorial force */
3220 tx = _mm256_mul_pd(fscal,dx31);
3221 ty = _mm256_mul_pd(fscal,dy31);
3222 tz = _mm256_mul_pd(fscal,dz31);
3224 /* Update vectorial force */
3225 fix3 = _mm256_add_pd(fix3,tx);
3226 fiy3 = _mm256_add_pd(fiy3,ty);
3227 fiz3 = _mm256_add_pd(fiz3,tz);
3229 fjx1 = _mm256_add_pd(fjx1,tx);
3230 fjy1 = _mm256_add_pd(fjy1,ty);
3231 fjz1 = _mm256_add_pd(fjz1,tz);
3235 /**************************
3236 * CALCULATE INTERACTIONS *
3237 **************************/
3239 if (gmx_mm256_any_lt(rsq32,rcutoff2))
3242 r32 = _mm256_mul_pd(rsq32,rinv32);
3243 r32 = _mm256_andnot_pd(dummy_mask,r32);
3245 /* EWALD ELECTROSTATICS */
3247 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3248 ewrt = _mm256_mul_pd(r32,ewtabscale);
3249 ewitab = _mm256_cvttpd_epi32(ewrt);
3250 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3251 ewitab = _mm_slli_epi32(ewitab,2);
3252 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3253 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3254 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3255 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3256 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3257 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3258 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3259 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
3260 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
3262 d = _mm256_sub_pd(r32,rswitch);
3263 d = _mm256_max_pd(d,_mm256_setzero_pd());
3264 d2 = _mm256_mul_pd(d,d);
3265 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3267 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3269 /* Evaluate switch function */
3270 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3271 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
3272 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
3276 fscal = _mm256_and_pd(fscal,cutoff_mask);
3278 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3280 /* Calculate temporary vectorial force */
3281 tx = _mm256_mul_pd(fscal,dx32);
3282 ty = _mm256_mul_pd(fscal,dy32);
3283 tz = _mm256_mul_pd(fscal,dz32);
3285 /* Update vectorial force */
3286 fix3 = _mm256_add_pd(fix3,tx);
3287 fiy3 = _mm256_add_pd(fiy3,ty);
3288 fiz3 = _mm256_add_pd(fiz3,tz);
3290 fjx2 = _mm256_add_pd(fjx2,tx);
3291 fjy2 = _mm256_add_pd(fjy2,ty);
3292 fjz2 = _mm256_add_pd(fjz2,tz);
3296 /**************************
3297 * CALCULATE INTERACTIONS *
3298 **************************/
3300 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3303 r33 = _mm256_mul_pd(rsq33,rinv33);
3304 r33 = _mm256_andnot_pd(dummy_mask,r33);
3306 /* EWALD ELECTROSTATICS */
3308 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3309 ewrt = _mm256_mul_pd(r33,ewtabscale);
3310 ewitab = _mm256_cvttpd_epi32(ewrt);
3311 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3312 ewitab = _mm_slli_epi32(ewitab,2);
3313 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3314 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3315 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3316 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3317 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3318 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3319 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3320 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
3321 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
3323 d = _mm256_sub_pd(r33,rswitch);
3324 d = _mm256_max_pd(d,_mm256_setzero_pd());
3325 d2 = _mm256_mul_pd(d,d);
3326 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3328 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3330 /* Evaluate switch function */
3331 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3332 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
3333 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
3337 fscal = _mm256_and_pd(fscal,cutoff_mask);
3339 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3341 /* Calculate temporary vectorial force */
3342 tx = _mm256_mul_pd(fscal,dx33);
3343 ty = _mm256_mul_pd(fscal,dy33);
3344 tz = _mm256_mul_pd(fscal,dz33);
3346 /* Update vectorial force */
3347 fix3 = _mm256_add_pd(fix3,tx);
3348 fiy3 = _mm256_add_pd(fiy3,ty);
3349 fiz3 = _mm256_add_pd(fiz3,tz);
3351 fjx3 = _mm256_add_pd(fjx3,tx);
3352 fjy3 = _mm256_add_pd(fjy3,ty);
3353 fjz3 = _mm256_add_pd(fjz3,tz);
3357 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3358 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3359 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3360 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3362 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
3363 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3364 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3366 /* Inner loop uses 627 flops */
3369 /* End of innermost loop */
3371 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3372 f+i_coord_offset,fshift+i_shift_offset);
3374 /* Increment number of inner iterations */
3375 inneriter += j_index_end - j_index_start;
3377 /* Outer loop uses 24 flops */
3380 /* Increment number of outer iterations */
3383 /* Update outer/inner flops */
3385 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);