2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
115 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
118 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
119 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
121 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
122 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
124 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
125 real rswitch_scalar,d_scalar;
126 __m256d dummy_mask,cutoff_mask;
127 __m128 tmpmask0,tmpmask1;
128 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
129 __m256d one = _mm256_set1_pd(1.0);
130 __m256d two = _mm256_set1_pd(2.0);
136 jindex = nlist->jindex;
138 shiftidx = nlist->shift;
140 shiftvec = fr->shift_vec[0];
141 fshift = fr->fshift[0];
142 facel = _mm256_set1_pd(fr->epsfac);
143 charge = mdatoms->chargeA;
144 nvdwtype = fr->ntype;
146 vdwtype = mdatoms->typeA;
148 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
149 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
150 beta2 = _mm256_mul_pd(beta,beta);
151 beta3 = _mm256_mul_pd(beta,beta2);
153 ewtab = fr->ic->tabq_coul_FDV0;
154 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
155 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
157 /* Setup water-specific parameters */
158 inr = nlist->iinr[0];
159 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
160 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
161 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
162 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
164 jq1 = _mm256_set1_pd(charge[inr+1]);
165 jq2 = _mm256_set1_pd(charge[inr+2]);
166 jq3 = _mm256_set1_pd(charge[inr+3]);
167 vdwjidx0A = 2*vdwtype[inr+0];
168 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
169 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
170 qq11 = _mm256_mul_pd(iq1,jq1);
171 qq12 = _mm256_mul_pd(iq1,jq2);
172 qq13 = _mm256_mul_pd(iq1,jq3);
173 qq21 = _mm256_mul_pd(iq2,jq1);
174 qq22 = _mm256_mul_pd(iq2,jq2);
175 qq23 = _mm256_mul_pd(iq2,jq3);
176 qq31 = _mm256_mul_pd(iq3,jq1);
177 qq32 = _mm256_mul_pd(iq3,jq2);
178 qq33 = _mm256_mul_pd(iq3,jq3);
180 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
181 rcutoff_scalar = fr->rcoulomb;
182 rcutoff = _mm256_set1_pd(rcutoff_scalar);
183 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
185 rswitch_scalar = fr->rcoulomb_switch;
186 rswitch = _mm256_set1_pd(rswitch_scalar);
187 /* Setup switch parameters */
188 d_scalar = rcutoff_scalar-rswitch_scalar;
189 d = _mm256_set1_pd(d_scalar);
190 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
191 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
192 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
193 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
194 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
195 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
197 /* Avoid stupid compiler warnings */
198 jnrA = jnrB = jnrC = jnrD = 0;
207 for(iidx=0;iidx<4*DIM;iidx++)
212 /* Start outer loop over neighborlists */
213 for(iidx=0; iidx<nri; iidx++)
215 /* Load shift vector for this list */
216 i_shift_offset = DIM*shiftidx[iidx];
218 /* Load limits for loop over neighbors */
219 j_index_start = jindex[iidx];
220 j_index_end = jindex[iidx+1];
222 /* Get outer coordinate index */
224 i_coord_offset = DIM*inr;
226 /* Load i particle coords and add shift vector */
227 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
228 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
230 fix0 = _mm256_setzero_pd();
231 fiy0 = _mm256_setzero_pd();
232 fiz0 = _mm256_setzero_pd();
233 fix1 = _mm256_setzero_pd();
234 fiy1 = _mm256_setzero_pd();
235 fiz1 = _mm256_setzero_pd();
236 fix2 = _mm256_setzero_pd();
237 fiy2 = _mm256_setzero_pd();
238 fiz2 = _mm256_setzero_pd();
239 fix3 = _mm256_setzero_pd();
240 fiy3 = _mm256_setzero_pd();
241 fiz3 = _mm256_setzero_pd();
243 /* Reset potential sums */
244 velecsum = _mm256_setzero_pd();
245 vvdwsum = _mm256_setzero_pd();
247 /* Start inner kernel loop */
248 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
251 /* Get j neighbor index, and coordinate index */
256 j_coord_offsetA = DIM*jnrA;
257 j_coord_offsetB = DIM*jnrB;
258 j_coord_offsetC = DIM*jnrC;
259 j_coord_offsetD = DIM*jnrD;
261 /* load j atom coordinates */
262 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
263 x+j_coord_offsetC,x+j_coord_offsetD,
264 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
265 &jy2,&jz2,&jx3,&jy3,&jz3);
267 /* Calculate displacement vector */
268 dx00 = _mm256_sub_pd(ix0,jx0);
269 dy00 = _mm256_sub_pd(iy0,jy0);
270 dz00 = _mm256_sub_pd(iz0,jz0);
271 dx11 = _mm256_sub_pd(ix1,jx1);
272 dy11 = _mm256_sub_pd(iy1,jy1);
273 dz11 = _mm256_sub_pd(iz1,jz1);
274 dx12 = _mm256_sub_pd(ix1,jx2);
275 dy12 = _mm256_sub_pd(iy1,jy2);
276 dz12 = _mm256_sub_pd(iz1,jz2);
277 dx13 = _mm256_sub_pd(ix1,jx3);
278 dy13 = _mm256_sub_pd(iy1,jy3);
279 dz13 = _mm256_sub_pd(iz1,jz3);
280 dx21 = _mm256_sub_pd(ix2,jx1);
281 dy21 = _mm256_sub_pd(iy2,jy1);
282 dz21 = _mm256_sub_pd(iz2,jz1);
283 dx22 = _mm256_sub_pd(ix2,jx2);
284 dy22 = _mm256_sub_pd(iy2,jy2);
285 dz22 = _mm256_sub_pd(iz2,jz2);
286 dx23 = _mm256_sub_pd(ix2,jx3);
287 dy23 = _mm256_sub_pd(iy2,jy3);
288 dz23 = _mm256_sub_pd(iz2,jz3);
289 dx31 = _mm256_sub_pd(ix3,jx1);
290 dy31 = _mm256_sub_pd(iy3,jy1);
291 dz31 = _mm256_sub_pd(iz3,jz1);
292 dx32 = _mm256_sub_pd(ix3,jx2);
293 dy32 = _mm256_sub_pd(iy3,jy2);
294 dz32 = _mm256_sub_pd(iz3,jz2);
295 dx33 = _mm256_sub_pd(ix3,jx3);
296 dy33 = _mm256_sub_pd(iy3,jy3);
297 dz33 = _mm256_sub_pd(iz3,jz3);
299 /* Calculate squared distance and things based on it */
300 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
301 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
302 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
303 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
304 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
305 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
306 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
307 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
308 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
309 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
311 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
312 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
313 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
314 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
315 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
316 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
317 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
318 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
319 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
320 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
322 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
323 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
324 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
325 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
326 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
327 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
328 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
329 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
330 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
331 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
333 fjx0 = _mm256_setzero_pd();
334 fjy0 = _mm256_setzero_pd();
335 fjz0 = _mm256_setzero_pd();
336 fjx1 = _mm256_setzero_pd();
337 fjy1 = _mm256_setzero_pd();
338 fjz1 = _mm256_setzero_pd();
339 fjx2 = _mm256_setzero_pd();
340 fjy2 = _mm256_setzero_pd();
341 fjz2 = _mm256_setzero_pd();
342 fjx3 = _mm256_setzero_pd();
343 fjy3 = _mm256_setzero_pd();
344 fjz3 = _mm256_setzero_pd();
346 /**************************
347 * CALCULATE INTERACTIONS *
348 **************************/
350 if (gmx_mm256_any_lt(rsq00,rcutoff2))
353 r00 = _mm256_mul_pd(rsq00,rinv00);
355 /* LENNARD-JONES DISPERSION/REPULSION */
357 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
358 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
359 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
360 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
361 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
363 d = _mm256_sub_pd(r00,rswitch);
364 d = _mm256_max_pd(d,_mm256_setzero_pd());
365 d2 = _mm256_mul_pd(d,d);
366 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
368 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
370 /* Evaluate switch function */
371 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
372 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
373 vvdw = _mm256_mul_pd(vvdw,sw);
374 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
378 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
382 fscal = _mm256_and_pd(fscal,cutoff_mask);
384 /* Calculate temporary vectorial force */
385 tx = _mm256_mul_pd(fscal,dx00);
386 ty = _mm256_mul_pd(fscal,dy00);
387 tz = _mm256_mul_pd(fscal,dz00);
389 /* Update vectorial force */
390 fix0 = _mm256_add_pd(fix0,tx);
391 fiy0 = _mm256_add_pd(fiy0,ty);
392 fiz0 = _mm256_add_pd(fiz0,tz);
394 fjx0 = _mm256_add_pd(fjx0,tx);
395 fjy0 = _mm256_add_pd(fjy0,ty);
396 fjz0 = _mm256_add_pd(fjz0,tz);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 if (gmx_mm256_any_lt(rsq11,rcutoff2))
407 r11 = _mm256_mul_pd(rsq11,rinv11);
409 /* EWALD ELECTROSTATICS */
411 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
412 ewrt = _mm256_mul_pd(r11,ewtabscale);
413 ewitab = _mm256_cvttpd_epi32(ewrt);
414 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
415 ewitab = _mm_slli_epi32(ewitab,2);
416 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
417 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
418 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
419 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
420 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
421 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
422 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
423 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
424 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
426 d = _mm256_sub_pd(r11,rswitch);
427 d = _mm256_max_pd(d,_mm256_setzero_pd());
428 d2 = _mm256_mul_pd(d,d);
429 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
431 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
433 /* Evaluate switch function */
434 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
435 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
436 velec = _mm256_mul_pd(velec,sw);
437 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
439 /* Update potential sum for this i atom from the interaction with this j atom. */
440 velec = _mm256_and_pd(velec,cutoff_mask);
441 velecsum = _mm256_add_pd(velecsum,velec);
445 fscal = _mm256_and_pd(fscal,cutoff_mask);
447 /* Calculate temporary vectorial force */
448 tx = _mm256_mul_pd(fscal,dx11);
449 ty = _mm256_mul_pd(fscal,dy11);
450 tz = _mm256_mul_pd(fscal,dz11);
452 /* Update vectorial force */
453 fix1 = _mm256_add_pd(fix1,tx);
454 fiy1 = _mm256_add_pd(fiy1,ty);
455 fiz1 = _mm256_add_pd(fiz1,tz);
457 fjx1 = _mm256_add_pd(fjx1,tx);
458 fjy1 = _mm256_add_pd(fjy1,ty);
459 fjz1 = _mm256_add_pd(fjz1,tz);
463 /**************************
464 * CALCULATE INTERACTIONS *
465 **************************/
467 if (gmx_mm256_any_lt(rsq12,rcutoff2))
470 r12 = _mm256_mul_pd(rsq12,rinv12);
472 /* EWALD ELECTROSTATICS */
474 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
475 ewrt = _mm256_mul_pd(r12,ewtabscale);
476 ewitab = _mm256_cvttpd_epi32(ewrt);
477 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
478 ewitab = _mm_slli_epi32(ewitab,2);
479 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
480 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
481 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
482 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
483 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
484 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
485 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
486 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
487 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
489 d = _mm256_sub_pd(r12,rswitch);
490 d = _mm256_max_pd(d,_mm256_setzero_pd());
491 d2 = _mm256_mul_pd(d,d);
492 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
494 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
496 /* Evaluate switch function */
497 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
498 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
499 velec = _mm256_mul_pd(velec,sw);
500 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velec = _mm256_and_pd(velec,cutoff_mask);
504 velecsum = _mm256_add_pd(velecsum,velec);
508 fscal = _mm256_and_pd(fscal,cutoff_mask);
510 /* Calculate temporary vectorial force */
511 tx = _mm256_mul_pd(fscal,dx12);
512 ty = _mm256_mul_pd(fscal,dy12);
513 tz = _mm256_mul_pd(fscal,dz12);
515 /* Update vectorial force */
516 fix1 = _mm256_add_pd(fix1,tx);
517 fiy1 = _mm256_add_pd(fiy1,ty);
518 fiz1 = _mm256_add_pd(fiz1,tz);
520 fjx2 = _mm256_add_pd(fjx2,tx);
521 fjy2 = _mm256_add_pd(fjy2,ty);
522 fjz2 = _mm256_add_pd(fjz2,tz);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 if (gmx_mm256_any_lt(rsq13,rcutoff2))
533 r13 = _mm256_mul_pd(rsq13,rinv13);
535 /* EWALD ELECTROSTATICS */
537 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
538 ewrt = _mm256_mul_pd(r13,ewtabscale);
539 ewitab = _mm256_cvttpd_epi32(ewrt);
540 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
541 ewitab = _mm_slli_epi32(ewitab,2);
542 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
543 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
544 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
545 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
546 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
547 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
548 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
549 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
550 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
552 d = _mm256_sub_pd(r13,rswitch);
553 d = _mm256_max_pd(d,_mm256_setzero_pd());
554 d2 = _mm256_mul_pd(d,d);
555 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
557 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
559 /* Evaluate switch function */
560 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
561 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
562 velec = _mm256_mul_pd(velec,sw);
563 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velec = _mm256_and_pd(velec,cutoff_mask);
567 velecsum = _mm256_add_pd(velecsum,velec);
571 fscal = _mm256_and_pd(fscal,cutoff_mask);
573 /* Calculate temporary vectorial force */
574 tx = _mm256_mul_pd(fscal,dx13);
575 ty = _mm256_mul_pd(fscal,dy13);
576 tz = _mm256_mul_pd(fscal,dz13);
578 /* Update vectorial force */
579 fix1 = _mm256_add_pd(fix1,tx);
580 fiy1 = _mm256_add_pd(fiy1,ty);
581 fiz1 = _mm256_add_pd(fiz1,tz);
583 fjx3 = _mm256_add_pd(fjx3,tx);
584 fjy3 = _mm256_add_pd(fjy3,ty);
585 fjz3 = _mm256_add_pd(fjz3,tz);
589 /**************************
590 * CALCULATE INTERACTIONS *
591 **************************/
593 if (gmx_mm256_any_lt(rsq21,rcutoff2))
596 r21 = _mm256_mul_pd(rsq21,rinv21);
598 /* EWALD ELECTROSTATICS */
600 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601 ewrt = _mm256_mul_pd(r21,ewtabscale);
602 ewitab = _mm256_cvttpd_epi32(ewrt);
603 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
604 ewitab = _mm_slli_epi32(ewitab,2);
605 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
606 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
607 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
608 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
609 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
610 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
611 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
612 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
613 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
615 d = _mm256_sub_pd(r21,rswitch);
616 d = _mm256_max_pd(d,_mm256_setzero_pd());
617 d2 = _mm256_mul_pd(d,d);
618 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
620 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
622 /* Evaluate switch function */
623 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
624 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
625 velec = _mm256_mul_pd(velec,sw);
626 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
628 /* Update potential sum for this i atom from the interaction with this j atom. */
629 velec = _mm256_and_pd(velec,cutoff_mask);
630 velecsum = _mm256_add_pd(velecsum,velec);
634 fscal = _mm256_and_pd(fscal,cutoff_mask);
636 /* Calculate temporary vectorial force */
637 tx = _mm256_mul_pd(fscal,dx21);
638 ty = _mm256_mul_pd(fscal,dy21);
639 tz = _mm256_mul_pd(fscal,dz21);
641 /* Update vectorial force */
642 fix2 = _mm256_add_pd(fix2,tx);
643 fiy2 = _mm256_add_pd(fiy2,ty);
644 fiz2 = _mm256_add_pd(fiz2,tz);
646 fjx1 = _mm256_add_pd(fjx1,tx);
647 fjy1 = _mm256_add_pd(fjy1,ty);
648 fjz1 = _mm256_add_pd(fjz1,tz);
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 if (gmx_mm256_any_lt(rsq22,rcutoff2))
659 r22 = _mm256_mul_pd(rsq22,rinv22);
661 /* EWALD ELECTROSTATICS */
663 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
664 ewrt = _mm256_mul_pd(r22,ewtabscale);
665 ewitab = _mm256_cvttpd_epi32(ewrt);
666 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
667 ewitab = _mm_slli_epi32(ewitab,2);
668 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
669 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
670 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
671 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
672 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
673 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
674 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
675 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
676 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
678 d = _mm256_sub_pd(r22,rswitch);
679 d = _mm256_max_pd(d,_mm256_setzero_pd());
680 d2 = _mm256_mul_pd(d,d);
681 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
683 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
685 /* Evaluate switch function */
686 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
687 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
688 velec = _mm256_mul_pd(velec,sw);
689 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
691 /* Update potential sum for this i atom from the interaction with this j atom. */
692 velec = _mm256_and_pd(velec,cutoff_mask);
693 velecsum = _mm256_add_pd(velecsum,velec);
697 fscal = _mm256_and_pd(fscal,cutoff_mask);
699 /* Calculate temporary vectorial force */
700 tx = _mm256_mul_pd(fscal,dx22);
701 ty = _mm256_mul_pd(fscal,dy22);
702 tz = _mm256_mul_pd(fscal,dz22);
704 /* Update vectorial force */
705 fix2 = _mm256_add_pd(fix2,tx);
706 fiy2 = _mm256_add_pd(fiy2,ty);
707 fiz2 = _mm256_add_pd(fiz2,tz);
709 fjx2 = _mm256_add_pd(fjx2,tx);
710 fjy2 = _mm256_add_pd(fjy2,ty);
711 fjz2 = _mm256_add_pd(fjz2,tz);
715 /**************************
716 * CALCULATE INTERACTIONS *
717 **************************/
719 if (gmx_mm256_any_lt(rsq23,rcutoff2))
722 r23 = _mm256_mul_pd(rsq23,rinv23);
724 /* EWALD ELECTROSTATICS */
726 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
727 ewrt = _mm256_mul_pd(r23,ewtabscale);
728 ewitab = _mm256_cvttpd_epi32(ewrt);
729 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
730 ewitab = _mm_slli_epi32(ewitab,2);
731 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
732 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
733 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
734 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
735 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
736 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
737 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
738 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
739 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
741 d = _mm256_sub_pd(r23,rswitch);
742 d = _mm256_max_pd(d,_mm256_setzero_pd());
743 d2 = _mm256_mul_pd(d,d);
744 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
746 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
748 /* Evaluate switch function */
749 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
750 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
751 velec = _mm256_mul_pd(velec,sw);
752 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
754 /* Update potential sum for this i atom from the interaction with this j atom. */
755 velec = _mm256_and_pd(velec,cutoff_mask);
756 velecsum = _mm256_add_pd(velecsum,velec);
760 fscal = _mm256_and_pd(fscal,cutoff_mask);
762 /* Calculate temporary vectorial force */
763 tx = _mm256_mul_pd(fscal,dx23);
764 ty = _mm256_mul_pd(fscal,dy23);
765 tz = _mm256_mul_pd(fscal,dz23);
767 /* Update vectorial force */
768 fix2 = _mm256_add_pd(fix2,tx);
769 fiy2 = _mm256_add_pd(fiy2,ty);
770 fiz2 = _mm256_add_pd(fiz2,tz);
772 fjx3 = _mm256_add_pd(fjx3,tx);
773 fjy3 = _mm256_add_pd(fjy3,ty);
774 fjz3 = _mm256_add_pd(fjz3,tz);
778 /**************************
779 * CALCULATE INTERACTIONS *
780 **************************/
782 if (gmx_mm256_any_lt(rsq31,rcutoff2))
785 r31 = _mm256_mul_pd(rsq31,rinv31);
787 /* EWALD ELECTROSTATICS */
789 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
790 ewrt = _mm256_mul_pd(r31,ewtabscale);
791 ewitab = _mm256_cvttpd_epi32(ewrt);
792 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
793 ewitab = _mm_slli_epi32(ewitab,2);
794 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
795 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
796 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
797 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
798 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
799 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
800 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
801 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
802 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
804 d = _mm256_sub_pd(r31,rswitch);
805 d = _mm256_max_pd(d,_mm256_setzero_pd());
806 d2 = _mm256_mul_pd(d,d);
807 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
809 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
811 /* Evaluate switch function */
812 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
813 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
814 velec = _mm256_mul_pd(velec,sw);
815 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
817 /* Update potential sum for this i atom from the interaction with this j atom. */
818 velec = _mm256_and_pd(velec,cutoff_mask);
819 velecsum = _mm256_add_pd(velecsum,velec);
823 fscal = _mm256_and_pd(fscal,cutoff_mask);
825 /* Calculate temporary vectorial force */
826 tx = _mm256_mul_pd(fscal,dx31);
827 ty = _mm256_mul_pd(fscal,dy31);
828 tz = _mm256_mul_pd(fscal,dz31);
830 /* Update vectorial force */
831 fix3 = _mm256_add_pd(fix3,tx);
832 fiy3 = _mm256_add_pd(fiy3,ty);
833 fiz3 = _mm256_add_pd(fiz3,tz);
835 fjx1 = _mm256_add_pd(fjx1,tx);
836 fjy1 = _mm256_add_pd(fjy1,ty);
837 fjz1 = _mm256_add_pd(fjz1,tz);
841 /**************************
842 * CALCULATE INTERACTIONS *
843 **************************/
845 if (gmx_mm256_any_lt(rsq32,rcutoff2))
848 r32 = _mm256_mul_pd(rsq32,rinv32);
850 /* EWALD ELECTROSTATICS */
852 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
853 ewrt = _mm256_mul_pd(r32,ewtabscale);
854 ewitab = _mm256_cvttpd_epi32(ewrt);
855 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
856 ewitab = _mm_slli_epi32(ewitab,2);
857 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
858 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
859 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
860 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
861 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
862 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
863 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
864 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
865 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
867 d = _mm256_sub_pd(r32,rswitch);
868 d = _mm256_max_pd(d,_mm256_setzero_pd());
869 d2 = _mm256_mul_pd(d,d);
870 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
872 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
874 /* Evaluate switch function */
875 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
876 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
877 velec = _mm256_mul_pd(velec,sw);
878 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
880 /* Update potential sum for this i atom from the interaction with this j atom. */
881 velec = _mm256_and_pd(velec,cutoff_mask);
882 velecsum = _mm256_add_pd(velecsum,velec);
886 fscal = _mm256_and_pd(fscal,cutoff_mask);
888 /* Calculate temporary vectorial force */
889 tx = _mm256_mul_pd(fscal,dx32);
890 ty = _mm256_mul_pd(fscal,dy32);
891 tz = _mm256_mul_pd(fscal,dz32);
893 /* Update vectorial force */
894 fix3 = _mm256_add_pd(fix3,tx);
895 fiy3 = _mm256_add_pd(fiy3,ty);
896 fiz3 = _mm256_add_pd(fiz3,tz);
898 fjx2 = _mm256_add_pd(fjx2,tx);
899 fjy2 = _mm256_add_pd(fjy2,ty);
900 fjz2 = _mm256_add_pd(fjz2,tz);
904 /**************************
905 * CALCULATE INTERACTIONS *
906 **************************/
908 if (gmx_mm256_any_lt(rsq33,rcutoff2))
911 r33 = _mm256_mul_pd(rsq33,rinv33);
913 /* EWALD ELECTROSTATICS */
915 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
916 ewrt = _mm256_mul_pd(r33,ewtabscale);
917 ewitab = _mm256_cvttpd_epi32(ewrt);
918 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
919 ewitab = _mm_slli_epi32(ewitab,2);
920 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
921 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
922 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
923 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
924 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
925 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
926 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
927 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
928 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
930 d = _mm256_sub_pd(r33,rswitch);
931 d = _mm256_max_pd(d,_mm256_setzero_pd());
932 d2 = _mm256_mul_pd(d,d);
933 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
935 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
937 /* Evaluate switch function */
938 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
939 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
940 velec = _mm256_mul_pd(velec,sw);
941 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
943 /* Update potential sum for this i atom from the interaction with this j atom. */
944 velec = _mm256_and_pd(velec,cutoff_mask);
945 velecsum = _mm256_add_pd(velecsum,velec);
949 fscal = _mm256_and_pd(fscal,cutoff_mask);
951 /* Calculate temporary vectorial force */
952 tx = _mm256_mul_pd(fscal,dx33);
953 ty = _mm256_mul_pd(fscal,dy33);
954 tz = _mm256_mul_pd(fscal,dz33);
956 /* Update vectorial force */
957 fix3 = _mm256_add_pd(fix3,tx);
958 fiy3 = _mm256_add_pd(fiy3,ty);
959 fiz3 = _mm256_add_pd(fiz3,tz);
961 fjx3 = _mm256_add_pd(fjx3,tx);
962 fjy3 = _mm256_add_pd(fjy3,ty);
963 fjz3 = _mm256_add_pd(fjz3,tz);
967 fjptrA = f+j_coord_offsetA;
968 fjptrB = f+j_coord_offsetB;
969 fjptrC = f+j_coord_offsetC;
970 fjptrD = f+j_coord_offsetD;
972 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
973 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
974 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
976 /* Inner loop uses 647 flops */
982 /* Get j neighbor index, and coordinate index */
983 jnrlistA = jjnr[jidx];
984 jnrlistB = jjnr[jidx+1];
985 jnrlistC = jjnr[jidx+2];
986 jnrlistD = jjnr[jidx+3];
987 /* Sign of each element will be negative for non-real atoms.
988 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
989 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
991 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
993 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
994 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
995 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
997 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
998 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
999 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1000 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1001 j_coord_offsetA = DIM*jnrA;
1002 j_coord_offsetB = DIM*jnrB;
1003 j_coord_offsetC = DIM*jnrC;
1004 j_coord_offsetD = DIM*jnrD;
1006 /* load j atom coordinates */
1007 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1008 x+j_coord_offsetC,x+j_coord_offsetD,
1009 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1010 &jy2,&jz2,&jx3,&jy3,&jz3);
1012 /* Calculate displacement vector */
1013 dx00 = _mm256_sub_pd(ix0,jx0);
1014 dy00 = _mm256_sub_pd(iy0,jy0);
1015 dz00 = _mm256_sub_pd(iz0,jz0);
1016 dx11 = _mm256_sub_pd(ix1,jx1);
1017 dy11 = _mm256_sub_pd(iy1,jy1);
1018 dz11 = _mm256_sub_pd(iz1,jz1);
1019 dx12 = _mm256_sub_pd(ix1,jx2);
1020 dy12 = _mm256_sub_pd(iy1,jy2);
1021 dz12 = _mm256_sub_pd(iz1,jz2);
1022 dx13 = _mm256_sub_pd(ix1,jx3);
1023 dy13 = _mm256_sub_pd(iy1,jy3);
1024 dz13 = _mm256_sub_pd(iz1,jz3);
1025 dx21 = _mm256_sub_pd(ix2,jx1);
1026 dy21 = _mm256_sub_pd(iy2,jy1);
1027 dz21 = _mm256_sub_pd(iz2,jz1);
1028 dx22 = _mm256_sub_pd(ix2,jx2);
1029 dy22 = _mm256_sub_pd(iy2,jy2);
1030 dz22 = _mm256_sub_pd(iz2,jz2);
1031 dx23 = _mm256_sub_pd(ix2,jx3);
1032 dy23 = _mm256_sub_pd(iy2,jy3);
1033 dz23 = _mm256_sub_pd(iz2,jz3);
1034 dx31 = _mm256_sub_pd(ix3,jx1);
1035 dy31 = _mm256_sub_pd(iy3,jy1);
1036 dz31 = _mm256_sub_pd(iz3,jz1);
1037 dx32 = _mm256_sub_pd(ix3,jx2);
1038 dy32 = _mm256_sub_pd(iy3,jy2);
1039 dz32 = _mm256_sub_pd(iz3,jz2);
1040 dx33 = _mm256_sub_pd(ix3,jx3);
1041 dy33 = _mm256_sub_pd(iy3,jy3);
1042 dz33 = _mm256_sub_pd(iz3,jz3);
1044 /* Calculate squared distance and things based on it */
1045 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1046 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1047 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1048 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1049 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1050 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1051 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1052 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1053 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1054 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1056 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1057 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1058 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1059 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1060 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1061 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1062 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1063 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1064 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1065 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1067 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1068 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1069 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1070 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1071 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1072 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1073 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1074 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1075 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1076 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1078 fjx0 = _mm256_setzero_pd();
1079 fjy0 = _mm256_setzero_pd();
1080 fjz0 = _mm256_setzero_pd();
1081 fjx1 = _mm256_setzero_pd();
1082 fjy1 = _mm256_setzero_pd();
1083 fjz1 = _mm256_setzero_pd();
1084 fjx2 = _mm256_setzero_pd();
1085 fjy2 = _mm256_setzero_pd();
1086 fjz2 = _mm256_setzero_pd();
1087 fjx3 = _mm256_setzero_pd();
1088 fjy3 = _mm256_setzero_pd();
1089 fjz3 = _mm256_setzero_pd();
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1095 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1098 r00 = _mm256_mul_pd(rsq00,rinv00);
1099 r00 = _mm256_andnot_pd(dummy_mask,r00);
1101 /* LENNARD-JONES DISPERSION/REPULSION */
1103 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1104 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1105 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1106 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1107 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1109 d = _mm256_sub_pd(r00,rswitch);
1110 d = _mm256_max_pd(d,_mm256_setzero_pd());
1111 d2 = _mm256_mul_pd(d,d);
1112 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1114 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1116 /* Evaluate switch function */
1117 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1118 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1119 vvdw = _mm256_mul_pd(vvdw,sw);
1120 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1122 /* Update potential sum for this i atom from the interaction with this j atom. */
1123 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
1124 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
1125 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
1129 fscal = _mm256_and_pd(fscal,cutoff_mask);
1131 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1133 /* Calculate temporary vectorial force */
1134 tx = _mm256_mul_pd(fscal,dx00);
1135 ty = _mm256_mul_pd(fscal,dy00);
1136 tz = _mm256_mul_pd(fscal,dz00);
1138 /* Update vectorial force */
1139 fix0 = _mm256_add_pd(fix0,tx);
1140 fiy0 = _mm256_add_pd(fiy0,ty);
1141 fiz0 = _mm256_add_pd(fiz0,tz);
1143 fjx0 = _mm256_add_pd(fjx0,tx);
1144 fjy0 = _mm256_add_pd(fjy0,ty);
1145 fjz0 = _mm256_add_pd(fjz0,tz);
1149 /**************************
1150 * CALCULATE INTERACTIONS *
1151 **************************/
1153 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1156 r11 = _mm256_mul_pd(rsq11,rinv11);
1157 r11 = _mm256_andnot_pd(dummy_mask,r11);
1159 /* EWALD ELECTROSTATICS */
1161 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1162 ewrt = _mm256_mul_pd(r11,ewtabscale);
1163 ewitab = _mm256_cvttpd_epi32(ewrt);
1164 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1165 ewitab = _mm_slli_epi32(ewitab,2);
1166 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1167 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1168 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1169 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1170 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1171 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1172 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1173 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1174 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1176 d = _mm256_sub_pd(r11,rswitch);
1177 d = _mm256_max_pd(d,_mm256_setzero_pd());
1178 d2 = _mm256_mul_pd(d,d);
1179 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1181 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1183 /* Evaluate switch function */
1184 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1185 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1186 velec = _mm256_mul_pd(velec,sw);
1187 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec = _mm256_and_pd(velec,cutoff_mask);
1191 velec = _mm256_andnot_pd(dummy_mask,velec);
1192 velecsum = _mm256_add_pd(velecsum,velec);
1196 fscal = _mm256_and_pd(fscal,cutoff_mask);
1198 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1200 /* Calculate temporary vectorial force */
1201 tx = _mm256_mul_pd(fscal,dx11);
1202 ty = _mm256_mul_pd(fscal,dy11);
1203 tz = _mm256_mul_pd(fscal,dz11);
1205 /* Update vectorial force */
1206 fix1 = _mm256_add_pd(fix1,tx);
1207 fiy1 = _mm256_add_pd(fiy1,ty);
1208 fiz1 = _mm256_add_pd(fiz1,tz);
1210 fjx1 = _mm256_add_pd(fjx1,tx);
1211 fjy1 = _mm256_add_pd(fjy1,ty);
1212 fjz1 = _mm256_add_pd(fjz1,tz);
1216 /**************************
1217 * CALCULATE INTERACTIONS *
1218 **************************/
1220 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1223 r12 = _mm256_mul_pd(rsq12,rinv12);
1224 r12 = _mm256_andnot_pd(dummy_mask,r12);
1226 /* EWALD ELECTROSTATICS */
1228 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1229 ewrt = _mm256_mul_pd(r12,ewtabscale);
1230 ewitab = _mm256_cvttpd_epi32(ewrt);
1231 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1232 ewitab = _mm_slli_epi32(ewitab,2);
1233 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1234 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1235 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1236 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1237 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1238 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1239 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1240 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1241 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1243 d = _mm256_sub_pd(r12,rswitch);
1244 d = _mm256_max_pd(d,_mm256_setzero_pd());
1245 d2 = _mm256_mul_pd(d,d);
1246 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1248 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1250 /* Evaluate switch function */
1251 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1252 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1253 velec = _mm256_mul_pd(velec,sw);
1254 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1256 /* Update potential sum for this i atom from the interaction with this j atom. */
1257 velec = _mm256_and_pd(velec,cutoff_mask);
1258 velec = _mm256_andnot_pd(dummy_mask,velec);
1259 velecsum = _mm256_add_pd(velecsum,velec);
1263 fscal = _mm256_and_pd(fscal,cutoff_mask);
1265 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1267 /* Calculate temporary vectorial force */
1268 tx = _mm256_mul_pd(fscal,dx12);
1269 ty = _mm256_mul_pd(fscal,dy12);
1270 tz = _mm256_mul_pd(fscal,dz12);
1272 /* Update vectorial force */
1273 fix1 = _mm256_add_pd(fix1,tx);
1274 fiy1 = _mm256_add_pd(fiy1,ty);
1275 fiz1 = _mm256_add_pd(fiz1,tz);
1277 fjx2 = _mm256_add_pd(fjx2,tx);
1278 fjy2 = _mm256_add_pd(fjy2,ty);
1279 fjz2 = _mm256_add_pd(fjz2,tz);
1283 /**************************
1284 * CALCULATE INTERACTIONS *
1285 **************************/
1287 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1290 r13 = _mm256_mul_pd(rsq13,rinv13);
1291 r13 = _mm256_andnot_pd(dummy_mask,r13);
1293 /* EWALD ELECTROSTATICS */
1295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1296 ewrt = _mm256_mul_pd(r13,ewtabscale);
1297 ewitab = _mm256_cvttpd_epi32(ewrt);
1298 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1299 ewitab = _mm_slli_epi32(ewitab,2);
1300 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1301 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1302 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1303 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1304 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1305 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1306 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1307 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1308 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1310 d = _mm256_sub_pd(r13,rswitch);
1311 d = _mm256_max_pd(d,_mm256_setzero_pd());
1312 d2 = _mm256_mul_pd(d,d);
1313 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1315 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1317 /* Evaluate switch function */
1318 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1319 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
1320 velec = _mm256_mul_pd(velec,sw);
1321 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1323 /* Update potential sum for this i atom from the interaction with this j atom. */
1324 velec = _mm256_and_pd(velec,cutoff_mask);
1325 velec = _mm256_andnot_pd(dummy_mask,velec);
1326 velecsum = _mm256_add_pd(velecsum,velec);
1330 fscal = _mm256_and_pd(fscal,cutoff_mask);
1332 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1334 /* Calculate temporary vectorial force */
1335 tx = _mm256_mul_pd(fscal,dx13);
1336 ty = _mm256_mul_pd(fscal,dy13);
1337 tz = _mm256_mul_pd(fscal,dz13);
1339 /* Update vectorial force */
1340 fix1 = _mm256_add_pd(fix1,tx);
1341 fiy1 = _mm256_add_pd(fiy1,ty);
1342 fiz1 = _mm256_add_pd(fiz1,tz);
1344 fjx3 = _mm256_add_pd(fjx3,tx);
1345 fjy3 = _mm256_add_pd(fjy3,ty);
1346 fjz3 = _mm256_add_pd(fjz3,tz);
1350 /**************************
1351 * CALCULATE INTERACTIONS *
1352 **************************/
1354 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1357 r21 = _mm256_mul_pd(rsq21,rinv21);
1358 r21 = _mm256_andnot_pd(dummy_mask,r21);
1360 /* EWALD ELECTROSTATICS */
1362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1363 ewrt = _mm256_mul_pd(r21,ewtabscale);
1364 ewitab = _mm256_cvttpd_epi32(ewrt);
1365 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1366 ewitab = _mm_slli_epi32(ewitab,2);
1367 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1368 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1369 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1370 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1371 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1372 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1373 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1374 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1375 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1377 d = _mm256_sub_pd(r21,rswitch);
1378 d = _mm256_max_pd(d,_mm256_setzero_pd());
1379 d2 = _mm256_mul_pd(d,d);
1380 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1382 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1384 /* Evaluate switch function */
1385 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1386 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1387 velec = _mm256_mul_pd(velec,sw);
1388 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1390 /* Update potential sum for this i atom from the interaction with this j atom. */
1391 velec = _mm256_and_pd(velec,cutoff_mask);
1392 velec = _mm256_andnot_pd(dummy_mask,velec);
1393 velecsum = _mm256_add_pd(velecsum,velec);
1397 fscal = _mm256_and_pd(fscal,cutoff_mask);
1399 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1401 /* Calculate temporary vectorial force */
1402 tx = _mm256_mul_pd(fscal,dx21);
1403 ty = _mm256_mul_pd(fscal,dy21);
1404 tz = _mm256_mul_pd(fscal,dz21);
1406 /* Update vectorial force */
1407 fix2 = _mm256_add_pd(fix2,tx);
1408 fiy2 = _mm256_add_pd(fiy2,ty);
1409 fiz2 = _mm256_add_pd(fiz2,tz);
1411 fjx1 = _mm256_add_pd(fjx1,tx);
1412 fjy1 = _mm256_add_pd(fjy1,ty);
1413 fjz1 = _mm256_add_pd(fjz1,tz);
1417 /**************************
1418 * CALCULATE INTERACTIONS *
1419 **************************/
1421 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1424 r22 = _mm256_mul_pd(rsq22,rinv22);
1425 r22 = _mm256_andnot_pd(dummy_mask,r22);
1427 /* EWALD ELECTROSTATICS */
1429 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1430 ewrt = _mm256_mul_pd(r22,ewtabscale);
1431 ewitab = _mm256_cvttpd_epi32(ewrt);
1432 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1433 ewitab = _mm_slli_epi32(ewitab,2);
1434 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1435 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1436 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1437 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1438 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1439 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1440 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1441 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1442 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1444 d = _mm256_sub_pd(r22,rswitch);
1445 d = _mm256_max_pd(d,_mm256_setzero_pd());
1446 d2 = _mm256_mul_pd(d,d);
1447 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1449 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1451 /* Evaluate switch function */
1452 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1453 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1454 velec = _mm256_mul_pd(velec,sw);
1455 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1457 /* Update potential sum for this i atom from the interaction with this j atom. */
1458 velec = _mm256_and_pd(velec,cutoff_mask);
1459 velec = _mm256_andnot_pd(dummy_mask,velec);
1460 velecsum = _mm256_add_pd(velecsum,velec);
1464 fscal = _mm256_and_pd(fscal,cutoff_mask);
1466 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1468 /* Calculate temporary vectorial force */
1469 tx = _mm256_mul_pd(fscal,dx22);
1470 ty = _mm256_mul_pd(fscal,dy22);
1471 tz = _mm256_mul_pd(fscal,dz22);
1473 /* Update vectorial force */
1474 fix2 = _mm256_add_pd(fix2,tx);
1475 fiy2 = _mm256_add_pd(fiy2,ty);
1476 fiz2 = _mm256_add_pd(fiz2,tz);
1478 fjx2 = _mm256_add_pd(fjx2,tx);
1479 fjy2 = _mm256_add_pd(fjy2,ty);
1480 fjz2 = _mm256_add_pd(fjz2,tz);
1484 /**************************
1485 * CALCULATE INTERACTIONS *
1486 **************************/
1488 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1491 r23 = _mm256_mul_pd(rsq23,rinv23);
1492 r23 = _mm256_andnot_pd(dummy_mask,r23);
1494 /* EWALD ELECTROSTATICS */
1496 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1497 ewrt = _mm256_mul_pd(r23,ewtabscale);
1498 ewitab = _mm256_cvttpd_epi32(ewrt);
1499 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1500 ewitab = _mm_slli_epi32(ewitab,2);
1501 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1502 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1503 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1504 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1505 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1506 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1507 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1508 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1509 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1511 d = _mm256_sub_pd(r23,rswitch);
1512 d = _mm256_max_pd(d,_mm256_setzero_pd());
1513 d2 = _mm256_mul_pd(d,d);
1514 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1516 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1518 /* Evaluate switch function */
1519 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1520 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
1521 velec = _mm256_mul_pd(velec,sw);
1522 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1524 /* Update potential sum for this i atom from the interaction with this j atom. */
1525 velec = _mm256_and_pd(velec,cutoff_mask);
1526 velec = _mm256_andnot_pd(dummy_mask,velec);
1527 velecsum = _mm256_add_pd(velecsum,velec);
1531 fscal = _mm256_and_pd(fscal,cutoff_mask);
1533 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1535 /* Calculate temporary vectorial force */
1536 tx = _mm256_mul_pd(fscal,dx23);
1537 ty = _mm256_mul_pd(fscal,dy23);
1538 tz = _mm256_mul_pd(fscal,dz23);
1540 /* Update vectorial force */
1541 fix2 = _mm256_add_pd(fix2,tx);
1542 fiy2 = _mm256_add_pd(fiy2,ty);
1543 fiz2 = _mm256_add_pd(fiz2,tz);
1545 fjx3 = _mm256_add_pd(fjx3,tx);
1546 fjy3 = _mm256_add_pd(fjy3,ty);
1547 fjz3 = _mm256_add_pd(fjz3,tz);
1551 /**************************
1552 * CALCULATE INTERACTIONS *
1553 **************************/
1555 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1558 r31 = _mm256_mul_pd(rsq31,rinv31);
1559 r31 = _mm256_andnot_pd(dummy_mask,r31);
1561 /* EWALD ELECTROSTATICS */
1563 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1564 ewrt = _mm256_mul_pd(r31,ewtabscale);
1565 ewitab = _mm256_cvttpd_epi32(ewrt);
1566 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1567 ewitab = _mm_slli_epi32(ewitab,2);
1568 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1569 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1570 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1571 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1572 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1573 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1574 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1575 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1576 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1578 d = _mm256_sub_pd(r31,rswitch);
1579 d = _mm256_max_pd(d,_mm256_setzero_pd());
1580 d2 = _mm256_mul_pd(d,d);
1581 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1583 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1585 /* Evaluate switch function */
1586 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1587 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
1588 velec = _mm256_mul_pd(velec,sw);
1589 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1591 /* Update potential sum for this i atom from the interaction with this j atom. */
1592 velec = _mm256_and_pd(velec,cutoff_mask);
1593 velec = _mm256_andnot_pd(dummy_mask,velec);
1594 velecsum = _mm256_add_pd(velecsum,velec);
1598 fscal = _mm256_and_pd(fscal,cutoff_mask);
1600 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1602 /* Calculate temporary vectorial force */
1603 tx = _mm256_mul_pd(fscal,dx31);
1604 ty = _mm256_mul_pd(fscal,dy31);
1605 tz = _mm256_mul_pd(fscal,dz31);
1607 /* Update vectorial force */
1608 fix3 = _mm256_add_pd(fix3,tx);
1609 fiy3 = _mm256_add_pd(fiy3,ty);
1610 fiz3 = _mm256_add_pd(fiz3,tz);
1612 fjx1 = _mm256_add_pd(fjx1,tx);
1613 fjy1 = _mm256_add_pd(fjy1,ty);
1614 fjz1 = _mm256_add_pd(fjz1,tz);
1618 /**************************
1619 * CALCULATE INTERACTIONS *
1620 **************************/
1622 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1625 r32 = _mm256_mul_pd(rsq32,rinv32);
1626 r32 = _mm256_andnot_pd(dummy_mask,r32);
1628 /* EWALD ELECTROSTATICS */
1630 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1631 ewrt = _mm256_mul_pd(r32,ewtabscale);
1632 ewitab = _mm256_cvttpd_epi32(ewrt);
1633 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1634 ewitab = _mm_slli_epi32(ewitab,2);
1635 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1636 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1637 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1638 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1639 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1640 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1641 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1642 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1643 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1645 d = _mm256_sub_pd(r32,rswitch);
1646 d = _mm256_max_pd(d,_mm256_setzero_pd());
1647 d2 = _mm256_mul_pd(d,d);
1648 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1650 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1652 /* Evaluate switch function */
1653 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1654 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
1655 velec = _mm256_mul_pd(velec,sw);
1656 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1658 /* Update potential sum for this i atom from the interaction with this j atom. */
1659 velec = _mm256_and_pd(velec,cutoff_mask);
1660 velec = _mm256_andnot_pd(dummy_mask,velec);
1661 velecsum = _mm256_add_pd(velecsum,velec);
1665 fscal = _mm256_and_pd(fscal,cutoff_mask);
1667 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1669 /* Calculate temporary vectorial force */
1670 tx = _mm256_mul_pd(fscal,dx32);
1671 ty = _mm256_mul_pd(fscal,dy32);
1672 tz = _mm256_mul_pd(fscal,dz32);
1674 /* Update vectorial force */
1675 fix3 = _mm256_add_pd(fix3,tx);
1676 fiy3 = _mm256_add_pd(fiy3,ty);
1677 fiz3 = _mm256_add_pd(fiz3,tz);
1679 fjx2 = _mm256_add_pd(fjx2,tx);
1680 fjy2 = _mm256_add_pd(fjy2,ty);
1681 fjz2 = _mm256_add_pd(fjz2,tz);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1692 r33 = _mm256_mul_pd(rsq33,rinv33);
1693 r33 = _mm256_andnot_pd(dummy_mask,r33);
1695 /* EWALD ELECTROSTATICS */
1697 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1698 ewrt = _mm256_mul_pd(r33,ewtabscale);
1699 ewitab = _mm256_cvttpd_epi32(ewrt);
1700 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1701 ewitab = _mm_slli_epi32(ewitab,2);
1702 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1703 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1704 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1705 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1706 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1707 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1708 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1709 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1710 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1712 d = _mm256_sub_pd(r33,rswitch);
1713 d = _mm256_max_pd(d,_mm256_setzero_pd());
1714 d2 = _mm256_mul_pd(d,d);
1715 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1717 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1719 /* Evaluate switch function */
1720 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1721 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
1722 velec = _mm256_mul_pd(velec,sw);
1723 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1725 /* Update potential sum for this i atom from the interaction with this j atom. */
1726 velec = _mm256_and_pd(velec,cutoff_mask);
1727 velec = _mm256_andnot_pd(dummy_mask,velec);
1728 velecsum = _mm256_add_pd(velecsum,velec);
1732 fscal = _mm256_and_pd(fscal,cutoff_mask);
1734 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1736 /* Calculate temporary vectorial force */
1737 tx = _mm256_mul_pd(fscal,dx33);
1738 ty = _mm256_mul_pd(fscal,dy33);
1739 tz = _mm256_mul_pd(fscal,dz33);
1741 /* Update vectorial force */
1742 fix3 = _mm256_add_pd(fix3,tx);
1743 fiy3 = _mm256_add_pd(fiy3,ty);
1744 fiz3 = _mm256_add_pd(fiz3,tz);
1746 fjx3 = _mm256_add_pd(fjx3,tx);
1747 fjy3 = _mm256_add_pd(fjy3,ty);
1748 fjz3 = _mm256_add_pd(fjz3,tz);
1752 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1753 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1754 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1755 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1757 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1758 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1759 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1761 /* Inner loop uses 657 flops */
1764 /* End of innermost loop */
1766 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1767 f+i_coord_offset,fshift+i_shift_offset);
1770 /* Update potential energies */
1771 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1772 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1774 /* Increment number of inner iterations */
1775 inneriter += j_index_end - j_index_start;
1777 /* Outer loop uses 26 flops */
1780 /* Increment number of outer iterations */
1783 /* Update outer/inner flops */
1785 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1788 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1789 * Electrostatics interaction: Ewald
1790 * VdW interaction: LennardJones
1791 * Geometry: Water4-Water4
1792 * Calculate force/pot: Force
1795 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_256_double
1796 (t_nblist * gmx_restrict nlist,
1797 rvec * gmx_restrict xx,
1798 rvec * gmx_restrict ff,
1799 t_forcerec * gmx_restrict fr,
1800 t_mdatoms * gmx_restrict mdatoms,
1801 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1802 t_nrnb * gmx_restrict nrnb)
1804 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1805 * just 0 for non-waters.
1806 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1807 * jnr indices corresponding to data put in the four positions in the SIMD register.
1809 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1810 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1811 int jnrA,jnrB,jnrC,jnrD;
1812 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1813 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1814 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1815 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1816 real rcutoff_scalar;
1817 real *shiftvec,*fshift,*x,*f;
1818 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1819 real scratch[4*DIM];
1820 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1821 real * vdwioffsetptr0;
1822 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1823 real * vdwioffsetptr1;
1824 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1825 real * vdwioffsetptr2;
1826 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1827 real * vdwioffsetptr3;
1828 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1829 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1830 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1831 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1832 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1833 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1834 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1835 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1836 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1837 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1838 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1839 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1840 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1841 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1842 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1843 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1844 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1845 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1846 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1847 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1850 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1853 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1854 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1856 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1857 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1859 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1860 real rswitch_scalar,d_scalar;
1861 __m256d dummy_mask,cutoff_mask;
1862 __m128 tmpmask0,tmpmask1;
1863 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1864 __m256d one = _mm256_set1_pd(1.0);
1865 __m256d two = _mm256_set1_pd(2.0);
1871 jindex = nlist->jindex;
1873 shiftidx = nlist->shift;
1875 shiftvec = fr->shift_vec[0];
1876 fshift = fr->fshift[0];
1877 facel = _mm256_set1_pd(fr->epsfac);
1878 charge = mdatoms->chargeA;
1879 nvdwtype = fr->ntype;
1880 vdwparam = fr->nbfp;
1881 vdwtype = mdatoms->typeA;
1883 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1884 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1885 beta2 = _mm256_mul_pd(beta,beta);
1886 beta3 = _mm256_mul_pd(beta,beta2);
1888 ewtab = fr->ic->tabq_coul_FDV0;
1889 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1890 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1892 /* Setup water-specific parameters */
1893 inr = nlist->iinr[0];
1894 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1895 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1896 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1897 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1899 jq1 = _mm256_set1_pd(charge[inr+1]);
1900 jq2 = _mm256_set1_pd(charge[inr+2]);
1901 jq3 = _mm256_set1_pd(charge[inr+3]);
1902 vdwjidx0A = 2*vdwtype[inr+0];
1903 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1904 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1905 qq11 = _mm256_mul_pd(iq1,jq1);
1906 qq12 = _mm256_mul_pd(iq1,jq2);
1907 qq13 = _mm256_mul_pd(iq1,jq3);
1908 qq21 = _mm256_mul_pd(iq2,jq1);
1909 qq22 = _mm256_mul_pd(iq2,jq2);
1910 qq23 = _mm256_mul_pd(iq2,jq3);
1911 qq31 = _mm256_mul_pd(iq3,jq1);
1912 qq32 = _mm256_mul_pd(iq3,jq2);
1913 qq33 = _mm256_mul_pd(iq3,jq3);
1915 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1916 rcutoff_scalar = fr->rcoulomb;
1917 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1918 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1920 rswitch_scalar = fr->rcoulomb_switch;
1921 rswitch = _mm256_set1_pd(rswitch_scalar);
1922 /* Setup switch parameters */
1923 d_scalar = rcutoff_scalar-rswitch_scalar;
1924 d = _mm256_set1_pd(d_scalar);
1925 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1926 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1927 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1928 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1929 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1930 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1932 /* Avoid stupid compiler warnings */
1933 jnrA = jnrB = jnrC = jnrD = 0;
1934 j_coord_offsetA = 0;
1935 j_coord_offsetB = 0;
1936 j_coord_offsetC = 0;
1937 j_coord_offsetD = 0;
1942 for(iidx=0;iidx<4*DIM;iidx++)
1944 scratch[iidx] = 0.0;
1947 /* Start outer loop over neighborlists */
1948 for(iidx=0; iidx<nri; iidx++)
1950 /* Load shift vector for this list */
1951 i_shift_offset = DIM*shiftidx[iidx];
1953 /* Load limits for loop over neighbors */
1954 j_index_start = jindex[iidx];
1955 j_index_end = jindex[iidx+1];
1957 /* Get outer coordinate index */
1959 i_coord_offset = DIM*inr;
1961 /* Load i particle coords and add shift vector */
1962 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1963 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1965 fix0 = _mm256_setzero_pd();
1966 fiy0 = _mm256_setzero_pd();
1967 fiz0 = _mm256_setzero_pd();
1968 fix1 = _mm256_setzero_pd();
1969 fiy1 = _mm256_setzero_pd();
1970 fiz1 = _mm256_setzero_pd();
1971 fix2 = _mm256_setzero_pd();
1972 fiy2 = _mm256_setzero_pd();
1973 fiz2 = _mm256_setzero_pd();
1974 fix3 = _mm256_setzero_pd();
1975 fiy3 = _mm256_setzero_pd();
1976 fiz3 = _mm256_setzero_pd();
1978 /* Start inner kernel loop */
1979 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1982 /* Get j neighbor index, and coordinate index */
1984 jnrB = jjnr[jidx+1];
1985 jnrC = jjnr[jidx+2];
1986 jnrD = jjnr[jidx+3];
1987 j_coord_offsetA = DIM*jnrA;
1988 j_coord_offsetB = DIM*jnrB;
1989 j_coord_offsetC = DIM*jnrC;
1990 j_coord_offsetD = DIM*jnrD;
1992 /* load j atom coordinates */
1993 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1994 x+j_coord_offsetC,x+j_coord_offsetD,
1995 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1996 &jy2,&jz2,&jx3,&jy3,&jz3);
1998 /* Calculate displacement vector */
1999 dx00 = _mm256_sub_pd(ix0,jx0);
2000 dy00 = _mm256_sub_pd(iy0,jy0);
2001 dz00 = _mm256_sub_pd(iz0,jz0);
2002 dx11 = _mm256_sub_pd(ix1,jx1);
2003 dy11 = _mm256_sub_pd(iy1,jy1);
2004 dz11 = _mm256_sub_pd(iz1,jz1);
2005 dx12 = _mm256_sub_pd(ix1,jx2);
2006 dy12 = _mm256_sub_pd(iy1,jy2);
2007 dz12 = _mm256_sub_pd(iz1,jz2);
2008 dx13 = _mm256_sub_pd(ix1,jx3);
2009 dy13 = _mm256_sub_pd(iy1,jy3);
2010 dz13 = _mm256_sub_pd(iz1,jz3);
2011 dx21 = _mm256_sub_pd(ix2,jx1);
2012 dy21 = _mm256_sub_pd(iy2,jy1);
2013 dz21 = _mm256_sub_pd(iz2,jz1);
2014 dx22 = _mm256_sub_pd(ix2,jx2);
2015 dy22 = _mm256_sub_pd(iy2,jy2);
2016 dz22 = _mm256_sub_pd(iz2,jz2);
2017 dx23 = _mm256_sub_pd(ix2,jx3);
2018 dy23 = _mm256_sub_pd(iy2,jy3);
2019 dz23 = _mm256_sub_pd(iz2,jz3);
2020 dx31 = _mm256_sub_pd(ix3,jx1);
2021 dy31 = _mm256_sub_pd(iy3,jy1);
2022 dz31 = _mm256_sub_pd(iz3,jz1);
2023 dx32 = _mm256_sub_pd(ix3,jx2);
2024 dy32 = _mm256_sub_pd(iy3,jy2);
2025 dz32 = _mm256_sub_pd(iz3,jz2);
2026 dx33 = _mm256_sub_pd(ix3,jx3);
2027 dy33 = _mm256_sub_pd(iy3,jy3);
2028 dz33 = _mm256_sub_pd(iz3,jz3);
2030 /* Calculate squared distance and things based on it */
2031 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2032 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2033 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2034 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2035 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2036 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2037 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2038 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2039 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2040 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2042 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2043 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2044 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2045 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2046 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2047 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2048 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2049 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2050 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2051 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2053 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2054 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2055 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2056 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2057 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2058 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2059 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2060 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2061 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2062 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2064 fjx0 = _mm256_setzero_pd();
2065 fjy0 = _mm256_setzero_pd();
2066 fjz0 = _mm256_setzero_pd();
2067 fjx1 = _mm256_setzero_pd();
2068 fjy1 = _mm256_setzero_pd();
2069 fjz1 = _mm256_setzero_pd();
2070 fjx2 = _mm256_setzero_pd();
2071 fjy2 = _mm256_setzero_pd();
2072 fjz2 = _mm256_setzero_pd();
2073 fjx3 = _mm256_setzero_pd();
2074 fjy3 = _mm256_setzero_pd();
2075 fjz3 = _mm256_setzero_pd();
2077 /**************************
2078 * CALCULATE INTERACTIONS *
2079 **************************/
2081 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2084 r00 = _mm256_mul_pd(rsq00,rinv00);
2086 /* LENNARD-JONES DISPERSION/REPULSION */
2088 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2089 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2090 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2091 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2092 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2094 d = _mm256_sub_pd(r00,rswitch);
2095 d = _mm256_max_pd(d,_mm256_setzero_pd());
2096 d2 = _mm256_mul_pd(d,d);
2097 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2099 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2101 /* Evaluate switch function */
2102 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2103 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2104 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2108 fscal = _mm256_and_pd(fscal,cutoff_mask);
2110 /* Calculate temporary vectorial force */
2111 tx = _mm256_mul_pd(fscal,dx00);
2112 ty = _mm256_mul_pd(fscal,dy00);
2113 tz = _mm256_mul_pd(fscal,dz00);
2115 /* Update vectorial force */
2116 fix0 = _mm256_add_pd(fix0,tx);
2117 fiy0 = _mm256_add_pd(fiy0,ty);
2118 fiz0 = _mm256_add_pd(fiz0,tz);
2120 fjx0 = _mm256_add_pd(fjx0,tx);
2121 fjy0 = _mm256_add_pd(fjy0,ty);
2122 fjz0 = _mm256_add_pd(fjz0,tz);
2126 /**************************
2127 * CALCULATE INTERACTIONS *
2128 **************************/
2130 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2133 r11 = _mm256_mul_pd(rsq11,rinv11);
2135 /* EWALD ELECTROSTATICS */
2137 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2138 ewrt = _mm256_mul_pd(r11,ewtabscale);
2139 ewitab = _mm256_cvttpd_epi32(ewrt);
2140 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2141 ewitab = _mm_slli_epi32(ewitab,2);
2142 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2143 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2144 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2145 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2146 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2147 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2148 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2149 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2150 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2152 d = _mm256_sub_pd(r11,rswitch);
2153 d = _mm256_max_pd(d,_mm256_setzero_pd());
2154 d2 = _mm256_mul_pd(d,d);
2155 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2157 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2159 /* Evaluate switch function */
2160 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2161 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2162 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2166 fscal = _mm256_and_pd(fscal,cutoff_mask);
2168 /* Calculate temporary vectorial force */
2169 tx = _mm256_mul_pd(fscal,dx11);
2170 ty = _mm256_mul_pd(fscal,dy11);
2171 tz = _mm256_mul_pd(fscal,dz11);
2173 /* Update vectorial force */
2174 fix1 = _mm256_add_pd(fix1,tx);
2175 fiy1 = _mm256_add_pd(fiy1,ty);
2176 fiz1 = _mm256_add_pd(fiz1,tz);
2178 fjx1 = _mm256_add_pd(fjx1,tx);
2179 fjy1 = _mm256_add_pd(fjy1,ty);
2180 fjz1 = _mm256_add_pd(fjz1,tz);
2184 /**************************
2185 * CALCULATE INTERACTIONS *
2186 **************************/
2188 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2191 r12 = _mm256_mul_pd(rsq12,rinv12);
2193 /* EWALD ELECTROSTATICS */
2195 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2196 ewrt = _mm256_mul_pd(r12,ewtabscale);
2197 ewitab = _mm256_cvttpd_epi32(ewrt);
2198 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2199 ewitab = _mm_slli_epi32(ewitab,2);
2200 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2201 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2202 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2203 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2204 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2205 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2206 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2207 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2208 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2210 d = _mm256_sub_pd(r12,rswitch);
2211 d = _mm256_max_pd(d,_mm256_setzero_pd());
2212 d2 = _mm256_mul_pd(d,d);
2213 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2215 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2217 /* Evaluate switch function */
2218 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2219 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2220 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2224 fscal = _mm256_and_pd(fscal,cutoff_mask);
2226 /* Calculate temporary vectorial force */
2227 tx = _mm256_mul_pd(fscal,dx12);
2228 ty = _mm256_mul_pd(fscal,dy12);
2229 tz = _mm256_mul_pd(fscal,dz12);
2231 /* Update vectorial force */
2232 fix1 = _mm256_add_pd(fix1,tx);
2233 fiy1 = _mm256_add_pd(fiy1,ty);
2234 fiz1 = _mm256_add_pd(fiz1,tz);
2236 fjx2 = _mm256_add_pd(fjx2,tx);
2237 fjy2 = _mm256_add_pd(fjy2,ty);
2238 fjz2 = _mm256_add_pd(fjz2,tz);
2242 /**************************
2243 * CALCULATE INTERACTIONS *
2244 **************************/
2246 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2249 r13 = _mm256_mul_pd(rsq13,rinv13);
2251 /* EWALD ELECTROSTATICS */
2253 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2254 ewrt = _mm256_mul_pd(r13,ewtabscale);
2255 ewitab = _mm256_cvttpd_epi32(ewrt);
2256 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2257 ewitab = _mm_slli_epi32(ewitab,2);
2258 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2259 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2260 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2261 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2262 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2263 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2264 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2265 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2266 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2268 d = _mm256_sub_pd(r13,rswitch);
2269 d = _mm256_max_pd(d,_mm256_setzero_pd());
2270 d2 = _mm256_mul_pd(d,d);
2271 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2273 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2275 /* Evaluate switch function */
2276 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2277 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2278 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2282 fscal = _mm256_and_pd(fscal,cutoff_mask);
2284 /* Calculate temporary vectorial force */
2285 tx = _mm256_mul_pd(fscal,dx13);
2286 ty = _mm256_mul_pd(fscal,dy13);
2287 tz = _mm256_mul_pd(fscal,dz13);
2289 /* Update vectorial force */
2290 fix1 = _mm256_add_pd(fix1,tx);
2291 fiy1 = _mm256_add_pd(fiy1,ty);
2292 fiz1 = _mm256_add_pd(fiz1,tz);
2294 fjx3 = _mm256_add_pd(fjx3,tx);
2295 fjy3 = _mm256_add_pd(fjy3,ty);
2296 fjz3 = _mm256_add_pd(fjz3,tz);
2300 /**************************
2301 * CALCULATE INTERACTIONS *
2302 **************************/
2304 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2307 r21 = _mm256_mul_pd(rsq21,rinv21);
2309 /* EWALD ELECTROSTATICS */
2311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2312 ewrt = _mm256_mul_pd(r21,ewtabscale);
2313 ewitab = _mm256_cvttpd_epi32(ewrt);
2314 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2315 ewitab = _mm_slli_epi32(ewitab,2);
2316 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2317 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2318 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2319 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2320 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2321 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2322 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2323 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2324 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2326 d = _mm256_sub_pd(r21,rswitch);
2327 d = _mm256_max_pd(d,_mm256_setzero_pd());
2328 d2 = _mm256_mul_pd(d,d);
2329 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2331 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2333 /* Evaluate switch function */
2334 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2335 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2336 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2340 fscal = _mm256_and_pd(fscal,cutoff_mask);
2342 /* Calculate temporary vectorial force */
2343 tx = _mm256_mul_pd(fscal,dx21);
2344 ty = _mm256_mul_pd(fscal,dy21);
2345 tz = _mm256_mul_pd(fscal,dz21);
2347 /* Update vectorial force */
2348 fix2 = _mm256_add_pd(fix2,tx);
2349 fiy2 = _mm256_add_pd(fiy2,ty);
2350 fiz2 = _mm256_add_pd(fiz2,tz);
2352 fjx1 = _mm256_add_pd(fjx1,tx);
2353 fjy1 = _mm256_add_pd(fjy1,ty);
2354 fjz1 = _mm256_add_pd(fjz1,tz);
2358 /**************************
2359 * CALCULATE INTERACTIONS *
2360 **************************/
2362 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2365 r22 = _mm256_mul_pd(rsq22,rinv22);
2367 /* EWALD ELECTROSTATICS */
2369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2370 ewrt = _mm256_mul_pd(r22,ewtabscale);
2371 ewitab = _mm256_cvttpd_epi32(ewrt);
2372 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2373 ewitab = _mm_slli_epi32(ewitab,2);
2374 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2375 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2376 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2377 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2378 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2379 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2380 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2381 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2382 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2384 d = _mm256_sub_pd(r22,rswitch);
2385 d = _mm256_max_pd(d,_mm256_setzero_pd());
2386 d2 = _mm256_mul_pd(d,d);
2387 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2389 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2391 /* Evaluate switch function */
2392 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2393 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2394 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2398 fscal = _mm256_and_pd(fscal,cutoff_mask);
2400 /* Calculate temporary vectorial force */
2401 tx = _mm256_mul_pd(fscal,dx22);
2402 ty = _mm256_mul_pd(fscal,dy22);
2403 tz = _mm256_mul_pd(fscal,dz22);
2405 /* Update vectorial force */
2406 fix2 = _mm256_add_pd(fix2,tx);
2407 fiy2 = _mm256_add_pd(fiy2,ty);
2408 fiz2 = _mm256_add_pd(fiz2,tz);
2410 fjx2 = _mm256_add_pd(fjx2,tx);
2411 fjy2 = _mm256_add_pd(fjy2,ty);
2412 fjz2 = _mm256_add_pd(fjz2,tz);
2416 /**************************
2417 * CALCULATE INTERACTIONS *
2418 **************************/
2420 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2423 r23 = _mm256_mul_pd(rsq23,rinv23);
2425 /* EWALD ELECTROSTATICS */
2427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2428 ewrt = _mm256_mul_pd(r23,ewtabscale);
2429 ewitab = _mm256_cvttpd_epi32(ewrt);
2430 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2431 ewitab = _mm_slli_epi32(ewitab,2);
2432 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2433 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2434 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2435 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2436 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2437 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2438 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2439 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2440 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2442 d = _mm256_sub_pd(r23,rswitch);
2443 d = _mm256_max_pd(d,_mm256_setzero_pd());
2444 d2 = _mm256_mul_pd(d,d);
2445 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2447 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2449 /* Evaluate switch function */
2450 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2451 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2452 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2456 fscal = _mm256_and_pd(fscal,cutoff_mask);
2458 /* Calculate temporary vectorial force */
2459 tx = _mm256_mul_pd(fscal,dx23);
2460 ty = _mm256_mul_pd(fscal,dy23);
2461 tz = _mm256_mul_pd(fscal,dz23);
2463 /* Update vectorial force */
2464 fix2 = _mm256_add_pd(fix2,tx);
2465 fiy2 = _mm256_add_pd(fiy2,ty);
2466 fiz2 = _mm256_add_pd(fiz2,tz);
2468 fjx3 = _mm256_add_pd(fjx3,tx);
2469 fjy3 = _mm256_add_pd(fjy3,ty);
2470 fjz3 = _mm256_add_pd(fjz3,tz);
2474 /**************************
2475 * CALCULATE INTERACTIONS *
2476 **************************/
2478 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2481 r31 = _mm256_mul_pd(rsq31,rinv31);
2483 /* EWALD ELECTROSTATICS */
2485 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2486 ewrt = _mm256_mul_pd(r31,ewtabscale);
2487 ewitab = _mm256_cvttpd_epi32(ewrt);
2488 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2489 ewitab = _mm_slli_epi32(ewitab,2);
2490 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2491 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2492 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2493 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2494 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2495 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2496 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2497 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2498 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2500 d = _mm256_sub_pd(r31,rswitch);
2501 d = _mm256_max_pd(d,_mm256_setzero_pd());
2502 d2 = _mm256_mul_pd(d,d);
2503 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2505 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2507 /* Evaluate switch function */
2508 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2509 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2510 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2514 fscal = _mm256_and_pd(fscal,cutoff_mask);
2516 /* Calculate temporary vectorial force */
2517 tx = _mm256_mul_pd(fscal,dx31);
2518 ty = _mm256_mul_pd(fscal,dy31);
2519 tz = _mm256_mul_pd(fscal,dz31);
2521 /* Update vectorial force */
2522 fix3 = _mm256_add_pd(fix3,tx);
2523 fiy3 = _mm256_add_pd(fiy3,ty);
2524 fiz3 = _mm256_add_pd(fiz3,tz);
2526 fjx1 = _mm256_add_pd(fjx1,tx);
2527 fjy1 = _mm256_add_pd(fjy1,ty);
2528 fjz1 = _mm256_add_pd(fjz1,tz);
2532 /**************************
2533 * CALCULATE INTERACTIONS *
2534 **************************/
2536 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2539 r32 = _mm256_mul_pd(rsq32,rinv32);
2541 /* EWALD ELECTROSTATICS */
2543 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2544 ewrt = _mm256_mul_pd(r32,ewtabscale);
2545 ewitab = _mm256_cvttpd_epi32(ewrt);
2546 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2547 ewitab = _mm_slli_epi32(ewitab,2);
2548 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2549 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2550 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2551 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2552 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2553 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2554 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2555 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2556 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2558 d = _mm256_sub_pd(r32,rswitch);
2559 d = _mm256_max_pd(d,_mm256_setzero_pd());
2560 d2 = _mm256_mul_pd(d,d);
2561 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2563 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2565 /* Evaluate switch function */
2566 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2567 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2568 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2572 fscal = _mm256_and_pd(fscal,cutoff_mask);
2574 /* Calculate temporary vectorial force */
2575 tx = _mm256_mul_pd(fscal,dx32);
2576 ty = _mm256_mul_pd(fscal,dy32);
2577 tz = _mm256_mul_pd(fscal,dz32);
2579 /* Update vectorial force */
2580 fix3 = _mm256_add_pd(fix3,tx);
2581 fiy3 = _mm256_add_pd(fiy3,ty);
2582 fiz3 = _mm256_add_pd(fiz3,tz);
2584 fjx2 = _mm256_add_pd(fjx2,tx);
2585 fjy2 = _mm256_add_pd(fjy2,ty);
2586 fjz2 = _mm256_add_pd(fjz2,tz);
2590 /**************************
2591 * CALCULATE INTERACTIONS *
2592 **************************/
2594 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2597 r33 = _mm256_mul_pd(rsq33,rinv33);
2599 /* EWALD ELECTROSTATICS */
2601 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2602 ewrt = _mm256_mul_pd(r33,ewtabscale);
2603 ewitab = _mm256_cvttpd_epi32(ewrt);
2604 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2605 ewitab = _mm_slli_epi32(ewitab,2);
2606 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2607 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2608 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2609 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2610 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2611 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2612 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2613 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
2614 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2616 d = _mm256_sub_pd(r33,rswitch);
2617 d = _mm256_max_pd(d,_mm256_setzero_pd());
2618 d2 = _mm256_mul_pd(d,d);
2619 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2621 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2623 /* Evaluate switch function */
2624 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2625 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
2626 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2630 fscal = _mm256_and_pd(fscal,cutoff_mask);
2632 /* Calculate temporary vectorial force */
2633 tx = _mm256_mul_pd(fscal,dx33);
2634 ty = _mm256_mul_pd(fscal,dy33);
2635 tz = _mm256_mul_pd(fscal,dz33);
2637 /* Update vectorial force */
2638 fix3 = _mm256_add_pd(fix3,tx);
2639 fiy3 = _mm256_add_pd(fiy3,ty);
2640 fiz3 = _mm256_add_pd(fiz3,tz);
2642 fjx3 = _mm256_add_pd(fjx3,tx);
2643 fjy3 = _mm256_add_pd(fjy3,ty);
2644 fjz3 = _mm256_add_pd(fjz3,tz);
2648 fjptrA = f+j_coord_offsetA;
2649 fjptrB = f+j_coord_offsetB;
2650 fjptrC = f+j_coord_offsetC;
2651 fjptrD = f+j_coord_offsetD;
2653 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2654 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2655 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2657 /* Inner loop uses 617 flops */
2660 if(jidx<j_index_end)
2663 /* Get j neighbor index, and coordinate index */
2664 jnrlistA = jjnr[jidx];
2665 jnrlistB = jjnr[jidx+1];
2666 jnrlistC = jjnr[jidx+2];
2667 jnrlistD = jjnr[jidx+3];
2668 /* Sign of each element will be negative for non-real atoms.
2669 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2670 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2672 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2674 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2675 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2676 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2678 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2679 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2680 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2681 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2682 j_coord_offsetA = DIM*jnrA;
2683 j_coord_offsetB = DIM*jnrB;
2684 j_coord_offsetC = DIM*jnrC;
2685 j_coord_offsetD = DIM*jnrD;
2687 /* load j atom coordinates */
2688 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2689 x+j_coord_offsetC,x+j_coord_offsetD,
2690 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2691 &jy2,&jz2,&jx3,&jy3,&jz3);
2693 /* Calculate displacement vector */
2694 dx00 = _mm256_sub_pd(ix0,jx0);
2695 dy00 = _mm256_sub_pd(iy0,jy0);
2696 dz00 = _mm256_sub_pd(iz0,jz0);
2697 dx11 = _mm256_sub_pd(ix1,jx1);
2698 dy11 = _mm256_sub_pd(iy1,jy1);
2699 dz11 = _mm256_sub_pd(iz1,jz1);
2700 dx12 = _mm256_sub_pd(ix1,jx2);
2701 dy12 = _mm256_sub_pd(iy1,jy2);
2702 dz12 = _mm256_sub_pd(iz1,jz2);
2703 dx13 = _mm256_sub_pd(ix1,jx3);
2704 dy13 = _mm256_sub_pd(iy1,jy3);
2705 dz13 = _mm256_sub_pd(iz1,jz3);
2706 dx21 = _mm256_sub_pd(ix2,jx1);
2707 dy21 = _mm256_sub_pd(iy2,jy1);
2708 dz21 = _mm256_sub_pd(iz2,jz1);
2709 dx22 = _mm256_sub_pd(ix2,jx2);
2710 dy22 = _mm256_sub_pd(iy2,jy2);
2711 dz22 = _mm256_sub_pd(iz2,jz2);
2712 dx23 = _mm256_sub_pd(ix2,jx3);
2713 dy23 = _mm256_sub_pd(iy2,jy3);
2714 dz23 = _mm256_sub_pd(iz2,jz3);
2715 dx31 = _mm256_sub_pd(ix3,jx1);
2716 dy31 = _mm256_sub_pd(iy3,jy1);
2717 dz31 = _mm256_sub_pd(iz3,jz1);
2718 dx32 = _mm256_sub_pd(ix3,jx2);
2719 dy32 = _mm256_sub_pd(iy3,jy2);
2720 dz32 = _mm256_sub_pd(iz3,jz2);
2721 dx33 = _mm256_sub_pd(ix3,jx3);
2722 dy33 = _mm256_sub_pd(iy3,jy3);
2723 dz33 = _mm256_sub_pd(iz3,jz3);
2725 /* Calculate squared distance and things based on it */
2726 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2727 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2728 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2729 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2730 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2731 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2732 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2733 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2734 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2735 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2737 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2738 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2739 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2740 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2741 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2742 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2743 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2744 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2745 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2746 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2748 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2749 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2750 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2751 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2752 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2753 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2754 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2755 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2756 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2757 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2759 fjx0 = _mm256_setzero_pd();
2760 fjy0 = _mm256_setzero_pd();
2761 fjz0 = _mm256_setzero_pd();
2762 fjx1 = _mm256_setzero_pd();
2763 fjy1 = _mm256_setzero_pd();
2764 fjz1 = _mm256_setzero_pd();
2765 fjx2 = _mm256_setzero_pd();
2766 fjy2 = _mm256_setzero_pd();
2767 fjz2 = _mm256_setzero_pd();
2768 fjx3 = _mm256_setzero_pd();
2769 fjy3 = _mm256_setzero_pd();
2770 fjz3 = _mm256_setzero_pd();
2772 /**************************
2773 * CALCULATE INTERACTIONS *
2774 **************************/
2776 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2779 r00 = _mm256_mul_pd(rsq00,rinv00);
2780 r00 = _mm256_andnot_pd(dummy_mask,r00);
2782 /* LENNARD-JONES DISPERSION/REPULSION */
2784 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2785 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2786 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2787 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2788 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2790 d = _mm256_sub_pd(r00,rswitch);
2791 d = _mm256_max_pd(d,_mm256_setzero_pd());
2792 d2 = _mm256_mul_pd(d,d);
2793 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2795 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2797 /* Evaluate switch function */
2798 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2799 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2800 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2804 fscal = _mm256_and_pd(fscal,cutoff_mask);
2806 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2808 /* Calculate temporary vectorial force */
2809 tx = _mm256_mul_pd(fscal,dx00);
2810 ty = _mm256_mul_pd(fscal,dy00);
2811 tz = _mm256_mul_pd(fscal,dz00);
2813 /* Update vectorial force */
2814 fix0 = _mm256_add_pd(fix0,tx);
2815 fiy0 = _mm256_add_pd(fiy0,ty);
2816 fiz0 = _mm256_add_pd(fiz0,tz);
2818 fjx0 = _mm256_add_pd(fjx0,tx);
2819 fjy0 = _mm256_add_pd(fjy0,ty);
2820 fjz0 = _mm256_add_pd(fjz0,tz);
2824 /**************************
2825 * CALCULATE INTERACTIONS *
2826 **************************/
2828 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2831 r11 = _mm256_mul_pd(rsq11,rinv11);
2832 r11 = _mm256_andnot_pd(dummy_mask,r11);
2834 /* EWALD ELECTROSTATICS */
2836 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2837 ewrt = _mm256_mul_pd(r11,ewtabscale);
2838 ewitab = _mm256_cvttpd_epi32(ewrt);
2839 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2840 ewitab = _mm_slli_epi32(ewitab,2);
2841 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2842 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2843 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2844 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2845 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2846 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2847 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2848 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2849 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2851 d = _mm256_sub_pd(r11,rswitch);
2852 d = _mm256_max_pd(d,_mm256_setzero_pd());
2853 d2 = _mm256_mul_pd(d,d);
2854 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2856 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2858 /* Evaluate switch function */
2859 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2860 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2861 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2865 fscal = _mm256_and_pd(fscal,cutoff_mask);
2867 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2869 /* Calculate temporary vectorial force */
2870 tx = _mm256_mul_pd(fscal,dx11);
2871 ty = _mm256_mul_pd(fscal,dy11);
2872 tz = _mm256_mul_pd(fscal,dz11);
2874 /* Update vectorial force */
2875 fix1 = _mm256_add_pd(fix1,tx);
2876 fiy1 = _mm256_add_pd(fiy1,ty);
2877 fiz1 = _mm256_add_pd(fiz1,tz);
2879 fjx1 = _mm256_add_pd(fjx1,tx);
2880 fjy1 = _mm256_add_pd(fjy1,ty);
2881 fjz1 = _mm256_add_pd(fjz1,tz);
2885 /**************************
2886 * CALCULATE INTERACTIONS *
2887 **************************/
2889 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2892 r12 = _mm256_mul_pd(rsq12,rinv12);
2893 r12 = _mm256_andnot_pd(dummy_mask,r12);
2895 /* EWALD ELECTROSTATICS */
2897 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2898 ewrt = _mm256_mul_pd(r12,ewtabscale);
2899 ewitab = _mm256_cvttpd_epi32(ewrt);
2900 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2901 ewitab = _mm_slli_epi32(ewitab,2);
2902 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2903 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2904 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2905 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2906 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2907 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2908 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2909 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2910 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2912 d = _mm256_sub_pd(r12,rswitch);
2913 d = _mm256_max_pd(d,_mm256_setzero_pd());
2914 d2 = _mm256_mul_pd(d,d);
2915 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2917 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2919 /* Evaluate switch function */
2920 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2921 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2922 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2926 fscal = _mm256_and_pd(fscal,cutoff_mask);
2928 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2930 /* Calculate temporary vectorial force */
2931 tx = _mm256_mul_pd(fscal,dx12);
2932 ty = _mm256_mul_pd(fscal,dy12);
2933 tz = _mm256_mul_pd(fscal,dz12);
2935 /* Update vectorial force */
2936 fix1 = _mm256_add_pd(fix1,tx);
2937 fiy1 = _mm256_add_pd(fiy1,ty);
2938 fiz1 = _mm256_add_pd(fiz1,tz);
2940 fjx2 = _mm256_add_pd(fjx2,tx);
2941 fjy2 = _mm256_add_pd(fjy2,ty);
2942 fjz2 = _mm256_add_pd(fjz2,tz);
2946 /**************************
2947 * CALCULATE INTERACTIONS *
2948 **************************/
2950 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2953 r13 = _mm256_mul_pd(rsq13,rinv13);
2954 r13 = _mm256_andnot_pd(dummy_mask,r13);
2956 /* EWALD ELECTROSTATICS */
2958 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2959 ewrt = _mm256_mul_pd(r13,ewtabscale);
2960 ewitab = _mm256_cvttpd_epi32(ewrt);
2961 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2962 ewitab = _mm_slli_epi32(ewitab,2);
2963 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2964 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2965 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2966 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2967 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2968 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2969 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2970 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2971 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2973 d = _mm256_sub_pd(r13,rswitch);
2974 d = _mm256_max_pd(d,_mm256_setzero_pd());
2975 d2 = _mm256_mul_pd(d,d);
2976 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2978 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2980 /* Evaluate switch function */
2981 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2982 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2983 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2987 fscal = _mm256_and_pd(fscal,cutoff_mask);
2989 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2991 /* Calculate temporary vectorial force */
2992 tx = _mm256_mul_pd(fscal,dx13);
2993 ty = _mm256_mul_pd(fscal,dy13);
2994 tz = _mm256_mul_pd(fscal,dz13);
2996 /* Update vectorial force */
2997 fix1 = _mm256_add_pd(fix1,tx);
2998 fiy1 = _mm256_add_pd(fiy1,ty);
2999 fiz1 = _mm256_add_pd(fiz1,tz);
3001 fjx3 = _mm256_add_pd(fjx3,tx);
3002 fjy3 = _mm256_add_pd(fjy3,ty);
3003 fjz3 = _mm256_add_pd(fjz3,tz);
3007 /**************************
3008 * CALCULATE INTERACTIONS *
3009 **************************/
3011 if (gmx_mm256_any_lt(rsq21,rcutoff2))
3014 r21 = _mm256_mul_pd(rsq21,rinv21);
3015 r21 = _mm256_andnot_pd(dummy_mask,r21);
3017 /* EWALD ELECTROSTATICS */
3019 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3020 ewrt = _mm256_mul_pd(r21,ewtabscale);
3021 ewitab = _mm256_cvttpd_epi32(ewrt);
3022 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3023 ewitab = _mm_slli_epi32(ewitab,2);
3024 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3025 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3026 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3027 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3028 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3029 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3030 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3031 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
3032 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
3034 d = _mm256_sub_pd(r21,rswitch);
3035 d = _mm256_max_pd(d,_mm256_setzero_pd());
3036 d2 = _mm256_mul_pd(d,d);
3037 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3039 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3041 /* Evaluate switch function */
3042 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3043 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
3044 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
3048 fscal = _mm256_and_pd(fscal,cutoff_mask);
3050 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3052 /* Calculate temporary vectorial force */
3053 tx = _mm256_mul_pd(fscal,dx21);
3054 ty = _mm256_mul_pd(fscal,dy21);
3055 tz = _mm256_mul_pd(fscal,dz21);
3057 /* Update vectorial force */
3058 fix2 = _mm256_add_pd(fix2,tx);
3059 fiy2 = _mm256_add_pd(fiy2,ty);
3060 fiz2 = _mm256_add_pd(fiz2,tz);
3062 fjx1 = _mm256_add_pd(fjx1,tx);
3063 fjy1 = _mm256_add_pd(fjy1,ty);
3064 fjz1 = _mm256_add_pd(fjz1,tz);
3068 /**************************
3069 * CALCULATE INTERACTIONS *
3070 **************************/
3072 if (gmx_mm256_any_lt(rsq22,rcutoff2))
3075 r22 = _mm256_mul_pd(rsq22,rinv22);
3076 r22 = _mm256_andnot_pd(dummy_mask,r22);
3078 /* EWALD ELECTROSTATICS */
3080 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3081 ewrt = _mm256_mul_pd(r22,ewtabscale);
3082 ewitab = _mm256_cvttpd_epi32(ewrt);
3083 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3084 ewitab = _mm_slli_epi32(ewitab,2);
3085 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3086 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3087 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3088 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3089 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3090 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3091 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3092 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
3093 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
3095 d = _mm256_sub_pd(r22,rswitch);
3096 d = _mm256_max_pd(d,_mm256_setzero_pd());
3097 d2 = _mm256_mul_pd(d,d);
3098 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3100 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3102 /* Evaluate switch function */
3103 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3104 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
3105 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
3109 fscal = _mm256_and_pd(fscal,cutoff_mask);
3111 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3113 /* Calculate temporary vectorial force */
3114 tx = _mm256_mul_pd(fscal,dx22);
3115 ty = _mm256_mul_pd(fscal,dy22);
3116 tz = _mm256_mul_pd(fscal,dz22);
3118 /* Update vectorial force */
3119 fix2 = _mm256_add_pd(fix2,tx);
3120 fiy2 = _mm256_add_pd(fiy2,ty);
3121 fiz2 = _mm256_add_pd(fiz2,tz);
3123 fjx2 = _mm256_add_pd(fjx2,tx);
3124 fjy2 = _mm256_add_pd(fjy2,ty);
3125 fjz2 = _mm256_add_pd(fjz2,tz);
3129 /**************************
3130 * CALCULATE INTERACTIONS *
3131 **************************/
3133 if (gmx_mm256_any_lt(rsq23,rcutoff2))
3136 r23 = _mm256_mul_pd(rsq23,rinv23);
3137 r23 = _mm256_andnot_pd(dummy_mask,r23);
3139 /* EWALD ELECTROSTATICS */
3141 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3142 ewrt = _mm256_mul_pd(r23,ewtabscale);
3143 ewitab = _mm256_cvttpd_epi32(ewrt);
3144 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3145 ewitab = _mm_slli_epi32(ewitab,2);
3146 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3147 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3148 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3149 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3150 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3151 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3152 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3153 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
3154 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
3156 d = _mm256_sub_pd(r23,rswitch);
3157 d = _mm256_max_pd(d,_mm256_setzero_pd());
3158 d2 = _mm256_mul_pd(d,d);
3159 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3161 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3163 /* Evaluate switch function */
3164 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3165 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
3166 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
3170 fscal = _mm256_and_pd(fscal,cutoff_mask);
3172 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3174 /* Calculate temporary vectorial force */
3175 tx = _mm256_mul_pd(fscal,dx23);
3176 ty = _mm256_mul_pd(fscal,dy23);
3177 tz = _mm256_mul_pd(fscal,dz23);
3179 /* Update vectorial force */
3180 fix2 = _mm256_add_pd(fix2,tx);
3181 fiy2 = _mm256_add_pd(fiy2,ty);
3182 fiz2 = _mm256_add_pd(fiz2,tz);
3184 fjx3 = _mm256_add_pd(fjx3,tx);
3185 fjy3 = _mm256_add_pd(fjy3,ty);
3186 fjz3 = _mm256_add_pd(fjz3,tz);
3190 /**************************
3191 * CALCULATE INTERACTIONS *
3192 **************************/
3194 if (gmx_mm256_any_lt(rsq31,rcutoff2))
3197 r31 = _mm256_mul_pd(rsq31,rinv31);
3198 r31 = _mm256_andnot_pd(dummy_mask,r31);
3200 /* EWALD ELECTROSTATICS */
3202 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3203 ewrt = _mm256_mul_pd(r31,ewtabscale);
3204 ewitab = _mm256_cvttpd_epi32(ewrt);
3205 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3206 ewitab = _mm_slli_epi32(ewitab,2);
3207 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3208 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3209 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3210 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3211 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3212 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3213 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3214 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
3215 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
3217 d = _mm256_sub_pd(r31,rswitch);
3218 d = _mm256_max_pd(d,_mm256_setzero_pd());
3219 d2 = _mm256_mul_pd(d,d);
3220 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3222 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3224 /* Evaluate switch function */
3225 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3226 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
3227 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
3231 fscal = _mm256_and_pd(fscal,cutoff_mask);
3233 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3235 /* Calculate temporary vectorial force */
3236 tx = _mm256_mul_pd(fscal,dx31);
3237 ty = _mm256_mul_pd(fscal,dy31);
3238 tz = _mm256_mul_pd(fscal,dz31);
3240 /* Update vectorial force */
3241 fix3 = _mm256_add_pd(fix3,tx);
3242 fiy3 = _mm256_add_pd(fiy3,ty);
3243 fiz3 = _mm256_add_pd(fiz3,tz);
3245 fjx1 = _mm256_add_pd(fjx1,tx);
3246 fjy1 = _mm256_add_pd(fjy1,ty);
3247 fjz1 = _mm256_add_pd(fjz1,tz);
3251 /**************************
3252 * CALCULATE INTERACTIONS *
3253 **************************/
3255 if (gmx_mm256_any_lt(rsq32,rcutoff2))
3258 r32 = _mm256_mul_pd(rsq32,rinv32);
3259 r32 = _mm256_andnot_pd(dummy_mask,r32);
3261 /* EWALD ELECTROSTATICS */
3263 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3264 ewrt = _mm256_mul_pd(r32,ewtabscale);
3265 ewitab = _mm256_cvttpd_epi32(ewrt);
3266 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3267 ewitab = _mm_slli_epi32(ewitab,2);
3268 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3269 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3270 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3271 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3272 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3273 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3274 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3275 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
3276 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
3278 d = _mm256_sub_pd(r32,rswitch);
3279 d = _mm256_max_pd(d,_mm256_setzero_pd());
3280 d2 = _mm256_mul_pd(d,d);
3281 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3283 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3285 /* Evaluate switch function */
3286 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3287 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
3288 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
3292 fscal = _mm256_and_pd(fscal,cutoff_mask);
3294 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3296 /* Calculate temporary vectorial force */
3297 tx = _mm256_mul_pd(fscal,dx32);
3298 ty = _mm256_mul_pd(fscal,dy32);
3299 tz = _mm256_mul_pd(fscal,dz32);
3301 /* Update vectorial force */
3302 fix3 = _mm256_add_pd(fix3,tx);
3303 fiy3 = _mm256_add_pd(fiy3,ty);
3304 fiz3 = _mm256_add_pd(fiz3,tz);
3306 fjx2 = _mm256_add_pd(fjx2,tx);
3307 fjy2 = _mm256_add_pd(fjy2,ty);
3308 fjz2 = _mm256_add_pd(fjz2,tz);
3312 /**************************
3313 * CALCULATE INTERACTIONS *
3314 **************************/
3316 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3319 r33 = _mm256_mul_pd(rsq33,rinv33);
3320 r33 = _mm256_andnot_pd(dummy_mask,r33);
3322 /* EWALD ELECTROSTATICS */
3324 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3325 ewrt = _mm256_mul_pd(r33,ewtabscale);
3326 ewitab = _mm256_cvttpd_epi32(ewrt);
3327 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3328 ewitab = _mm_slli_epi32(ewitab,2);
3329 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3330 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3331 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3332 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3333 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3334 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3335 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3336 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
3337 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
3339 d = _mm256_sub_pd(r33,rswitch);
3340 d = _mm256_max_pd(d,_mm256_setzero_pd());
3341 d2 = _mm256_mul_pd(d,d);
3342 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3344 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3346 /* Evaluate switch function */
3347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3348 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
3349 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
3353 fscal = _mm256_and_pd(fscal,cutoff_mask);
3355 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3357 /* Calculate temporary vectorial force */
3358 tx = _mm256_mul_pd(fscal,dx33);
3359 ty = _mm256_mul_pd(fscal,dy33);
3360 tz = _mm256_mul_pd(fscal,dz33);
3362 /* Update vectorial force */
3363 fix3 = _mm256_add_pd(fix3,tx);
3364 fiy3 = _mm256_add_pd(fiy3,ty);
3365 fiz3 = _mm256_add_pd(fiz3,tz);
3367 fjx3 = _mm256_add_pd(fjx3,tx);
3368 fjy3 = _mm256_add_pd(fjy3,ty);
3369 fjz3 = _mm256_add_pd(fjz3,tz);
3373 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3374 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3375 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3376 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3378 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
3379 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3380 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3382 /* Inner loop uses 627 flops */
3385 /* End of innermost loop */
3387 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3388 f+i_coord_offset,fshift+i_shift_offset);
3390 /* Increment number of inner iterations */
3391 inneriter += j_index_end - j_index_start;
3393 /* Outer loop uses 24 flops */
3396 /* Increment number of outer iterations */
3399 /* Update outer/inner flops */
3401 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);