2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr1;
71 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
72 real * vdwioffsetptr2;
73 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
74 real * vdwioffsetptr3;
75 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
85 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
88 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
89 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
90 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
95 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
97 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
98 real rswitch_scalar,d_scalar;
99 __m256d dummy_mask,cutoff_mask;
100 __m128 tmpmask0,tmpmask1;
101 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
102 __m256d one = _mm256_set1_pd(1.0);
103 __m256d two = _mm256_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm256_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
118 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
119 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
120 beta2 = _mm256_mul_pd(beta,beta);
121 beta3 = _mm256_mul_pd(beta,beta2);
123 ewtab = fr->ic->tabq_coul_FDV0;
124 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
125 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
130 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
131 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
133 jq1 = _mm256_set1_pd(charge[inr+1]);
134 jq2 = _mm256_set1_pd(charge[inr+2]);
135 jq3 = _mm256_set1_pd(charge[inr+3]);
136 qq11 = _mm256_mul_pd(iq1,jq1);
137 qq12 = _mm256_mul_pd(iq1,jq2);
138 qq13 = _mm256_mul_pd(iq1,jq3);
139 qq21 = _mm256_mul_pd(iq2,jq1);
140 qq22 = _mm256_mul_pd(iq2,jq2);
141 qq23 = _mm256_mul_pd(iq2,jq3);
142 qq31 = _mm256_mul_pd(iq3,jq1);
143 qq32 = _mm256_mul_pd(iq3,jq2);
144 qq33 = _mm256_mul_pd(iq3,jq3);
146 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
147 rcutoff_scalar = fr->rcoulomb;
148 rcutoff = _mm256_set1_pd(rcutoff_scalar);
149 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
151 rswitch_scalar = fr->rcoulomb_switch;
152 rswitch = _mm256_set1_pd(rswitch_scalar);
153 /* Setup switch parameters */
154 d_scalar = rcutoff_scalar-rswitch_scalar;
155 d = _mm256_set1_pd(d_scalar);
156 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
157 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
158 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
159 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
160 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
163 /* Avoid stupid compiler warnings */
164 jnrA = jnrB = jnrC = jnrD = 0;
173 for(iidx=0;iidx<4*DIM;iidx++)
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
194 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
196 fix1 = _mm256_setzero_pd();
197 fiy1 = _mm256_setzero_pd();
198 fiz1 = _mm256_setzero_pd();
199 fix2 = _mm256_setzero_pd();
200 fiy2 = _mm256_setzero_pd();
201 fiz2 = _mm256_setzero_pd();
202 fix3 = _mm256_setzero_pd();
203 fiy3 = _mm256_setzero_pd();
204 fiz3 = _mm256_setzero_pd();
206 /* Reset potential sums */
207 velecsum = _mm256_setzero_pd();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
218 j_coord_offsetA = DIM*jnrA;
219 j_coord_offsetB = DIM*jnrB;
220 j_coord_offsetC = DIM*jnrC;
221 j_coord_offsetD = DIM*jnrD;
223 /* load j atom coordinates */
224 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
225 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
226 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
228 /* Calculate displacement vector */
229 dx11 = _mm256_sub_pd(ix1,jx1);
230 dy11 = _mm256_sub_pd(iy1,jy1);
231 dz11 = _mm256_sub_pd(iz1,jz1);
232 dx12 = _mm256_sub_pd(ix1,jx2);
233 dy12 = _mm256_sub_pd(iy1,jy2);
234 dz12 = _mm256_sub_pd(iz1,jz2);
235 dx13 = _mm256_sub_pd(ix1,jx3);
236 dy13 = _mm256_sub_pd(iy1,jy3);
237 dz13 = _mm256_sub_pd(iz1,jz3);
238 dx21 = _mm256_sub_pd(ix2,jx1);
239 dy21 = _mm256_sub_pd(iy2,jy1);
240 dz21 = _mm256_sub_pd(iz2,jz1);
241 dx22 = _mm256_sub_pd(ix2,jx2);
242 dy22 = _mm256_sub_pd(iy2,jy2);
243 dz22 = _mm256_sub_pd(iz2,jz2);
244 dx23 = _mm256_sub_pd(ix2,jx3);
245 dy23 = _mm256_sub_pd(iy2,jy3);
246 dz23 = _mm256_sub_pd(iz2,jz3);
247 dx31 = _mm256_sub_pd(ix3,jx1);
248 dy31 = _mm256_sub_pd(iy3,jy1);
249 dz31 = _mm256_sub_pd(iz3,jz1);
250 dx32 = _mm256_sub_pd(ix3,jx2);
251 dy32 = _mm256_sub_pd(iy3,jy2);
252 dz32 = _mm256_sub_pd(iz3,jz2);
253 dx33 = _mm256_sub_pd(ix3,jx3);
254 dy33 = _mm256_sub_pd(iy3,jy3);
255 dz33 = _mm256_sub_pd(iz3,jz3);
257 /* Calculate squared distance and things based on it */
258 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
259 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
260 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
261 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
262 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
263 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
264 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
265 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
266 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
268 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
269 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
270 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
271 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
272 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
273 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
274 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
275 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
276 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
278 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
279 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
280 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
281 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
282 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
283 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
284 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
285 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
286 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
288 fjx1 = _mm256_setzero_pd();
289 fjy1 = _mm256_setzero_pd();
290 fjz1 = _mm256_setzero_pd();
291 fjx2 = _mm256_setzero_pd();
292 fjy2 = _mm256_setzero_pd();
293 fjz2 = _mm256_setzero_pd();
294 fjx3 = _mm256_setzero_pd();
295 fjy3 = _mm256_setzero_pd();
296 fjz3 = _mm256_setzero_pd();
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 if (gmx_mm256_any_lt(rsq11,rcutoff2))
305 r11 = _mm256_mul_pd(rsq11,rinv11);
307 /* EWALD ELECTROSTATICS */
309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
310 ewrt = _mm256_mul_pd(r11,ewtabscale);
311 ewitab = _mm256_cvttpd_epi32(ewrt);
312 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
313 ewitab = _mm_slli_epi32(ewitab,2);
314 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
315 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
316 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
317 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
318 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
319 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
320 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
321 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
322 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
324 d = _mm256_sub_pd(r11,rswitch);
325 d = _mm256_max_pd(d,_mm256_setzero_pd());
326 d2 = _mm256_mul_pd(d,d);
327 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
329 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
331 /* Evaluate switch function */
332 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
333 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
334 velec = _mm256_mul_pd(velec,sw);
335 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
337 /* Update potential sum for this i atom from the interaction with this j atom. */
338 velec = _mm256_and_pd(velec,cutoff_mask);
339 velecsum = _mm256_add_pd(velecsum,velec);
343 fscal = _mm256_and_pd(fscal,cutoff_mask);
345 /* Calculate temporary vectorial force */
346 tx = _mm256_mul_pd(fscal,dx11);
347 ty = _mm256_mul_pd(fscal,dy11);
348 tz = _mm256_mul_pd(fscal,dz11);
350 /* Update vectorial force */
351 fix1 = _mm256_add_pd(fix1,tx);
352 fiy1 = _mm256_add_pd(fiy1,ty);
353 fiz1 = _mm256_add_pd(fiz1,tz);
355 fjx1 = _mm256_add_pd(fjx1,tx);
356 fjy1 = _mm256_add_pd(fjy1,ty);
357 fjz1 = _mm256_add_pd(fjz1,tz);
361 /**************************
362 * CALCULATE INTERACTIONS *
363 **************************/
365 if (gmx_mm256_any_lt(rsq12,rcutoff2))
368 r12 = _mm256_mul_pd(rsq12,rinv12);
370 /* EWALD ELECTROSTATICS */
372 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
373 ewrt = _mm256_mul_pd(r12,ewtabscale);
374 ewitab = _mm256_cvttpd_epi32(ewrt);
375 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
376 ewitab = _mm_slli_epi32(ewitab,2);
377 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
378 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
379 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
380 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
381 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
382 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
383 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
384 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
385 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
387 d = _mm256_sub_pd(r12,rswitch);
388 d = _mm256_max_pd(d,_mm256_setzero_pd());
389 d2 = _mm256_mul_pd(d,d);
390 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
392 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
394 /* Evaluate switch function */
395 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
396 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
397 velec = _mm256_mul_pd(velec,sw);
398 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velec = _mm256_and_pd(velec,cutoff_mask);
402 velecsum = _mm256_add_pd(velecsum,velec);
406 fscal = _mm256_and_pd(fscal,cutoff_mask);
408 /* Calculate temporary vectorial force */
409 tx = _mm256_mul_pd(fscal,dx12);
410 ty = _mm256_mul_pd(fscal,dy12);
411 tz = _mm256_mul_pd(fscal,dz12);
413 /* Update vectorial force */
414 fix1 = _mm256_add_pd(fix1,tx);
415 fiy1 = _mm256_add_pd(fiy1,ty);
416 fiz1 = _mm256_add_pd(fiz1,tz);
418 fjx2 = _mm256_add_pd(fjx2,tx);
419 fjy2 = _mm256_add_pd(fjy2,ty);
420 fjz2 = _mm256_add_pd(fjz2,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 if (gmx_mm256_any_lt(rsq13,rcutoff2))
431 r13 = _mm256_mul_pd(rsq13,rinv13);
433 /* EWALD ELECTROSTATICS */
435 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
436 ewrt = _mm256_mul_pd(r13,ewtabscale);
437 ewitab = _mm256_cvttpd_epi32(ewrt);
438 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
439 ewitab = _mm_slli_epi32(ewitab,2);
440 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
441 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
442 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
443 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
444 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
445 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
446 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
447 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
448 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
450 d = _mm256_sub_pd(r13,rswitch);
451 d = _mm256_max_pd(d,_mm256_setzero_pd());
452 d2 = _mm256_mul_pd(d,d);
453 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
455 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
457 /* Evaluate switch function */
458 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
459 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
460 velec = _mm256_mul_pd(velec,sw);
461 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
463 /* Update potential sum for this i atom from the interaction with this j atom. */
464 velec = _mm256_and_pd(velec,cutoff_mask);
465 velecsum = _mm256_add_pd(velecsum,velec);
469 fscal = _mm256_and_pd(fscal,cutoff_mask);
471 /* Calculate temporary vectorial force */
472 tx = _mm256_mul_pd(fscal,dx13);
473 ty = _mm256_mul_pd(fscal,dy13);
474 tz = _mm256_mul_pd(fscal,dz13);
476 /* Update vectorial force */
477 fix1 = _mm256_add_pd(fix1,tx);
478 fiy1 = _mm256_add_pd(fiy1,ty);
479 fiz1 = _mm256_add_pd(fiz1,tz);
481 fjx3 = _mm256_add_pd(fjx3,tx);
482 fjy3 = _mm256_add_pd(fjy3,ty);
483 fjz3 = _mm256_add_pd(fjz3,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 if (gmx_mm256_any_lt(rsq21,rcutoff2))
494 r21 = _mm256_mul_pd(rsq21,rinv21);
496 /* EWALD ELECTROSTATICS */
498 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
499 ewrt = _mm256_mul_pd(r21,ewtabscale);
500 ewitab = _mm256_cvttpd_epi32(ewrt);
501 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
502 ewitab = _mm_slli_epi32(ewitab,2);
503 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
504 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
505 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
506 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
507 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
508 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
509 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
510 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
511 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
513 d = _mm256_sub_pd(r21,rswitch);
514 d = _mm256_max_pd(d,_mm256_setzero_pd());
515 d2 = _mm256_mul_pd(d,d);
516 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
518 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
520 /* Evaluate switch function */
521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
522 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
523 velec = _mm256_mul_pd(velec,sw);
524 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velec = _mm256_and_pd(velec,cutoff_mask);
528 velecsum = _mm256_add_pd(velecsum,velec);
532 fscal = _mm256_and_pd(fscal,cutoff_mask);
534 /* Calculate temporary vectorial force */
535 tx = _mm256_mul_pd(fscal,dx21);
536 ty = _mm256_mul_pd(fscal,dy21);
537 tz = _mm256_mul_pd(fscal,dz21);
539 /* Update vectorial force */
540 fix2 = _mm256_add_pd(fix2,tx);
541 fiy2 = _mm256_add_pd(fiy2,ty);
542 fiz2 = _mm256_add_pd(fiz2,tz);
544 fjx1 = _mm256_add_pd(fjx1,tx);
545 fjy1 = _mm256_add_pd(fjy1,ty);
546 fjz1 = _mm256_add_pd(fjz1,tz);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 if (gmx_mm256_any_lt(rsq22,rcutoff2))
557 r22 = _mm256_mul_pd(rsq22,rinv22);
559 /* EWALD ELECTROSTATICS */
561 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
562 ewrt = _mm256_mul_pd(r22,ewtabscale);
563 ewitab = _mm256_cvttpd_epi32(ewrt);
564 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
565 ewitab = _mm_slli_epi32(ewitab,2);
566 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
567 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
568 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
569 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
570 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
571 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
572 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
573 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
574 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
576 d = _mm256_sub_pd(r22,rswitch);
577 d = _mm256_max_pd(d,_mm256_setzero_pd());
578 d2 = _mm256_mul_pd(d,d);
579 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
581 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
583 /* Evaluate switch function */
584 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
585 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
586 velec = _mm256_mul_pd(velec,sw);
587 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velec = _mm256_and_pd(velec,cutoff_mask);
591 velecsum = _mm256_add_pd(velecsum,velec);
595 fscal = _mm256_and_pd(fscal,cutoff_mask);
597 /* Calculate temporary vectorial force */
598 tx = _mm256_mul_pd(fscal,dx22);
599 ty = _mm256_mul_pd(fscal,dy22);
600 tz = _mm256_mul_pd(fscal,dz22);
602 /* Update vectorial force */
603 fix2 = _mm256_add_pd(fix2,tx);
604 fiy2 = _mm256_add_pd(fiy2,ty);
605 fiz2 = _mm256_add_pd(fiz2,tz);
607 fjx2 = _mm256_add_pd(fjx2,tx);
608 fjy2 = _mm256_add_pd(fjy2,ty);
609 fjz2 = _mm256_add_pd(fjz2,tz);
613 /**************************
614 * CALCULATE INTERACTIONS *
615 **************************/
617 if (gmx_mm256_any_lt(rsq23,rcutoff2))
620 r23 = _mm256_mul_pd(rsq23,rinv23);
622 /* EWALD ELECTROSTATICS */
624 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
625 ewrt = _mm256_mul_pd(r23,ewtabscale);
626 ewitab = _mm256_cvttpd_epi32(ewrt);
627 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
628 ewitab = _mm_slli_epi32(ewitab,2);
629 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
630 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
631 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
632 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
633 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
634 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
635 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
636 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
637 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
639 d = _mm256_sub_pd(r23,rswitch);
640 d = _mm256_max_pd(d,_mm256_setzero_pd());
641 d2 = _mm256_mul_pd(d,d);
642 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
644 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
646 /* Evaluate switch function */
647 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
648 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
649 velec = _mm256_mul_pd(velec,sw);
650 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velec = _mm256_and_pd(velec,cutoff_mask);
654 velecsum = _mm256_add_pd(velecsum,velec);
658 fscal = _mm256_and_pd(fscal,cutoff_mask);
660 /* Calculate temporary vectorial force */
661 tx = _mm256_mul_pd(fscal,dx23);
662 ty = _mm256_mul_pd(fscal,dy23);
663 tz = _mm256_mul_pd(fscal,dz23);
665 /* Update vectorial force */
666 fix2 = _mm256_add_pd(fix2,tx);
667 fiy2 = _mm256_add_pd(fiy2,ty);
668 fiz2 = _mm256_add_pd(fiz2,tz);
670 fjx3 = _mm256_add_pd(fjx3,tx);
671 fjy3 = _mm256_add_pd(fjy3,ty);
672 fjz3 = _mm256_add_pd(fjz3,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 if (gmx_mm256_any_lt(rsq31,rcutoff2))
683 r31 = _mm256_mul_pd(rsq31,rinv31);
685 /* EWALD ELECTROSTATICS */
687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688 ewrt = _mm256_mul_pd(r31,ewtabscale);
689 ewitab = _mm256_cvttpd_epi32(ewrt);
690 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
691 ewitab = _mm_slli_epi32(ewitab,2);
692 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
693 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
694 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
695 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
696 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
697 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
698 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
699 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
700 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
702 d = _mm256_sub_pd(r31,rswitch);
703 d = _mm256_max_pd(d,_mm256_setzero_pd());
704 d2 = _mm256_mul_pd(d,d);
705 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
707 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
709 /* Evaluate switch function */
710 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
711 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
712 velec = _mm256_mul_pd(velec,sw);
713 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
715 /* Update potential sum for this i atom from the interaction with this j atom. */
716 velec = _mm256_and_pd(velec,cutoff_mask);
717 velecsum = _mm256_add_pd(velecsum,velec);
721 fscal = _mm256_and_pd(fscal,cutoff_mask);
723 /* Calculate temporary vectorial force */
724 tx = _mm256_mul_pd(fscal,dx31);
725 ty = _mm256_mul_pd(fscal,dy31);
726 tz = _mm256_mul_pd(fscal,dz31);
728 /* Update vectorial force */
729 fix3 = _mm256_add_pd(fix3,tx);
730 fiy3 = _mm256_add_pd(fiy3,ty);
731 fiz3 = _mm256_add_pd(fiz3,tz);
733 fjx1 = _mm256_add_pd(fjx1,tx);
734 fjy1 = _mm256_add_pd(fjy1,ty);
735 fjz1 = _mm256_add_pd(fjz1,tz);
739 /**************************
740 * CALCULATE INTERACTIONS *
741 **************************/
743 if (gmx_mm256_any_lt(rsq32,rcutoff2))
746 r32 = _mm256_mul_pd(rsq32,rinv32);
748 /* EWALD ELECTROSTATICS */
750 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
751 ewrt = _mm256_mul_pd(r32,ewtabscale);
752 ewitab = _mm256_cvttpd_epi32(ewrt);
753 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
754 ewitab = _mm_slli_epi32(ewitab,2);
755 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
756 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
757 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
758 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
759 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
760 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
761 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
762 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
763 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
765 d = _mm256_sub_pd(r32,rswitch);
766 d = _mm256_max_pd(d,_mm256_setzero_pd());
767 d2 = _mm256_mul_pd(d,d);
768 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
770 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
772 /* Evaluate switch function */
773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
774 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
775 velec = _mm256_mul_pd(velec,sw);
776 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
778 /* Update potential sum for this i atom from the interaction with this j atom. */
779 velec = _mm256_and_pd(velec,cutoff_mask);
780 velecsum = _mm256_add_pd(velecsum,velec);
784 fscal = _mm256_and_pd(fscal,cutoff_mask);
786 /* Calculate temporary vectorial force */
787 tx = _mm256_mul_pd(fscal,dx32);
788 ty = _mm256_mul_pd(fscal,dy32);
789 tz = _mm256_mul_pd(fscal,dz32);
791 /* Update vectorial force */
792 fix3 = _mm256_add_pd(fix3,tx);
793 fiy3 = _mm256_add_pd(fiy3,ty);
794 fiz3 = _mm256_add_pd(fiz3,tz);
796 fjx2 = _mm256_add_pd(fjx2,tx);
797 fjy2 = _mm256_add_pd(fjy2,ty);
798 fjz2 = _mm256_add_pd(fjz2,tz);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 if (gmx_mm256_any_lt(rsq33,rcutoff2))
809 r33 = _mm256_mul_pd(rsq33,rinv33);
811 /* EWALD ELECTROSTATICS */
813 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
814 ewrt = _mm256_mul_pd(r33,ewtabscale);
815 ewitab = _mm256_cvttpd_epi32(ewrt);
816 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
817 ewitab = _mm_slli_epi32(ewitab,2);
818 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
819 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
820 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
821 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
822 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
823 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
824 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
825 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
826 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
828 d = _mm256_sub_pd(r33,rswitch);
829 d = _mm256_max_pd(d,_mm256_setzero_pd());
830 d2 = _mm256_mul_pd(d,d);
831 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
833 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
835 /* Evaluate switch function */
836 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
837 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
838 velec = _mm256_mul_pd(velec,sw);
839 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 velec = _mm256_and_pd(velec,cutoff_mask);
843 velecsum = _mm256_add_pd(velecsum,velec);
847 fscal = _mm256_and_pd(fscal,cutoff_mask);
849 /* Calculate temporary vectorial force */
850 tx = _mm256_mul_pd(fscal,dx33);
851 ty = _mm256_mul_pd(fscal,dy33);
852 tz = _mm256_mul_pd(fscal,dz33);
854 /* Update vectorial force */
855 fix3 = _mm256_add_pd(fix3,tx);
856 fiy3 = _mm256_add_pd(fiy3,ty);
857 fiz3 = _mm256_add_pd(fiz3,tz);
859 fjx3 = _mm256_add_pd(fjx3,tx);
860 fjy3 = _mm256_add_pd(fjy3,ty);
861 fjz3 = _mm256_add_pd(fjz3,tz);
865 fjptrA = f+j_coord_offsetA;
866 fjptrB = f+j_coord_offsetB;
867 fjptrC = f+j_coord_offsetC;
868 fjptrD = f+j_coord_offsetD;
870 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
871 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
873 /* Inner loop uses 585 flops */
879 /* Get j neighbor index, and coordinate index */
880 jnrlistA = jjnr[jidx];
881 jnrlistB = jjnr[jidx+1];
882 jnrlistC = jjnr[jidx+2];
883 jnrlistD = jjnr[jidx+3];
884 /* Sign of each element will be negative for non-real atoms.
885 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
886 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
888 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
890 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
891 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
892 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
894 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
895 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
896 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
897 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
898 j_coord_offsetA = DIM*jnrA;
899 j_coord_offsetB = DIM*jnrB;
900 j_coord_offsetC = DIM*jnrC;
901 j_coord_offsetD = DIM*jnrD;
903 /* load j atom coordinates */
904 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
905 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
906 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
908 /* Calculate displacement vector */
909 dx11 = _mm256_sub_pd(ix1,jx1);
910 dy11 = _mm256_sub_pd(iy1,jy1);
911 dz11 = _mm256_sub_pd(iz1,jz1);
912 dx12 = _mm256_sub_pd(ix1,jx2);
913 dy12 = _mm256_sub_pd(iy1,jy2);
914 dz12 = _mm256_sub_pd(iz1,jz2);
915 dx13 = _mm256_sub_pd(ix1,jx3);
916 dy13 = _mm256_sub_pd(iy1,jy3);
917 dz13 = _mm256_sub_pd(iz1,jz3);
918 dx21 = _mm256_sub_pd(ix2,jx1);
919 dy21 = _mm256_sub_pd(iy2,jy1);
920 dz21 = _mm256_sub_pd(iz2,jz1);
921 dx22 = _mm256_sub_pd(ix2,jx2);
922 dy22 = _mm256_sub_pd(iy2,jy2);
923 dz22 = _mm256_sub_pd(iz2,jz2);
924 dx23 = _mm256_sub_pd(ix2,jx3);
925 dy23 = _mm256_sub_pd(iy2,jy3);
926 dz23 = _mm256_sub_pd(iz2,jz3);
927 dx31 = _mm256_sub_pd(ix3,jx1);
928 dy31 = _mm256_sub_pd(iy3,jy1);
929 dz31 = _mm256_sub_pd(iz3,jz1);
930 dx32 = _mm256_sub_pd(ix3,jx2);
931 dy32 = _mm256_sub_pd(iy3,jy2);
932 dz32 = _mm256_sub_pd(iz3,jz2);
933 dx33 = _mm256_sub_pd(ix3,jx3);
934 dy33 = _mm256_sub_pd(iy3,jy3);
935 dz33 = _mm256_sub_pd(iz3,jz3);
937 /* Calculate squared distance and things based on it */
938 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
939 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
940 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
941 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
942 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
943 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
944 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
945 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
946 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
948 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
949 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
950 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
951 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
952 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
953 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
954 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
955 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
956 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
958 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
959 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
960 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
961 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
962 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
963 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
964 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
965 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
966 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
968 fjx1 = _mm256_setzero_pd();
969 fjy1 = _mm256_setzero_pd();
970 fjz1 = _mm256_setzero_pd();
971 fjx2 = _mm256_setzero_pd();
972 fjy2 = _mm256_setzero_pd();
973 fjz2 = _mm256_setzero_pd();
974 fjx3 = _mm256_setzero_pd();
975 fjy3 = _mm256_setzero_pd();
976 fjz3 = _mm256_setzero_pd();
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 if (gmx_mm256_any_lt(rsq11,rcutoff2))
985 r11 = _mm256_mul_pd(rsq11,rinv11);
986 r11 = _mm256_andnot_pd(dummy_mask,r11);
988 /* EWALD ELECTROSTATICS */
990 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
991 ewrt = _mm256_mul_pd(r11,ewtabscale);
992 ewitab = _mm256_cvttpd_epi32(ewrt);
993 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
994 ewitab = _mm_slli_epi32(ewitab,2);
995 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
996 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
997 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
998 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
999 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1000 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1001 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1002 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1003 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1005 d = _mm256_sub_pd(r11,rswitch);
1006 d = _mm256_max_pd(d,_mm256_setzero_pd());
1007 d2 = _mm256_mul_pd(d,d);
1008 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1010 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1012 /* Evaluate switch function */
1013 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1014 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1015 velec = _mm256_mul_pd(velec,sw);
1016 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1018 /* Update potential sum for this i atom from the interaction with this j atom. */
1019 velec = _mm256_and_pd(velec,cutoff_mask);
1020 velec = _mm256_andnot_pd(dummy_mask,velec);
1021 velecsum = _mm256_add_pd(velecsum,velec);
1025 fscal = _mm256_and_pd(fscal,cutoff_mask);
1027 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1029 /* Calculate temporary vectorial force */
1030 tx = _mm256_mul_pd(fscal,dx11);
1031 ty = _mm256_mul_pd(fscal,dy11);
1032 tz = _mm256_mul_pd(fscal,dz11);
1034 /* Update vectorial force */
1035 fix1 = _mm256_add_pd(fix1,tx);
1036 fiy1 = _mm256_add_pd(fiy1,ty);
1037 fiz1 = _mm256_add_pd(fiz1,tz);
1039 fjx1 = _mm256_add_pd(fjx1,tx);
1040 fjy1 = _mm256_add_pd(fjy1,ty);
1041 fjz1 = _mm256_add_pd(fjz1,tz);
1045 /**************************
1046 * CALCULATE INTERACTIONS *
1047 **************************/
1049 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1052 r12 = _mm256_mul_pd(rsq12,rinv12);
1053 r12 = _mm256_andnot_pd(dummy_mask,r12);
1055 /* EWALD ELECTROSTATICS */
1057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058 ewrt = _mm256_mul_pd(r12,ewtabscale);
1059 ewitab = _mm256_cvttpd_epi32(ewrt);
1060 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1061 ewitab = _mm_slli_epi32(ewitab,2);
1062 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1063 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1064 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1065 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1066 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1067 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1068 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1069 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1070 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1072 d = _mm256_sub_pd(r12,rswitch);
1073 d = _mm256_max_pd(d,_mm256_setzero_pd());
1074 d2 = _mm256_mul_pd(d,d);
1075 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1077 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1079 /* Evaluate switch function */
1080 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1081 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1082 velec = _mm256_mul_pd(velec,sw);
1083 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1085 /* Update potential sum for this i atom from the interaction with this j atom. */
1086 velec = _mm256_and_pd(velec,cutoff_mask);
1087 velec = _mm256_andnot_pd(dummy_mask,velec);
1088 velecsum = _mm256_add_pd(velecsum,velec);
1092 fscal = _mm256_and_pd(fscal,cutoff_mask);
1094 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1096 /* Calculate temporary vectorial force */
1097 tx = _mm256_mul_pd(fscal,dx12);
1098 ty = _mm256_mul_pd(fscal,dy12);
1099 tz = _mm256_mul_pd(fscal,dz12);
1101 /* Update vectorial force */
1102 fix1 = _mm256_add_pd(fix1,tx);
1103 fiy1 = _mm256_add_pd(fiy1,ty);
1104 fiz1 = _mm256_add_pd(fiz1,tz);
1106 fjx2 = _mm256_add_pd(fjx2,tx);
1107 fjy2 = _mm256_add_pd(fjy2,ty);
1108 fjz2 = _mm256_add_pd(fjz2,tz);
1112 /**************************
1113 * CALCULATE INTERACTIONS *
1114 **************************/
1116 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1119 r13 = _mm256_mul_pd(rsq13,rinv13);
1120 r13 = _mm256_andnot_pd(dummy_mask,r13);
1122 /* EWALD ELECTROSTATICS */
1124 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1125 ewrt = _mm256_mul_pd(r13,ewtabscale);
1126 ewitab = _mm256_cvttpd_epi32(ewrt);
1127 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1128 ewitab = _mm_slli_epi32(ewitab,2);
1129 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1130 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1131 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1132 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1133 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1134 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1135 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1136 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1137 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1139 d = _mm256_sub_pd(r13,rswitch);
1140 d = _mm256_max_pd(d,_mm256_setzero_pd());
1141 d2 = _mm256_mul_pd(d,d);
1142 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1144 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1146 /* Evaluate switch function */
1147 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1148 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
1149 velec = _mm256_mul_pd(velec,sw);
1150 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm256_and_pd(velec,cutoff_mask);
1154 velec = _mm256_andnot_pd(dummy_mask,velec);
1155 velecsum = _mm256_add_pd(velecsum,velec);
1159 fscal = _mm256_and_pd(fscal,cutoff_mask);
1161 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1163 /* Calculate temporary vectorial force */
1164 tx = _mm256_mul_pd(fscal,dx13);
1165 ty = _mm256_mul_pd(fscal,dy13);
1166 tz = _mm256_mul_pd(fscal,dz13);
1168 /* Update vectorial force */
1169 fix1 = _mm256_add_pd(fix1,tx);
1170 fiy1 = _mm256_add_pd(fiy1,ty);
1171 fiz1 = _mm256_add_pd(fiz1,tz);
1173 fjx3 = _mm256_add_pd(fjx3,tx);
1174 fjy3 = _mm256_add_pd(fjy3,ty);
1175 fjz3 = _mm256_add_pd(fjz3,tz);
1179 /**************************
1180 * CALCULATE INTERACTIONS *
1181 **************************/
1183 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1186 r21 = _mm256_mul_pd(rsq21,rinv21);
1187 r21 = _mm256_andnot_pd(dummy_mask,r21);
1189 /* EWALD ELECTROSTATICS */
1191 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1192 ewrt = _mm256_mul_pd(r21,ewtabscale);
1193 ewitab = _mm256_cvttpd_epi32(ewrt);
1194 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1195 ewitab = _mm_slli_epi32(ewitab,2);
1196 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1197 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1198 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1199 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1200 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1201 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1202 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1203 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1204 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1206 d = _mm256_sub_pd(r21,rswitch);
1207 d = _mm256_max_pd(d,_mm256_setzero_pd());
1208 d2 = _mm256_mul_pd(d,d);
1209 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1211 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1213 /* Evaluate switch function */
1214 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1215 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1216 velec = _mm256_mul_pd(velec,sw);
1217 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1219 /* Update potential sum for this i atom from the interaction with this j atom. */
1220 velec = _mm256_and_pd(velec,cutoff_mask);
1221 velec = _mm256_andnot_pd(dummy_mask,velec);
1222 velecsum = _mm256_add_pd(velecsum,velec);
1226 fscal = _mm256_and_pd(fscal,cutoff_mask);
1228 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm256_mul_pd(fscal,dx21);
1232 ty = _mm256_mul_pd(fscal,dy21);
1233 tz = _mm256_mul_pd(fscal,dz21);
1235 /* Update vectorial force */
1236 fix2 = _mm256_add_pd(fix2,tx);
1237 fiy2 = _mm256_add_pd(fiy2,ty);
1238 fiz2 = _mm256_add_pd(fiz2,tz);
1240 fjx1 = _mm256_add_pd(fjx1,tx);
1241 fjy1 = _mm256_add_pd(fjy1,ty);
1242 fjz1 = _mm256_add_pd(fjz1,tz);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1253 r22 = _mm256_mul_pd(rsq22,rinv22);
1254 r22 = _mm256_andnot_pd(dummy_mask,r22);
1256 /* EWALD ELECTROSTATICS */
1258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1259 ewrt = _mm256_mul_pd(r22,ewtabscale);
1260 ewitab = _mm256_cvttpd_epi32(ewrt);
1261 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1262 ewitab = _mm_slli_epi32(ewitab,2);
1263 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1264 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1265 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1266 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1267 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1268 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1269 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1270 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1271 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1273 d = _mm256_sub_pd(r22,rswitch);
1274 d = _mm256_max_pd(d,_mm256_setzero_pd());
1275 d2 = _mm256_mul_pd(d,d);
1276 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1278 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1280 /* Evaluate switch function */
1281 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1282 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1283 velec = _mm256_mul_pd(velec,sw);
1284 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1286 /* Update potential sum for this i atom from the interaction with this j atom. */
1287 velec = _mm256_and_pd(velec,cutoff_mask);
1288 velec = _mm256_andnot_pd(dummy_mask,velec);
1289 velecsum = _mm256_add_pd(velecsum,velec);
1293 fscal = _mm256_and_pd(fscal,cutoff_mask);
1295 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1297 /* Calculate temporary vectorial force */
1298 tx = _mm256_mul_pd(fscal,dx22);
1299 ty = _mm256_mul_pd(fscal,dy22);
1300 tz = _mm256_mul_pd(fscal,dz22);
1302 /* Update vectorial force */
1303 fix2 = _mm256_add_pd(fix2,tx);
1304 fiy2 = _mm256_add_pd(fiy2,ty);
1305 fiz2 = _mm256_add_pd(fiz2,tz);
1307 fjx2 = _mm256_add_pd(fjx2,tx);
1308 fjy2 = _mm256_add_pd(fjy2,ty);
1309 fjz2 = _mm256_add_pd(fjz2,tz);
1313 /**************************
1314 * CALCULATE INTERACTIONS *
1315 **************************/
1317 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1320 r23 = _mm256_mul_pd(rsq23,rinv23);
1321 r23 = _mm256_andnot_pd(dummy_mask,r23);
1323 /* EWALD ELECTROSTATICS */
1325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1326 ewrt = _mm256_mul_pd(r23,ewtabscale);
1327 ewitab = _mm256_cvttpd_epi32(ewrt);
1328 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1329 ewitab = _mm_slli_epi32(ewitab,2);
1330 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1331 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1332 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1333 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1334 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1335 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1336 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1337 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1338 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1340 d = _mm256_sub_pd(r23,rswitch);
1341 d = _mm256_max_pd(d,_mm256_setzero_pd());
1342 d2 = _mm256_mul_pd(d,d);
1343 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1345 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1347 /* Evaluate switch function */
1348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1349 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
1350 velec = _mm256_mul_pd(velec,sw);
1351 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1353 /* Update potential sum for this i atom from the interaction with this j atom. */
1354 velec = _mm256_and_pd(velec,cutoff_mask);
1355 velec = _mm256_andnot_pd(dummy_mask,velec);
1356 velecsum = _mm256_add_pd(velecsum,velec);
1360 fscal = _mm256_and_pd(fscal,cutoff_mask);
1362 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1364 /* Calculate temporary vectorial force */
1365 tx = _mm256_mul_pd(fscal,dx23);
1366 ty = _mm256_mul_pd(fscal,dy23);
1367 tz = _mm256_mul_pd(fscal,dz23);
1369 /* Update vectorial force */
1370 fix2 = _mm256_add_pd(fix2,tx);
1371 fiy2 = _mm256_add_pd(fiy2,ty);
1372 fiz2 = _mm256_add_pd(fiz2,tz);
1374 fjx3 = _mm256_add_pd(fjx3,tx);
1375 fjy3 = _mm256_add_pd(fjy3,ty);
1376 fjz3 = _mm256_add_pd(fjz3,tz);
1380 /**************************
1381 * CALCULATE INTERACTIONS *
1382 **************************/
1384 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1387 r31 = _mm256_mul_pd(rsq31,rinv31);
1388 r31 = _mm256_andnot_pd(dummy_mask,r31);
1390 /* EWALD ELECTROSTATICS */
1392 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1393 ewrt = _mm256_mul_pd(r31,ewtabscale);
1394 ewitab = _mm256_cvttpd_epi32(ewrt);
1395 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1396 ewitab = _mm_slli_epi32(ewitab,2);
1397 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1398 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1399 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1400 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1401 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1402 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1403 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1404 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1405 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1407 d = _mm256_sub_pd(r31,rswitch);
1408 d = _mm256_max_pd(d,_mm256_setzero_pd());
1409 d2 = _mm256_mul_pd(d,d);
1410 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1412 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1414 /* Evaluate switch function */
1415 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1416 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
1417 velec = _mm256_mul_pd(velec,sw);
1418 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1420 /* Update potential sum for this i atom from the interaction with this j atom. */
1421 velec = _mm256_and_pd(velec,cutoff_mask);
1422 velec = _mm256_andnot_pd(dummy_mask,velec);
1423 velecsum = _mm256_add_pd(velecsum,velec);
1427 fscal = _mm256_and_pd(fscal,cutoff_mask);
1429 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1431 /* Calculate temporary vectorial force */
1432 tx = _mm256_mul_pd(fscal,dx31);
1433 ty = _mm256_mul_pd(fscal,dy31);
1434 tz = _mm256_mul_pd(fscal,dz31);
1436 /* Update vectorial force */
1437 fix3 = _mm256_add_pd(fix3,tx);
1438 fiy3 = _mm256_add_pd(fiy3,ty);
1439 fiz3 = _mm256_add_pd(fiz3,tz);
1441 fjx1 = _mm256_add_pd(fjx1,tx);
1442 fjy1 = _mm256_add_pd(fjy1,ty);
1443 fjz1 = _mm256_add_pd(fjz1,tz);
1447 /**************************
1448 * CALCULATE INTERACTIONS *
1449 **************************/
1451 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1454 r32 = _mm256_mul_pd(rsq32,rinv32);
1455 r32 = _mm256_andnot_pd(dummy_mask,r32);
1457 /* EWALD ELECTROSTATICS */
1459 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1460 ewrt = _mm256_mul_pd(r32,ewtabscale);
1461 ewitab = _mm256_cvttpd_epi32(ewrt);
1462 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1463 ewitab = _mm_slli_epi32(ewitab,2);
1464 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1465 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1466 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1467 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1468 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1469 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1470 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1471 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1472 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1474 d = _mm256_sub_pd(r32,rswitch);
1475 d = _mm256_max_pd(d,_mm256_setzero_pd());
1476 d2 = _mm256_mul_pd(d,d);
1477 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1479 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1481 /* Evaluate switch function */
1482 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1483 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
1484 velec = _mm256_mul_pd(velec,sw);
1485 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1487 /* Update potential sum for this i atom from the interaction with this j atom. */
1488 velec = _mm256_and_pd(velec,cutoff_mask);
1489 velec = _mm256_andnot_pd(dummy_mask,velec);
1490 velecsum = _mm256_add_pd(velecsum,velec);
1494 fscal = _mm256_and_pd(fscal,cutoff_mask);
1496 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1498 /* Calculate temporary vectorial force */
1499 tx = _mm256_mul_pd(fscal,dx32);
1500 ty = _mm256_mul_pd(fscal,dy32);
1501 tz = _mm256_mul_pd(fscal,dz32);
1503 /* Update vectorial force */
1504 fix3 = _mm256_add_pd(fix3,tx);
1505 fiy3 = _mm256_add_pd(fiy3,ty);
1506 fiz3 = _mm256_add_pd(fiz3,tz);
1508 fjx2 = _mm256_add_pd(fjx2,tx);
1509 fjy2 = _mm256_add_pd(fjy2,ty);
1510 fjz2 = _mm256_add_pd(fjz2,tz);
1514 /**************************
1515 * CALCULATE INTERACTIONS *
1516 **************************/
1518 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1521 r33 = _mm256_mul_pd(rsq33,rinv33);
1522 r33 = _mm256_andnot_pd(dummy_mask,r33);
1524 /* EWALD ELECTROSTATICS */
1526 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1527 ewrt = _mm256_mul_pd(r33,ewtabscale);
1528 ewitab = _mm256_cvttpd_epi32(ewrt);
1529 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1530 ewitab = _mm_slli_epi32(ewitab,2);
1531 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1532 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1533 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1534 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1535 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1536 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1537 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1538 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1539 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1541 d = _mm256_sub_pd(r33,rswitch);
1542 d = _mm256_max_pd(d,_mm256_setzero_pd());
1543 d2 = _mm256_mul_pd(d,d);
1544 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1546 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1548 /* Evaluate switch function */
1549 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1550 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
1551 velec = _mm256_mul_pd(velec,sw);
1552 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1554 /* Update potential sum for this i atom from the interaction with this j atom. */
1555 velec = _mm256_and_pd(velec,cutoff_mask);
1556 velec = _mm256_andnot_pd(dummy_mask,velec);
1557 velecsum = _mm256_add_pd(velecsum,velec);
1561 fscal = _mm256_and_pd(fscal,cutoff_mask);
1563 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1565 /* Calculate temporary vectorial force */
1566 tx = _mm256_mul_pd(fscal,dx33);
1567 ty = _mm256_mul_pd(fscal,dy33);
1568 tz = _mm256_mul_pd(fscal,dz33);
1570 /* Update vectorial force */
1571 fix3 = _mm256_add_pd(fix3,tx);
1572 fiy3 = _mm256_add_pd(fiy3,ty);
1573 fiz3 = _mm256_add_pd(fiz3,tz);
1575 fjx3 = _mm256_add_pd(fjx3,tx);
1576 fjy3 = _mm256_add_pd(fjy3,ty);
1577 fjz3 = _mm256_add_pd(fjz3,tz);
1581 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1582 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1583 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1584 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1586 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1587 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1589 /* Inner loop uses 594 flops */
1592 /* End of innermost loop */
1594 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1595 f+i_coord_offset+DIM,fshift+i_shift_offset);
1598 /* Update potential energies */
1599 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1601 /* Increment number of inner iterations */
1602 inneriter += j_index_end - j_index_start;
1604 /* Outer loop uses 19 flops */
1607 /* Increment number of outer iterations */
1610 /* Update outer/inner flops */
1612 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
1615 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
1616 * Electrostatics interaction: Ewald
1617 * VdW interaction: None
1618 * Geometry: Water4-Water4
1619 * Calculate force/pot: Force
1622 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
1623 (t_nblist * gmx_restrict nlist,
1624 rvec * gmx_restrict xx,
1625 rvec * gmx_restrict ff,
1626 t_forcerec * gmx_restrict fr,
1627 t_mdatoms * gmx_restrict mdatoms,
1628 nb_kernel_data_t * gmx_restrict kernel_data,
1629 t_nrnb * gmx_restrict nrnb)
1631 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1632 * just 0 for non-waters.
1633 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1634 * jnr indices corresponding to data put in the four positions in the SIMD register.
1636 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1637 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1638 int jnrA,jnrB,jnrC,jnrD;
1639 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1640 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1641 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1642 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1643 real rcutoff_scalar;
1644 real *shiftvec,*fshift,*x,*f;
1645 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1646 real scratch[4*DIM];
1647 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1648 real * vdwioffsetptr1;
1649 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1650 real * vdwioffsetptr2;
1651 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1652 real * vdwioffsetptr3;
1653 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1654 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1655 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1656 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1657 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1658 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1659 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1660 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1661 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1662 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1663 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1664 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1665 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1666 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1667 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1668 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1669 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1672 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1673 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1675 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1676 real rswitch_scalar,d_scalar;
1677 __m256d dummy_mask,cutoff_mask;
1678 __m128 tmpmask0,tmpmask1;
1679 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1680 __m256d one = _mm256_set1_pd(1.0);
1681 __m256d two = _mm256_set1_pd(2.0);
1687 jindex = nlist->jindex;
1689 shiftidx = nlist->shift;
1691 shiftvec = fr->shift_vec[0];
1692 fshift = fr->fshift[0];
1693 facel = _mm256_set1_pd(fr->epsfac);
1694 charge = mdatoms->chargeA;
1696 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1697 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1698 beta2 = _mm256_mul_pd(beta,beta);
1699 beta3 = _mm256_mul_pd(beta,beta2);
1701 ewtab = fr->ic->tabq_coul_FDV0;
1702 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1703 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1705 /* Setup water-specific parameters */
1706 inr = nlist->iinr[0];
1707 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1708 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1709 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1711 jq1 = _mm256_set1_pd(charge[inr+1]);
1712 jq2 = _mm256_set1_pd(charge[inr+2]);
1713 jq3 = _mm256_set1_pd(charge[inr+3]);
1714 qq11 = _mm256_mul_pd(iq1,jq1);
1715 qq12 = _mm256_mul_pd(iq1,jq2);
1716 qq13 = _mm256_mul_pd(iq1,jq3);
1717 qq21 = _mm256_mul_pd(iq2,jq1);
1718 qq22 = _mm256_mul_pd(iq2,jq2);
1719 qq23 = _mm256_mul_pd(iq2,jq3);
1720 qq31 = _mm256_mul_pd(iq3,jq1);
1721 qq32 = _mm256_mul_pd(iq3,jq2);
1722 qq33 = _mm256_mul_pd(iq3,jq3);
1724 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1725 rcutoff_scalar = fr->rcoulomb;
1726 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1727 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1729 rswitch_scalar = fr->rcoulomb_switch;
1730 rswitch = _mm256_set1_pd(rswitch_scalar);
1731 /* Setup switch parameters */
1732 d_scalar = rcutoff_scalar-rswitch_scalar;
1733 d = _mm256_set1_pd(d_scalar);
1734 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1735 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1736 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1737 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1738 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1739 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1741 /* Avoid stupid compiler warnings */
1742 jnrA = jnrB = jnrC = jnrD = 0;
1743 j_coord_offsetA = 0;
1744 j_coord_offsetB = 0;
1745 j_coord_offsetC = 0;
1746 j_coord_offsetD = 0;
1751 for(iidx=0;iidx<4*DIM;iidx++)
1753 scratch[iidx] = 0.0;
1756 /* Start outer loop over neighborlists */
1757 for(iidx=0; iidx<nri; iidx++)
1759 /* Load shift vector for this list */
1760 i_shift_offset = DIM*shiftidx[iidx];
1762 /* Load limits for loop over neighbors */
1763 j_index_start = jindex[iidx];
1764 j_index_end = jindex[iidx+1];
1766 /* Get outer coordinate index */
1768 i_coord_offset = DIM*inr;
1770 /* Load i particle coords and add shift vector */
1771 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1772 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1774 fix1 = _mm256_setzero_pd();
1775 fiy1 = _mm256_setzero_pd();
1776 fiz1 = _mm256_setzero_pd();
1777 fix2 = _mm256_setzero_pd();
1778 fiy2 = _mm256_setzero_pd();
1779 fiz2 = _mm256_setzero_pd();
1780 fix3 = _mm256_setzero_pd();
1781 fiy3 = _mm256_setzero_pd();
1782 fiz3 = _mm256_setzero_pd();
1784 /* Start inner kernel loop */
1785 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1788 /* Get j neighbor index, and coordinate index */
1790 jnrB = jjnr[jidx+1];
1791 jnrC = jjnr[jidx+2];
1792 jnrD = jjnr[jidx+3];
1793 j_coord_offsetA = DIM*jnrA;
1794 j_coord_offsetB = DIM*jnrB;
1795 j_coord_offsetC = DIM*jnrC;
1796 j_coord_offsetD = DIM*jnrD;
1798 /* load j atom coordinates */
1799 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1800 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1801 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1803 /* Calculate displacement vector */
1804 dx11 = _mm256_sub_pd(ix1,jx1);
1805 dy11 = _mm256_sub_pd(iy1,jy1);
1806 dz11 = _mm256_sub_pd(iz1,jz1);
1807 dx12 = _mm256_sub_pd(ix1,jx2);
1808 dy12 = _mm256_sub_pd(iy1,jy2);
1809 dz12 = _mm256_sub_pd(iz1,jz2);
1810 dx13 = _mm256_sub_pd(ix1,jx3);
1811 dy13 = _mm256_sub_pd(iy1,jy3);
1812 dz13 = _mm256_sub_pd(iz1,jz3);
1813 dx21 = _mm256_sub_pd(ix2,jx1);
1814 dy21 = _mm256_sub_pd(iy2,jy1);
1815 dz21 = _mm256_sub_pd(iz2,jz1);
1816 dx22 = _mm256_sub_pd(ix2,jx2);
1817 dy22 = _mm256_sub_pd(iy2,jy2);
1818 dz22 = _mm256_sub_pd(iz2,jz2);
1819 dx23 = _mm256_sub_pd(ix2,jx3);
1820 dy23 = _mm256_sub_pd(iy2,jy3);
1821 dz23 = _mm256_sub_pd(iz2,jz3);
1822 dx31 = _mm256_sub_pd(ix3,jx1);
1823 dy31 = _mm256_sub_pd(iy3,jy1);
1824 dz31 = _mm256_sub_pd(iz3,jz1);
1825 dx32 = _mm256_sub_pd(ix3,jx2);
1826 dy32 = _mm256_sub_pd(iy3,jy2);
1827 dz32 = _mm256_sub_pd(iz3,jz2);
1828 dx33 = _mm256_sub_pd(ix3,jx3);
1829 dy33 = _mm256_sub_pd(iy3,jy3);
1830 dz33 = _mm256_sub_pd(iz3,jz3);
1832 /* Calculate squared distance and things based on it */
1833 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1834 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1835 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1836 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1837 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1838 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1839 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1840 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1841 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1843 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1844 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1845 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1846 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1847 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1848 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1849 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1850 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1851 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1853 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1854 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1855 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1856 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1857 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1858 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1859 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1860 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1861 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1863 fjx1 = _mm256_setzero_pd();
1864 fjy1 = _mm256_setzero_pd();
1865 fjz1 = _mm256_setzero_pd();
1866 fjx2 = _mm256_setzero_pd();
1867 fjy2 = _mm256_setzero_pd();
1868 fjz2 = _mm256_setzero_pd();
1869 fjx3 = _mm256_setzero_pd();
1870 fjy3 = _mm256_setzero_pd();
1871 fjz3 = _mm256_setzero_pd();
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1880 r11 = _mm256_mul_pd(rsq11,rinv11);
1882 /* EWALD ELECTROSTATICS */
1884 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1885 ewrt = _mm256_mul_pd(r11,ewtabscale);
1886 ewitab = _mm256_cvttpd_epi32(ewrt);
1887 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1888 ewitab = _mm_slli_epi32(ewitab,2);
1889 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1890 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1891 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1892 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1893 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1894 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1895 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1896 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1897 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1899 d = _mm256_sub_pd(r11,rswitch);
1900 d = _mm256_max_pd(d,_mm256_setzero_pd());
1901 d2 = _mm256_mul_pd(d,d);
1902 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1904 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1906 /* Evaluate switch function */
1907 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1908 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1909 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1913 fscal = _mm256_and_pd(fscal,cutoff_mask);
1915 /* Calculate temporary vectorial force */
1916 tx = _mm256_mul_pd(fscal,dx11);
1917 ty = _mm256_mul_pd(fscal,dy11);
1918 tz = _mm256_mul_pd(fscal,dz11);
1920 /* Update vectorial force */
1921 fix1 = _mm256_add_pd(fix1,tx);
1922 fiy1 = _mm256_add_pd(fiy1,ty);
1923 fiz1 = _mm256_add_pd(fiz1,tz);
1925 fjx1 = _mm256_add_pd(fjx1,tx);
1926 fjy1 = _mm256_add_pd(fjy1,ty);
1927 fjz1 = _mm256_add_pd(fjz1,tz);
1931 /**************************
1932 * CALCULATE INTERACTIONS *
1933 **************************/
1935 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1938 r12 = _mm256_mul_pd(rsq12,rinv12);
1940 /* EWALD ELECTROSTATICS */
1942 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1943 ewrt = _mm256_mul_pd(r12,ewtabscale);
1944 ewitab = _mm256_cvttpd_epi32(ewrt);
1945 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1946 ewitab = _mm_slli_epi32(ewitab,2);
1947 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1948 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1949 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1950 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1951 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1952 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1953 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1954 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1955 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1957 d = _mm256_sub_pd(r12,rswitch);
1958 d = _mm256_max_pd(d,_mm256_setzero_pd());
1959 d2 = _mm256_mul_pd(d,d);
1960 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1962 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1967 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1971 fscal = _mm256_and_pd(fscal,cutoff_mask);
1973 /* Calculate temporary vectorial force */
1974 tx = _mm256_mul_pd(fscal,dx12);
1975 ty = _mm256_mul_pd(fscal,dy12);
1976 tz = _mm256_mul_pd(fscal,dz12);
1978 /* Update vectorial force */
1979 fix1 = _mm256_add_pd(fix1,tx);
1980 fiy1 = _mm256_add_pd(fiy1,ty);
1981 fiz1 = _mm256_add_pd(fiz1,tz);
1983 fjx2 = _mm256_add_pd(fjx2,tx);
1984 fjy2 = _mm256_add_pd(fjy2,ty);
1985 fjz2 = _mm256_add_pd(fjz2,tz);
1989 /**************************
1990 * CALCULATE INTERACTIONS *
1991 **************************/
1993 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1996 r13 = _mm256_mul_pd(rsq13,rinv13);
1998 /* EWALD ELECTROSTATICS */
2000 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2001 ewrt = _mm256_mul_pd(r13,ewtabscale);
2002 ewitab = _mm256_cvttpd_epi32(ewrt);
2003 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2004 ewitab = _mm_slli_epi32(ewitab,2);
2005 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2006 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2007 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2008 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2009 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2010 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2011 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2012 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2013 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2015 d = _mm256_sub_pd(r13,rswitch);
2016 d = _mm256_max_pd(d,_mm256_setzero_pd());
2017 d2 = _mm256_mul_pd(d,d);
2018 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2020 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2022 /* Evaluate switch function */
2023 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2024 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2025 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2029 fscal = _mm256_and_pd(fscal,cutoff_mask);
2031 /* Calculate temporary vectorial force */
2032 tx = _mm256_mul_pd(fscal,dx13);
2033 ty = _mm256_mul_pd(fscal,dy13);
2034 tz = _mm256_mul_pd(fscal,dz13);
2036 /* Update vectorial force */
2037 fix1 = _mm256_add_pd(fix1,tx);
2038 fiy1 = _mm256_add_pd(fiy1,ty);
2039 fiz1 = _mm256_add_pd(fiz1,tz);
2041 fjx3 = _mm256_add_pd(fjx3,tx);
2042 fjy3 = _mm256_add_pd(fjy3,ty);
2043 fjz3 = _mm256_add_pd(fjz3,tz);
2047 /**************************
2048 * CALCULATE INTERACTIONS *
2049 **************************/
2051 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2054 r21 = _mm256_mul_pd(rsq21,rinv21);
2056 /* EWALD ELECTROSTATICS */
2058 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2059 ewrt = _mm256_mul_pd(r21,ewtabscale);
2060 ewitab = _mm256_cvttpd_epi32(ewrt);
2061 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2062 ewitab = _mm_slli_epi32(ewitab,2);
2063 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2064 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2065 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2066 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2067 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2068 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2069 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2070 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2071 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2073 d = _mm256_sub_pd(r21,rswitch);
2074 d = _mm256_max_pd(d,_mm256_setzero_pd());
2075 d2 = _mm256_mul_pd(d,d);
2076 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2078 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2080 /* Evaluate switch function */
2081 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2082 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2083 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2087 fscal = _mm256_and_pd(fscal,cutoff_mask);
2089 /* Calculate temporary vectorial force */
2090 tx = _mm256_mul_pd(fscal,dx21);
2091 ty = _mm256_mul_pd(fscal,dy21);
2092 tz = _mm256_mul_pd(fscal,dz21);
2094 /* Update vectorial force */
2095 fix2 = _mm256_add_pd(fix2,tx);
2096 fiy2 = _mm256_add_pd(fiy2,ty);
2097 fiz2 = _mm256_add_pd(fiz2,tz);
2099 fjx1 = _mm256_add_pd(fjx1,tx);
2100 fjy1 = _mm256_add_pd(fjy1,ty);
2101 fjz1 = _mm256_add_pd(fjz1,tz);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2112 r22 = _mm256_mul_pd(rsq22,rinv22);
2114 /* EWALD ELECTROSTATICS */
2116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2117 ewrt = _mm256_mul_pd(r22,ewtabscale);
2118 ewitab = _mm256_cvttpd_epi32(ewrt);
2119 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2120 ewitab = _mm_slli_epi32(ewitab,2);
2121 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2122 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2123 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2124 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2125 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2126 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2127 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2128 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2129 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2131 d = _mm256_sub_pd(r22,rswitch);
2132 d = _mm256_max_pd(d,_mm256_setzero_pd());
2133 d2 = _mm256_mul_pd(d,d);
2134 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2136 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2138 /* Evaluate switch function */
2139 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2140 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2141 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2145 fscal = _mm256_and_pd(fscal,cutoff_mask);
2147 /* Calculate temporary vectorial force */
2148 tx = _mm256_mul_pd(fscal,dx22);
2149 ty = _mm256_mul_pd(fscal,dy22);
2150 tz = _mm256_mul_pd(fscal,dz22);
2152 /* Update vectorial force */
2153 fix2 = _mm256_add_pd(fix2,tx);
2154 fiy2 = _mm256_add_pd(fiy2,ty);
2155 fiz2 = _mm256_add_pd(fiz2,tz);
2157 fjx2 = _mm256_add_pd(fjx2,tx);
2158 fjy2 = _mm256_add_pd(fjy2,ty);
2159 fjz2 = _mm256_add_pd(fjz2,tz);
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2170 r23 = _mm256_mul_pd(rsq23,rinv23);
2172 /* EWALD ELECTROSTATICS */
2174 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2175 ewrt = _mm256_mul_pd(r23,ewtabscale);
2176 ewitab = _mm256_cvttpd_epi32(ewrt);
2177 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2178 ewitab = _mm_slli_epi32(ewitab,2);
2179 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2180 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2181 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2182 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2183 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2184 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2185 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2186 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2187 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2189 d = _mm256_sub_pd(r23,rswitch);
2190 d = _mm256_max_pd(d,_mm256_setzero_pd());
2191 d2 = _mm256_mul_pd(d,d);
2192 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2194 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2196 /* Evaluate switch function */
2197 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2198 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2199 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2203 fscal = _mm256_and_pd(fscal,cutoff_mask);
2205 /* Calculate temporary vectorial force */
2206 tx = _mm256_mul_pd(fscal,dx23);
2207 ty = _mm256_mul_pd(fscal,dy23);
2208 tz = _mm256_mul_pd(fscal,dz23);
2210 /* Update vectorial force */
2211 fix2 = _mm256_add_pd(fix2,tx);
2212 fiy2 = _mm256_add_pd(fiy2,ty);
2213 fiz2 = _mm256_add_pd(fiz2,tz);
2215 fjx3 = _mm256_add_pd(fjx3,tx);
2216 fjy3 = _mm256_add_pd(fjy3,ty);
2217 fjz3 = _mm256_add_pd(fjz3,tz);
2221 /**************************
2222 * CALCULATE INTERACTIONS *
2223 **************************/
2225 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2228 r31 = _mm256_mul_pd(rsq31,rinv31);
2230 /* EWALD ELECTROSTATICS */
2232 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2233 ewrt = _mm256_mul_pd(r31,ewtabscale);
2234 ewitab = _mm256_cvttpd_epi32(ewrt);
2235 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2236 ewitab = _mm_slli_epi32(ewitab,2);
2237 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2238 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2239 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2240 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2241 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2242 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2243 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2244 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2245 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2247 d = _mm256_sub_pd(r31,rswitch);
2248 d = _mm256_max_pd(d,_mm256_setzero_pd());
2249 d2 = _mm256_mul_pd(d,d);
2250 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2252 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2254 /* Evaluate switch function */
2255 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2256 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2257 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2261 fscal = _mm256_and_pd(fscal,cutoff_mask);
2263 /* Calculate temporary vectorial force */
2264 tx = _mm256_mul_pd(fscal,dx31);
2265 ty = _mm256_mul_pd(fscal,dy31);
2266 tz = _mm256_mul_pd(fscal,dz31);
2268 /* Update vectorial force */
2269 fix3 = _mm256_add_pd(fix3,tx);
2270 fiy3 = _mm256_add_pd(fiy3,ty);
2271 fiz3 = _mm256_add_pd(fiz3,tz);
2273 fjx1 = _mm256_add_pd(fjx1,tx);
2274 fjy1 = _mm256_add_pd(fjy1,ty);
2275 fjz1 = _mm256_add_pd(fjz1,tz);
2279 /**************************
2280 * CALCULATE INTERACTIONS *
2281 **************************/
2283 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2286 r32 = _mm256_mul_pd(rsq32,rinv32);
2288 /* EWALD ELECTROSTATICS */
2290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2291 ewrt = _mm256_mul_pd(r32,ewtabscale);
2292 ewitab = _mm256_cvttpd_epi32(ewrt);
2293 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2294 ewitab = _mm_slli_epi32(ewitab,2);
2295 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2296 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2297 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2298 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2299 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2300 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2301 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2302 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2303 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2305 d = _mm256_sub_pd(r32,rswitch);
2306 d = _mm256_max_pd(d,_mm256_setzero_pd());
2307 d2 = _mm256_mul_pd(d,d);
2308 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2310 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2312 /* Evaluate switch function */
2313 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2314 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2315 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2319 fscal = _mm256_and_pd(fscal,cutoff_mask);
2321 /* Calculate temporary vectorial force */
2322 tx = _mm256_mul_pd(fscal,dx32);
2323 ty = _mm256_mul_pd(fscal,dy32);
2324 tz = _mm256_mul_pd(fscal,dz32);
2326 /* Update vectorial force */
2327 fix3 = _mm256_add_pd(fix3,tx);
2328 fiy3 = _mm256_add_pd(fiy3,ty);
2329 fiz3 = _mm256_add_pd(fiz3,tz);
2331 fjx2 = _mm256_add_pd(fjx2,tx);
2332 fjy2 = _mm256_add_pd(fjy2,ty);
2333 fjz2 = _mm256_add_pd(fjz2,tz);
2337 /**************************
2338 * CALCULATE INTERACTIONS *
2339 **************************/
2341 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2344 r33 = _mm256_mul_pd(rsq33,rinv33);
2346 /* EWALD ELECTROSTATICS */
2348 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2349 ewrt = _mm256_mul_pd(r33,ewtabscale);
2350 ewitab = _mm256_cvttpd_epi32(ewrt);
2351 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2352 ewitab = _mm_slli_epi32(ewitab,2);
2353 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2354 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2355 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2356 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2357 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2358 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2359 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2360 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
2361 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2363 d = _mm256_sub_pd(r33,rswitch);
2364 d = _mm256_max_pd(d,_mm256_setzero_pd());
2365 d2 = _mm256_mul_pd(d,d);
2366 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2368 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2370 /* Evaluate switch function */
2371 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2372 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
2373 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2377 fscal = _mm256_and_pd(fscal,cutoff_mask);
2379 /* Calculate temporary vectorial force */
2380 tx = _mm256_mul_pd(fscal,dx33);
2381 ty = _mm256_mul_pd(fscal,dy33);
2382 tz = _mm256_mul_pd(fscal,dz33);
2384 /* Update vectorial force */
2385 fix3 = _mm256_add_pd(fix3,tx);
2386 fiy3 = _mm256_add_pd(fiy3,ty);
2387 fiz3 = _mm256_add_pd(fiz3,tz);
2389 fjx3 = _mm256_add_pd(fjx3,tx);
2390 fjy3 = _mm256_add_pd(fjy3,ty);
2391 fjz3 = _mm256_add_pd(fjz3,tz);
2395 fjptrA = f+j_coord_offsetA;
2396 fjptrB = f+j_coord_offsetB;
2397 fjptrC = f+j_coord_offsetC;
2398 fjptrD = f+j_coord_offsetD;
2400 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2401 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2403 /* Inner loop uses 558 flops */
2406 if(jidx<j_index_end)
2409 /* Get j neighbor index, and coordinate index */
2410 jnrlistA = jjnr[jidx];
2411 jnrlistB = jjnr[jidx+1];
2412 jnrlistC = jjnr[jidx+2];
2413 jnrlistD = jjnr[jidx+3];
2414 /* Sign of each element will be negative for non-real atoms.
2415 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2416 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2418 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2420 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2421 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2422 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2424 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2425 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2426 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2427 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2428 j_coord_offsetA = DIM*jnrA;
2429 j_coord_offsetB = DIM*jnrB;
2430 j_coord_offsetC = DIM*jnrC;
2431 j_coord_offsetD = DIM*jnrD;
2433 /* load j atom coordinates */
2434 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2435 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2436 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2438 /* Calculate displacement vector */
2439 dx11 = _mm256_sub_pd(ix1,jx1);
2440 dy11 = _mm256_sub_pd(iy1,jy1);
2441 dz11 = _mm256_sub_pd(iz1,jz1);
2442 dx12 = _mm256_sub_pd(ix1,jx2);
2443 dy12 = _mm256_sub_pd(iy1,jy2);
2444 dz12 = _mm256_sub_pd(iz1,jz2);
2445 dx13 = _mm256_sub_pd(ix1,jx3);
2446 dy13 = _mm256_sub_pd(iy1,jy3);
2447 dz13 = _mm256_sub_pd(iz1,jz3);
2448 dx21 = _mm256_sub_pd(ix2,jx1);
2449 dy21 = _mm256_sub_pd(iy2,jy1);
2450 dz21 = _mm256_sub_pd(iz2,jz1);
2451 dx22 = _mm256_sub_pd(ix2,jx2);
2452 dy22 = _mm256_sub_pd(iy2,jy2);
2453 dz22 = _mm256_sub_pd(iz2,jz2);
2454 dx23 = _mm256_sub_pd(ix2,jx3);
2455 dy23 = _mm256_sub_pd(iy2,jy3);
2456 dz23 = _mm256_sub_pd(iz2,jz3);
2457 dx31 = _mm256_sub_pd(ix3,jx1);
2458 dy31 = _mm256_sub_pd(iy3,jy1);
2459 dz31 = _mm256_sub_pd(iz3,jz1);
2460 dx32 = _mm256_sub_pd(ix3,jx2);
2461 dy32 = _mm256_sub_pd(iy3,jy2);
2462 dz32 = _mm256_sub_pd(iz3,jz2);
2463 dx33 = _mm256_sub_pd(ix3,jx3);
2464 dy33 = _mm256_sub_pd(iy3,jy3);
2465 dz33 = _mm256_sub_pd(iz3,jz3);
2467 /* Calculate squared distance and things based on it */
2468 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2469 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2470 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2471 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2472 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2473 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2474 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2475 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2476 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2478 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2479 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2480 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2481 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2482 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2483 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2484 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2485 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2486 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2488 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2489 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2490 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2491 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2492 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2493 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2494 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2495 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2496 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2498 fjx1 = _mm256_setzero_pd();
2499 fjy1 = _mm256_setzero_pd();
2500 fjz1 = _mm256_setzero_pd();
2501 fjx2 = _mm256_setzero_pd();
2502 fjy2 = _mm256_setzero_pd();
2503 fjz2 = _mm256_setzero_pd();
2504 fjx3 = _mm256_setzero_pd();
2505 fjy3 = _mm256_setzero_pd();
2506 fjz3 = _mm256_setzero_pd();
2508 /**************************
2509 * CALCULATE INTERACTIONS *
2510 **************************/
2512 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2515 r11 = _mm256_mul_pd(rsq11,rinv11);
2516 r11 = _mm256_andnot_pd(dummy_mask,r11);
2518 /* EWALD ELECTROSTATICS */
2520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2521 ewrt = _mm256_mul_pd(r11,ewtabscale);
2522 ewitab = _mm256_cvttpd_epi32(ewrt);
2523 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2524 ewitab = _mm_slli_epi32(ewitab,2);
2525 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2526 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2527 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2528 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2529 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2530 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2531 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2532 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2533 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2535 d = _mm256_sub_pd(r11,rswitch);
2536 d = _mm256_max_pd(d,_mm256_setzero_pd());
2537 d2 = _mm256_mul_pd(d,d);
2538 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2540 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2542 /* Evaluate switch function */
2543 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2544 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2545 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2549 fscal = _mm256_and_pd(fscal,cutoff_mask);
2551 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2553 /* Calculate temporary vectorial force */
2554 tx = _mm256_mul_pd(fscal,dx11);
2555 ty = _mm256_mul_pd(fscal,dy11);
2556 tz = _mm256_mul_pd(fscal,dz11);
2558 /* Update vectorial force */
2559 fix1 = _mm256_add_pd(fix1,tx);
2560 fiy1 = _mm256_add_pd(fiy1,ty);
2561 fiz1 = _mm256_add_pd(fiz1,tz);
2563 fjx1 = _mm256_add_pd(fjx1,tx);
2564 fjy1 = _mm256_add_pd(fjy1,ty);
2565 fjz1 = _mm256_add_pd(fjz1,tz);
2569 /**************************
2570 * CALCULATE INTERACTIONS *
2571 **************************/
2573 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2576 r12 = _mm256_mul_pd(rsq12,rinv12);
2577 r12 = _mm256_andnot_pd(dummy_mask,r12);
2579 /* EWALD ELECTROSTATICS */
2581 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2582 ewrt = _mm256_mul_pd(r12,ewtabscale);
2583 ewitab = _mm256_cvttpd_epi32(ewrt);
2584 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2585 ewitab = _mm_slli_epi32(ewitab,2);
2586 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2587 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2588 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2589 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2590 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2591 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2592 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2593 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2594 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2596 d = _mm256_sub_pd(r12,rswitch);
2597 d = _mm256_max_pd(d,_mm256_setzero_pd());
2598 d2 = _mm256_mul_pd(d,d);
2599 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2601 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2603 /* Evaluate switch function */
2604 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2605 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2606 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2610 fscal = _mm256_and_pd(fscal,cutoff_mask);
2612 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2614 /* Calculate temporary vectorial force */
2615 tx = _mm256_mul_pd(fscal,dx12);
2616 ty = _mm256_mul_pd(fscal,dy12);
2617 tz = _mm256_mul_pd(fscal,dz12);
2619 /* Update vectorial force */
2620 fix1 = _mm256_add_pd(fix1,tx);
2621 fiy1 = _mm256_add_pd(fiy1,ty);
2622 fiz1 = _mm256_add_pd(fiz1,tz);
2624 fjx2 = _mm256_add_pd(fjx2,tx);
2625 fjy2 = _mm256_add_pd(fjy2,ty);
2626 fjz2 = _mm256_add_pd(fjz2,tz);
2630 /**************************
2631 * CALCULATE INTERACTIONS *
2632 **************************/
2634 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2637 r13 = _mm256_mul_pd(rsq13,rinv13);
2638 r13 = _mm256_andnot_pd(dummy_mask,r13);
2640 /* EWALD ELECTROSTATICS */
2642 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2643 ewrt = _mm256_mul_pd(r13,ewtabscale);
2644 ewitab = _mm256_cvttpd_epi32(ewrt);
2645 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2646 ewitab = _mm_slli_epi32(ewitab,2);
2647 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2648 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2649 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2650 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2651 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2652 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2653 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2654 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2655 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2657 d = _mm256_sub_pd(r13,rswitch);
2658 d = _mm256_max_pd(d,_mm256_setzero_pd());
2659 d2 = _mm256_mul_pd(d,d);
2660 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2662 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2664 /* Evaluate switch function */
2665 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2666 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2667 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2671 fscal = _mm256_and_pd(fscal,cutoff_mask);
2673 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2675 /* Calculate temporary vectorial force */
2676 tx = _mm256_mul_pd(fscal,dx13);
2677 ty = _mm256_mul_pd(fscal,dy13);
2678 tz = _mm256_mul_pd(fscal,dz13);
2680 /* Update vectorial force */
2681 fix1 = _mm256_add_pd(fix1,tx);
2682 fiy1 = _mm256_add_pd(fiy1,ty);
2683 fiz1 = _mm256_add_pd(fiz1,tz);
2685 fjx3 = _mm256_add_pd(fjx3,tx);
2686 fjy3 = _mm256_add_pd(fjy3,ty);
2687 fjz3 = _mm256_add_pd(fjz3,tz);
2691 /**************************
2692 * CALCULATE INTERACTIONS *
2693 **************************/
2695 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2698 r21 = _mm256_mul_pd(rsq21,rinv21);
2699 r21 = _mm256_andnot_pd(dummy_mask,r21);
2701 /* EWALD ELECTROSTATICS */
2703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2704 ewrt = _mm256_mul_pd(r21,ewtabscale);
2705 ewitab = _mm256_cvttpd_epi32(ewrt);
2706 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2707 ewitab = _mm_slli_epi32(ewitab,2);
2708 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2709 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2710 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2711 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2712 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2713 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2714 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2715 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2716 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2718 d = _mm256_sub_pd(r21,rswitch);
2719 d = _mm256_max_pd(d,_mm256_setzero_pd());
2720 d2 = _mm256_mul_pd(d,d);
2721 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2723 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2725 /* Evaluate switch function */
2726 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2727 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2728 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2732 fscal = _mm256_and_pd(fscal,cutoff_mask);
2734 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2736 /* Calculate temporary vectorial force */
2737 tx = _mm256_mul_pd(fscal,dx21);
2738 ty = _mm256_mul_pd(fscal,dy21);
2739 tz = _mm256_mul_pd(fscal,dz21);
2741 /* Update vectorial force */
2742 fix2 = _mm256_add_pd(fix2,tx);
2743 fiy2 = _mm256_add_pd(fiy2,ty);
2744 fiz2 = _mm256_add_pd(fiz2,tz);
2746 fjx1 = _mm256_add_pd(fjx1,tx);
2747 fjy1 = _mm256_add_pd(fjy1,ty);
2748 fjz1 = _mm256_add_pd(fjz1,tz);
2752 /**************************
2753 * CALCULATE INTERACTIONS *
2754 **************************/
2756 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2759 r22 = _mm256_mul_pd(rsq22,rinv22);
2760 r22 = _mm256_andnot_pd(dummy_mask,r22);
2762 /* EWALD ELECTROSTATICS */
2764 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2765 ewrt = _mm256_mul_pd(r22,ewtabscale);
2766 ewitab = _mm256_cvttpd_epi32(ewrt);
2767 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2768 ewitab = _mm_slli_epi32(ewitab,2);
2769 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2770 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2771 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2772 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2773 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2774 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2775 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2776 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2777 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2779 d = _mm256_sub_pd(r22,rswitch);
2780 d = _mm256_max_pd(d,_mm256_setzero_pd());
2781 d2 = _mm256_mul_pd(d,d);
2782 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2784 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2786 /* Evaluate switch function */
2787 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2788 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2789 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2793 fscal = _mm256_and_pd(fscal,cutoff_mask);
2795 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2797 /* Calculate temporary vectorial force */
2798 tx = _mm256_mul_pd(fscal,dx22);
2799 ty = _mm256_mul_pd(fscal,dy22);
2800 tz = _mm256_mul_pd(fscal,dz22);
2802 /* Update vectorial force */
2803 fix2 = _mm256_add_pd(fix2,tx);
2804 fiy2 = _mm256_add_pd(fiy2,ty);
2805 fiz2 = _mm256_add_pd(fiz2,tz);
2807 fjx2 = _mm256_add_pd(fjx2,tx);
2808 fjy2 = _mm256_add_pd(fjy2,ty);
2809 fjz2 = _mm256_add_pd(fjz2,tz);
2813 /**************************
2814 * CALCULATE INTERACTIONS *
2815 **************************/
2817 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2820 r23 = _mm256_mul_pd(rsq23,rinv23);
2821 r23 = _mm256_andnot_pd(dummy_mask,r23);
2823 /* EWALD ELECTROSTATICS */
2825 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2826 ewrt = _mm256_mul_pd(r23,ewtabscale);
2827 ewitab = _mm256_cvttpd_epi32(ewrt);
2828 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2829 ewitab = _mm_slli_epi32(ewitab,2);
2830 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2831 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2832 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2833 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2834 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2835 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2836 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2837 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2838 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2840 d = _mm256_sub_pd(r23,rswitch);
2841 d = _mm256_max_pd(d,_mm256_setzero_pd());
2842 d2 = _mm256_mul_pd(d,d);
2843 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2845 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2847 /* Evaluate switch function */
2848 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2849 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2850 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2854 fscal = _mm256_and_pd(fscal,cutoff_mask);
2856 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2858 /* Calculate temporary vectorial force */
2859 tx = _mm256_mul_pd(fscal,dx23);
2860 ty = _mm256_mul_pd(fscal,dy23);
2861 tz = _mm256_mul_pd(fscal,dz23);
2863 /* Update vectorial force */
2864 fix2 = _mm256_add_pd(fix2,tx);
2865 fiy2 = _mm256_add_pd(fiy2,ty);
2866 fiz2 = _mm256_add_pd(fiz2,tz);
2868 fjx3 = _mm256_add_pd(fjx3,tx);
2869 fjy3 = _mm256_add_pd(fjy3,ty);
2870 fjz3 = _mm256_add_pd(fjz3,tz);
2874 /**************************
2875 * CALCULATE INTERACTIONS *
2876 **************************/
2878 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2881 r31 = _mm256_mul_pd(rsq31,rinv31);
2882 r31 = _mm256_andnot_pd(dummy_mask,r31);
2884 /* EWALD ELECTROSTATICS */
2886 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2887 ewrt = _mm256_mul_pd(r31,ewtabscale);
2888 ewitab = _mm256_cvttpd_epi32(ewrt);
2889 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2890 ewitab = _mm_slli_epi32(ewitab,2);
2891 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2892 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2893 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2894 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2895 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2896 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2897 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2898 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2899 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2901 d = _mm256_sub_pd(r31,rswitch);
2902 d = _mm256_max_pd(d,_mm256_setzero_pd());
2903 d2 = _mm256_mul_pd(d,d);
2904 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2906 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2908 /* Evaluate switch function */
2909 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2910 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2911 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2915 fscal = _mm256_and_pd(fscal,cutoff_mask);
2917 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2919 /* Calculate temporary vectorial force */
2920 tx = _mm256_mul_pd(fscal,dx31);
2921 ty = _mm256_mul_pd(fscal,dy31);
2922 tz = _mm256_mul_pd(fscal,dz31);
2924 /* Update vectorial force */
2925 fix3 = _mm256_add_pd(fix3,tx);
2926 fiy3 = _mm256_add_pd(fiy3,ty);
2927 fiz3 = _mm256_add_pd(fiz3,tz);
2929 fjx1 = _mm256_add_pd(fjx1,tx);
2930 fjy1 = _mm256_add_pd(fjy1,ty);
2931 fjz1 = _mm256_add_pd(fjz1,tz);
2935 /**************************
2936 * CALCULATE INTERACTIONS *
2937 **************************/
2939 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2942 r32 = _mm256_mul_pd(rsq32,rinv32);
2943 r32 = _mm256_andnot_pd(dummy_mask,r32);
2945 /* EWALD ELECTROSTATICS */
2947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2948 ewrt = _mm256_mul_pd(r32,ewtabscale);
2949 ewitab = _mm256_cvttpd_epi32(ewrt);
2950 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2951 ewitab = _mm_slli_epi32(ewitab,2);
2952 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2953 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2954 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2955 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2956 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2957 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2958 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2959 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2960 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2962 d = _mm256_sub_pd(r32,rswitch);
2963 d = _mm256_max_pd(d,_mm256_setzero_pd());
2964 d2 = _mm256_mul_pd(d,d);
2965 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2967 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2969 /* Evaluate switch function */
2970 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2971 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2972 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2976 fscal = _mm256_and_pd(fscal,cutoff_mask);
2978 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2980 /* Calculate temporary vectorial force */
2981 tx = _mm256_mul_pd(fscal,dx32);
2982 ty = _mm256_mul_pd(fscal,dy32);
2983 tz = _mm256_mul_pd(fscal,dz32);
2985 /* Update vectorial force */
2986 fix3 = _mm256_add_pd(fix3,tx);
2987 fiy3 = _mm256_add_pd(fiy3,ty);
2988 fiz3 = _mm256_add_pd(fiz3,tz);
2990 fjx2 = _mm256_add_pd(fjx2,tx);
2991 fjy2 = _mm256_add_pd(fjy2,ty);
2992 fjz2 = _mm256_add_pd(fjz2,tz);
2996 /**************************
2997 * CALCULATE INTERACTIONS *
2998 **************************/
3000 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3003 r33 = _mm256_mul_pd(rsq33,rinv33);
3004 r33 = _mm256_andnot_pd(dummy_mask,r33);
3006 /* EWALD ELECTROSTATICS */
3008 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3009 ewrt = _mm256_mul_pd(r33,ewtabscale);
3010 ewitab = _mm256_cvttpd_epi32(ewrt);
3011 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3012 ewitab = _mm_slli_epi32(ewitab,2);
3013 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3014 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3015 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3016 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3017 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3018 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3019 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3020 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
3021 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
3023 d = _mm256_sub_pd(r33,rswitch);
3024 d = _mm256_max_pd(d,_mm256_setzero_pd());
3025 d2 = _mm256_mul_pd(d,d);
3026 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3028 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3030 /* Evaluate switch function */
3031 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3032 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
3033 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
3037 fscal = _mm256_and_pd(fscal,cutoff_mask);
3039 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3041 /* Calculate temporary vectorial force */
3042 tx = _mm256_mul_pd(fscal,dx33);
3043 ty = _mm256_mul_pd(fscal,dy33);
3044 tz = _mm256_mul_pd(fscal,dz33);
3046 /* Update vectorial force */
3047 fix3 = _mm256_add_pd(fix3,tx);
3048 fiy3 = _mm256_add_pd(fiy3,ty);
3049 fiz3 = _mm256_add_pd(fiz3,tz);
3051 fjx3 = _mm256_add_pd(fjx3,tx);
3052 fjy3 = _mm256_add_pd(fjy3,ty);
3053 fjz3 = _mm256_add_pd(fjz3,tz);
3057 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3058 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3059 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3060 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3062 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3063 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3065 /* Inner loop uses 567 flops */
3068 /* End of innermost loop */
3070 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3071 f+i_coord_offset+DIM,fshift+i_shift_offset);
3073 /* Increment number of inner iterations */
3074 inneriter += j_index_end - j_index_start;
3076 /* Outer loop uses 18 flops */
3079 /* Increment number of outer iterations */
3082 /* Update outer/inner flops */
3084 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);