2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr1;
84 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 real * vdwioffsetptr2;
86 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 real * vdwioffsetptr3;
88 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
90 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
92 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
94 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
98 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
101 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
102 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
103 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
107 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
110 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
111 real rswitch_scalar,d_scalar;
112 __m256d dummy_mask,cutoff_mask;
113 __m128 tmpmask0,tmpmask1;
114 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
115 __m256d one = _mm256_set1_pd(1.0);
116 __m256d two = _mm256_set1_pd(2.0);
122 jindex = nlist->jindex;
124 shiftidx = nlist->shift;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm256_set1_pd(fr->ic->epsfac);
129 charge = mdatoms->chargeA;
131 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
132 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
133 beta2 = _mm256_mul_pd(beta,beta);
134 beta3 = _mm256_mul_pd(beta,beta2);
136 ewtab = fr->ic->tabq_coul_FDV0;
137 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
138 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
143 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
144 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
146 jq1 = _mm256_set1_pd(charge[inr+1]);
147 jq2 = _mm256_set1_pd(charge[inr+2]);
148 jq3 = _mm256_set1_pd(charge[inr+3]);
149 qq11 = _mm256_mul_pd(iq1,jq1);
150 qq12 = _mm256_mul_pd(iq1,jq2);
151 qq13 = _mm256_mul_pd(iq1,jq3);
152 qq21 = _mm256_mul_pd(iq2,jq1);
153 qq22 = _mm256_mul_pd(iq2,jq2);
154 qq23 = _mm256_mul_pd(iq2,jq3);
155 qq31 = _mm256_mul_pd(iq3,jq1);
156 qq32 = _mm256_mul_pd(iq3,jq2);
157 qq33 = _mm256_mul_pd(iq3,jq3);
159 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160 rcutoff_scalar = fr->ic->rcoulomb;
161 rcutoff = _mm256_set1_pd(rcutoff_scalar);
162 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
164 rswitch_scalar = fr->ic->rcoulomb_switch;
165 rswitch = _mm256_set1_pd(rswitch_scalar);
166 /* Setup switch parameters */
167 d_scalar = rcutoff_scalar-rswitch_scalar;
168 d = _mm256_set1_pd(d_scalar);
169 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
170 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
171 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
172 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
173 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176 /* Avoid stupid compiler warnings */
177 jnrA = jnrB = jnrC = jnrD = 0;
186 for(iidx=0;iidx<4*DIM;iidx++)
191 /* Start outer loop over neighborlists */
192 for(iidx=0; iidx<nri; iidx++)
194 /* Load shift vector for this list */
195 i_shift_offset = DIM*shiftidx[iidx];
197 /* Load limits for loop over neighbors */
198 j_index_start = jindex[iidx];
199 j_index_end = jindex[iidx+1];
201 /* Get outer coordinate index */
203 i_coord_offset = DIM*inr;
205 /* Load i particle coords and add shift vector */
206 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
207 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
209 fix1 = _mm256_setzero_pd();
210 fiy1 = _mm256_setzero_pd();
211 fiz1 = _mm256_setzero_pd();
212 fix2 = _mm256_setzero_pd();
213 fiy2 = _mm256_setzero_pd();
214 fiz2 = _mm256_setzero_pd();
215 fix3 = _mm256_setzero_pd();
216 fiy3 = _mm256_setzero_pd();
217 fiz3 = _mm256_setzero_pd();
219 /* Reset potential sums */
220 velecsum = _mm256_setzero_pd();
222 /* Start inner kernel loop */
223 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
226 /* Get j neighbor index, and coordinate index */
231 j_coord_offsetA = DIM*jnrA;
232 j_coord_offsetB = DIM*jnrB;
233 j_coord_offsetC = DIM*jnrC;
234 j_coord_offsetD = DIM*jnrD;
236 /* load j atom coordinates */
237 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
238 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
239 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
241 /* Calculate displacement vector */
242 dx11 = _mm256_sub_pd(ix1,jx1);
243 dy11 = _mm256_sub_pd(iy1,jy1);
244 dz11 = _mm256_sub_pd(iz1,jz1);
245 dx12 = _mm256_sub_pd(ix1,jx2);
246 dy12 = _mm256_sub_pd(iy1,jy2);
247 dz12 = _mm256_sub_pd(iz1,jz2);
248 dx13 = _mm256_sub_pd(ix1,jx3);
249 dy13 = _mm256_sub_pd(iy1,jy3);
250 dz13 = _mm256_sub_pd(iz1,jz3);
251 dx21 = _mm256_sub_pd(ix2,jx1);
252 dy21 = _mm256_sub_pd(iy2,jy1);
253 dz21 = _mm256_sub_pd(iz2,jz1);
254 dx22 = _mm256_sub_pd(ix2,jx2);
255 dy22 = _mm256_sub_pd(iy2,jy2);
256 dz22 = _mm256_sub_pd(iz2,jz2);
257 dx23 = _mm256_sub_pd(ix2,jx3);
258 dy23 = _mm256_sub_pd(iy2,jy3);
259 dz23 = _mm256_sub_pd(iz2,jz3);
260 dx31 = _mm256_sub_pd(ix3,jx1);
261 dy31 = _mm256_sub_pd(iy3,jy1);
262 dz31 = _mm256_sub_pd(iz3,jz1);
263 dx32 = _mm256_sub_pd(ix3,jx2);
264 dy32 = _mm256_sub_pd(iy3,jy2);
265 dz32 = _mm256_sub_pd(iz3,jz2);
266 dx33 = _mm256_sub_pd(ix3,jx3);
267 dy33 = _mm256_sub_pd(iy3,jy3);
268 dz33 = _mm256_sub_pd(iz3,jz3);
270 /* Calculate squared distance and things based on it */
271 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
272 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
273 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
274 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
275 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
276 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
277 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
278 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
279 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
281 rinv11 = avx256_invsqrt_d(rsq11);
282 rinv12 = avx256_invsqrt_d(rsq12);
283 rinv13 = avx256_invsqrt_d(rsq13);
284 rinv21 = avx256_invsqrt_d(rsq21);
285 rinv22 = avx256_invsqrt_d(rsq22);
286 rinv23 = avx256_invsqrt_d(rsq23);
287 rinv31 = avx256_invsqrt_d(rsq31);
288 rinv32 = avx256_invsqrt_d(rsq32);
289 rinv33 = avx256_invsqrt_d(rsq33);
291 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
292 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
293 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
294 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
295 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
296 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
297 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
298 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
299 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
301 fjx1 = _mm256_setzero_pd();
302 fjy1 = _mm256_setzero_pd();
303 fjz1 = _mm256_setzero_pd();
304 fjx2 = _mm256_setzero_pd();
305 fjy2 = _mm256_setzero_pd();
306 fjz2 = _mm256_setzero_pd();
307 fjx3 = _mm256_setzero_pd();
308 fjy3 = _mm256_setzero_pd();
309 fjz3 = _mm256_setzero_pd();
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 if (gmx_mm256_any_lt(rsq11,rcutoff2))
318 r11 = _mm256_mul_pd(rsq11,rinv11);
320 /* EWALD ELECTROSTATICS */
322 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
323 ewrt = _mm256_mul_pd(r11,ewtabscale);
324 ewitab = _mm256_cvttpd_epi32(ewrt);
325 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
326 ewitab = _mm_slli_epi32(ewitab,2);
327 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
328 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
329 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
330 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
331 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
332 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
333 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
334 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
335 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
337 d = _mm256_sub_pd(r11,rswitch);
338 d = _mm256_max_pd(d,_mm256_setzero_pd());
339 d2 = _mm256_mul_pd(d,d);
340 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
342 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
344 /* Evaluate switch function */
345 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
346 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
347 velec = _mm256_mul_pd(velec,sw);
348 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 velec = _mm256_and_pd(velec,cutoff_mask);
352 velecsum = _mm256_add_pd(velecsum,velec);
356 fscal = _mm256_and_pd(fscal,cutoff_mask);
358 /* Calculate temporary vectorial force */
359 tx = _mm256_mul_pd(fscal,dx11);
360 ty = _mm256_mul_pd(fscal,dy11);
361 tz = _mm256_mul_pd(fscal,dz11);
363 /* Update vectorial force */
364 fix1 = _mm256_add_pd(fix1,tx);
365 fiy1 = _mm256_add_pd(fiy1,ty);
366 fiz1 = _mm256_add_pd(fiz1,tz);
368 fjx1 = _mm256_add_pd(fjx1,tx);
369 fjy1 = _mm256_add_pd(fjy1,ty);
370 fjz1 = _mm256_add_pd(fjz1,tz);
374 /**************************
375 * CALCULATE INTERACTIONS *
376 **************************/
378 if (gmx_mm256_any_lt(rsq12,rcutoff2))
381 r12 = _mm256_mul_pd(rsq12,rinv12);
383 /* EWALD ELECTROSTATICS */
385 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
386 ewrt = _mm256_mul_pd(r12,ewtabscale);
387 ewitab = _mm256_cvttpd_epi32(ewrt);
388 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
389 ewitab = _mm_slli_epi32(ewitab,2);
390 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
391 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
392 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
393 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
394 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
395 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
396 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
397 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
398 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
400 d = _mm256_sub_pd(r12,rswitch);
401 d = _mm256_max_pd(d,_mm256_setzero_pd());
402 d2 = _mm256_mul_pd(d,d);
403 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
405 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
407 /* Evaluate switch function */
408 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
409 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
410 velec = _mm256_mul_pd(velec,sw);
411 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velec = _mm256_and_pd(velec,cutoff_mask);
415 velecsum = _mm256_add_pd(velecsum,velec);
419 fscal = _mm256_and_pd(fscal,cutoff_mask);
421 /* Calculate temporary vectorial force */
422 tx = _mm256_mul_pd(fscal,dx12);
423 ty = _mm256_mul_pd(fscal,dy12);
424 tz = _mm256_mul_pd(fscal,dz12);
426 /* Update vectorial force */
427 fix1 = _mm256_add_pd(fix1,tx);
428 fiy1 = _mm256_add_pd(fiy1,ty);
429 fiz1 = _mm256_add_pd(fiz1,tz);
431 fjx2 = _mm256_add_pd(fjx2,tx);
432 fjy2 = _mm256_add_pd(fjy2,ty);
433 fjz2 = _mm256_add_pd(fjz2,tz);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 if (gmx_mm256_any_lt(rsq13,rcutoff2))
444 r13 = _mm256_mul_pd(rsq13,rinv13);
446 /* EWALD ELECTROSTATICS */
448 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
449 ewrt = _mm256_mul_pd(r13,ewtabscale);
450 ewitab = _mm256_cvttpd_epi32(ewrt);
451 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
452 ewitab = _mm_slli_epi32(ewitab,2);
453 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
454 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
455 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
456 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
457 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
458 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
459 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
460 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
461 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
463 d = _mm256_sub_pd(r13,rswitch);
464 d = _mm256_max_pd(d,_mm256_setzero_pd());
465 d2 = _mm256_mul_pd(d,d);
466 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
468 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
470 /* Evaluate switch function */
471 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
472 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
473 velec = _mm256_mul_pd(velec,sw);
474 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
476 /* Update potential sum for this i atom from the interaction with this j atom. */
477 velec = _mm256_and_pd(velec,cutoff_mask);
478 velecsum = _mm256_add_pd(velecsum,velec);
482 fscal = _mm256_and_pd(fscal,cutoff_mask);
484 /* Calculate temporary vectorial force */
485 tx = _mm256_mul_pd(fscal,dx13);
486 ty = _mm256_mul_pd(fscal,dy13);
487 tz = _mm256_mul_pd(fscal,dz13);
489 /* Update vectorial force */
490 fix1 = _mm256_add_pd(fix1,tx);
491 fiy1 = _mm256_add_pd(fiy1,ty);
492 fiz1 = _mm256_add_pd(fiz1,tz);
494 fjx3 = _mm256_add_pd(fjx3,tx);
495 fjy3 = _mm256_add_pd(fjy3,ty);
496 fjz3 = _mm256_add_pd(fjz3,tz);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 if (gmx_mm256_any_lt(rsq21,rcutoff2))
507 r21 = _mm256_mul_pd(rsq21,rinv21);
509 /* EWALD ELECTROSTATICS */
511 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
512 ewrt = _mm256_mul_pd(r21,ewtabscale);
513 ewitab = _mm256_cvttpd_epi32(ewrt);
514 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
515 ewitab = _mm_slli_epi32(ewitab,2);
516 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
517 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
518 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
519 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
520 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
521 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
522 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
523 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
524 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
526 d = _mm256_sub_pd(r21,rswitch);
527 d = _mm256_max_pd(d,_mm256_setzero_pd());
528 d2 = _mm256_mul_pd(d,d);
529 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
531 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
533 /* Evaluate switch function */
534 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
535 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
536 velec = _mm256_mul_pd(velec,sw);
537 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velec = _mm256_and_pd(velec,cutoff_mask);
541 velecsum = _mm256_add_pd(velecsum,velec);
545 fscal = _mm256_and_pd(fscal,cutoff_mask);
547 /* Calculate temporary vectorial force */
548 tx = _mm256_mul_pd(fscal,dx21);
549 ty = _mm256_mul_pd(fscal,dy21);
550 tz = _mm256_mul_pd(fscal,dz21);
552 /* Update vectorial force */
553 fix2 = _mm256_add_pd(fix2,tx);
554 fiy2 = _mm256_add_pd(fiy2,ty);
555 fiz2 = _mm256_add_pd(fiz2,tz);
557 fjx1 = _mm256_add_pd(fjx1,tx);
558 fjy1 = _mm256_add_pd(fjy1,ty);
559 fjz1 = _mm256_add_pd(fjz1,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 if (gmx_mm256_any_lt(rsq22,rcutoff2))
570 r22 = _mm256_mul_pd(rsq22,rinv22);
572 /* EWALD ELECTROSTATICS */
574 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
575 ewrt = _mm256_mul_pd(r22,ewtabscale);
576 ewitab = _mm256_cvttpd_epi32(ewrt);
577 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
578 ewitab = _mm_slli_epi32(ewitab,2);
579 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
580 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
581 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
582 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
583 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
584 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
585 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
586 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
587 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
589 d = _mm256_sub_pd(r22,rswitch);
590 d = _mm256_max_pd(d,_mm256_setzero_pd());
591 d2 = _mm256_mul_pd(d,d);
592 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
594 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
596 /* Evaluate switch function */
597 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
598 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
599 velec = _mm256_mul_pd(velec,sw);
600 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
602 /* Update potential sum for this i atom from the interaction with this j atom. */
603 velec = _mm256_and_pd(velec,cutoff_mask);
604 velecsum = _mm256_add_pd(velecsum,velec);
608 fscal = _mm256_and_pd(fscal,cutoff_mask);
610 /* Calculate temporary vectorial force */
611 tx = _mm256_mul_pd(fscal,dx22);
612 ty = _mm256_mul_pd(fscal,dy22);
613 tz = _mm256_mul_pd(fscal,dz22);
615 /* Update vectorial force */
616 fix2 = _mm256_add_pd(fix2,tx);
617 fiy2 = _mm256_add_pd(fiy2,ty);
618 fiz2 = _mm256_add_pd(fiz2,tz);
620 fjx2 = _mm256_add_pd(fjx2,tx);
621 fjy2 = _mm256_add_pd(fjy2,ty);
622 fjz2 = _mm256_add_pd(fjz2,tz);
626 /**************************
627 * CALCULATE INTERACTIONS *
628 **************************/
630 if (gmx_mm256_any_lt(rsq23,rcutoff2))
633 r23 = _mm256_mul_pd(rsq23,rinv23);
635 /* EWALD ELECTROSTATICS */
637 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
638 ewrt = _mm256_mul_pd(r23,ewtabscale);
639 ewitab = _mm256_cvttpd_epi32(ewrt);
640 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
641 ewitab = _mm_slli_epi32(ewitab,2);
642 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
643 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
644 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
645 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
646 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
647 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
648 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
649 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
650 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
652 d = _mm256_sub_pd(r23,rswitch);
653 d = _mm256_max_pd(d,_mm256_setzero_pd());
654 d2 = _mm256_mul_pd(d,d);
655 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
657 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
659 /* Evaluate switch function */
660 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
661 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
662 velec = _mm256_mul_pd(velec,sw);
663 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velec = _mm256_and_pd(velec,cutoff_mask);
667 velecsum = _mm256_add_pd(velecsum,velec);
671 fscal = _mm256_and_pd(fscal,cutoff_mask);
673 /* Calculate temporary vectorial force */
674 tx = _mm256_mul_pd(fscal,dx23);
675 ty = _mm256_mul_pd(fscal,dy23);
676 tz = _mm256_mul_pd(fscal,dz23);
678 /* Update vectorial force */
679 fix2 = _mm256_add_pd(fix2,tx);
680 fiy2 = _mm256_add_pd(fiy2,ty);
681 fiz2 = _mm256_add_pd(fiz2,tz);
683 fjx3 = _mm256_add_pd(fjx3,tx);
684 fjy3 = _mm256_add_pd(fjy3,ty);
685 fjz3 = _mm256_add_pd(fjz3,tz);
689 /**************************
690 * CALCULATE INTERACTIONS *
691 **************************/
693 if (gmx_mm256_any_lt(rsq31,rcutoff2))
696 r31 = _mm256_mul_pd(rsq31,rinv31);
698 /* EWALD ELECTROSTATICS */
700 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
701 ewrt = _mm256_mul_pd(r31,ewtabscale);
702 ewitab = _mm256_cvttpd_epi32(ewrt);
703 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
704 ewitab = _mm_slli_epi32(ewitab,2);
705 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
706 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
707 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
708 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
709 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
710 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
711 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
712 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
713 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
715 d = _mm256_sub_pd(r31,rswitch);
716 d = _mm256_max_pd(d,_mm256_setzero_pd());
717 d2 = _mm256_mul_pd(d,d);
718 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
720 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
722 /* Evaluate switch function */
723 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
724 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
725 velec = _mm256_mul_pd(velec,sw);
726 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
728 /* Update potential sum for this i atom from the interaction with this j atom. */
729 velec = _mm256_and_pd(velec,cutoff_mask);
730 velecsum = _mm256_add_pd(velecsum,velec);
734 fscal = _mm256_and_pd(fscal,cutoff_mask);
736 /* Calculate temporary vectorial force */
737 tx = _mm256_mul_pd(fscal,dx31);
738 ty = _mm256_mul_pd(fscal,dy31);
739 tz = _mm256_mul_pd(fscal,dz31);
741 /* Update vectorial force */
742 fix3 = _mm256_add_pd(fix3,tx);
743 fiy3 = _mm256_add_pd(fiy3,ty);
744 fiz3 = _mm256_add_pd(fiz3,tz);
746 fjx1 = _mm256_add_pd(fjx1,tx);
747 fjy1 = _mm256_add_pd(fjy1,ty);
748 fjz1 = _mm256_add_pd(fjz1,tz);
752 /**************************
753 * CALCULATE INTERACTIONS *
754 **************************/
756 if (gmx_mm256_any_lt(rsq32,rcutoff2))
759 r32 = _mm256_mul_pd(rsq32,rinv32);
761 /* EWALD ELECTROSTATICS */
763 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
764 ewrt = _mm256_mul_pd(r32,ewtabscale);
765 ewitab = _mm256_cvttpd_epi32(ewrt);
766 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
767 ewitab = _mm_slli_epi32(ewitab,2);
768 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
769 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
770 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
771 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
772 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
773 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
774 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
775 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
776 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
778 d = _mm256_sub_pd(r32,rswitch);
779 d = _mm256_max_pd(d,_mm256_setzero_pd());
780 d2 = _mm256_mul_pd(d,d);
781 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
783 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
785 /* Evaluate switch function */
786 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
787 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
788 velec = _mm256_mul_pd(velec,sw);
789 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
791 /* Update potential sum for this i atom from the interaction with this j atom. */
792 velec = _mm256_and_pd(velec,cutoff_mask);
793 velecsum = _mm256_add_pd(velecsum,velec);
797 fscal = _mm256_and_pd(fscal,cutoff_mask);
799 /* Calculate temporary vectorial force */
800 tx = _mm256_mul_pd(fscal,dx32);
801 ty = _mm256_mul_pd(fscal,dy32);
802 tz = _mm256_mul_pd(fscal,dz32);
804 /* Update vectorial force */
805 fix3 = _mm256_add_pd(fix3,tx);
806 fiy3 = _mm256_add_pd(fiy3,ty);
807 fiz3 = _mm256_add_pd(fiz3,tz);
809 fjx2 = _mm256_add_pd(fjx2,tx);
810 fjy2 = _mm256_add_pd(fjy2,ty);
811 fjz2 = _mm256_add_pd(fjz2,tz);
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
819 if (gmx_mm256_any_lt(rsq33,rcutoff2))
822 r33 = _mm256_mul_pd(rsq33,rinv33);
824 /* EWALD ELECTROSTATICS */
826 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
827 ewrt = _mm256_mul_pd(r33,ewtabscale);
828 ewitab = _mm256_cvttpd_epi32(ewrt);
829 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
830 ewitab = _mm_slli_epi32(ewitab,2);
831 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
832 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
833 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
834 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
835 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
836 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
837 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
838 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
839 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
841 d = _mm256_sub_pd(r33,rswitch);
842 d = _mm256_max_pd(d,_mm256_setzero_pd());
843 d2 = _mm256_mul_pd(d,d);
844 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
846 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
848 /* Evaluate switch function */
849 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
850 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
851 velec = _mm256_mul_pd(velec,sw);
852 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
854 /* Update potential sum for this i atom from the interaction with this j atom. */
855 velec = _mm256_and_pd(velec,cutoff_mask);
856 velecsum = _mm256_add_pd(velecsum,velec);
860 fscal = _mm256_and_pd(fscal,cutoff_mask);
862 /* Calculate temporary vectorial force */
863 tx = _mm256_mul_pd(fscal,dx33);
864 ty = _mm256_mul_pd(fscal,dy33);
865 tz = _mm256_mul_pd(fscal,dz33);
867 /* Update vectorial force */
868 fix3 = _mm256_add_pd(fix3,tx);
869 fiy3 = _mm256_add_pd(fiy3,ty);
870 fiz3 = _mm256_add_pd(fiz3,tz);
872 fjx3 = _mm256_add_pd(fjx3,tx);
873 fjy3 = _mm256_add_pd(fjy3,ty);
874 fjz3 = _mm256_add_pd(fjz3,tz);
878 fjptrA = f+j_coord_offsetA;
879 fjptrB = f+j_coord_offsetB;
880 fjptrC = f+j_coord_offsetC;
881 fjptrD = f+j_coord_offsetD;
883 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
884 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
886 /* Inner loop uses 585 flops */
892 /* Get j neighbor index, and coordinate index */
893 jnrlistA = jjnr[jidx];
894 jnrlistB = jjnr[jidx+1];
895 jnrlistC = jjnr[jidx+2];
896 jnrlistD = jjnr[jidx+3];
897 /* Sign of each element will be negative for non-real atoms.
898 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
899 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
901 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
903 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
904 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
905 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
907 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
908 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
909 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
910 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
911 j_coord_offsetA = DIM*jnrA;
912 j_coord_offsetB = DIM*jnrB;
913 j_coord_offsetC = DIM*jnrC;
914 j_coord_offsetD = DIM*jnrD;
916 /* load j atom coordinates */
917 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
918 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
919 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
921 /* Calculate displacement vector */
922 dx11 = _mm256_sub_pd(ix1,jx1);
923 dy11 = _mm256_sub_pd(iy1,jy1);
924 dz11 = _mm256_sub_pd(iz1,jz1);
925 dx12 = _mm256_sub_pd(ix1,jx2);
926 dy12 = _mm256_sub_pd(iy1,jy2);
927 dz12 = _mm256_sub_pd(iz1,jz2);
928 dx13 = _mm256_sub_pd(ix1,jx3);
929 dy13 = _mm256_sub_pd(iy1,jy3);
930 dz13 = _mm256_sub_pd(iz1,jz3);
931 dx21 = _mm256_sub_pd(ix2,jx1);
932 dy21 = _mm256_sub_pd(iy2,jy1);
933 dz21 = _mm256_sub_pd(iz2,jz1);
934 dx22 = _mm256_sub_pd(ix2,jx2);
935 dy22 = _mm256_sub_pd(iy2,jy2);
936 dz22 = _mm256_sub_pd(iz2,jz2);
937 dx23 = _mm256_sub_pd(ix2,jx3);
938 dy23 = _mm256_sub_pd(iy2,jy3);
939 dz23 = _mm256_sub_pd(iz2,jz3);
940 dx31 = _mm256_sub_pd(ix3,jx1);
941 dy31 = _mm256_sub_pd(iy3,jy1);
942 dz31 = _mm256_sub_pd(iz3,jz1);
943 dx32 = _mm256_sub_pd(ix3,jx2);
944 dy32 = _mm256_sub_pd(iy3,jy2);
945 dz32 = _mm256_sub_pd(iz3,jz2);
946 dx33 = _mm256_sub_pd(ix3,jx3);
947 dy33 = _mm256_sub_pd(iy3,jy3);
948 dz33 = _mm256_sub_pd(iz3,jz3);
950 /* Calculate squared distance and things based on it */
951 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
952 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
953 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
954 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
955 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
956 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
957 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
958 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
959 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
961 rinv11 = avx256_invsqrt_d(rsq11);
962 rinv12 = avx256_invsqrt_d(rsq12);
963 rinv13 = avx256_invsqrt_d(rsq13);
964 rinv21 = avx256_invsqrt_d(rsq21);
965 rinv22 = avx256_invsqrt_d(rsq22);
966 rinv23 = avx256_invsqrt_d(rsq23);
967 rinv31 = avx256_invsqrt_d(rsq31);
968 rinv32 = avx256_invsqrt_d(rsq32);
969 rinv33 = avx256_invsqrt_d(rsq33);
971 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
972 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
973 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
974 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
975 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
976 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
977 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
978 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
979 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
981 fjx1 = _mm256_setzero_pd();
982 fjy1 = _mm256_setzero_pd();
983 fjz1 = _mm256_setzero_pd();
984 fjx2 = _mm256_setzero_pd();
985 fjy2 = _mm256_setzero_pd();
986 fjz2 = _mm256_setzero_pd();
987 fjx3 = _mm256_setzero_pd();
988 fjy3 = _mm256_setzero_pd();
989 fjz3 = _mm256_setzero_pd();
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm256_any_lt(rsq11,rcutoff2))
998 r11 = _mm256_mul_pd(rsq11,rinv11);
999 r11 = _mm256_andnot_pd(dummy_mask,r11);
1001 /* EWALD ELECTROSTATICS */
1003 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1004 ewrt = _mm256_mul_pd(r11,ewtabscale);
1005 ewitab = _mm256_cvttpd_epi32(ewrt);
1006 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1007 ewitab = _mm_slli_epi32(ewitab,2);
1008 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1009 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1010 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1011 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1012 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1013 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1014 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1015 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1016 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1018 d = _mm256_sub_pd(r11,rswitch);
1019 d = _mm256_max_pd(d,_mm256_setzero_pd());
1020 d2 = _mm256_mul_pd(d,d);
1021 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1023 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1025 /* Evaluate switch function */
1026 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1027 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1028 velec = _mm256_mul_pd(velec,sw);
1029 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1031 /* Update potential sum for this i atom from the interaction with this j atom. */
1032 velec = _mm256_and_pd(velec,cutoff_mask);
1033 velec = _mm256_andnot_pd(dummy_mask,velec);
1034 velecsum = _mm256_add_pd(velecsum,velec);
1038 fscal = _mm256_and_pd(fscal,cutoff_mask);
1040 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1042 /* Calculate temporary vectorial force */
1043 tx = _mm256_mul_pd(fscal,dx11);
1044 ty = _mm256_mul_pd(fscal,dy11);
1045 tz = _mm256_mul_pd(fscal,dz11);
1047 /* Update vectorial force */
1048 fix1 = _mm256_add_pd(fix1,tx);
1049 fiy1 = _mm256_add_pd(fiy1,ty);
1050 fiz1 = _mm256_add_pd(fiz1,tz);
1052 fjx1 = _mm256_add_pd(fjx1,tx);
1053 fjy1 = _mm256_add_pd(fjy1,ty);
1054 fjz1 = _mm256_add_pd(fjz1,tz);
1058 /**************************
1059 * CALCULATE INTERACTIONS *
1060 **************************/
1062 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1065 r12 = _mm256_mul_pd(rsq12,rinv12);
1066 r12 = _mm256_andnot_pd(dummy_mask,r12);
1068 /* EWALD ELECTROSTATICS */
1070 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1071 ewrt = _mm256_mul_pd(r12,ewtabscale);
1072 ewitab = _mm256_cvttpd_epi32(ewrt);
1073 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1074 ewitab = _mm_slli_epi32(ewitab,2);
1075 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1076 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1077 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1078 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1079 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1080 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1081 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1082 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1083 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1085 d = _mm256_sub_pd(r12,rswitch);
1086 d = _mm256_max_pd(d,_mm256_setzero_pd());
1087 d2 = _mm256_mul_pd(d,d);
1088 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1090 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1092 /* Evaluate switch function */
1093 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1094 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1095 velec = _mm256_mul_pd(velec,sw);
1096 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1098 /* Update potential sum for this i atom from the interaction with this j atom. */
1099 velec = _mm256_and_pd(velec,cutoff_mask);
1100 velec = _mm256_andnot_pd(dummy_mask,velec);
1101 velecsum = _mm256_add_pd(velecsum,velec);
1105 fscal = _mm256_and_pd(fscal,cutoff_mask);
1107 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1109 /* Calculate temporary vectorial force */
1110 tx = _mm256_mul_pd(fscal,dx12);
1111 ty = _mm256_mul_pd(fscal,dy12);
1112 tz = _mm256_mul_pd(fscal,dz12);
1114 /* Update vectorial force */
1115 fix1 = _mm256_add_pd(fix1,tx);
1116 fiy1 = _mm256_add_pd(fiy1,ty);
1117 fiz1 = _mm256_add_pd(fiz1,tz);
1119 fjx2 = _mm256_add_pd(fjx2,tx);
1120 fjy2 = _mm256_add_pd(fjy2,ty);
1121 fjz2 = _mm256_add_pd(fjz2,tz);
1125 /**************************
1126 * CALCULATE INTERACTIONS *
1127 **************************/
1129 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1132 r13 = _mm256_mul_pd(rsq13,rinv13);
1133 r13 = _mm256_andnot_pd(dummy_mask,r13);
1135 /* EWALD ELECTROSTATICS */
1137 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1138 ewrt = _mm256_mul_pd(r13,ewtabscale);
1139 ewitab = _mm256_cvttpd_epi32(ewrt);
1140 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1141 ewitab = _mm_slli_epi32(ewitab,2);
1142 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1143 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1144 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1145 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1146 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1147 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1148 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1149 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1150 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1152 d = _mm256_sub_pd(r13,rswitch);
1153 d = _mm256_max_pd(d,_mm256_setzero_pd());
1154 d2 = _mm256_mul_pd(d,d);
1155 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1157 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1159 /* Evaluate switch function */
1160 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1161 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
1162 velec = _mm256_mul_pd(velec,sw);
1163 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1165 /* Update potential sum for this i atom from the interaction with this j atom. */
1166 velec = _mm256_and_pd(velec,cutoff_mask);
1167 velec = _mm256_andnot_pd(dummy_mask,velec);
1168 velecsum = _mm256_add_pd(velecsum,velec);
1172 fscal = _mm256_and_pd(fscal,cutoff_mask);
1174 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1176 /* Calculate temporary vectorial force */
1177 tx = _mm256_mul_pd(fscal,dx13);
1178 ty = _mm256_mul_pd(fscal,dy13);
1179 tz = _mm256_mul_pd(fscal,dz13);
1181 /* Update vectorial force */
1182 fix1 = _mm256_add_pd(fix1,tx);
1183 fiy1 = _mm256_add_pd(fiy1,ty);
1184 fiz1 = _mm256_add_pd(fiz1,tz);
1186 fjx3 = _mm256_add_pd(fjx3,tx);
1187 fjy3 = _mm256_add_pd(fjy3,ty);
1188 fjz3 = _mm256_add_pd(fjz3,tz);
1192 /**************************
1193 * CALCULATE INTERACTIONS *
1194 **************************/
1196 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1199 r21 = _mm256_mul_pd(rsq21,rinv21);
1200 r21 = _mm256_andnot_pd(dummy_mask,r21);
1202 /* EWALD ELECTROSTATICS */
1204 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1205 ewrt = _mm256_mul_pd(r21,ewtabscale);
1206 ewitab = _mm256_cvttpd_epi32(ewrt);
1207 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1208 ewitab = _mm_slli_epi32(ewitab,2);
1209 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1210 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1211 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1212 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1213 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1214 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1215 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1216 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1217 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1219 d = _mm256_sub_pd(r21,rswitch);
1220 d = _mm256_max_pd(d,_mm256_setzero_pd());
1221 d2 = _mm256_mul_pd(d,d);
1222 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1224 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1226 /* Evaluate switch function */
1227 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1228 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1229 velec = _mm256_mul_pd(velec,sw);
1230 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1232 /* Update potential sum for this i atom from the interaction with this j atom. */
1233 velec = _mm256_and_pd(velec,cutoff_mask);
1234 velec = _mm256_andnot_pd(dummy_mask,velec);
1235 velecsum = _mm256_add_pd(velecsum,velec);
1239 fscal = _mm256_and_pd(fscal,cutoff_mask);
1241 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1243 /* Calculate temporary vectorial force */
1244 tx = _mm256_mul_pd(fscal,dx21);
1245 ty = _mm256_mul_pd(fscal,dy21);
1246 tz = _mm256_mul_pd(fscal,dz21);
1248 /* Update vectorial force */
1249 fix2 = _mm256_add_pd(fix2,tx);
1250 fiy2 = _mm256_add_pd(fiy2,ty);
1251 fiz2 = _mm256_add_pd(fiz2,tz);
1253 fjx1 = _mm256_add_pd(fjx1,tx);
1254 fjy1 = _mm256_add_pd(fjy1,ty);
1255 fjz1 = _mm256_add_pd(fjz1,tz);
1259 /**************************
1260 * CALCULATE INTERACTIONS *
1261 **************************/
1263 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1266 r22 = _mm256_mul_pd(rsq22,rinv22);
1267 r22 = _mm256_andnot_pd(dummy_mask,r22);
1269 /* EWALD ELECTROSTATICS */
1271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1272 ewrt = _mm256_mul_pd(r22,ewtabscale);
1273 ewitab = _mm256_cvttpd_epi32(ewrt);
1274 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1275 ewitab = _mm_slli_epi32(ewitab,2);
1276 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1277 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1278 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1279 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1280 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1281 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1282 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1283 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1284 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1286 d = _mm256_sub_pd(r22,rswitch);
1287 d = _mm256_max_pd(d,_mm256_setzero_pd());
1288 d2 = _mm256_mul_pd(d,d);
1289 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1291 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1293 /* Evaluate switch function */
1294 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1295 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1296 velec = _mm256_mul_pd(velec,sw);
1297 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1299 /* Update potential sum for this i atom from the interaction with this j atom. */
1300 velec = _mm256_and_pd(velec,cutoff_mask);
1301 velec = _mm256_andnot_pd(dummy_mask,velec);
1302 velecsum = _mm256_add_pd(velecsum,velec);
1306 fscal = _mm256_and_pd(fscal,cutoff_mask);
1308 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1310 /* Calculate temporary vectorial force */
1311 tx = _mm256_mul_pd(fscal,dx22);
1312 ty = _mm256_mul_pd(fscal,dy22);
1313 tz = _mm256_mul_pd(fscal,dz22);
1315 /* Update vectorial force */
1316 fix2 = _mm256_add_pd(fix2,tx);
1317 fiy2 = _mm256_add_pd(fiy2,ty);
1318 fiz2 = _mm256_add_pd(fiz2,tz);
1320 fjx2 = _mm256_add_pd(fjx2,tx);
1321 fjy2 = _mm256_add_pd(fjy2,ty);
1322 fjz2 = _mm256_add_pd(fjz2,tz);
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1333 r23 = _mm256_mul_pd(rsq23,rinv23);
1334 r23 = _mm256_andnot_pd(dummy_mask,r23);
1336 /* EWALD ELECTROSTATICS */
1338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1339 ewrt = _mm256_mul_pd(r23,ewtabscale);
1340 ewitab = _mm256_cvttpd_epi32(ewrt);
1341 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1342 ewitab = _mm_slli_epi32(ewitab,2);
1343 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1344 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1345 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1346 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1347 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1348 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1349 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1350 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1351 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1353 d = _mm256_sub_pd(r23,rswitch);
1354 d = _mm256_max_pd(d,_mm256_setzero_pd());
1355 d2 = _mm256_mul_pd(d,d);
1356 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1358 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1360 /* Evaluate switch function */
1361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1362 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
1363 velec = _mm256_mul_pd(velec,sw);
1364 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1366 /* Update potential sum for this i atom from the interaction with this j atom. */
1367 velec = _mm256_and_pd(velec,cutoff_mask);
1368 velec = _mm256_andnot_pd(dummy_mask,velec);
1369 velecsum = _mm256_add_pd(velecsum,velec);
1373 fscal = _mm256_and_pd(fscal,cutoff_mask);
1375 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1377 /* Calculate temporary vectorial force */
1378 tx = _mm256_mul_pd(fscal,dx23);
1379 ty = _mm256_mul_pd(fscal,dy23);
1380 tz = _mm256_mul_pd(fscal,dz23);
1382 /* Update vectorial force */
1383 fix2 = _mm256_add_pd(fix2,tx);
1384 fiy2 = _mm256_add_pd(fiy2,ty);
1385 fiz2 = _mm256_add_pd(fiz2,tz);
1387 fjx3 = _mm256_add_pd(fjx3,tx);
1388 fjy3 = _mm256_add_pd(fjy3,ty);
1389 fjz3 = _mm256_add_pd(fjz3,tz);
1393 /**************************
1394 * CALCULATE INTERACTIONS *
1395 **************************/
1397 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1400 r31 = _mm256_mul_pd(rsq31,rinv31);
1401 r31 = _mm256_andnot_pd(dummy_mask,r31);
1403 /* EWALD ELECTROSTATICS */
1405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1406 ewrt = _mm256_mul_pd(r31,ewtabscale);
1407 ewitab = _mm256_cvttpd_epi32(ewrt);
1408 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1409 ewitab = _mm_slli_epi32(ewitab,2);
1410 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1411 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1412 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1413 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1414 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1415 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1416 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1417 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1418 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1420 d = _mm256_sub_pd(r31,rswitch);
1421 d = _mm256_max_pd(d,_mm256_setzero_pd());
1422 d2 = _mm256_mul_pd(d,d);
1423 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1425 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1427 /* Evaluate switch function */
1428 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1429 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
1430 velec = _mm256_mul_pd(velec,sw);
1431 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1433 /* Update potential sum for this i atom from the interaction with this j atom. */
1434 velec = _mm256_and_pd(velec,cutoff_mask);
1435 velec = _mm256_andnot_pd(dummy_mask,velec);
1436 velecsum = _mm256_add_pd(velecsum,velec);
1440 fscal = _mm256_and_pd(fscal,cutoff_mask);
1442 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1444 /* Calculate temporary vectorial force */
1445 tx = _mm256_mul_pd(fscal,dx31);
1446 ty = _mm256_mul_pd(fscal,dy31);
1447 tz = _mm256_mul_pd(fscal,dz31);
1449 /* Update vectorial force */
1450 fix3 = _mm256_add_pd(fix3,tx);
1451 fiy3 = _mm256_add_pd(fiy3,ty);
1452 fiz3 = _mm256_add_pd(fiz3,tz);
1454 fjx1 = _mm256_add_pd(fjx1,tx);
1455 fjy1 = _mm256_add_pd(fjy1,ty);
1456 fjz1 = _mm256_add_pd(fjz1,tz);
1460 /**************************
1461 * CALCULATE INTERACTIONS *
1462 **************************/
1464 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1467 r32 = _mm256_mul_pd(rsq32,rinv32);
1468 r32 = _mm256_andnot_pd(dummy_mask,r32);
1470 /* EWALD ELECTROSTATICS */
1472 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1473 ewrt = _mm256_mul_pd(r32,ewtabscale);
1474 ewitab = _mm256_cvttpd_epi32(ewrt);
1475 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1476 ewitab = _mm_slli_epi32(ewitab,2);
1477 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1478 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1479 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1480 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1481 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1482 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1483 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1484 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1485 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1487 d = _mm256_sub_pd(r32,rswitch);
1488 d = _mm256_max_pd(d,_mm256_setzero_pd());
1489 d2 = _mm256_mul_pd(d,d);
1490 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1492 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1494 /* Evaluate switch function */
1495 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1496 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
1497 velec = _mm256_mul_pd(velec,sw);
1498 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1500 /* Update potential sum for this i atom from the interaction with this j atom. */
1501 velec = _mm256_and_pd(velec,cutoff_mask);
1502 velec = _mm256_andnot_pd(dummy_mask,velec);
1503 velecsum = _mm256_add_pd(velecsum,velec);
1507 fscal = _mm256_and_pd(fscal,cutoff_mask);
1509 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1511 /* Calculate temporary vectorial force */
1512 tx = _mm256_mul_pd(fscal,dx32);
1513 ty = _mm256_mul_pd(fscal,dy32);
1514 tz = _mm256_mul_pd(fscal,dz32);
1516 /* Update vectorial force */
1517 fix3 = _mm256_add_pd(fix3,tx);
1518 fiy3 = _mm256_add_pd(fiy3,ty);
1519 fiz3 = _mm256_add_pd(fiz3,tz);
1521 fjx2 = _mm256_add_pd(fjx2,tx);
1522 fjy2 = _mm256_add_pd(fjy2,ty);
1523 fjz2 = _mm256_add_pd(fjz2,tz);
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1534 r33 = _mm256_mul_pd(rsq33,rinv33);
1535 r33 = _mm256_andnot_pd(dummy_mask,r33);
1537 /* EWALD ELECTROSTATICS */
1539 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1540 ewrt = _mm256_mul_pd(r33,ewtabscale);
1541 ewitab = _mm256_cvttpd_epi32(ewrt);
1542 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1543 ewitab = _mm_slli_epi32(ewitab,2);
1544 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1545 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1546 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1547 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1548 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1549 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1550 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1551 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1552 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1554 d = _mm256_sub_pd(r33,rswitch);
1555 d = _mm256_max_pd(d,_mm256_setzero_pd());
1556 d2 = _mm256_mul_pd(d,d);
1557 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1559 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1561 /* Evaluate switch function */
1562 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1563 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
1564 velec = _mm256_mul_pd(velec,sw);
1565 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1567 /* Update potential sum for this i atom from the interaction with this j atom. */
1568 velec = _mm256_and_pd(velec,cutoff_mask);
1569 velec = _mm256_andnot_pd(dummy_mask,velec);
1570 velecsum = _mm256_add_pd(velecsum,velec);
1574 fscal = _mm256_and_pd(fscal,cutoff_mask);
1576 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1578 /* Calculate temporary vectorial force */
1579 tx = _mm256_mul_pd(fscal,dx33);
1580 ty = _mm256_mul_pd(fscal,dy33);
1581 tz = _mm256_mul_pd(fscal,dz33);
1583 /* Update vectorial force */
1584 fix3 = _mm256_add_pd(fix3,tx);
1585 fiy3 = _mm256_add_pd(fiy3,ty);
1586 fiz3 = _mm256_add_pd(fiz3,tz);
1588 fjx3 = _mm256_add_pd(fjx3,tx);
1589 fjy3 = _mm256_add_pd(fjy3,ty);
1590 fjz3 = _mm256_add_pd(fjz3,tz);
1594 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1595 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1596 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1597 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1599 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1600 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1602 /* Inner loop uses 594 flops */
1605 /* End of innermost loop */
1607 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1608 f+i_coord_offset+DIM,fshift+i_shift_offset);
1611 /* Update potential energies */
1612 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1614 /* Increment number of inner iterations */
1615 inneriter += j_index_end - j_index_start;
1617 /* Outer loop uses 19 flops */
1620 /* Increment number of outer iterations */
1623 /* Update outer/inner flops */
1625 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
1628 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
1629 * Electrostatics interaction: Ewald
1630 * VdW interaction: None
1631 * Geometry: Water4-Water4
1632 * Calculate force/pot: Force
1635 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
1636 (t_nblist * gmx_restrict nlist,
1637 rvec * gmx_restrict xx,
1638 rvec * gmx_restrict ff,
1639 struct t_forcerec * gmx_restrict fr,
1640 t_mdatoms * gmx_restrict mdatoms,
1641 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1642 t_nrnb * gmx_restrict nrnb)
1644 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1645 * just 0 for non-waters.
1646 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1647 * jnr indices corresponding to data put in the four positions in the SIMD register.
1649 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1650 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1651 int jnrA,jnrB,jnrC,jnrD;
1652 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1653 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1654 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1655 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1656 real rcutoff_scalar;
1657 real *shiftvec,*fshift,*x,*f;
1658 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1659 real scratch[4*DIM];
1660 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1661 real * vdwioffsetptr1;
1662 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1663 real * vdwioffsetptr2;
1664 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1665 real * vdwioffsetptr3;
1666 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1667 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1668 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1669 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1670 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1671 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1672 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1673 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1674 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1675 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1676 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1677 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1678 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1679 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1680 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1681 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1682 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1685 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1686 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1688 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1689 real rswitch_scalar,d_scalar;
1690 __m256d dummy_mask,cutoff_mask;
1691 __m128 tmpmask0,tmpmask1;
1692 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1693 __m256d one = _mm256_set1_pd(1.0);
1694 __m256d two = _mm256_set1_pd(2.0);
1700 jindex = nlist->jindex;
1702 shiftidx = nlist->shift;
1704 shiftvec = fr->shift_vec[0];
1705 fshift = fr->fshift[0];
1706 facel = _mm256_set1_pd(fr->ic->epsfac);
1707 charge = mdatoms->chargeA;
1709 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1710 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1711 beta2 = _mm256_mul_pd(beta,beta);
1712 beta3 = _mm256_mul_pd(beta,beta2);
1714 ewtab = fr->ic->tabq_coul_FDV0;
1715 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1716 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1718 /* Setup water-specific parameters */
1719 inr = nlist->iinr[0];
1720 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1721 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1722 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1724 jq1 = _mm256_set1_pd(charge[inr+1]);
1725 jq2 = _mm256_set1_pd(charge[inr+2]);
1726 jq3 = _mm256_set1_pd(charge[inr+3]);
1727 qq11 = _mm256_mul_pd(iq1,jq1);
1728 qq12 = _mm256_mul_pd(iq1,jq2);
1729 qq13 = _mm256_mul_pd(iq1,jq3);
1730 qq21 = _mm256_mul_pd(iq2,jq1);
1731 qq22 = _mm256_mul_pd(iq2,jq2);
1732 qq23 = _mm256_mul_pd(iq2,jq3);
1733 qq31 = _mm256_mul_pd(iq3,jq1);
1734 qq32 = _mm256_mul_pd(iq3,jq2);
1735 qq33 = _mm256_mul_pd(iq3,jq3);
1737 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1738 rcutoff_scalar = fr->ic->rcoulomb;
1739 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1740 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1742 rswitch_scalar = fr->ic->rcoulomb_switch;
1743 rswitch = _mm256_set1_pd(rswitch_scalar);
1744 /* Setup switch parameters */
1745 d_scalar = rcutoff_scalar-rswitch_scalar;
1746 d = _mm256_set1_pd(d_scalar);
1747 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1748 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1749 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1750 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1751 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1752 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1754 /* Avoid stupid compiler warnings */
1755 jnrA = jnrB = jnrC = jnrD = 0;
1756 j_coord_offsetA = 0;
1757 j_coord_offsetB = 0;
1758 j_coord_offsetC = 0;
1759 j_coord_offsetD = 0;
1764 for(iidx=0;iidx<4*DIM;iidx++)
1766 scratch[iidx] = 0.0;
1769 /* Start outer loop over neighborlists */
1770 for(iidx=0; iidx<nri; iidx++)
1772 /* Load shift vector for this list */
1773 i_shift_offset = DIM*shiftidx[iidx];
1775 /* Load limits for loop over neighbors */
1776 j_index_start = jindex[iidx];
1777 j_index_end = jindex[iidx+1];
1779 /* Get outer coordinate index */
1781 i_coord_offset = DIM*inr;
1783 /* Load i particle coords and add shift vector */
1784 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1785 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1787 fix1 = _mm256_setzero_pd();
1788 fiy1 = _mm256_setzero_pd();
1789 fiz1 = _mm256_setzero_pd();
1790 fix2 = _mm256_setzero_pd();
1791 fiy2 = _mm256_setzero_pd();
1792 fiz2 = _mm256_setzero_pd();
1793 fix3 = _mm256_setzero_pd();
1794 fiy3 = _mm256_setzero_pd();
1795 fiz3 = _mm256_setzero_pd();
1797 /* Start inner kernel loop */
1798 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1801 /* Get j neighbor index, and coordinate index */
1803 jnrB = jjnr[jidx+1];
1804 jnrC = jjnr[jidx+2];
1805 jnrD = jjnr[jidx+3];
1806 j_coord_offsetA = DIM*jnrA;
1807 j_coord_offsetB = DIM*jnrB;
1808 j_coord_offsetC = DIM*jnrC;
1809 j_coord_offsetD = DIM*jnrD;
1811 /* load j atom coordinates */
1812 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1813 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1814 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1816 /* Calculate displacement vector */
1817 dx11 = _mm256_sub_pd(ix1,jx1);
1818 dy11 = _mm256_sub_pd(iy1,jy1);
1819 dz11 = _mm256_sub_pd(iz1,jz1);
1820 dx12 = _mm256_sub_pd(ix1,jx2);
1821 dy12 = _mm256_sub_pd(iy1,jy2);
1822 dz12 = _mm256_sub_pd(iz1,jz2);
1823 dx13 = _mm256_sub_pd(ix1,jx3);
1824 dy13 = _mm256_sub_pd(iy1,jy3);
1825 dz13 = _mm256_sub_pd(iz1,jz3);
1826 dx21 = _mm256_sub_pd(ix2,jx1);
1827 dy21 = _mm256_sub_pd(iy2,jy1);
1828 dz21 = _mm256_sub_pd(iz2,jz1);
1829 dx22 = _mm256_sub_pd(ix2,jx2);
1830 dy22 = _mm256_sub_pd(iy2,jy2);
1831 dz22 = _mm256_sub_pd(iz2,jz2);
1832 dx23 = _mm256_sub_pd(ix2,jx3);
1833 dy23 = _mm256_sub_pd(iy2,jy3);
1834 dz23 = _mm256_sub_pd(iz2,jz3);
1835 dx31 = _mm256_sub_pd(ix3,jx1);
1836 dy31 = _mm256_sub_pd(iy3,jy1);
1837 dz31 = _mm256_sub_pd(iz3,jz1);
1838 dx32 = _mm256_sub_pd(ix3,jx2);
1839 dy32 = _mm256_sub_pd(iy3,jy2);
1840 dz32 = _mm256_sub_pd(iz3,jz2);
1841 dx33 = _mm256_sub_pd(ix3,jx3);
1842 dy33 = _mm256_sub_pd(iy3,jy3);
1843 dz33 = _mm256_sub_pd(iz3,jz3);
1845 /* Calculate squared distance and things based on it */
1846 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1847 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1848 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1849 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1850 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1851 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1852 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1853 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1854 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1856 rinv11 = avx256_invsqrt_d(rsq11);
1857 rinv12 = avx256_invsqrt_d(rsq12);
1858 rinv13 = avx256_invsqrt_d(rsq13);
1859 rinv21 = avx256_invsqrt_d(rsq21);
1860 rinv22 = avx256_invsqrt_d(rsq22);
1861 rinv23 = avx256_invsqrt_d(rsq23);
1862 rinv31 = avx256_invsqrt_d(rsq31);
1863 rinv32 = avx256_invsqrt_d(rsq32);
1864 rinv33 = avx256_invsqrt_d(rsq33);
1866 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1867 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1868 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1869 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1870 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1871 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1872 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1873 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1874 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1876 fjx1 = _mm256_setzero_pd();
1877 fjy1 = _mm256_setzero_pd();
1878 fjz1 = _mm256_setzero_pd();
1879 fjx2 = _mm256_setzero_pd();
1880 fjy2 = _mm256_setzero_pd();
1881 fjz2 = _mm256_setzero_pd();
1882 fjx3 = _mm256_setzero_pd();
1883 fjy3 = _mm256_setzero_pd();
1884 fjz3 = _mm256_setzero_pd();
1886 /**************************
1887 * CALCULATE INTERACTIONS *
1888 **************************/
1890 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1893 r11 = _mm256_mul_pd(rsq11,rinv11);
1895 /* EWALD ELECTROSTATICS */
1897 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1898 ewrt = _mm256_mul_pd(r11,ewtabscale);
1899 ewitab = _mm256_cvttpd_epi32(ewrt);
1900 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1901 ewitab = _mm_slli_epi32(ewitab,2);
1902 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1903 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1904 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1905 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1906 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1907 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1908 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1909 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1910 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1912 d = _mm256_sub_pd(r11,rswitch);
1913 d = _mm256_max_pd(d,_mm256_setzero_pd());
1914 d2 = _mm256_mul_pd(d,d);
1915 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1917 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1919 /* Evaluate switch function */
1920 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1921 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1922 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1926 fscal = _mm256_and_pd(fscal,cutoff_mask);
1928 /* Calculate temporary vectorial force */
1929 tx = _mm256_mul_pd(fscal,dx11);
1930 ty = _mm256_mul_pd(fscal,dy11);
1931 tz = _mm256_mul_pd(fscal,dz11);
1933 /* Update vectorial force */
1934 fix1 = _mm256_add_pd(fix1,tx);
1935 fiy1 = _mm256_add_pd(fiy1,ty);
1936 fiz1 = _mm256_add_pd(fiz1,tz);
1938 fjx1 = _mm256_add_pd(fjx1,tx);
1939 fjy1 = _mm256_add_pd(fjy1,ty);
1940 fjz1 = _mm256_add_pd(fjz1,tz);
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1951 r12 = _mm256_mul_pd(rsq12,rinv12);
1953 /* EWALD ELECTROSTATICS */
1955 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1956 ewrt = _mm256_mul_pd(r12,ewtabscale);
1957 ewitab = _mm256_cvttpd_epi32(ewrt);
1958 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1959 ewitab = _mm_slli_epi32(ewitab,2);
1960 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1961 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1962 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1963 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1964 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1965 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1966 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1967 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1968 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1970 d = _mm256_sub_pd(r12,rswitch);
1971 d = _mm256_max_pd(d,_mm256_setzero_pd());
1972 d2 = _mm256_mul_pd(d,d);
1973 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1975 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1977 /* Evaluate switch function */
1978 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1979 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1980 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1984 fscal = _mm256_and_pd(fscal,cutoff_mask);
1986 /* Calculate temporary vectorial force */
1987 tx = _mm256_mul_pd(fscal,dx12);
1988 ty = _mm256_mul_pd(fscal,dy12);
1989 tz = _mm256_mul_pd(fscal,dz12);
1991 /* Update vectorial force */
1992 fix1 = _mm256_add_pd(fix1,tx);
1993 fiy1 = _mm256_add_pd(fiy1,ty);
1994 fiz1 = _mm256_add_pd(fiz1,tz);
1996 fjx2 = _mm256_add_pd(fjx2,tx);
1997 fjy2 = _mm256_add_pd(fjy2,ty);
1998 fjz2 = _mm256_add_pd(fjz2,tz);
2002 /**************************
2003 * CALCULATE INTERACTIONS *
2004 **************************/
2006 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2009 r13 = _mm256_mul_pd(rsq13,rinv13);
2011 /* EWALD ELECTROSTATICS */
2013 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2014 ewrt = _mm256_mul_pd(r13,ewtabscale);
2015 ewitab = _mm256_cvttpd_epi32(ewrt);
2016 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2017 ewitab = _mm_slli_epi32(ewitab,2);
2018 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2019 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2020 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2021 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2022 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2023 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2024 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2025 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2026 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2028 d = _mm256_sub_pd(r13,rswitch);
2029 d = _mm256_max_pd(d,_mm256_setzero_pd());
2030 d2 = _mm256_mul_pd(d,d);
2031 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2033 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2035 /* Evaluate switch function */
2036 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2037 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2038 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2042 fscal = _mm256_and_pd(fscal,cutoff_mask);
2044 /* Calculate temporary vectorial force */
2045 tx = _mm256_mul_pd(fscal,dx13);
2046 ty = _mm256_mul_pd(fscal,dy13);
2047 tz = _mm256_mul_pd(fscal,dz13);
2049 /* Update vectorial force */
2050 fix1 = _mm256_add_pd(fix1,tx);
2051 fiy1 = _mm256_add_pd(fiy1,ty);
2052 fiz1 = _mm256_add_pd(fiz1,tz);
2054 fjx3 = _mm256_add_pd(fjx3,tx);
2055 fjy3 = _mm256_add_pd(fjy3,ty);
2056 fjz3 = _mm256_add_pd(fjz3,tz);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2067 r21 = _mm256_mul_pd(rsq21,rinv21);
2069 /* EWALD ELECTROSTATICS */
2071 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2072 ewrt = _mm256_mul_pd(r21,ewtabscale);
2073 ewitab = _mm256_cvttpd_epi32(ewrt);
2074 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2075 ewitab = _mm_slli_epi32(ewitab,2);
2076 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2077 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2078 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2079 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2080 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2081 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2082 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2083 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2084 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2086 d = _mm256_sub_pd(r21,rswitch);
2087 d = _mm256_max_pd(d,_mm256_setzero_pd());
2088 d2 = _mm256_mul_pd(d,d);
2089 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2091 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2093 /* Evaluate switch function */
2094 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2095 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2096 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2100 fscal = _mm256_and_pd(fscal,cutoff_mask);
2102 /* Calculate temporary vectorial force */
2103 tx = _mm256_mul_pd(fscal,dx21);
2104 ty = _mm256_mul_pd(fscal,dy21);
2105 tz = _mm256_mul_pd(fscal,dz21);
2107 /* Update vectorial force */
2108 fix2 = _mm256_add_pd(fix2,tx);
2109 fiy2 = _mm256_add_pd(fiy2,ty);
2110 fiz2 = _mm256_add_pd(fiz2,tz);
2112 fjx1 = _mm256_add_pd(fjx1,tx);
2113 fjy1 = _mm256_add_pd(fjy1,ty);
2114 fjz1 = _mm256_add_pd(fjz1,tz);
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2125 r22 = _mm256_mul_pd(rsq22,rinv22);
2127 /* EWALD ELECTROSTATICS */
2129 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2130 ewrt = _mm256_mul_pd(r22,ewtabscale);
2131 ewitab = _mm256_cvttpd_epi32(ewrt);
2132 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2133 ewitab = _mm_slli_epi32(ewitab,2);
2134 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2135 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2136 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2137 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2138 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2139 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2140 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2141 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2142 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2144 d = _mm256_sub_pd(r22,rswitch);
2145 d = _mm256_max_pd(d,_mm256_setzero_pd());
2146 d2 = _mm256_mul_pd(d,d);
2147 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2149 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2151 /* Evaluate switch function */
2152 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2153 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2154 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2158 fscal = _mm256_and_pd(fscal,cutoff_mask);
2160 /* Calculate temporary vectorial force */
2161 tx = _mm256_mul_pd(fscal,dx22);
2162 ty = _mm256_mul_pd(fscal,dy22);
2163 tz = _mm256_mul_pd(fscal,dz22);
2165 /* Update vectorial force */
2166 fix2 = _mm256_add_pd(fix2,tx);
2167 fiy2 = _mm256_add_pd(fiy2,ty);
2168 fiz2 = _mm256_add_pd(fiz2,tz);
2170 fjx2 = _mm256_add_pd(fjx2,tx);
2171 fjy2 = _mm256_add_pd(fjy2,ty);
2172 fjz2 = _mm256_add_pd(fjz2,tz);
2176 /**************************
2177 * CALCULATE INTERACTIONS *
2178 **************************/
2180 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2183 r23 = _mm256_mul_pd(rsq23,rinv23);
2185 /* EWALD ELECTROSTATICS */
2187 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2188 ewrt = _mm256_mul_pd(r23,ewtabscale);
2189 ewitab = _mm256_cvttpd_epi32(ewrt);
2190 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2191 ewitab = _mm_slli_epi32(ewitab,2);
2192 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2193 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2194 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2195 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2196 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2197 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2198 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2199 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2200 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2202 d = _mm256_sub_pd(r23,rswitch);
2203 d = _mm256_max_pd(d,_mm256_setzero_pd());
2204 d2 = _mm256_mul_pd(d,d);
2205 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2207 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2209 /* Evaluate switch function */
2210 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2211 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2212 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2216 fscal = _mm256_and_pd(fscal,cutoff_mask);
2218 /* Calculate temporary vectorial force */
2219 tx = _mm256_mul_pd(fscal,dx23);
2220 ty = _mm256_mul_pd(fscal,dy23);
2221 tz = _mm256_mul_pd(fscal,dz23);
2223 /* Update vectorial force */
2224 fix2 = _mm256_add_pd(fix2,tx);
2225 fiy2 = _mm256_add_pd(fiy2,ty);
2226 fiz2 = _mm256_add_pd(fiz2,tz);
2228 fjx3 = _mm256_add_pd(fjx3,tx);
2229 fjy3 = _mm256_add_pd(fjy3,ty);
2230 fjz3 = _mm256_add_pd(fjz3,tz);
2234 /**************************
2235 * CALCULATE INTERACTIONS *
2236 **************************/
2238 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2241 r31 = _mm256_mul_pd(rsq31,rinv31);
2243 /* EWALD ELECTROSTATICS */
2245 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2246 ewrt = _mm256_mul_pd(r31,ewtabscale);
2247 ewitab = _mm256_cvttpd_epi32(ewrt);
2248 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2249 ewitab = _mm_slli_epi32(ewitab,2);
2250 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2251 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2252 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2253 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2254 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2255 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2256 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2257 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2258 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2260 d = _mm256_sub_pd(r31,rswitch);
2261 d = _mm256_max_pd(d,_mm256_setzero_pd());
2262 d2 = _mm256_mul_pd(d,d);
2263 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2265 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2267 /* Evaluate switch function */
2268 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2269 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2270 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2274 fscal = _mm256_and_pd(fscal,cutoff_mask);
2276 /* Calculate temporary vectorial force */
2277 tx = _mm256_mul_pd(fscal,dx31);
2278 ty = _mm256_mul_pd(fscal,dy31);
2279 tz = _mm256_mul_pd(fscal,dz31);
2281 /* Update vectorial force */
2282 fix3 = _mm256_add_pd(fix3,tx);
2283 fiy3 = _mm256_add_pd(fiy3,ty);
2284 fiz3 = _mm256_add_pd(fiz3,tz);
2286 fjx1 = _mm256_add_pd(fjx1,tx);
2287 fjy1 = _mm256_add_pd(fjy1,ty);
2288 fjz1 = _mm256_add_pd(fjz1,tz);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2299 r32 = _mm256_mul_pd(rsq32,rinv32);
2301 /* EWALD ELECTROSTATICS */
2303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2304 ewrt = _mm256_mul_pd(r32,ewtabscale);
2305 ewitab = _mm256_cvttpd_epi32(ewrt);
2306 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2307 ewitab = _mm_slli_epi32(ewitab,2);
2308 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2309 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2310 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2311 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2312 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2313 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2314 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2315 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2316 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2318 d = _mm256_sub_pd(r32,rswitch);
2319 d = _mm256_max_pd(d,_mm256_setzero_pd());
2320 d2 = _mm256_mul_pd(d,d);
2321 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2323 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2325 /* Evaluate switch function */
2326 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2327 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2328 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2332 fscal = _mm256_and_pd(fscal,cutoff_mask);
2334 /* Calculate temporary vectorial force */
2335 tx = _mm256_mul_pd(fscal,dx32);
2336 ty = _mm256_mul_pd(fscal,dy32);
2337 tz = _mm256_mul_pd(fscal,dz32);
2339 /* Update vectorial force */
2340 fix3 = _mm256_add_pd(fix3,tx);
2341 fiy3 = _mm256_add_pd(fiy3,ty);
2342 fiz3 = _mm256_add_pd(fiz3,tz);
2344 fjx2 = _mm256_add_pd(fjx2,tx);
2345 fjy2 = _mm256_add_pd(fjy2,ty);
2346 fjz2 = _mm256_add_pd(fjz2,tz);
2350 /**************************
2351 * CALCULATE INTERACTIONS *
2352 **************************/
2354 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2357 r33 = _mm256_mul_pd(rsq33,rinv33);
2359 /* EWALD ELECTROSTATICS */
2361 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2362 ewrt = _mm256_mul_pd(r33,ewtabscale);
2363 ewitab = _mm256_cvttpd_epi32(ewrt);
2364 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2365 ewitab = _mm_slli_epi32(ewitab,2);
2366 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2367 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2368 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2369 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2370 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2371 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2372 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2373 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
2374 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2376 d = _mm256_sub_pd(r33,rswitch);
2377 d = _mm256_max_pd(d,_mm256_setzero_pd());
2378 d2 = _mm256_mul_pd(d,d);
2379 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2381 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2383 /* Evaluate switch function */
2384 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2385 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
2386 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2390 fscal = _mm256_and_pd(fscal,cutoff_mask);
2392 /* Calculate temporary vectorial force */
2393 tx = _mm256_mul_pd(fscal,dx33);
2394 ty = _mm256_mul_pd(fscal,dy33);
2395 tz = _mm256_mul_pd(fscal,dz33);
2397 /* Update vectorial force */
2398 fix3 = _mm256_add_pd(fix3,tx);
2399 fiy3 = _mm256_add_pd(fiy3,ty);
2400 fiz3 = _mm256_add_pd(fiz3,tz);
2402 fjx3 = _mm256_add_pd(fjx3,tx);
2403 fjy3 = _mm256_add_pd(fjy3,ty);
2404 fjz3 = _mm256_add_pd(fjz3,tz);
2408 fjptrA = f+j_coord_offsetA;
2409 fjptrB = f+j_coord_offsetB;
2410 fjptrC = f+j_coord_offsetC;
2411 fjptrD = f+j_coord_offsetD;
2413 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2414 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2416 /* Inner loop uses 558 flops */
2419 if(jidx<j_index_end)
2422 /* Get j neighbor index, and coordinate index */
2423 jnrlistA = jjnr[jidx];
2424 jnrlistB = jjnr[jidx+1];
2425 jnrlistC = jjnr[jidx+2];
2426 jnrlistD = jjnr[jidx+3];
2427 /* Sign of each element will be negative for non-real atoms.
2428 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2429 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2431 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2433 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2434 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2435 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2437 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2438 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2439 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2440 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2441 j_coord_offsetA = DIM*jnrA;
2442 j_coord_offsetB = DIM*jnrB;
2443 j_coord_offsetC = DIM*jnrC;
2444 j_coord_offsetD = DIM*jnrD;
2446 /* load j atom coordinates */
2447 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2448 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2449 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2451 /* Calculate displacement vector */
2452 dx11 = _mm256_sub_pd(ix1,jx1);
2453 dy11 = _mm256_sub_pd(iy1,jy1);
2454 dz11 = _mm256_sub_pd(iz1,jz1);
2455 dx12 = _mm256_sub_pd(ix1,jx2);
2456 dy12 = _mm256_sub_pd(iy1,jy2);
2457 dz12 = _mm256_sub_pd(iz1,jz2);
2458 dx13 = _mm256_sub_pd(ix1,jx3);
2459 dy13 = _mm256_sub_pd(iy1,jy3);
2460 dz13 = _mm256_sub_pd(iz1,jz3);
2461 dx21 = _mm256_sub_pd(ix2,jx1);
2462 dy21 = _mm256_sub_pd(iy2,jy1);
2463 dz21 = _mm256_sub_pd(iz2,jz1);
2464 dx22 = _mm256_sub_pd(ix2,jx2);
2465 dy22 = _mm256_sub_pd(iy2,jy2);
2466 dz22 = _mm256_sub_pd(iz2,jz2);
2467 dx23 = _mm256_sub_pd(ix2,jx3);
2468 dy23 = _mm256_sub_pd(iy2,jy3);
2469 dz23 = _mm256_sub_pd(iz2,jz3);
2470 dx31 = _mm256_sub_pd(ix3,jx1);
2471 dy31 = _mm256_sub_pd(iy3,jy1);
2472 dz31 = _mm256_sub_pd(iz3,jz1);
2473 dx32 = _mm256_sub_pd(ix3,jx2);
2474 dy32 = _mm256_sub_pd(iy3,jy2);
2475 dz32 = _mm256_sub_pd(iz3,jz2);
2476 dx33 = _mm256_sub_pd(ix3,jx3);
2477 dy33 = _mm256_sub_pd(iy3,jy3);
2478 dz33 = _mm256_sub_pd(iz3,jz3);
2480 /* Calculate squared distance and things based on it */
2481 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2482 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2483 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2484 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2485 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2486 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2487 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2488 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2489 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2491 rinv11 = avx256_invsqrt_d(rsq11);
2492 rinv12 = avx256_invsqrt_d(rsq12);
2493 rinv13 = avx256_invsqrt_d(rsq13);
2494 rinv21 = avx256_invsqrt_d(rsq21);
2495 rinv22 = avx256_invsqrt_d(rsq22);
2496 rinv23 = avx256_invsqrt_d(rsq23);
2497 rinv31 = avx256_invsqrt_d(rsq31);
2498 rinv32 = avx256_invsqrt_d(rsq32);
2499 rinv33 = avx256_invsqrt_d(rsq33);
2501 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2502 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2503 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2504 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2505 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2506 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2507 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2508 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2509 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2511 fjx1 = _mm256_setzero_pd();
2512 fjy1 = _mm256_setzero_pd();
2513 fjz1 = _mm256_setzero_pd();
2514 fjx2 = _mm256_setzero_pd();
2515 fjy2 = _mm256_setzero_pd();
2516 fjz2 = _mm256_setzero_pd();
2517 fjx3 = _mm256_setzero_pd();
2518 fjy3 = _mm256_setzero_pd();
2519 fjz3 = _mm256_setzero_pd();
2521 /**************************
2522 * CALCULATE INTERACTIONS *
2523 **************************/
2525 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2528 r11 = _mm256_mul_pd(rsq11,rinv11);
2529 r11 = _mm256_andnot_pd(dummy_mask,r11);
2531 /* EWALD ELECTROSTATICS */
2533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2534 ewrt = _mm256_mul_pd(r11,ewtabscale);
2535 ewitab = _mm256_cvttpd_epi32(ewrt);
2536 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2537 ewitab = _mm_slli_epi32(ewitab,2);
2538 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2539 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2540 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2541 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2542 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2543 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2544 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2545 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2546 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2548 d = _mm256_sub_pd(r11,rswitch);
2549 d = _mm256_max_pd(d,_mm256_setzero_pd());
2550 d2 = _mm256_mul_pd(d,d);
2551 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2553 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2555 /* Evaluate switch function */
2556 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2557 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2558 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2562 fscal = _mm256_and_pd(fscal,cutoff_mask);
2564 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2566 /* Calculate temporary vectorial force */
2567 tx = _mm256_mul_pd(fscal,dx11);
2568 ty = _mm256_mul_pd(fscal,dy11);
2569 tz = _mm256_mul_pd(fscal,dz11);
2571 /* Update vectorial force */
2572 fix1 = _mm256_add_pd(fix1,tx);
2573 fiy1 = _mm256_add_pd(fiy1,ty);
2574 fiz1 = _mm256_add_pd(fiz1,tz);
2576 fjx1 = _mm256_add_pd(fjx1,tx);
2577 fjy1 = _mm256_add_pd(fjy1,ty);
2578 fjz1 = _mm256_add_pd(fjz1,tz);
2582 /**************************
2583 * CALCULATE INTERACTIONS *
2584 **************************/
2586 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2589 r12 = _mm256_mul_pd(rsq12,rinv12);
2590 r12 = _mm256_andnot_pd(dummy_mask,r12);
2592 /* EWALD ELECTROSTATICS */
2594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2595 ewrt = _mm256_mul_pd(r12,ewtabscale);
2596 ewitab = _mm256_cvttpd_epi32(ewrt);
2597 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2598 ewitab = _mm_slli_epi32(ewitab,2);
2599 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2600 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2601 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2602 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2603 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2604 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2605 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2606 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2607 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2609 d = _mm256_sub_pd(r12,rswitch);
2610 d = _mm256_max_pd(d,_mm256_setzero_pd());
2611 d2 = _mm256_mul_pd(d,d);
2612 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2614 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2616 /* Evaluate switch function */
2617 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2618 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2619 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2623 fscal = _mm256_and_pd(fscal,cutoff_mask);
2625 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2627 /* Calculate temporary vectorial force */
2628 tx = _mm256_mul_pd(fscal,dx12);
2629 ty = _mm256_mul_pd(fscal,dy12);
2630 tz = _mm256_mul_pd(fscal,dz12);
2632 /* Update vectorial force */
2633 fix1 = _mm256_add_pd(fix1,tx);
2634 fiy1 = _mm256_add_pd(fiy1,ty);
2635 fiz1 = _mm256_add_pd(fiz1,tz);
2637 fjx2 = _mm256_add_pd(fjx2,tx);
2638 fjy2 = _mm256_add_pd(fjy2,ty);
2639 fjz2 = _mm256_add_pd(fjz2,tz);
2643 /**************************
2644 * CALCULATE INTERACTIONS *
2645 **************************/
2647 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2650 r13 = _mm256_mul_pd(rsq13,rinv13);
2651 r13 = _mm256_andnot_pd(dummy_mask,r13);
2653 /* EWALD ELECTROSTATICS */
2655 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2656 ewrt = _mm256_mul_pd(r13,ewtabscale);
2657 ewitab = _mm256_cvttpd_epi32(ewrt);
2658 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2659 ewitab = _mm_slli_epi32(ewitab,2);
2660 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2661 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2662 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2663 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2664 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2665 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2666 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2667 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2668 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2670 d = _mm256_sub_pd(r13,rswitch);
2671 d = _mm256_max_pd(d,_mm256_setzero_pd());
2672 d2 = _mm256_mul_pd(d,d);
2673 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2675 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2677 /* Evaluate switch function */
2678 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2679 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2680 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2684 fscal = _mm256_and_pd(fscal,cutoff_mask);
2686 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2688 /* Calculate temporary vectorial force */
2689 tx = _mm256_mul_pd(fscal,dx13);
2690 ty = _mm256_mul_pd(fscal,dy13);
2691 tz = _mm256_mul_pd(fscal,dz13);
2693 /* Update vectorial force */
2694 fix1 = _mm256_add_pd(fix1,tx);
2695 fiy1 = _mm256_add_pd(fiy1,ty);
2696 fiz1 = _mm256_add_pd(fiz1,tz);
2698 fjx3 = _mm256_add_pd(fjx3,tx);
2699 fjy3 = _mm256_add_pd(fjy3,ty);
2700 fjz3 = _mm256_add_pd(fjz3,tz);
2704 /**************************
2705 * CALCULATE INTERACTIONS *
2706 **************************/
2708 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2711 r21 = _mm256_mul_pd(rsq21,rinv21);
2712 r21 = _mm256_andnot_pd(dummy_mask,r21);
2714 /* EWALD ELECTROSTATICS */
2716 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2717 ewrt = _mm256_mul_pd(r21,ewtabscale);
2718 ewitab = _mm256_cvttpd_epi32(ewrt);
2719 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2720 ewitab = _mm_slli_epi32(ewitab,2);
2721 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2722 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2723 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2724 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2725 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2726 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2727 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2728 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2729 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2731 d = _mm256_sub_pd(r21,rswitch);
2732 d = _mm256_max_pd(d,_mm256_setzero_pd());
2733 d2 = _mm256_mul_pd(d,d);
2734 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2736 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2738 /* Evaluate switch function */
2739 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2740 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2741 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2745 fscal = _mm256_and_pd(fscal,cutoff_mask);
2747 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2749 /* Calculate temporary vectorial force */
2750 tx = _mm256_mul_pd(fscal,dx21);
2751 ty = _mm256_mul_pd(fscal,dy21);
2752 tz = _mm256_mul_pd(fscal,dz21);
2754 /* Update vectorial force */
2755 fix2 = _mm256_add_pd(fix2,tx);
2756 fiy2 = _mm256_add_pd(fiy2,ty);
2757 fiz2 = _mm256_add_pd(fiz2,tz);
2759 fjx1 = _mm256_add_pd(fjx1,tx);
2760 fjy1 = _mm256_add_pd(fjy1,ty);
2761 fjz1 = _mm256_add_pd(fjz1,tz);
2765 /**************************
2766 * CALCULATE INTERACTIONS *
2767 **************************/
2769 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2772 r22 = _mm256_mul_pd(rsq22,rinv22);
2773 r22 = _mm256_andnot_pd(dummy_mask,r22);
2775 /* EWALD ELECTROSTATICS */
2777 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2778 ewrt = _mm256_mul_pd(r22,ewtabscale);
2779 ewitab = _mm256_cvttpd_epi32(ewrt);
2780 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2781 ewitab = _mm_slli_epi32(ewitab,2);
2782 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2783 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2784 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2785 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2786 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2787 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2788 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2789 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2790 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2792 d = _mm256_sub_pd(r22,rswitch);
2793 d = _mm256_max_pd(d,_mm256_setzero_pd());
2794 d2 = _mm256_mul_pd(d,d);
2795 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2797 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2799 /* Evaluate switch function */
2800 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2801 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2802 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2806 fscal = _mm256_and_pd(fscal,cutoff_mask);
2808 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2810 /* Calculate temporary vectorial force */
2811 tx = _mm256_mul_pd(fscal,dx22);
2812 ty = _mm256_mul_pd(fscal,dy22);
2813 tz = _mm256_mul_pd(fscal,dz22);
2815 /* Update vectorial force */
2816 fix2 = _mm256_add_pd(fix2,tx);
2817 fiy2 = _mm256_add_pd(fiy2,ty);
2818 fiz2 = _mm256_add_pd(fiz2,tz);
2820 fjx2 = _mm256_add_pd(fjx2,tx);
2821 fjy2 = _mm256_add_pd(fjy2,ty);
2822 fjz2 = _mm256_add_pd(fjz2,tz);
2826 /**************************
2827 * CALCULATE INTERACTIONS *
2828 **************************/
2830 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2833 r23 = _mm256_mul_pd(rsq23,rinv23);
2834 r23 = _mm256_andnot_pd(dummy_mask,r23);
2836 /* EWALD ELECTROSTATICS */
2838 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2839 ewrt = _mm256_mul_pd(r23,ewtabscale);
2840 ewitab = _mm256_cvttpd_epi32(ewrt);
2841 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2842 ewitab = _mm_slli_epi32(ewitab,2);
2843 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2844 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2845 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2846 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2847 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2848 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2849 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2850 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2851 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2853 d = _mm256_sub_pd(r23,rswitch);
2854 d = _mm256_max_pd(d,_mm256_setzero_pd());
2855 d2 = _mm256_mul_pd(d,d);
2856 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2858 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2860 /* Evaluate switch function */
2861 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2862 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2863 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2867 fscal = _mm256_and_pd(fscal,cutoff_mask);
2869 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2871 /* Calculate temporary vectorial force */
2872 tx = _mm256_mul_pd(fscal,dx23);
2873 ty = _mm256_mul_pd(fscal,dy23);
2874 tz = _mm256_mul_pd(fscal,dz23);
2876 /* Update vectorial force */
2877 fix2 = _mm256_add_pd(fix2,tx);
2878 fiy2 = _mm256_add_pd(fiy2,ty);
2879 fiz2 = _mm256_add_pd(fiz2,tz);
2881 fjx3 = _mm256_add_pd(fjx3,tx);
2882 fjy3 = _mm256_add_pd(fjy3,ty);
2883 fjz3 = _mm256_add_pd(fjz3,tz);
2887 /**************************
2888 * CALCULATE INTERACTIONS *
2889 **************************/
2891 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2894 r31 = _mm256_mul_pd(rsq31,rinv31);
2895 r31 = _mm256_andnot_pd(dummy_mask,r31);
2897 /* EWALD ELECTROSTATICS */
2899 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2900 ewrt = _mm256_mul_pd(r31,ewtabscale);
2901 ewitab = _mm256_cvttpd_epi32(ewrt);
2902 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2903 ewitab = _mm_slli_epi32(ewitab,2);
2904 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2905 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2906 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2907 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2908 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2909 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2910 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2911 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2912 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2914 d = _mm256_sub_pd(r31,rswitch);
2915 d = _mm256_max_pd(d,_mm256_setzero_pd());
2916 d2 = _mm256_mul_pd(d,d);
2917 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2919 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2921 /* Evaluate switch function */
2922 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2923 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2924 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2928 fscal = _mm256_and_pd(fscal,cutoff_mask);
2930 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2932 /* Calculate temporary vectorial force */
2933 tx = _mm256_mul_pd(fscal,dx31);
2934 ty = _mm256_mul_pd(fscal,dy31);
2935 tz = _mm256_mul_pd(fscal,dz31);
2937 /* Update vectorial force */
2938 fix3 = _mm256_add_pd(fix3,tx);
2939 fiy3 = _mm256_add_pd(fiy3,ty);
2940 fiz3 = _mm256_add_pd(fiz3,tz);
2942 fjx1 = _mm256_add_pd(fjx1,tx);
2943 fjy1 = _mm256_add_pd(fjy1,ty);
2944 fjz1 = _mm256_add_pd(fjz1,tz);
2948 /**************************
2949 * CALCULATE INTERACTIONS *
2950 **************************/
2952 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2955 r32 = _mm256_mul_pd(rsq32,rinv32);
2956 r32 = _mm256_andnot_pd(dummy_mask,r32);
2958 /* EWALD ELECTROSTATICS */
2960 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2961 ewrt = _mm256_mul_pd(r32,ewtabscale);
2962 ewitab = _mm256_cvttpd_epi32(ewrt);
2963 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2964 ewitab = _mm_slli_epi32(ewitab,2);
2965 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2966 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2967 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2968 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2969 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2970 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2971 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2972 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2973 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2975 d = _mm256_sub_pd(r32,rswitch);
2976 d = _mm256_max_pd(d,_mm256_setzero_pd());
2977 d2 = _mm256_mul_pd(d,d);
2978 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2980 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2982 /* Evaluate switch function */
2983 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2984 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2985 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2989 fscal = _mm256_and_pd(fscal,cutoff_mask);
2991 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2993 /* Calculate temporary vectorial force */
2994 tx = _mm256_mul_pd(fscal,dx32);
2995 ty = _mm256_mul_pd(fscal,dy32);
2996 tz = _mm256_mul_pd(fscal,dz32);
2998 /* Update vectorial force */
2999 fix3 = _mm256_add_pd(fix3,tx);
3000 fiy3 = _mm256_add_pd(fiy3,ty);
3001 fiz3 = _mm256_add_pd(fiz3,tz);
3003 fjx2 = _mm256_add_pd(fjx2,tx);
3004 fjy2 = _mm256_add_pd(fjy2,ty);
3005 fjz2 = _mm256_add_pd(fjz2,tz);
3009 /**************************
3010 * CALCULATE INTERACTIONS *
3011 **************************/
3013 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3016 r33 = _mm256_mul_pd(rsq33,rinv33);
3017 r33 = _mm256_andnot_pd(dummy_mask,r33);
3019 /* EWALD ELECTROSTATICS */
3021 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3022 ewrt = _mm256_mul_pd(r33,ewtabscale);
3023 ewitab = _mm256_cvttpd_epi32(ewrt);
3024 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3025 ewitab = _mm_slli_epi32(ewitab,2);
3026 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3027 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3028 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3029 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3030 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3031 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3032 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3033 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
3034 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
3036 d = _mm256_sub_pd(r33,rswitch);
3037 d = _mm256_max_pd(d,_mm256_setzero_pd());
3038 d2 = _mm256_mul_pd(d,d);
3039 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3041 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3043 /* Evaluate switch function */
3044 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3045 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
3046 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
3050 fscal = _mm256_and_pd(fscal,cutoff_mask);
3052 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3054 /* Calculate temporary vectorial force */
3055 tx = _mm256_mul_pd(fscal,dx33);
3056 ty = _mm256_mul_pd(fscal,dy33);
3057 tz = _mm256_mul_pd(fscal,dz33);
3059 /* Update vectorial force */
3060 fix3 = _mm256_add_pd(fix3,tx);
3061 fiy3 = _mm256_add_pd(fiy3,ty);
3062 fiz3 = _mm256_add_pd(fiz3,tz);
3064 fjx3 = _mm256_add_pd(fjx3,tx);
3065 fjy3 = _mm256_add_pd(fjy3,ty);
3066 fjz3 = _mm256_add_pd(fjz3,tz);
3070 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3071 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3072 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3073 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3075 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3076 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3078 /* Inner loop uses 567 flops */
3081 /* End of innermost loop */
3083 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3084 f+i_coord_offset+DIM,fshift+i_shift_offset);
3086 /* Increment number of inner iterations */
3087 inneriter += j_index_end - j_index_start;
3089 /* Outer loop uses 18 flops */
3092 /* Increment number of outer iterations */
3095 /* Update outer/inner flops */
3097 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);