2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: None
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr1;
85 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86 real * vdwioffsetptr2;
87 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 real * vdwioffsetptr3;
89 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
95 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
111 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
112 real rswitch_scalar,d_scalar;
113 __m256d dummy_mask,cutoff_mask;
114 __m128 tmpmask0,tmpmask1;
115 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
116 __m256d one = _mm256_set1_pd(1.0);
117 __m256d two = _mm256_set1_pd(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm256_set1_pd(fr->epsfac);
130 charge = mdatoms->chargeA;
132 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
133 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
134 beta2 = _mm256_mul_pd(beta,beta);
135 beta3 = _mm256_mul_pd(beta,beta2);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
144 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
145 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
147 jq1 = _mm256_set1_pd(charge[inr+1]);
148 jq2 = _mm256_set1_pd(charge[inr+2]);
149 jq3 = _mm256_set1_pd(charge[inr+3]);
150 qq11 = _mm256_mul_pd(iq1,jq1);
151 qq12 = _mm256_mul_pd(iq1,jq2);
152 qq13 = _mm256_mul_pd(iq1,jq3);
153 qq21 = _mm256_mul_pd(iq2,jq1);
154 qq22 = _mm256_mul_pd(iq2,jq2);
155 qq23 = _mm256_mul_pd(iq2,jq3);
156 qq31 = _mm256_mul_pd(iq3,jq1);
157 qq32 = _mm256_mul_pd(iq3,jq2);
158 qq33 = _mm256_mul_pd(iq3,jq3);
160 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
161 rcutoff_scalar = fr->rcoulomb;
162 rcutoff = _mm256_set1_pd(rcutoff_scalar);
163 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
165 rswitch_scalar = fr->rcoulomb_switch;
166 rswitch = _mm256_set1_pd(rswitch_scalar);
167 /* Setup switch parameters */
168 d_scalar = rcutoff_scalar-rswitch_scalar;
169 d = _mm256_set1_pd(d_scalar);
170 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
171 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
172 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
173 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
174 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177 /* Avoid stupid compiler warnings */
178 jnrA = jnrB = jnrC = jnrD = 0;
187 for(iidx=0;iidx<4*DIM;iidx++)
192 /* Start outer loop over neighborlists */
193 for(iidx=0; iidx<nri; iidx++)
195 /* Load shift vector for this list */
196 i_shift_offset = DIM*shiftidx[iidx];
198 /* Load limits for loop over neighbors */
199 j_index_start = jindex[iidx];
200 j_index_end = jindex[iidx+1];
202 /* Get outer coordinate index */
204 i_coord_offset = DIM*inr;
206 /* Load i particle coords and add shift vector */
207 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
208 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
210 fix1 = _mm256_setzero_pd();
211 fiy1 = _mm256_setzero_pd();
212 fiz1 = _mm256_setzero_pd();
213 fix2 = _mm256_setzero_pd();
214 fiy2 = _mm256_setzero_pd();
215 fiz2 = _mm256_setzero_pd();
216 fix3 = _mm256_setzero_pd();
217 fiy3 = _mm256_setzero_pd();
218 fiz3 = _mm256_setzero_pd();
220 /* Reset potential sums */
221 velecsum = _mm256_setzero_pd();
223 /* Start inner kernel loop */
224 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
227 /* Get j neighbor index, and coordinate index */
232 j_coord_offsetA = DIM*jnrA;
233 j_coord_offsetB = DIM*jnrB;
234 j_coord_offsetC = DIM*jnrC;
235 j_coord_offsetD = DIM*jnrD;
237 /* load j atom coordinates */
238 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
239 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
240 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
242 /* Calculate displacement vector */
243 dx11 = _mm256_sub_pd(ix1,jx1);
244 dy11 = _mm256_sub_pd(iy1,jy1);
245 dz11 = _mm256_sub_pd(iz1,jz1);
246 dx12 = _mm256_sub_pd(ix1,jx2);
247 dy12 = _mm256_sub_pd(iy1,jy2);
248 dz12 = _mm256_sub_pd(iz1,jz2);
249 dx13 = _mm256_sub_pd(ix1,jx3);
250 dy13 = _mm256_sub_pd(iy1,jy3);
251 dz13 = _mm256_sub_pd(iz1,jz3);
252 dx21 = _mm256_sub_pd(ix2,jx1);
253 dy21 = _mm256_sub_pd(iy2,jy1);
254 dz21 = _mm256_sub_pd(iz2,jz1);
255 dx22 = _mm256_sub_pd(ix2,jx2);
256 dy22 = _mm256_sub_pd(iy2,jy2);
257 dz22 = _mm256_sub_pd(iz2,jz2);
258 dx23 = _mm256_sub_pd(ix2,jx3);
259 dy23 = _mm256_sub_pd(iy2,jy3);
260 dz23 = _mm256_sub_pd(iz2,jz3);
261 dx31 = _mm256_sub_pd(ix3,jx1);
262 dy31 = _mm256_sub_pd(iy3,jy1);
263 dz31 = _mm256_sub_pd(iz3,jz1);
264 dx32 = _mm256_sub_pd(ix3,jx2);
265 dy32 = _mm256_sub_pd(iy3,jy2);
266 dz32 = _mm256_sub_pd(iz3,jz2);
267 dx33 = _mm256_sub_pd(ix3,jx3);
268 dy33 = _mm256_sub_pd(iy3,jy3);
269 dz33 = _mm256_sub_pd(iz3,jz3);
271 /* Calculate squared distance and things based on it */
272 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
273 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
274 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
275 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
276 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
277 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
278 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
279 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
280 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
282 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
283 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
284 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
285 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
286 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
287 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
288 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
289 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
290 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
292 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
293 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
294 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
295 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
296 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
297 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
298 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
299 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
300 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
302 fjx1 = _mm256_setzero_pd();
303 fjy1 = _mm256_setzero_pd();
304 fjz1 = _mm256_setzero_pd();
305 fjx2 = _mm256_setzero_pd();
306 fjy2 = _mm256_setzero_pd();
307 fjz2 = _mm256_setzero_pd();
308 fjx3 = _mm256_setzero_pd();
309 fjy3 = _mm256_setzero_pd();
310 fjz3 = _mm256_setzero_pd();
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm256_any_lt(rsq11,rcutoff2))
319 r11 = _mm256_mul_pd(rsq11,rinv11);
321 /* EWALD ELECTROSTATICS */
323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
324 ewrt = _mm256_mul_pd(r11,ewtabscale);
325 ewitab = _mm256_cvttpd_epi32(ewrt);
326 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
327 ewitab = _mm_slli_epi32(ewitab,2);
328 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
329 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
330 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
331 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
332 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
333 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
334 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
335 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
336 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
338 d = _mm256_sub_pd(r11,rswitch);
339 d = _mm256_max_pd(d,_mm256_setzero_pd());
340 d2 = _mm256_mul_pd(d,d);
341 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
343 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
345 /* Evaluate switch function */
346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
347 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
348 velec = _mm256_mul_pd(velec,sw);
349 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velec = _mm256_and_pd(velec,cutoff_mask);
353 velecsum = _mm256_add_pd(velecsum,velec);
357 fscal = _mm256_and_pd(fscal,cutoff_mask);
359 /* Calculate temporary vectorial force */
360 tx = _mm256_mul_pd(fscal,dx11);
361 ty = _mm256_mul_pd(fscal,dy11);
362 tz = _mm256_mul_pd(fscal,dz11);
364 /* Update vectorial force */
365 fix1 = _mm256_add_pd(fix1,tx);
366 fiy1 = _mm256_add_pd(fiy1,ty);
367 fiz1 = _mm256_add_pd(fiz1,tz);
369 fjx1 = _mm256_add_pd(fjx1,tx);
370 fjy1 = _mm256_add_pd(fjy1,ty);
371 fjz1 = _mm256_add_pd(fjz1,tz);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 if (gmx_mm256_any_lt(rsq12,rcutoff2))
382 r12 = _mm256_mul_pd(rsq12,rinv12);
384 /* EWALD ELECTROSTATICS */
386 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
387 ewrt = _mm256_mul_pd(r12,ewtabscale);
388 ewitab = _mm256_cvttpd_epi32(ewrt);
389 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
390 ewitab = _mm_slli_epi32(ewitab,2);
391 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
392 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
393 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
394 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
395 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
396 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
397 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
398 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
399 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
401 d = _mm256_sub_pd(r12,rswitch);
402 d = _mm256_max_pd(d,_mm256_setzero_pd());
403 d2 = _mm256_mul_pd(d,d);
404 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
406 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
408 /* Evaluate switch function */
409 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
410 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
411 velec = _mm256_mul_pd(velec,sw);
412 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velec = _mm256_and_pd(velec,cutoff_mask);
416 velecsum = _mm256_add_pd(velecsum,velec);
420 fscal = _mm256_and_pd(fscal,cutoff_mask);
422 /* Calculate temporary vectorial force */
423 tx = _mm256_mul_pd(fscal,dx12);
424 ty = _mm256_mul_pd(fscal,dy12);
425 tz = _mm256_mul_pd(fscal,dz12);
427 /* Update vectorial force */
428 fix1 = _mm256_add_pd(fix1,tx);
429 fiy1 = _mm256_add_pd(fiy1,ty);
430 fiz1 = _mm256_add_pd(fiz1,tz);
432 fjx2 = _mm256_add_pd(fjx2,tx);
433 fjy2 = _mm256_add_pd(fjy2,ty);
434 fjz2 = _mm256_add_pd(fjz2,tz);
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
442 if (gmx_mm256_any_lt(rsq13,rcutoff2))
445 r13 = _mm256_mul_pd(rsq13,rinv13);
447 /* EWALD ELECTROSTATICS */
449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
450 ewrt = _mm256_mul_pd(r13,ewtabscale);
451 ewitab = _mm256_cvttpd_epi32(ewrt);
452 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
453 ewitab = _mm_slli_epi32(ewitab,2);
454 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
455 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
456 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
457 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
458 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
459 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
460 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
461 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
462 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
464 d = _mm256_sub_pd(r13,rswitch);
465 d = _mm256_max_pd(d,_mm256_setzero_pd());
466 d2 = _mm256_mul_pd(d,d);
467 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
469 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
471 /* Evaluate switch function */
472 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
473 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
474 velec = _mm256_mul_pd(velec,sw);
475 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velec = _mm256_and_pd(velec,cutoff_mask);
479 velecsum = _mm256_add_pd(velecsum,velec);
483 fscal = _mm256_and_pd(fscal,cutoff_mask);
485 /* Calculate temporary vectorial force */
486 tx = _mm256_mul_pd(fscal,dx13);
487 ty = _mm256_mul_pd(fscal,dy13);
488 tz = _mm256_mul_pd(fscal,dz13);
490 /* Update vectorial force */
491 fix1 = _mm256_add_pd(fix1,tx);
492 fiy1 = _mm256_add_pd(fiy1,ty);
493 fiz1 = _mm256_add_pd(fiz1,tz);
495 fjx3 = _mm256_add_pd(fjx3,tx);
496 fjy3 = _mm256_add_pd(fjy3,ty);
497 fjz3 = _mm256_add_pd(fjz3,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 if (gmx_mm256_any_lt(rsq21,rcutoff2))
508 r21 = _mm256_mul_pd(rsq21,rinv21);
510 /* EWALD ELECTROSTATICS */
512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
513 ewrt = _mm256_mul_pd(r21,ewtabscale);
514 ewitab = _mm256_cvttpd_epi32(ewrt);
515 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
516 ewitab = _mm_slli_epi32(ewitab,2);
517 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
518 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
519 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
520 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
521 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
522 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
523 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
524 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
525 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
527 d = _mm256_sub_pd(r21,rswitch);
528 d = _mm256_max_pd(d,_mm256_setzero_pd());
529 d2 = _mm256_mul_pd(d,d);
530 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
532 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
534 /* Evaluate switch function */
535 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
536 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
537 velec = _mm256_mul_pd(velec,sw);
538 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
540 /* Update potential sum for this i atom from the interaction with this j atom. */
541 velec = _mm256_and_pd(velec,cutoff_mask);
542 velecsum = _mm256_add_pd(velecsum,velec);
546 fscal = _mm256_and_pd(fscal,cutoff_mask);
548 /* Calculate temporary vectorial force */
549 tx = _mm256_mul_pd(fscal,dx21);
550 ty = _mm256_mul_pd(fscal,dy21);
551 tz = _mm256_mul_pd(fscal,dz21);
553 /* Update vectorial force */
554 fix2 = _mm256_add_pd(fix2,tx);
555 fiy2 = _mm256_add_pd(fiy2,ty);
556 fiz2 = _mm256_add_pd(fiz2,tz);
558 fjx1 = _mm256_add_pd(fjx1,tx);
559 fjy1 = _mm256_add_pd(fjy1,ty);
560 fjz1 = _mm256_add_pd(fjz1,tz);
564 /**************************
565 * CALCULATE INTERACTIONS *
566 **************************/
568 if (gmx_mm256_any_lt(rsq22,rcutoff2))
571 r22 = _mm256_mul_pd(rsq22,rinv22);
573 /* EWALD ELECTROSTATICS */
575 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
576 ewrt = _mm256_mul_pd(r22,ewtabscale);
577 ewitab = _mm256_cvttpd_epi32(ewrt);
578 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
579 ewitab = _mm_slli_epi32(ewitab,2);
580 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
581 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
582 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
583 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
584 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
585 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
586 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
587 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
588 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
590 d = _mm256_sub_pd(r22,rswitch);
591 d = _mm256_max_pd(d,_mm256_setzero_pd());
592 d2 = _mm256_mul_pd(d,d);
593 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
595 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
597 /* Evaluate switch function */
598 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
599 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
600 velec = _mm256_mul_pd(velec,sw);
601 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velec = _mm256_and_pd(velec,cutoff_mask);
605 velecsum = _mm256_add_pd(velecsum,velec);
609 fscal = _mm256_and_pd(fscal,cutoff_mask);
611 /* Calculate temporary vectorial force */
612 tx = _mm256_mul_pd(fscal,dx22);
613 ty = _mm256_mul_pd(fscal,dy22);
614 tz = _mm256_mul_pd(fscal,dz22);
616 /* Update vectorial force */
617 fix2 = _mm256_add_pd(fix2,tx);
618 fiy2 = _mm256_add_pd(fiy2,ty);
619 fiz2 = _mm256_add_pd(fiz2,tz);
621 fjx2 = _mm256_add_pd(fjx2,tx);
622 fjy2 = _mm256_add_pd(fjy2,ty);
623 fjz2 = _mm256_add_pd(fjz2,tz);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 if (gmx_mm256_any_lt(rsq23,rcutoff2))
634 r23 = _mm256_mul_pd(rsq23,rinv23);
636 /* EWALD ELECTROSTATICS */
638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639 ewrt = _mm256_mul_pd(r23,ewtabscale);
640 ewitab = _mm256_cvttpd_epi32(ewrt);
641 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
642 ewitab = _mm_slli_epi32(ewitab,2);
643 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
644 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
645 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
646 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
647 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
648 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
649 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
650 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
651 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
653 d = _mm256_sub_pd(r23,rswitch);
654 d = _mm256_max_pd(d,_mm256_setzero_pd());
655 d2 = _mm256_mul_pd(d,d);
656 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
658 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
660 /* Evaluate switch function */
661 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
662 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
663 velec = _mm256_mul_pd(velec,sw);
664 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
666 /* Update potential sum for this i atom from the interaction with this j atom. */
667 velec = _mm256_and_pd(velec,cutoff_mask);
668 velecsum = _mm256_add_pd(velecsum,velec);
672 fscal = _mm256_and_pd(fscal,cutoff_mask);
674 /* Calculate temporary vectorial force */
675 tx = _mm256_mul_pd(fscal,dx23);
676 ty = _mm256_mul_pd(fscal,dy23);
677 tz = _mm256_mul_pd(fscal,dz23);
679 /* Update vectorial force */
680 fix2 = _mm256_add_pd(fix2,tx);
681 fiy2 = _mm256_add_pd(fiy2,ty);
682 fiz2 = _mm256_add_pd(fiz2,tz);
684 fjx3 = _mm256_add_pd(fjx3,tx);
685 fjy3 = _mm256_add_pd(fjy3,ty);
686 fjz3 = _mm256_add_pd(fjz3,tz);
690 /**************************
691 * CALCULATE INTERACTIONS *
692 **************************/
694 if (gmx_mm256_any_lt(rsq31,rcutoff2))
697 r31 = _mm256_mul_pd(rsq31,rinv31);
699 /* EWALD ELECTROSTATICS */
701 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
702 ewrt = _mm256_mul_pd(r31,ewtabscale);
703 ewitab = _mm256_cvttpd_epi32(ewrt);
704 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
705 ewitab = _mm_slli_epi32(ewitab,2);
706 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
707 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
708 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
709 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
710 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
711 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
712 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
713 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
714 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
716 d = _mm256_sub_pd(r31,rswitch);
717 d = _mm256_max_pd(d,_mm256_setzero_pd());
718 d2 = _mm256_mul_pd(d,d);
719 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
721 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
723 /* Evaluate switch function */
724 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
725 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
726 velec = _mm256_mul_pd(velec,sw);
727 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
729 /* Update potential sum for this i atom from the interaction with this j atom. */
730 velec = _mm256_and_pd(velec,cutoff_mask);
731 velecsum = _mm256_add_pd(velecsum,velec);
735 fscal = _mm256_and_pd(fscal,cutoff_mask);
737 /* Calculate temporary vectorial force */
738 tx = _mm256_mul_pd(fscal,dx31);
739 ty = _mm256_mul_pd(fscal,dy31);
740 tz = _mm256_mul_pd(fscal,dz31);
742 /* Update vectorial force */
743 fix3 = _mm256_add_pd(fix3,tx);
744 fiy3 = _mm256_add_pd(fiy3,ty);
745 fiz3 = _mm256_add_pd(fiz3,tz);
747 fjx1 = _mm256_add_pd(fjx1,tx);
748 fjy1 = _mm256_add_pd(fjy1,ty);
749 fjz1 = _mm256_add_pd(fjz1,tz);
753 /**************************
754 * CALCULATE INTERACTIONS *
755 **************************/
757 if (gmx_mm256_any_lt(rsq32,rcutoff2))
760 r32 = _mm256_mul_pd(rsq32,rinv32);
762 /* EWALD ELECTROSTATICS */
764 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
765 ewrt = _mm256_mul_pd(r32,ewtabscale);
766 ewitab = _mm256_cvttpd_epi32(ewrt);
767 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
768 ewitab = _mm_slli_epi32(ewitab,2);
769 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
770 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
771 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
772 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
773 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
774 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
775 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
776 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
777 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
779 d = _mm256_sub_pd(r32,rswitch);
780 d = _mm256_max_pd(d,_mm256_setzero_pd());
781 d2 = _mm256_mul_pd(d,d);
782 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
784 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
786 /* Evaluate switch function */
787 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
788 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
789 velec = _mm256_mul_pd(velec,sw);
790 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
792 /* Update potential sum for this i atom from the interaction with this j atom. */
793 velec = _mm256_and_pd(velec,cutoff_mask);
794 velecsum = _mm256_add_pd(velecsum,velec);
798 fscal = _mm256_and_pd(fscal,cutoff_mask);
800 /* Calculate temporary vectorial force */
801 tx = _mm256_mul_pd(fscal,dx32);
802 ty = _mm256_mul_pd(fscal,dy32);
803 tz = _mm256_mul_pd(fscal,dz32);
805 /* Update vectorial force */
806 fix3 = _mm256_add_pd(fix3,tx);
807 fiy3 = _mm256_add_pd(fiy3,ty);
808 fiz3 = _mm256_add_pd(fiz3,tz);
810 fjx2 = _mm256_add_pd(fjx2,tx);
811 fjy2 = _mm256_add_pd(fjy2,ty);
812 fjz2 = _mm256_add_pd(fjz2,tz);
816 /**************************
817 * CALCULATE INTERACTIONS *
818 **************************/
820 if (gmx_mm256_any_lt(rsq33,rcutoff2))
823 r33 = _mm256_mul_pd(rsq33,rinv33);
825 /* EWALD ELECTROSTATICS */
827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
828 ewrt = _mm256_mul_pd(r33,ewtabscale);
829 ewitab = _mm256_cvttpd_epi32(ewrt);
830 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
831 ewitab = _mm_slli_epi32(ewitab,2);
832 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
833 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
834 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
835 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
836 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
837 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
838 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
839 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
840 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
842 d = _mm256_sub_pd(r33,rswitch);
843 d = _mm256_max_pd(d,_mm256_setzero_pd());
844 d2 = _mm256_mul_pd(d,d);
845 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
847 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
849 /* Evaluate switch function */
850 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
851 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
852 velec = _mm256_mul_pd(velec,sw);
853 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
855 /* Update potential sum for this i atom from the interaction with this j atom. */
856 velec = _mm256_and_pd(velec,cutoff_mask);
857 velecsum = _mm256_add_pd(velecsum,velec);
861 fscal = _mm256_and_pd(fscal,cutoff_mask);
863 /* Calculate temporary vectorial force */
864 tx = _mm256_mul_pd(fscal,dx33);
865 ty = _mm256_mul_pd(fscal,dy33);
866 tz = _mm256_mul_pd(fscal,dz33);
868 /* Update vectorial force */
869 fix3 = _mm256_add_pd(fix3,tx);
870 fiy3 = _mm256_add_pd(fiy3,ty);
871 fiz3 = _mm256_add_pd(fiz3,tz);
873 fjx3 = _mm256_add_pd(fjx3,tx);
874 fjy3 = _mm256_add_pd(fjy3,ty);
875 fjz3 = _mm256_add_pd(fjz3,tz);
879 fjptrA = f+j_coord_offsetA;
880 fjptrB = f+j_coord_offsetB;
881 fjptrC = f+j_coord_offsetC;
882 fjptrD = f+j_coord_offsetD;
884 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
885 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
887 /* Inner loop uses 585 flops */
893 /* Get j neighbor index, and coordinate index */
894 jnrlistA = jjnr[jidx];
895 jnrlistB = jjnr[jidx+1];
896 jnrlistC = jjnr[jidx+2];
897 jnrlistD = jjnr[jidx+3];
898 /* Sign of each element will be negative for non-real atoms.
899 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
900 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
902 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
904 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
905 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
906 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
908 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
909 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
910 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
911 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
912 j_coord_offsetA = DIM*jnrA;
913 j_coord_offsetB = DIM*jnrB;
914 j_coord_offsetC = DIM*jnrC;
915 j_coord_offsetD = DIM*jnrD;
917 /* load j atom coordinates */
918 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
919 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
920 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
922 /* Calculate displacement vector */
923 dx11 = _mm256_sub_pd(ix1,jx1);
924 dy11 = _mm256_sub_pd(iy1,jy1);
925 dz11 = _mm256_sub_pd(iz1,jz1);
926 dx12 = _mm256_sub_pd(ix1,jx2);
927 dy12 = _mm256_sub_pd(iy1,jy2);
928 dz12 = _mm256_sub_pd(iz1,jz2);
929 dx13 = _mm256_sub_pd(ix1,jx3);
930 dy13 = _mm256_sub_pd(iy1,jy3);
931 dz13 = _mm256_sub_pd(iz1,jz3);
932 dx21 = _mm256_sub_pd(ix2,jx1);
933 dy21 = _mm256_sub_pd(iy2,jy1);
934 dz21 = _mm256_sub_pd(iz2,jz1);
935 dx22 = _mm256_sub_pd(ix2,jx2);
936 dy22 = _mm256_sub_pd(iy2,jy2);
937 dz22 = _mm256_sub_pd(iz2,jz2);
938 dx23 = _mm256_sub_pd(ix2,jx3);
939 dy23 = _mm256_sub_pd(iy2,jy3);
940 dz23 = _mm256_sub_pd(iz2,jz3);
941 dx31 = _mm256_sub_pd(ix3,jx1);
942 dy31 = _mm256_sub_pd(iy3,jy1);
943 dz31 = _mm256_sub_pd(iz3,jz1);
944 dx32 = _mm256_sub_pd(ix3,jx2);
945 dy32 = _mm256_sub_pd(iy3,jy2);
946 dz32 = _mm256_sub_pd(iz3,jz2);
947 dx33 = _mm256_sub_pd(ix3,jx3);
948 dy33 = _mm256_sub_pd(iy3,jy3);
949 dz33 = _mm256_sub_pd(iz3,jz3);
951 /* Calculate squared distance and things based on it */
952 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
953 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
954 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
955 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
956 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
957 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
958 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
959 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
960 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
962 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
963 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
964 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
965 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
966 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
967 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
968 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
969 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
970 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
972 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
973 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
974 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
975 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
976 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
977 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
978 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
979 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
980 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
982 fjx1 = _mm256_setzero_pd();
983 fjy1 = _mm256_setzero_pd();
984 fjz1 = _mm256_setzero_pd();
985 fjx2 = _mm256_setzero_pd();
986 fjy2 = _mm256_setzero_pd();
987 fjz2 = _mm256_setzero_pd();
988 fjx3 = _mm256_setzero_pd();
989 fjy3 = _mm256_setzero_pd();
990 fjz3 = _mm256_setzero_pd();
992 /**************************
993 * CALCULATE INTERACTIONS *
994 **************************/
996 if (gmx_mm256_any_lt(rsq11,rcutoff2))
999 r11 = _mm256_mul_pd(rsq11,rinv11);
1000 r11 = _mm256_andnot_pd(dummy_mask,r11);
1002 /* EWALD ELECTROSTATICS */
1004 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1005 ewrt = _mm256_mul_pd(r11,ewtabscale);
1006 ewitab = _mm256_cvttpd_epi32(ewrt);
1007 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1008 ewitab = _mm_slli_epi32(ewitab,2);
1009 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1010 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1011 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1012 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1013 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1014 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1015 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1016 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1017 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1019 d = _mm256_sub_pd(r11,rswitch);
1020 d = _mm256_max_pd(d,_mm256_setzero_pd());
1021 d2 = _mm256_mul_pd(d,d);
1022 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1024 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1026 /* Evaluate switch function */
1027 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1028 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1029 velec = _mm256_mul_pd(velec,sw);
1030 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1032 /* Update potential sum for this i atom from the interaction with this j atom. */
1033 velec = _mm256_and_pd(velec,cutoff_mask);
1034 velec = _mm256_andnot_pd(dummy_mask,velec);
1035 velecsum = _mm256_add_pd(velecsum,velec);
1039 fscal = _mm256_and_pd(fscal,cutoff_mask);
1041 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1043 /* Calculate temporary vectorial force */
1044 tx = _mm256_mul_pd(fscal,dx11);
1045 ty = _mm256_mul_pd(fscal,dy11);
1046 tz = _mm256_mul_pd(fscal,dz11);
1048 /* Update vectorial force */
1049 fix1 = _mm256_add_pd(fix1,tx);
1050 fiy1 = _mm256_add_pd(fiy1,ty);
1051 fiz1 = _mm256_add_pd(fiz1,tz);
1053 fjx1 = _mm256_add_pd(fjx1,tx);
1054 fjy1 = _mm256_add_pd(fjy1,ty);
1055 fjz1 = _mm256_add_pd(fjz1,tz);
1059 /**************************
1060 * CALCULATE INTERACTIONS *
1061 **************************/
1063 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1066 r12 = _mm256_mul_pd(rsq12,rinv12);
1067 r12 = _mm256_andnot_pd(dummy_mask,r12);
1069 /* EWALD ELECTROSTATICS */
1071 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1072 ewrt = _mm256_mul_pd(r12,ewtabscale);
1073 ewitab = _mm256_cvttpd_epi32(ewrt);
1074 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1075 ewitab = _mm_slli_epi32(ewitab,2);
1076 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1077 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1078 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1079 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1080 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1081 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1082 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1083 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1084 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1086 d = _mm256_sub_pd(r12,rswitch);
1087 d = _mm256_max_pd(d,_mm256_setzero_pd());
1088 d2 = _mm256_mul_pd(d,d);
1089 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1091 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1093 /* Evaluate switch function */
1094 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1095 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1096 velec = _mm256_mul_pd(velec,sw);
1097 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _mm256_and_pd(velec,cutoff_mask);
1101 velec = _mm256_andnot_pd(dummy_mask,velec);
1102 velecsum = _mm256_add_pd(velecsum,velec);
1106 fscal = _mm256_and_pd(fscal,cutoff_mask);
1108 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1110 /* Calculate temporary vectorial force */
1111 tx = _mm256_mul_pd(fscal,dx12);
1112 ty = _mm256_mul_pd(fscal,dy12);
1113 tz = _mm256_mul_pd(fscal,dz12);
1115 /* Update vectorial force */
1116 fix1 = _mm256_add_pd(fix1,tx);
1117 fiy1 = _mm256_add_pd(fiy1,ty);
1118 fiz1 = _mm256_add_pd(fiz1,tz);
1120 fjx2 = _mm256_add_pd(fjx2,tx);
1121 fjy2 = _mm256_add_pd(fjy2,ty);
1122 fjz2 = _mm256_add_pd(fjz2,tz);
1126 /**************************
1127 * CALCULATE INTERACTIONS *
1128 **************************/
1130 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1133 r13 = _mm256_mul_pd(rsq13,rinv13);
1134 r13 = _mm256_andnot_pd(dummy_mask,r13);
1136 /* EWALD ELECTROSTATICS */
1138 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1139 ewrt = _mm256_mul_pd(r13,ewtabscale);
1140 ewitab = _mm256_cvttpd_epi32(ewrt);
1141 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1142 ewitab = _mm_slli_epi32(ewitab,2);
1143 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1144 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1145 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1146 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1147 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1148 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1149 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1150 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1151 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1153 d = _mm256_sub_pd(r13,rswitch);
1154 d = _mm256_max_pd(d,_mm256_setzero_pd());
1155 d2 = _mm256_mul_pd(d,d);
1156 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1158 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1160 /* Evaluate switch function */
1161 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1162 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
1163 velec = _mm256_mul_pd(velec,sw);
1164 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1166 /* Update potential sum for this i atom from the interaction with this j atom. */
1167 velec = _mm256_and_pd(velec,cutoff_mask);
1168 velec = _mm256_andnot_pd(dummy_mask,velec);
1169 velecsum = _mm256_add_pd(velecsum,velec);
1173 fscal = _mm256_and_pd(fscal,cutoff_mask);
1175 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1177 /* Calculate temporary vectorial force */
1178 tx = _mm256_mul_pd(fscal,dx13);
1179 ty = _mm256_mul_pd(fscal,dy13);
1180 tz = _mm256_mul_pd(fscal,dz13);
1182 /* Update vectorial force */
1183 fix1 = _mm256_add_pd(fix1,tx);
1184 fiy1 = _mm256_add_pd(fiy1,ty);
1185 fiz1 = _mm256_add_pd(fiz1,tz);
1187 fjx3 = _mm256_add_pd(fjx3,tx);
1188 fjy3 = _mm256_add_pd(fjy3,ty);
1189 fjz3 = _mm256_add_pd(fjz3,tz);
1193 /**************************
1194 * CALCULATE INTERACTIONS *
1195 **************************/
1197 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1200 r21 = _mm256_mul_pd(rsq21,rinv21);
1201 r21 = _mm256_andnot_pd(dummy_mask,r21);
1203 /* EWALD ELECTROSTATICS */
1205 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1206 ewrt = _mm256_mul_pd(r21,ewtabscale);
1207 ewitab = _mm256_cvttpd_epi32(ewrt);
1208 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1209 ewitab = _mm_slli_epi32(ewitab,2);
1210 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1211 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1212 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1213 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1214 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1215 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1216 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1217 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1218 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1220 d = _mm256_sub_pd(r21,rswitch);
1221 d = _mm256_max_pd(d,_mm256_setzero_pd());
1222 d2 = _mm256_mul_pd(d,d);
1223 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1225 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1227 /* Evaluate switch function */
1228 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1229 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1230 velec = _mm256_mul_pd(velec,sw);
1231 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1233 /* Update potential sum for this i atom from the interaction with this j atom. */
1234 velec = _mm256_and_pd(velec,cutoff_mask);
1235 velec = _mm256_andnot_pd(dummy_mask,velec);
1236 velecsum = _mm256_add_pd(velecsum,velec);
1240 fscal = _mm256_and_pd(fscal,cutoff_mask);
1242 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1244 /* Calculate temporary vectorial force */
1245 tx = _mm256_mul_pd(fscal,dx21);
1246 ty = _mm256_mul_pd(fscal,dy21);
1247 tz = _mm256_mul_pd(fscal,dz21);
1249 /* Update vectorial force */
1250 fix2 = _mm256_add_pd(fix2,tx);
1251 fiy2 = _mm256_add_pd(fiy2,ty);
1252 fiz2 = _mm256_add_pd(fiz2,tz);
1254 fjx1 = _mm256_add_pd(fjx1,tx);
1255 fjy1 = _mm256_add_pd(fjy1,ty);
1256 fjz1 = _mm256_add_pd(fjz1,tz);
1260 /**************************
1261 * CALCULATE INTERACTIONS *
1262 **************************/
1264 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1267 r22 = _mm256_mul_pd(rsq22,rinv22);
1268 r22 = _mm256_andnot_pd(dummy_mask,r22);
1270 /* EWALD ELECTROSTATICS */
1272 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1273 ewrt = _mm256_mul_pd(r22,ewtabscale);
1274 ewitab = _mm256_cvttpd_epi32(ewrt);
1275 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1276 ewitab = _mm_slli_epi32(ewitab,2);
1277 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1278 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1279 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1280 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1281 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1282 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1283 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1284 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1285 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1287 d = _mm256_sub_pd(r22,rswitch);
1288 d = _mm256_max_pd(d,_mm256_setzero_pd());
1289 d2 = _mm256_mul_pd(d,d);
1290 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1292 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1294 /* Evaluate switch function */
1295 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1296 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1297 velec = _mm256_mul_pd(velec,sw);
1298 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1300 /* Update potential sum for this i atom from the interaction with this j atom. */
1301 velec = _mm256_and_pd(velec,cutoff_mask);
1302 velec = _mm256_andnot_pd(dummy_mask,velec);
1303 velecsum = _mm256_add_pd(velecsum,velec);
1307 fscal = _mm256_and_pd(fscal,cutoff_mask);
1309 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1311 /* Calculate temporary vectorial force */
1312 tx = _mm256_mul_pd(fscal,dx22);
1313 ty = _mm256_mul_pd(fscal,dy22);
1314 tz = _mm256_mul_pd(fscal,dz22);
1316 /* Update vectorial force */
1317 fix2 = _mm256_add_pd(fix2,tx);
1318 fiy2 = _mm256_add_pd(fiy2,ty);
1319 fiz2 = _mm256_add_pd(fiz2,tz);
1321 fjx2 = _mm256_add_pd(fjx2,tx);
1322 fjy2 = _mm256_add_pd(fjy2,ty);
1323 fjz2 = _mm256_add_pd(fjz2,tz);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1334 r23 = _mm256_mul_pd(rsq23,rinv23);
1335 r23 = _mm256_andnot_pd(dummy_mask,r23);
1337 /* EWALD ELECTROSTATICS */
1339 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1340 ewrt = _mm256_mul_pd(r23,ewtabscale);
1341 ewitab = _mm256_cvttpd_epi32(ewrt);
1342 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1343 ewitab = _mm_slli_epi32(ewitab,2);
1344 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1345 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1346 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1347 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1348 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1349 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1350 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1351 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1352 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1354 d = _mm256_sub_pd(r23,rswitch);
1355 d = _mm256_max_pd(d,_mm256_setzero_pd());
1356 d2 = _mm256_mul_pd(d,d);
1357 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1359 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1361 /* Evaluate switch function */
1362 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1363 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
1364 velec = _mm256_mul_pd(velec,sw);
1365 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1367 /* Update potential sum for this i atom from the interaction with this j atom. */
1368 velec = _mm256_and_pd(velec,cutoff_mask);
1369 velec = _mm256_andnot_pd(dummy_mask,velec);
1370 velecsum = _mm256_add_pd(velecsum,velec);
1374 fscal = _mm256_and_pd(fscal,cutoff_mask);
1376 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1378 /* Calculate temporary vectorial force */
1379 tx = _mm256_mul_pd(fscal,dx23);
1380 ty = _mm256_mul_pd(fscal,dy23);
1381 tz = _mm256_mul_pd(fscal,dz23);
1383 /* Update vectorial force */
1384 fix2 = _mm256_add_pd(fix2,tx);
1385 fiy2 = _mm256_add_pd(fiy2,ty);
1386 fiz2 = _mm256_add_pd(fiz2,tz);
1388 fjx3 = _mm256_add_pd(fjx3,tx);
1389 fjy3 = _mm256_add_pd(fjy3,ty);
1390 fjz3 = _mm256_add_pd(fjz3,tz);
1394 /**************************
1395 * CALCULATE INTERACTIONS *
1396 **************************/
1398 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1401 r31 = _mm256_mul_pd(rsq31,rinv31);
1402 r31 = _mm256_andnot_pd(dummy_mask,r31);
1404 /* EWALD ELECTROSTATICS */
1406 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1407 ewrt = _mm256_mul_pd(r31,ewtabscale);
1408 ewitab = _mm256_cvttpd_epi32(ewrt);
1409 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1410 ewitab = _mm_slli_epi32(ewitab,2);
1411 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1412 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1413 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1414 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1415 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1416 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1417 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1418 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1419 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1421 d = _mm256_sub_pd(r31,rswitch);
1422 d = _mm256_max_pd(d,_mm256_setzero_pd());
1423 d2 = _mm256_mul_pd(d,d);
1424 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1426 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1428 /* Evaluate switch function */
1429 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1430 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
1431 velec = _mm256_mul_pd(velec,sw);
1432 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1434 /* Update potential sum for this i atom from the interaction with this j atom. */
1435 velec = _mm256_and_pd(velec,cutoff_mask);
1436 velec = _mm256_andnot_pd(dummy_mask,velec);
1437 velecsum = _mm256_add_pd(velecsum,velec);
1441 fscal = _mm256_and_pd(fscal,cutoff_mask);
1443 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1445 /* Calculate temporary vectorial force */
1446 tx = _mm256_mul_pd(fscal,dx31);
1447 ty = _mm256_mul_pd(fscal,dy31);
1448 tz = _mm256_mul_pd(fscal,dz31);
1450 /* Update vectorial force */
1451 fix3 = _mm256_add_pd(fix3,tx);
1452 fiy3 = _mm256_add_pd(fiy3,ty);
1453 fiz3 = _mm256_add_pd(fiz3,tz);
1455 fjx1 = _mm256_add_pd(fjx1,tx);
1456 fjy1 = _mm256_add_pd(fjy1,ty);
1457 fjz1 = _mm256_add_pd(fjz1,tz);
1461 /**************************
1462 * CALCULATE INTERACTIONS *
1463 **************************/
1465 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1468 r32 = _mm256_mul_pd(rsq32,rinv32);
1469 r32 = _mm256_andnot_pd(dummy_mask,r32);
1471 /* EWALD ELECTROSTATICS */
1473 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1474 ewrt = _mm256_mul_pd(r32,ewtabscale);
1475 ewitab = _mm256_cvttpd_epi32(ewrt);
1476 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1477 ewitab = _mm_slli_epi32(ewitab,2);
1478 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1479 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1480 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1481 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1482 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1483 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1484 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1485 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1486 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1488 d = _mm256_sub_pd(r32,rswitch);
1489 d = _mm256_max_pd(d,_mm256_setzero_pd());
1490 d2 = _mm256_mul_pd(d,d);
1491 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1493 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1495 /* Evaluate switch function */
1496 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1497 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
1498 velec = _mm256_mul_pd(velec,sw);
1499 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1501 /* Update potential sum for this i atom from the interaction with this j atom. */
1502 velec = _mm256_and_pd(velec,cutoff_mask);
1503 velec = _mm256_andnot_pd(dummy_mask,velec);
1504 velecsum = _mm256_add_pd(velecsum,velec);
1508 fscal = _mm256_and_pd(fscal,cutoff_mask);
1510 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1512 /* Calculate temporary vectorial force */
1513 tx = _mm256_mul_pd(fscal,dx32);
1514 ty = _mm256_mul_pd(fscal,dy32);
1515 tz = _mm256_mul_pd(fscal,dz32);
1517 /* Update vectorial force */
1518 fix3 = _mm256_add_pd(fix3,tx);
1519 fiy3 = _mm256_add_pd(fiy3,ty);
1520 fiz3 = _mm256_add_pd(fiz3,tz);
1522 fjx2 = _mm256_add_pd(fjx2,tx);
1523 fjy2 = _mm256_add_pd(fjy2,ty);
1524 fjz2 = _mm256_add_pd(fjz2,tz);
1528 /**************************
1529 * CALCULATE INTERACTIONS *
1530 **************************/
1532 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1535 r33 = _mm256_mul_pd(rsq33,rinv33);
1536 r33 = _mm256_andnot_pd(dummy_mask,r33);
1538 /* EWALD ELECTROSTATICS */
1540 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1541 ewrt = _mm256_mul_pd(r33,ewtabscale);
1542 ewitab = _mm256_cvttpd_epi32(ewrt);
1543 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1544 ewitab = _mm_slli_epi32(ewitab,2);
1545 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1546 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1547 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1548 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1549 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1550 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1551 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1552 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1553 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1555 d = _mm256_sub_pd(r33,rswitch);
1556 d = _mm256_max_pd(d,_mm256_setzero_pd());
1557 d2 = _mm256_mul_pd(d,d);
1558 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1560 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1562 /* Evaluate switch function */
1563 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1564 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
1565 velec = _mm256_mul_pd(velec,sw);
1566 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1568 /* Update potential sum for this i atom from the interaction with this j atom. */
1569 velec = _mm256_and_pd(velec,cutoff_mask);
1570 velec = _mm256_andnot_pd(dummy_mask,velec);
1571 velecsum = _mm256_add_pd(velecsum,velec);
1575 fscal = _mm256_and_pd(fscal,cutoff_mask);
1577 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1579 /* Calculate temporary vectorial force */
1580 tx = _mm256_mul_pd(fscal,dx33);
1581 ty = _mm256_mul_pd(fscal,dy33);
1582 tz = _mm256_mul_pd(fscal,dz33);
1584 /* Update vectorial force */
1585 fix3 = _mm256_add_pd(fix3,tx);
1586 fiy3 = _mm256_add_pd(fiy3,ty);
1587 fiz3 = _mm256_add_pd(fiz3,tz);
1589 fjx3 = _mm256_add_pd(fjx3,tx);
1590 fjy3 = _mm256_add_pd(fjy3,ty);
1591 fjz3 = _mm256_add_pd(fjz3,tz);
1595 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1596 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1597 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1598 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1600 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1601 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1603 /* Inner loop uses 594 flops */
1606 /* End of innermost loop */
1608 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1609 f+i_coord_offset+DIM,fshift+i_shift_offset);
1612 /* Update potential energies */
1613 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1615 /* Increment number of inner iterations */
1616 inneriter += j_index_end - j_index_start;
1618 /* Outer loop uses 19 flops */
1621 /* Increment number of outer iterations */
1624 /* Update outer/inner flops */
1626 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
1629 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
1630 * Electrostatics interaction: Ewald
1631 * VdW interaction: None
1632 * Geometry: Water4-Water4
1633 * Calculate force/pot: Force
1636 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_double
1637 (t_nblist * gmx_restrict nlist,
1638 rvec * gmx_restrict xx,
1639 rvec * gmx_restrict ff,
1640 t_forcerec * gmx_restrict fr,
1641 t_mdatoms * gmx_restrict mdatoms,
1642 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1643 t_nrnb * gmx_restrict nrnb)
1645 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1646 * just 0 for non-waters.
1647 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1648 * jnr indices corresponding to data put in the four positions in the SIMD register.
1650 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1651 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1652 int jnrA,jnrB,jnrC,jnrD;
1653 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1654 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1655 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1656 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1657 real rcutoff_scalar;
1658 real *shiftvec,*fshift,*x,*f;
1659 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1660 real scratch[4*DIM];
1661 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1662 real * vdwioffsetptr1;
1663 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1664 real * vdwioffsetptr2;
1665 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1666 real * vdwioffsetptr3;
1667 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1668 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1669 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1670 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1671 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1672 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1673 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1674 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1675 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1676 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1677 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1678 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1679 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1680 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1681 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1682 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1683 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1686 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1687 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1689 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1690 real rswitch_scalar,d_scalar;
1691 __m256d dummy_mask,cutoff_mask;
1692 __m128 tmpmask0,tmpmask1;
1693 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1694 __m256d one = _mm256_set1_pd(1.0);
1695 __m256d two = _mm256_set1_pd(2.0);
1701 jindex = nlist->jindex;
1703 shiftidx = nlist->shift;
1705 shiftvec = fr->shift_vec[0];
1706 fshift = fr->fshift[0];
1707 facel = _mm256_set1_pd(fr->epsfac);
1708 charge = mdatoms->chargeA;
1710 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1711 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1712 beta2 = _mm256_mul_pd(beta,beta);
1713 beta3 = _mm256_mul_pd(beta,beta2);
1715 ewtab = fr->ic->tabq_coul_FDV0;
1716 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1717 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1719 /* Setup water-specific parameters */
1720 inr = nlist->iinr[0];
1721 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1722 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1723 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1725 jq1 = _mm256_set1_pd(charge[inr+1]);
1726 jq2 = _mm256_set1_pd(charge[inr+2]);
1727 jq3 = _mm256_set1_pd(charge[inr+3]);
1728 qq11 = _mm256_mul_pd(iq1,jq1);
1729 qq12 = _mm256_mul_pd(iq1,jq2);
1730 qq13 = _mm256_mul_pd(iq1,jq3);
1731 qq21 = _mm256_mul_pd(iq2,jq1);
1732 qq22 = _mm256_mul_pd(iq2,jq2);
1733 qq23 = _mm256_mul_pd(iq2,jq3);
1734 qq31 = _mm256_mul_pd(iq3,jq1);
1735 qq32 = _mm256_mul_pd(iq3,jq2);
1736 qq33 = _mm256_mul_pd(iq3,jq3);
1738 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1739 rcutoff_scalar = fr->rcoulomb;
1740 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1741 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1743 rswitch_scalar = fr->rcoulomb_switch;
1744 rswitch = _mm256_set1_pd(rswitch_scalar);
1745 /* Setup switch parameters */
1746 d_scalar = rcutoff_scalar-rswitch_scalar;
1747 d = _mm256_set1_pd(d_scalar);
1748 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1749 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1750 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1751 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1752 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1753 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1755 /* Avoid stupid compiler warnings */
1756 jnrA = jnrB = jnrC = jnrD = 0;
1757 j_coord_offsetA = 0;
1758 j_coord_offsetB = 0;
1759 j_coord_offsetC = 0;
1760 j_coord_offsetD = 0;
1765 for(iidx=0;iidx<4*DIM;iidx++)
1767 scratch[iidx] = 0.0;
1770 /* Start outer loop over neighborlists */
1771 for(iidx=0; iidx<nri; iidx++)
1773 /* Load shift vector for this list */
1774 i_shift_offset = DIM*shiftidx[iidx];
1776 /* Load limits for loop over neighbors */
1777 j_index_start = jindex[iidx];
1778 j_index_end = jindex[iidx+1];
1780 /* Get outer coordinate index */
1782 i_coord_offset = DIM*inr;
1784 /* Load i particle coords and add shift vector */
1785 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1786 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1788 fix1 = _mm256_setzero_pd();
1789 fiy1 = _mm256_setzero_pd();
1790 fiz1 = _mm256_setzero_pd();
1791 fix2 = _mm256_setzero_pd();
1792 fiy2 = _mm256_setzero_pd();
1793 fiz2 = _mm256_setzero_pd();
1794 fix3 = _mm256_setzero_pd();
1795 fiy3 = _mm256_setzero_pd();
1796 fiz3 = _mm256_setzero_pd();
1798 /* Start inner kernel loop */
1799 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1802 /* Get j neighbor index, and coordinate index */
1804 jnrB = jjnr[jidx+1];
1805 jnrC = jjnr[jidx+2];
1806 jnrD = jjnr[jidx+3];
1807 j_coord_offsetA = DIM*jnrA;
1808 j_coord_offsetB = DIM*jnrB;
1809 j_coord_offsetC = DIM*jnrC;
1810 j_coord_offsetD = DIM*jnrD;
1812 /* load j atom coordinates */
1813 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1814 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1815 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1817 /* Calculate displacement vector */
1818 dx11 = _mm256_sub_pd(ix1,jx1);
1819 dy11 = _mm256_sub_pd(iy1,jy1);
1820 dz11 = _mm256_sub_pd(iz1,jz1);
1821 dx12 = _mm256_sub_pd(ix1,jx2);
1822 dy12 = _mm256_sub_pd(iy1,jy2);
1823 dz12 = _mm256_sub_pd(iz1,jz2);
1824 dx13 = _mm256_sub_pd(ix1,jx3);
1825 dy13 = _mm256_sub_pd(iy1,jy3);
1826 dz13 = _mm256_sub_pd(iz1,jz3);
1827 dx21 = _mm256_sub_pd(ix2,jx1);
1828 dy21 = _mm256_sub_pd(iy2,jy1);
1829 dz21 = _mm256_sub_pd(iz2,jz1);
1830 dx22 = _mm256_sub_pd(ix2,jx2);
1831 dy22 = _mm256_sub_pd(iy2,jy2);
1832 dz22 = _mm256_sub_pd(iz2,jz2);
1833 dx23 = _mm256_sub_pd(ix2,jx3);
1834 dy23 = _mm256_sub_pd(iy2,jy3);
1835 dz23 = _mm256_sub_pd(iz2,jz3);
1836 dx31 = _mm256_sub_pd(ix3,jx1);
1837 dy31 = _mm256_sub_pd(iy3,jy1);
1838 dz31 = _mm256_sub_pd(iz3,jz1);
1839 dx32 = _mm256_sub_pd(ix3,jx2);
1840 dy32 = _mm256_sub_pd(iy3,jy2);
1841 dz32 = _mm256_sub_pd(iz3,jz2);
1842 dx33 = _mm256_sub_pd(ix3,jx3);
1843 dy33 = _mm256_sub_pd(iy3,jy3);
1844 dz33 = _mm256_sub_pd(iz3,jz3);
1846 /* Calculate squared distance and things based on it */
1847 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1848 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1849 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1850 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1851 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1852 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1853 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1854 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1855 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1857 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1858 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1859 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1860 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1861 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1862 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1863 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1864 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1865 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1867 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1868 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1869 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1870 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1871 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1872 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1873 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1874 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1875 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1877 fjx1 = _mm256_setzero_pd();
1878 fjy1 = _mm256_setzero_pd();
1879 fjz1 = _mm256_setzero_pd();
1880 fjx2 = _mm256_setzero_pd();
1881 fjy2 = _mm256_setzero_pd();
1882 fjz2 = _mm256_setzero_pd();
1883 fjx3 = _mm256_setzero_pd();
1884 fjy3 = _mm256_setzero_pd();
1885 fjz3 = _mm256_setzero_pd();
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1894 r11 = _mm256_mul_pd(rsq11,rinv11);
1896 /* EWALD ELECTROSTATICS */
1898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1899 ewrt = _mm256_mul_pd(r11,ewtabscale);
1900 ewitab = _mm256_cvttpd_epi32(ewrt);
1901 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1902 ewitab = _mm_slli_epi32(ewitab,2);
1903 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1904 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1905 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1906 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1907 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1908 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1909 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1910 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1911 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1913 d = _mm256_sub_pd(r11,rswitch);
1914 d = _mm256_max_pd(d,_mm256_setzero_pd());
1915 d2 = _mm256_mul_pd(d,d);
1916 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1918 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1920 /* Evaluate switch function */
1921 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1922 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1923 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1927 fscal = _mm256_and_pd(fscal,cutoff_mask);
1929 /* Calculate temporary vectorial force */
1930 tx = _mm256_mul_pd(fscal,dx11);
1931 ty = _mm256_mul_pd(fscal,dy11);
1932 tz = _mm256_mul_pd(fscal,dz11);
1934 /* Update vectorial force */
1935 fix1 = _mm256_add_pd(fix1,tx);
1936 fiy1 = _mm256_add_pd(fiy1,ty);
1937 fiz1 = _mm256_add_pd(fiz1,tz);
1939 fjx1 = _mm256_add_pd(fjx1,tx);
1940 fjy1 = _mm256_add_pd(fjy1,ty);
1941 fjz1 = _mm256_add_pd(fjz1,tz);
1945 /**************************
1946 * CALCULATE INTERACTIONS *
1947 **************************/
1949 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1952 r12 = _mm256_mul_pd(rsq12,rinv12);
1954 /* EWALD ELECTROSTATICS */
1956 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1957 ewrt = _mm256_mul_pd(r12,ewtabscale);
1958 ewitab = _mm256_cvttpd_epi32(ewrt);
1959 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1960 ewitab = _mm_slli_epi32(ewitab,2);
1961 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1962 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1963 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1964 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1965 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1966 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1967 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1968 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1969 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1971 d = _mm256_sub_pd(r12,rswitch);
1972 d = _mm256_max_pd(d,_mm256_setzero_pd());
1973 d2 = _mm256_mul_pd(d,d);
1974 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1976 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1978 /* Evaluate switch function */
1979 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1980 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1981 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1985 fscal = _mm256_and_pd(fscal,cutoff_mask);
1987 /* Calculate temporary vectorial force */
1988 tx = _mm256_mul_pd(fscal,dx12);
1989 ty = _mm256_mul_pd(fscal,dy12);
1990 tz = _mm256_mul_pd(fscal,dz12);
1992 /* Update vectorial force */
1993 fix1 = _mm256_add_pd(fix1,tx);
1994 fiy1 = _mm256_add_pd(fiy1,ty);
1995 fiz1 = _mm256_add_pd(fiz1,tz);
1997 fjx2 = _mm256_add_pd(fjx2,tx);
1998 fjy2 = _mm256_add_pd(fjy2,ty);
1999 fjz2 = _mm256_add_pd(fjz2,tz);
2003 /**************************
2004 * CALCULATE INTERACTIONS *
2005 **************************/
2007 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2010 r13 = _mm256_mul_pd(rsq13,rinv13);
2012 /* EWALD ELECTROSTATICS */
2014 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2015 ewrt = _mm256_mul_pd(r13,ewtabscale);
2016 ewitab = _mm256_cvttpd_epi32(ewrt);
2017 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2018 ewitab = _mm_slli_epi32(ewitab,2);
2019 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2020 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2021 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2022 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2023 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2024 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2025 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2026 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2027 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2029 d = _mm256_sub_pd(r13,rswitch);
2030 d = _mm256_max_pd(d,_mm256_setzero_pd());
2031 d2 = _mm256_mul_pd(d,d);
2032 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2034 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2036 /* Evaluate switch function */
2037 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2038 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2039 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2043 fscal = _mm256_and_pd(fscal,cutoff_mask);
2045 /* Calculate temporary vectorial force */
2046 tx = _mm256_mul_pd(fscal,dx13);
2047 ty = _mm256_mul_pd(fscal,dy13);
2048 tz = _mm256_mul_pd(fscal,dz13);
2050 /* Update vectorial force */
2051 fix1 = _mm256_add_pd(fix1,tx);
2052 fiy1 = _mm256_add_pd(fiy1,ty);
2053 fiz1 = _mm256_add_pd(fiz1,tz);
2055 fjx3 = _mm256_add_pd(fjx3,tx);
2056 fjy3 = _mm256_add_pd(fjy3,ty);
2057 fjz3 = _mm256_add_pd(fjz3,tz);
2061 /**************************
2062 * CALCULATE INTERACTIONS *
2063 **************************/
2065 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2068 r21 = _mm256_mul_pd(rsq21,rinv21);
2070 /* EWALD ELECTROSTATICS */
2072 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2073 ewrt = _mm256_mul_pd(r21,ewtabscale);
2074 ewitab = _mm256_cvttpd_epi32(ewrt);
2075 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2076 ewitab = _mm_slli_epi32(ewitab,2);
2077 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2078 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2079 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2080 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2081 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2082 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2083 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2084 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2085 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2087 d = _mm256_sub_pd(r21,rswitch);
2088 d = _mm256_max_pd(d,_mm256_setzero_pd());
2089 d2 = _mm256_mul_pd(d,d);
2090 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2092 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2094 /* Evaluate switch function */
2095 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2096 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2097 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2101 fscal = _mm256_and_pd(fscal,cutoff_mask);
2103 /* Calculate temporary vectorial force */
2104 tx = _mm256_mul_pd(fscal,dx21);
2105 ty = _mm256_mul_pd(fscal,dy21);
2106 tz = _mm256_mul_pd(fscal,dz21);
2108 /* Update vectorial force */
2109 fix2 = _mm256_add_pd(fix2,tx);
2110 fiy2 = _mm256_add_pd(fiy2,ty);
2111 fiz2 = _mm256_add_pd(fiz2,tz);
2113 fjx1 = _mm256_add_pd(fjx1,tx);
2114 fjy1 = _mm256_add_pd(fjy1,ty);
2115 fjz1 = _mm256_add_pd(fjz1,tz);
2119 /**************************
2120 * CALCULATE INTERACTIONS *
2121 **************************/
2123 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2126 r22 = _mm256_mul_pd(rsq22,rinv22);
2128 /* EWALD ELECTROSTATICS */
2130 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2131 ewrt = _mm256_mul_pd(r22,ewtabscale);
2132 ewitab = _mm256_cvttpd_epi32(ewrt);
2133 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2134 ewitab = _mm_slli_epi32(ewitab,2);
2135 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2136 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2137 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2138 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2139 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2140 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2141 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2142 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2143 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2145 d = _mm256_sub_pd(r22,rswitch);
2146 d = _mm256_max_pd(d,_mm256_setzero_pd());
2147 d2 = _mm256_mul_pd(d,d);
2148 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2150 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2152 /* Evaluate switch function */
2153 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2154 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2155 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2159 fscal = _mm256_and_pd(fscal,cutoff_mask);
2161 /* Calculate temporary vectorial force */
2162 tx = _mm256_mul_pd(fscal,dx22);
2163 ty = _mm256_mul_pd(fscal,dy22);
2164 tz = _mm256_mul_pd(fscal,dz22);
2166 /* Update vectorial force */
2167 fix2 = _mm256_add_pd(fix2,tx);
2168 fiy2 = _mm256_add_pd(fiy2,ty);
2169 fiz2 = _mm256_add_pd(fiz2,tz);
2171 fjx2 = _mm256_add_pd(fjx2,tx);
2172 fjy2 = _mm256_add_pd(fjy2,ty);
2173 fjz2 = _mm256_add_pd(fjz2,tz);
2177 /**************************
2178 * CALCULATE INTERACTIONS *
2179 **************************/
2181 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2184 r23 = _mm256_mul_pd(rsq23,rinv23);
2186 /* EWALD ELECTROSTATICS */
2188 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2189 ewrt = _mm256_mul_pd(r23,ewtabscale);
2190 ewitab = _mm256_cvttpd_epi32(ewrt);
2191 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2192 ewitab = _mm_slli_epi32(ewitab,2);
2193 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2194 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2195 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2196 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2197 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2198 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2199 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2200 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2201 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2203 d = _mm256_sub_pd(r23,rswitch);
2204 d = _mm256_max_pd(d,_mm256_setzero_pd());
2205 d2 = _mm256_mul_pd(d,d);
2206 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2208 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2210 /* Evaluate switch function */
2211 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2212 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2213 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2217 fscal = _mm256_and_pd(fscal,cutoff_mask);
2219 /* Calculate temporary vectorial force */
2220 tx = _mm256_mul_pd(fscal,dx23);
2221 ty = _mm256_mul_pd(fscal,dy23);
2222 tz = _mm256_mul_pd(fscal,dz23);
2224 /* Update vectorial force */
2225 fix2 = _mm256_add_pd(fix2,tx);
2226 fiy2 = _mm256_add_pd(fiy2,ty);
2227 fiz2 = _mm256_add_pd(fiz2,tz);
2229 fjx3 = _mm256_add_pd(fjx3,tx);
2230 fjy3 = _mm256_add_pd(fjy3,ty);
2231 fjz3 = _mm256_add_pd(fjz3,tz);
2235 /**************************
2236 * CALCULATE INTERACTIONS *
2237 **************************/
2239 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2242 r31 = _mm256_mul_pd(rsq31,rinv31);
2244 /* EWALD ELECTROSTATICS */
2246 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2247 ewrt = _mm256_mul_pd(r31,ewtabscale);
2248 ewitab = _mm256_cvttpd_epi32(ewrt);
2249 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2250 ewitab = _mm_slli_epi32(ewitab,2);
2251 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2252 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2253 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2254 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2255 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2256 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2257 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2258 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2259 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2261 d = _mm256_sub_pd(r31,rswitch);
2262 d = _mm256_max_pd(d,_mm256_setzero_pd());
2263 d2 = _mm256_mul_pd(d,d);
2264 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2266 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2268 /* Evaluate switch function */
2269 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2270 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2271 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2275 fscal = _mm256_and_pd(fscal,cutoff_mask);
2277 /* Calculate temporary vectorial force */
2278 tx = _mm256_mul_pd(fscal,dx31);
2279 ty = _mm256_mul_pd(fscal,dy31);
2280 tz = _mm256_mul_pd(fscal,dz31);
2282 /* Update vectorial force */
2283 fix3 = _mm256_add_pd(fix3,tx);
2284 fiy3 = _mm256_add_pd(fiy3,ty);
2285 fiz3 = _mm256_add_pd(fiz3,tz);
2287 fjx1 = _mm256_add_pd(fjx1,tx);
2288 fjy1 = _mm256_add_pd(fjy1,ty);
2289 fjz1 = _mm256_add_pd(fjz1,tz);
2293 /**************************
2294 * CALCULATE INTERACTIONS *
2295 **************************/
2297 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2300 r32 = _mm256_mul_pd(rsq32,rinv32);
2302 /* EWALD ELECTROSTATICS */
2304 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305 ewrt = _mm256_mul_pd(r32,ewtabscale);
2306 ewitab = _mm256_cvttpd_epi32(ewrt);
2307 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2308 ewitab = _mm_slli_epi32(ewitab,2);
2309 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2310 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2311 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2312 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2313 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2314 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2315 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2316 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2317 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2319 d = _mm256_sub_pd(r32,rswitch);
2320 d = _mm256_max_pd(d,_mm256_setzero_pd());
2321 d2 = _mm256_mul_pd(d,d);
2322 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2324 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2326 /* Evaluate switch function */
2327 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2328 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2329 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2333 fscal = _mm256_and_pd(fscal,cutoff_mask);
2335 /* Calculate temporary vectorial force */
2336 tx = _mm256_mul_pd(fscal,dx32);
2337 ty = _mm256_mul_pd(fscal,dy32);
2338 tz = _mm256_mul_pd(fscal,dz32);
2340 /* Update vectorial force */
2341 fix3 = _mm256_add_pd(fix3,tx);
2342 fiy3 = _mm256_add_pd(fiy3,ty);
2343 fiz3 = _mm256_add_pd(fiz3,tz);
2345 fjx2 = _mm256_add_pd(fjx2,tx);
2346 fjy2 = _mm256_add_pd(fjy2,ty);
2347 fjz2 = _mm256_add_pd(fjz2,tz);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2358 r33 = _mm256_mul_pd(rsq33,rinv33);
2360 /* EWALD ELECTROSTATICS */
2362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2363 ewrt = _mm256_mul_pd(r33,ewtabscale);
2364 ewitab = _mm256_cvttpd_epi32(ewrt);
2365 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2366 ewitab = _mm_slli_epi32(ewitab,2);
2367 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2368 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2369 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2370 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2371 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2372 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2373 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2374 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
2375 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2377 d = _mm256_sub_pd(r33,rswitch);
2378 d = _mm256_max_pd(d,_mm256_setzero_pd());
2379 d2 = _mm256_mul_pd(d,d);
2380 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2382 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2384 /* Evaluate switch function */
2385 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2386 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
2387 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2391 fscal = _mm256_and_pd(fscal,cutoff_mask);
2393 /* Calculate temporary vectorial force */
2394 tx = _mm256_mul_pd(fscal,dx33);
2395 ty = _mm256_mul_pd(fscal,dy33);
2396 tz = _mm256_mul_pd(fscal,dz33);
2398 /* Update vectorial force */
2399 fix3 = _mm256_add_pd(fix3,tx);
2400 fiy3 = _mm256_add_pd(fiy3,ty);
2401 fiz3 = _mm256_add_pd(fiz3,tz);
2403 fjx3 = _mm256_add_pd(fjx3,tx);
2404 fjy3 = _mm256_add_pd(fjy3,ty);
2405 fjz3 = _mm256_add_pd(fjz3,tz);
2409 fjptrA = f+j_coord_offsetA;
2410 fjptrB = f+j_coord_offsetB;
2411 fjptrC = f+j_coord_offsetC;
2412 fjptrD = f+j_coord_offsetD;
2414 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2415 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2417 /* Inner loop uses 558 flops */
2420 if(jidx<j_index_end)
2423 /* Get j neighbor index, and coordinate index */
2424 jnrlistA = jjnr[jidx];
2425 jnrlistB = jjnr[jidx+1];
2426 jnrlistC = jjnr[jidx+2];
2427 jnrlistD = jjnr[jidx+3];
2428 /* Sign of each element will be negative for non-real atoms.
2429 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2430 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2432 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2434 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2435 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2436 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2438 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2439 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2440 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2441 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2442 j_coord_offsetA = DIM*jnrA;
2443 j_coord_offsetB = DIM*jnrB;
2444 j_coord_offsetC = DIM*jnrC;
2445 j_coord_offsetD = DIM*jnrD;
2447 /* load j atom coordinates */
2448 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2449 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2450 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2452 /* Calculate displacement vector */
2453 dx11 = _mm256_sub_pd(ix1,jx1);
2454 dy11 = _mm256_sub_pd(iy1,jy1);
2455 dz11 = _mm256_sub_pd(iz1,jz1);
2456 dx12 = _mm256_sub_pd(ix1,jx2);
2457 dy12 = _mm256_sub_pd(iy1,jy2);
2458 dz12 = _mm256_sub_pd(iz1,jz2);
2459 dx13 = _mm256_sub_pd(ix1,jx3);
2460 dy13 = _mm256_sub_pd(iy1,jy3);
2461 dz13 = _mm256_sub_pd(iz1,jz3);
2462 dx21 = _mm256_sub_pd(ix2,jx1);
2463 dy21 = _mm256_sub_pd(iy2,jy1);
2464 dz21 = _mm256_sub_pd(iz2,jz1);
2465 dx22 = _mm256_sub_pd(ix2,jx2);
2466 dy22 = _mm256_sub_pd(iy2,jy2);
2467 dz22 = _mm256_sub_pd(iz2,jz2);
2468 dx23 = _mm256_sub_pd(ix2,jx3);
2469 dy23 = _mm256_sub_pd(iy2,jy3);
2470 dz23 = _mm256_sub_pd(iz2,jz3);
2471 dx31 = _mm256_sub_pd(ix3,jx1);
2472 dy31 = _mm256_sub_pd(iy3,jy1);
2473 dz31 = _mm256_sub_pd(iz3,jz1);
2474 dx32 = _mm256_sub_pd(ix3,jx2);
2475 dy32 = _mm256_sub_pd(iy3,jy2);
2476 dz32 = _mm256_sub_pd(iz3,jz2);
2477 dx33 = _mm256_sub_pd(ix3,jx3);
2478 dy33 = _mm256_sub_pd(iy3,jy3);
2479 dz33 = _mm256_sub_pd(iz3,jz3);
2481 /* Calculate squared distance and things based on it */
2482 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2483 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2484 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2485 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2486 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2487 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2488 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2489 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2490 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2492 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2493 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2494 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2495 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2496 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2497 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2498 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2499 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2500 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2502 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2503 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2504 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2505 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2506 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2507 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2508 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2509 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2510 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2512 fjx1 = _mm256_setzero_pd();
2513 fjy1 = _mm256_setzero_pd();
2514 fjz1 = _mm256_setzero_pd();
2515 fjx2 = _mm256_setzero_pd();
2516 fjy2 = _mm256_setzero_pd();
2517 fjz2 = _mm256_setzero_pd();
2518 fjx3 = _mm256_setzero_pd();
2519 fjy3 = _mm256_setzero_pd();
2520 fjz3 = _mm256_setzero_pd();
2522 /**************************
2523 * CALCULATE INTERACTIONS *
2524 **************************/
2526 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2529 r11 = _mm256_mul_pd(rsq11,rinv11);
2530 r11 = _mm256_andnot_pd(dummy_mask,r11);
2532 /* EWALD ELECTROSTATICS */
2534 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2535 ewrt = _mm256_mul_pd(r11,ewtabscale);
2536 ewitab = _mm256_cvttpd_epi32(ewrt);
2537 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2538 ewitab = _mm_slli_epi32(ewitab,2);
2539 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2540 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2541 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2542 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2543 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2544 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2545 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2546 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2547 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2549 d = _mm256_sub_pd(r11,rswitch);
2550 d = _mm256_max_pd(d,_mm256_setzero_pd());
2551 d2 = _mm256_mul_pd(d,d);
2552 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2554 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2556 /* Evaluate switch function */
2557 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2558 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2559 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2563 fscal = _mm256_and_pd(fscal,cutoff_mask);
2565 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2567 /* Calculate temporary vectorial force */
2568 tx = _mm256_mul_pd(fscal,dx11);
2569 ty = _mm256_mul_pd(fscal,dy11);
2570 tz = _mm256_mul_pd(fscal,dz11);
2572 /* Update vectorial force */
2573 fix1 = _mm256_add_pd(fix1,tx);
2574 fiy1 = _mm256_add_pd(fiy1,ty);
2575 fiz1 = _mm256_add_pd(fiz1,tz);
2577 fjx1 = _mm256_add_pd(fjx1,tx);
2578 fjy1 = _mm256_add_pd(fjy1,ty);
2579 fjz1 = _mm256_add_pd(fjz1,tz);
2583 /**************************
2584 * CALCULATE INTERACTIONS *
2585 **************************/
2587 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2590 r12 = _mm256_mul_pd(rsq12,rinv12);
2591 r12 = _mm256_andnot_pd(dummy_mask,r12);
2593 /* EWALD ELECTROSTATICS */
2595 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2596 ewrt = _mm256_mul_pd(r12,ewtabscale);
2597 ewitab = _mm256_cvttpd_epi32(ewrt);
2598 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2599 ewitab = _mm_slli_epi32(ewitab,2);
2600 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2601 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2602 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2603 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2604 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2605 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2606 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2607 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2608 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2610 d = _mm256_sub_pd(r12,rswitch);
2611 d = _mm256_max_pd(d,_mm256_setzero_pd());
2612 d2 = _mm256_mul_pd(d,d);
2613 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2615 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2617 /* Evaluate switch function */
2618 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2619 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2620 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2624 fscal = _mm256_and_pd(fscal,cutoff_mask);
2626 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2628 /* Calculate temporary vectorial force */
2629 tx = _mm256_mul_pd(fscal,dx12);
2630 ty = _mm256_mul_pd(fscal,dy12);
2631 tz = _mm256_mul_pd(fscal,dz12);
2633 /* Update vectorial force */
2634 fix1 = _mm256_add_pd(fix1,tx);
2635 fiy1 = _mm256_add_pd(fiy1,ty);
2636 fiz1 = _mm256_add_pd(fiz1,tz);
2638 fjx2 = _mm256_add_pd(fjx2,tx);
2639 fjy2 = _mm256_add_pd(fjy2,ty);
2640 fjz2 = _mm256_add_pd(fjz2,tz);
2644 /**************************
2645 * CALCULATE INTERACTIONS *
2646 **************************/
2648 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2651 r13 = _mm256_mul_pd(rsq13,rinv13);
2652 r13 = _mm256_andnot_pd(dummy_mask,r13);
2654 /* EWALD ELECTROSTATICS */
2656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2657 ewrt = _mm256_mul_pd(r13,ewtabscale);
2658 ewitab = _mm256_cvttpd_epi32(ewrt);
2659 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2660 ewitab = _mm_slli_epi32(ewitab,2);
2661 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2662 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2663 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2664 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2665 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2666 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2667 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2668 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
2669 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2671 d = _mm256_sub_pd(r13,rswitch);
2672 d = _mm256_max_pd(d,_mm256_setzero_pd());
2673 d2 = _mm256_mul_pd(d,d);
2674 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2676 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2678 /* Evaluate switch function */
2679 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2680 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv13,_mm256_mul_pd(velec,dsw)) );
2681 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2685 fscal = _mm256_and_pd(fscal,cutoff_mask);
2687 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2689 /* Calculate temporary vectorial force */
2690 tx = _mm256_mul_pd(fscal,dx13);
2691 ty = _mm256_mul_pd(fscal,dy13);
2692 tz = _mm256_mul_pd(fscal,dz13);
2694 /* Update vectorial force */
2695 fix1 = _mm256_add_pd(fix1,tx);
2696 fiy1 = _mm256_add_pd(fiy1,ty);
2697 fiz1 = _mm256_add_pd(fiz1,tz);
2699 fjx3 = _mm256_add_pd(fjx3,tx);
2700 fjy3 = _mm256_add_pd(fjy3,ty);
2701 fjz3 = _mm256_add_pd(fjz3,tz);
2705 /**************************
2706 * CALCULATE INTERACTIONS *
2707 **************************/
2709 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2712 r21 = _mm256_mul_pd(rsq21,rinv21);
2713 r21 = _mm256_andnot_pd(dummy_mask,r21);
2715 /* EWALD ELECTROSTATICS */
2717 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2718 ewrt = _mm256_mul_pd(r21,ewtabscale);
2719 ewitab = _mm256_cvttpd_epi32(ewrt);
2720 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2721 ewitab = _mm_slli_epi32(ewitab,2);
2722 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2723 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2724 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2725 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2726 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2727 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2728 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2729 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2730 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2732 d = _mm256_sub_pd(r21,rswitch);
2733 d = _mm256_max_pd(d,_mm256_setzero_pd());
2734 d2 = _mm256_mul_pd(d,d);
2735 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2737 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2739 /* Evaluate switch function */
2740 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2741 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2742 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2746 fscal = _mm256_and_pd(fscal,cutoff_mask);
2748 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2750 /* Calculate temporary vectorial force */
2751 tx = _mm256_mul_pd(fscal,dx21);
2752 ty = _mm256_mul_pd(fscal,dy21);
2753 tz = _mm256_mul_pd(fscal,dz21);
2755 /* Update vectorial force */
2756 fix2 = _mm256_add_pd(fix2,tx);
2757 fiy2 = _mm256_add_pd(fiy2,ty);
2758 fiz2 = _mm256_add_pd(fiz2,tz);
2760 fjx1 = _mm256_add_pd(fjx1,tx);
2761 fjy1 = _mm256_add_pd(fjy1,ty);
2762 fjz1 = _mm256_add_pd(fjz1,tz);
2766 /**************************
2767 * CALCULATE INTERACTIONS *
2768 **************************/
2770 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2773 r22 = _mm256_mul_pd(rsq22,rinv22);
2774 r22 = _mm256_andnot_pd(dummy_mask,r22);
2776 /* EWALD ELECTROSTATICS */
2778 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2779 ewrt = _mm256_mul_pd(r22,ewtabscale);
2780 ewitab = _mm256_cvttpd_epi32(ewrt);
2781 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2782 ewitab = _mm_slli_epi32(ewitab,2);
2783 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2784 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2785 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2786 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2787 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2788 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2789 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2790 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2791 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2793 d = _mm256_sub_pd(r22,rswitch);
2794 d = _mm256_max_pd(d,_mm256_setzero_pd());
2795 d2 = _mm256_mul_pd(d,d);
2796 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2798 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2800 /* Evaluate switch function */
2801 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2802 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2803 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2807 fscal = _mm256_and_pd(fscal,cutoff_mask);
2809 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2811 /* Calculate temporary vectorial force */
2812 tx = _mm256_mul_pd(fscal,dx22);
2813 ty = _mm256_mul_pd(fscal,dy22);
2814 tz = _mm256_mul_pd(fscal,dz22);
2816 /* Update vectorial force */
2817 fix2 = _mm256_add_pd(fix2,tx);
2818 fiy2 = _mm256_add_pd(fiy2,ty);
2819 fiz2 = _mm256_add_pd(fiz2,tz);
2821 fjx2 = _mm256_add_pd(fjx2,tx);
2822 fjy2 = _mm256_add_pd(fjy2,ty);
2823 fjz2 = _mm256_add_pd(fjz2,tz);
2827 /**************************
2828 * CALCULATE INTERACTIONS *
2829 **************************/
2831 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2834 r23 = _mm256_mul_pd(rsq23,rinv23);
2835 r23 = _mm256_andnot_pd(dummy_mask,r23);
2837 /* EWALD ELECTROSTATICS */
2839 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2840 ewrt = _mm256_mul_pd(r23,ewtabscale);
2841 ewitab = _mm256_cvttpd_epi32(ewrt);
2842 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2843 ewitab = _mm_slli_epi32(ewitab,2);
2844 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2845 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2846 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2847 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2848 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2849 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2850 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2851 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
2852 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2854 d = _mm256_sub_pd(r23,rswitch);
2855 d = _mm256_max_pd(d,_mm256_setzero_pd());
2856 d2 = _mm256_mul_pd(d,d);
2857 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2859 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2861 /* Evaluate switch function */
2862 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2863 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv23,_mm256_mul_pd(velec,dsw)) );
2864 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2868 fscal = _mm256_and_pd(fscal,cutoff_mask);
2870 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2872 /* Calculate temporary vectorial force */
2873 tx = _mm256_mul_pd(fscal,dx23);
2874 ty = _mm256_mul_pd(fscal,dy23);
2875 tz = _mm256_mul_pd(fscal,dz23);
2877 /* Update vectorial force */
2878 fix2 = _mm256_add_pd(fix2,tx);
2879 fiy2 = _mm256_add_pd(fiy2,ty);
2880 fiz2 = _mm256_add_pd(fiz2,tz);
2882 fjx3 = _mm256_add_pd(fjx3,tx);
2883 fjy3 = _mm256_add_pd(fjy3,ty);
2884 fjz3 = _mm256_add_pd(fjz3,tz);
2888 /**************************
2889 * CALCULATE INTERACTIONS *
2890 **************************/
2892 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2895 r31 = _mm256_mul_pd(rsq31,rinv31);
2896 r31 = _mm256_andnot_pd(dummy_mask,r31);
2898 /* EWALD ELECTROSTATICS */
2900 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2901 ewrt = _mm256_mul_pd(r31,ewtabscale);
2902 ewitab = _mm256_cvttpd_epi32(ewrt);
2903 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2904 ewitab = _mm_slli_epi32(ewitab,2);
2905 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2906 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2907 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2908 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2909 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2910 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2911 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2912 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
2913 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2915 d = _mm256_sub_pd(r31,rswitch);
2916 d = _mm256_max_pd(d,_mm256_setzero_pd());
2917 d2 = _mm256_mul_pd(d,d);
2918 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2920 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2922 /* Evaluate switch function */
2923 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2924 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv31,_mm256_mul_pd(velec,dsw)) );
2925 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2929 fscal = _mm256_and_pd(fscal,cutoff_mask);
2931 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2933 /* Calculate temporary vectorial force */
2934 tx = _mm256_mul_pd(fscal,dx31);
2935 ty = _mm256_mul_pd(fscal,dy31);
2936 tz = _mm256_mul_pd(fscal,dz31);
2938 /* Update vectorial force */
2939 fix3 = _mm256_add_pd(fix3,tx);
2940 fiy3 = _mm256_add_pd(fiy3,ty);
2941 fiz3 = _mm256_add_pd(fiz3,tz);
2943 fjx1 = _mm256_add_pd(fjx1,tx);
2944 fjy1 = _mm256_add_pd(fjy1,ty);
2945 fjz1 = _mm256_add_pd(fjz1,tz);
2949 /**************************
2950 * CALCULATE INTERACTIONS *
2951 **************************/
2953 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2956 r32 = _mm256_mul_pd(rsq32,rinv32);
2957 r32 = _mm256_andnot_pd(dummy_mask,r32);
2959 /* EWALD ELECTROSTATICS */
2961 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2962 ewrt = _mm256_mul_pd(r32,ewtabscale);
2963 ewitab = _mm256_cvttpd_epi32(ewrt);
2964 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2965 ewitab = _mm_slli_epi32(ewitab,2);
2966 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2967 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2968 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2969 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2970 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2971 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2972 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2973 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
2974 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2976 d = _mm256_sub_pd(r32,rswitch);
2977 d = _mm256_max_pd(d,_mm256_setzero_pd());
2978 d2 = _mm256_mul_pd(d,d);
2979 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2981 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2983 /* Evaluate switch function */
2984 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2985 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv32,_mm256_mul_pd(velec,dsw)) );
2986 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2990 fscal = _mm256_and_pd(fscal,cutoff_mask);
2992 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2994 /* Calculate temporary vectorial force */
2995 tx = _mm256_mul_pd(fscal,dx32);
2996 ty = _mm256_mul_pd(fscal,dy32);
2997 tz = _mm256_mul_pd(fscal,dz32);
2999 /* Update vectorial force */
3000 fix3 = _mm256_add_pd(fix3,tx);
3001 fiy3 = _mm256_add_pd(fiy3,ty);
3002 fiz3 = _mm256_add_pd(fiz3,tz);
3004 fjx2 = _mm256_add_pd(fjx2,tx);
3005 fjy2 = _mm256_add_pd(fjy2,ty);
3006 fjz2 = _mm256_add_pd(fjz2,tz);
3010 /**************************
3011 * CALCULATE INTERACTIONS *
3012 **************************/
3014 if (gmx_mm256_any_lt(rsq33,rcutoff2))
3017 r33 = _mm256_mul_pd(rsq33,rinv33);
3018 r33 = _mm256_andnot_pd(dummy_mask,r33);
3020 /* EWALD ELECTROSTATICS */
3022 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3023 ewrt = _mm256_mul_pd(r33,ewtabscale);
3024 ewitab = _mm256_cvttpd_epi32(ewrt);
3025 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3026 ewitab = _mm_slli_epi32(ewitab,2);
3027 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3028 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3029 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3030 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3031 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3032 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3033 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3034 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
3035 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
3037 d = _mm256_sub_pd(r33,rswitch);
3038 d = _mm256_max_pd(d,_mm256_setzero_pd());
3039 d2 = _mm256_mul_pd(d,d);
3040 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3042 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3044 /* Evaluate switch function */
3045 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3046 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv33,_mm256_mul_pd(velec,dsw)) );
3047 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
3051 fscal = _mm256_and_pd(fscal,cutoff_mask);
3053 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3055 /* Calculate temporary vectorial force */
3056 tx = _mm256_mul_pd(fscal,dx33);
3057 ty = _mm256_mul_pd(fscal,dy33);
3058 tz = _mm256_mul_pd(fscal,dz33);
3060 /* Update vectorial force */
3061 fix3 = _mm256_add_pd(fix3,tx);
3062 fiy3 = _mm256_add_pd(fiy3,ty);
3063 fiz3 = _mm256_add_pd(fiz3,tz);
3065 fjx3 = _mm256_add_pd(fjx3,tx);
3066 fjy3 = _mm256_add_pd(fjy3,ty);
3067 fjz3 = _mm256_add_pd(fjz3,tz);
3071 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3072 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3073 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3074 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3076 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3077 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3079 /* Inner loop uses 567 flops */
3082 /* End of innermost loop */
3084 gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3085 f+i_coord_offset+DIM,fshift+i_shift_offset);
3087 /* Increment number of inner iterations */
3088 inneriter += j_index_end - j_index_start;
3090 /* Outer loop uses 18 flops */
3093 /* Increment number of outer iterations */
3096 /* Update outer/inner flops */
3098 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);