2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
100 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
101 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
103 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
104 real rswitch_scalar,d_scalar;
105 __m256d dummy_mask,cutoff_mask;
106 __m128 tmpmask0,tmpmask1;
107 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
108 __m256d one = _mm256_set1_pd(1.0);
109 __m256d two = _mm256_set1_pd(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm256_set1_pd(fr->epsfac);
122 charge = mdatoms->chargeA;
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
128 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
129 beta2 = _mm256_mul_pd(beta,beta);
130 beta3 = _mm256_mul_pd(beta,beta2);
132 ewtab = fr->ic->tabq_coul_FDV0;
133 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
134 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
139 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
140 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
141 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
143 jq0 = _mm256_set1_pd(charge[inr+0]);
144 jq1 = _mm256_set1_pd(charge[inr+1]);
145 jq2 = _mm256_set1_pd(charge[inr+2]);
146 vdwjidx0A = 2*vdwtype[inr+0];
147 qq00 = _mm256_mul_pd(iq0,jq0);
148 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
149 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
150 qq01 = _mm256_mul_pd(iq0,jq1);
151 qq02 = _mm256_mul_pd(iq0,jq2);
152 qq10 = _mm256_mul_pd(iq1,jq0);
153 qq11 = _mm256_mul_pd(iq1,jq1);
154 qq12 = _mm256_mul_pd(iq1,jq2);
155 qq20 = _mm256_mul_pd(iq2,jq0);
156 qq21 = _mm256_mul_pd(iq2,jq1);
157 qq22 = _mm256_mul_pd(iq2,jq2);
159 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160 rcutoff_scalar = fr->rcoulomb;
161 rcutoff = _mm256_set1_pd(rcutoff_scalar);
162 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
164 rswitch_scalar = fr->rcoulomb_switch;
165 rswitch = _mm256_set1_pd(rswitch_scalar);
166 /* Setup switch parameters */
167 d_scalar = rcutoff_scalar-rswitch_scalar;
168 d = _mm256_set1_pd(d_scalar);
169 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
170 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
171 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
172 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
173 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176 /* Avoid stupid compiler warnings */
177 jnrA = jnrB = jnrC = jnrD = 0;
186 for(iidx=0;iidx<4*DIM;iidx++)
191 /* Start outer loop over neighborlists */
192 for(iidx=0; iidx<nri; iidx++)
194 /* Load shift vector for this list */
195 i_shift_offset = DIM*shiftidx[iidx];
197 /* Load limits for loop over neighbors */
198 j_index_start = jindex[iidx];
199 j_index_end = jindex[iidx+1];
201 /* Get outer coordinate index */
203 i_coord_offset = DIM*inr;
205 /* Load i particle coords and add shift vector */
206 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
207 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
209 fix0 = _mm256_setzero_pd();
210 fiy0 = _mm256_setzero_pd();
211 fiz0 = _mm256_setzero_pd();
212 fix1 = _mm256_setzero_pd();
213 fiy1 = _mm256_setzero_pd();
214 fiz1 = _mm256_setzero_pd();
215 fix2 = _mm256_setzero_pd();
216 fiy2 = _mm256_setzero_pd();
217 fiz2 = _mm256_setzero_pd();
219 /* Reset potential sums */
220 velecsum = _mm256_setzero_pd();
221 vvdwsum = _mm256_setzero_pd();
223 /* Start inner kernel loop */
224 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
227 /* Get j neighbor index, and coordinate index */
232 j_coord_offsetA = DIM*jnrA;
233 j_coord_offsetB = DIM*jnrB;
234 j_coord_offsetC = DIM*jnrC;
235 j_coord_offsetD = DIM*jnrD;
237 /* load j atom coordinates */
238 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
239 x+j_coord_offsetC,x+j_coord_offsetD,
240 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
242 /* Calculate displacement vector */
243 dx00 = _mm256_sub_pd(ix0,jx0);
244 dy00 = _mm256_sub_pd(iy0,jy0);
245 dz00 = _mm256_sub_pd(iz0,jz0);
246 dx01 = _mm256_sub_pd(ix0,jx1);
247 dy01 = _mm256_sub_pd(iy0,jy1);
248 dz01 = _mm256_sub_pd(iz0,jz1);
249 dx02 = _mm256_sub_pd(ix0,jx2);
250 dy02 = _mm256_sub_pd(iy0,jy2);
251 dz02 = _mm256_sub_pd(iz0,jz2);
252 dx10 = _mm256_sub_pd(ix1,jx0);
253 dy10 = _mm256_sub_pd(iy1,jy0);
254 dz10 = _mm256_sub_pd(iz1,jz0);
255 dx11 = _mm256_sub_pd(ix1,jx1);
256 dy11 = _mm256_sub_pd(iy1,jy1);
257 dz11 = _mm256_sub_pd(iz1,jz1);
258 dx12 = _mm256_sub_pd(ix1,jx2);
259 dy12 = _mm256_sub_pd(iy1,jy2);
260 dz12 = _mm256_sub_pd(iz1,jz2);
261 dx20 = _mm256_sub_pd(ix2,jx0);
262 dy20 = _mm256_sub_pd(iy2,jy0);
263 dz20 = _mm256_sub_pd(iz2,jz0);
264 dx21 = _mm256_sub_pd(ix2,jx1);
265 dy21 = _mm256_sub_pd(iy2,jy1);
266 dz21 = _mm256_sub_pd(iz2,jz1);
267 dx22 = _mm256_sub_pd(ix2,jx2);
268 dy22 = _mm256_sub_pd(iy2,jy2);
269 dz22 = _mm256_sub_pd(iz2,jz2);
271 /* Calculate squared distance and things based on it */
272 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
273 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
274 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
275 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
276 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
277 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
278 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
279 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
280 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
282 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
283 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
284 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
285 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
286 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
287 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
288 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
289 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
290 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
292 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
293 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
294 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
295 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
296 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
297 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
298 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
299 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
300 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
302 fjx0 = _mm256_setzero_pd();
303 fjy0 = _mm256_setzero_pd();
304 fjz0 = _mm256_setzero_pd();
305 fjx1 = _mm256_setzero_pd();
306 fjy1 = _mm256_setzero_pd();
307 fjz1 = _mm256_setzero_pd();
308 fjx2 = _mm256_setzero_pd();
309 fjy2 = _mm256_setzero_pd();
310 fjz2 = _mm256_setzero_pd();
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm256_any_lt(rsq00,rcutoff2))
319 r00 = _mm256_mul_pd(rsq00,rinv00);
321 /* EWALD ELECTROSTATICS */
323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
324 ewrt = _mm256_mul_pd(r00,ewtabscale);
325 ewitab = _mm256_cvttpd_epi32(ewrt);
326 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
327 ewitab = _mm_slli_epi32(ewitab,2);
328 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
329 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
330 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
331 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
332 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
333 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
334 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
335 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
336 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
338 /* LENNARD-JONES DISPERSION/REPULSION */
340 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
341 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
342 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
343 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
344 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
346 d = _mm256_sub_pd(r00,rswitch);
347 d = _mm256_max_pd(d,_mm256_setzero_pd());
348 d2 = _mm256_mul_pd(d,d);
349 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
351 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
353 /* Evaluate switch function */
354 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
355 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
356 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
357 velec = _mm256_mul_pd(velec,sw);
358 vvdw = _mm256_mul_pd(vvdw,sw);
359 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velec = _mm256_and_pd(velec,cutoff_mask);
363 velecsum = _mm256_add_pd(velecsum,velec);
364 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
365 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
367 fscal = _mm256_add_pd(felec,fvdw);
369 fscal = _mm256_and_pd(fscal,cutoff_mask);
371 /* Calculate temporary vectorial force */
372 tx = _mm256_mul_pd(fscal,dx00);
373 ty = _mm256_mul_pd(fscal,dy00);
374 tz = _mm256_mul_pd(fscal,dz00);
376 /* Update vectorial force */
377 fix0 = _mm256_add_pd(fix0,tx);
378 fiy0 = _mm256_add_pd(fiy0,ty);
379 fiz0 = _mm256_add_pd(fiz0,tz);
381 fjx0 = _mm256_add_pd(fjx0,tx);
382 fjy0 = _mm256_add_pd(fjy0,ty);
383 fjz0 = _mm256_add_pd(fjz0,tz);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 if (gmx_mm256_any_lt(rsq01,rcutoff2))
394 r01 = _mm256_mul_pd(rsq01,rinv01);
396 /* EWALD ELECTROSTATICS */
398 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
399 ewrt = _mm256_mul_pd(r01,ewtabscale);
400 ewitab = _mm256_cvttpd_epi32(ewrt);
401 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
402 ewitab = _mm_slli_epi32(ewitab,2);
403 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
404 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
405 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
406 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
407 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
408 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
409 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
410 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
411 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
413 d = _mm256_sub_pd(r01,rswitch);
414 d = _mm256_max_pd(d,_mm256_setzero_pd());
415 d2 = _mm256_mul_pd(d,d);
416 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
418 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
420 /* Evaluate switch function */
421 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
422 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
423 velec = _mm256_mul_pd(velec,sw);
424 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velec = _mm256_and_pd(velec,cutoff_mask);
428 velecsum = _mm256_add_pd(velecsum,velec);
432 fscal = _mm256_and_pd(fscal,cutoff_mask);
434 /* Calculate temporary vectorial force */
435 tx = _mm256_mul_pd(fscal,dx01);
436 ty = _mm256_mul_pd(fscal,dy01);
437 tz = _mm256_mul_pd(fscal,dz01);
439 /* Update vectorial force */
440 fix0 = _mm256_add_pd(fix0,tx);
441 fiy0 = _mm256_add_pd(fiy0,ty);
442 fiz0 = _mm256_add_pd(fiz0,tz);
444 fjx1 = _mm256_add_pd(fjx1,tx);
445 fjy1 = _mm256_add_pd(fjy1,ty);
446 fjz1 = _mm256_add_pd(fjz1,tz);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 if (gmx_mm256_any_lt(rsq02,rcutoff2))
457 r02 = _mm256_mul_pd(rsq02,rinv02);
459 /* EWALD ELECTROSTATICS */
461 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
462 ewrt = _mm256_mul_pd(r02,ewtabscale);
463 ewitab = _mm256_cvttpd_epi32(ewrt);
464 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
465 ewitab = _mm_slli_epi32(ewitab,2);
466 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
467 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
468 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
469 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
470 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
471 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
472 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
473 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
474 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
476 d = _mm256_sub_pd(r02,rswitch);
477 d = _mm256_max_pd(d,_mm256_setzero_pd());
478 d2 = _mm256_mul_pd(d,d);
479 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
481 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
483 /* Evaluate switch function */
484 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
485 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
486 velec = _mm256_mul_pd(velec,sw);
487 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
489 /* Update potential sum for this i atom from the interaction with this j atom. */
490 velec = _mm256_and_pd(velec,cutoff_mask);
491 velecsum = _mm256_add_pd(velecsum,velec);
495 fscal = _mm256_and_pd(fscal,cutoff_mask);
497 /* Calculate temporary vectorial force */
498 tx = _mm256_mul_pd(fscal,dx02);
499 ty = _mm256_mul_pd(fscal,dy02);
500 tz = _mm256_mul_pd(fscal,dz02);
502 /* Update vectorial force */
503 fix0 = _mm256_add_pd(fix0,tx);
504 fiy0 = _mm256_add_pd(fiy0,ty);
505 fiz0 = _mm256_add_pd(fiz0,tz);
507 fjx2 = _mm256_add_pd(fjx2,tx);
508 fjy2 = _mm256_add_pd(fjy2,ty);
509 fjz2 = _mm256_add_pd(fjz2,tz);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 if (gmx_mm256_any_lt(rsq10,rcutoff2))
520 r10 = _mm256_mul_pd(rsq10,rinv10);
522 /* EWALD ELECTROSTATICS */
524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525 ewrt = _mm256_mul_pd(r10,ewtabscale);
526 ewitab = _mm256_cvttpd_epi32(ewrt);
527 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
528 ewitab = _mm_slli_epi32(ewitab,2);
529 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
530 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
531 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
532 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
533 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
534 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
535 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
536 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
537 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
539 d = _mm256_sub_pd(r10,rswitch);
540 d = _mm256_max_pd(d,_mm256_setzero_pd());
541 d2 = _mm256_mul_pd(d,d);
542 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
544 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
546 /* Evaluate switch function */
547 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
548 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
549 velec = _mm256_mul_pd(velec,sw);
550 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
552 /* Update potential sum for this i atom from the interaction with this j atom. */
553 velec = _mm256_and_pd(velec,cutoff_mask);
554 velecsum = _mm256_add_pd(velecsum,velec);
558 fscal = _mm256_and_pd(fscal,cutoff_mask);
560 /* Calculate temporary vectorial force */
561 tx = _mm256_mul_pd(fscal,dx10);
562 ty = _mm256_mul_pd(fscal,dy10);
563 tz = _mm256_mul_pd(fscal,dz10);
565 /* Update vectorial force */
566 fix1 = _mm256_add_pd(fix1,tx);
567 fiy1 = _mm256_add_pd(fiy1,ty);
568 fiz1 = _mm256_add_pd(fiz1,tz);
570 fjx0 = _mm256_add_pd(fjx0,tx);
571 fjy0 = _mm256_add_pd(fjy0,ty);
572 fjz0 = _mm256_add_pd(fjz0,tz);
576 /**************************
577 * CALCULATE INTERACTIONS *
578 **************************/
580 if (gmx_mm256_any_lt(rsq11,rcutoff2))
583 r11 = _mm256_mul_pd(rsq11,rinv11);
585 /* EWALD ELECTROSTATICS */
587 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
588 ewrt = _mm256_mul_pd(r11,ewtabscale);
589 ewitab = _mm256_cvttpd_epi32(ewrt);
590 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
591 ewitab = _mm_slli_epi32(ewitab,2);
592 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
593 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
594 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
595 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
596 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
597 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
598 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
599 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
600 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
602 d = _mm256_sub_pd(r11,rswitch);
603 d = _mm256_max_pd(d,_mm256_setzero_pd());
604 d2 = _mm256_mul_pd(d,d);
605 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
607 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
609 /* Evaluate switch function */
610 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
611 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
612 velec = _mm256_mul_pd(velec,sw);
613 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
615 /* Update potential sum for this i atom from the interaction with this j atom. */
616 velec = _mm256_and_pd(velec,cutoff_mask);
617 velecsum = _mm256_add_pd(velecsum,velec);
621 fscal = _mm256_and_pd(fscal,cutoff_mask);
623 /* Calculate temporary vectorial force */
624 tx = _mm256_mul_pd(fscal,dx11);
625 ty = _mm256_mul_pd(fscal,dy11);
626 tz = _mm256_mul_pd(fscal,dz11);
628 /* Update vectorial force */
629 fix1 = _mm256_add_pd(fix1,tx);
630 fiy1 = _mm256_add_pd(fiy1,ty);
631 fiz1 = _mm256_add_pd(fiz1,tz);
633 fjx1 = _mm256_add_pd(fjx1,tx);
634 fjy1 = _mm256_add_pd(fjy1,ty);
635 fjz1 = _mm256_add_pd(fjz1,tz);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 if (gmx_mm256_any_lt(rsq12,rcutoff2))
646 r12 = _mm256_mul_pd(rsq12,rinv12);
648 /* EWALD ELECTROSTATICS */
650 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
651 ewrt = _mm256_mul_pd(r12,ewtabscale);
652 ewitab = _mm256_cvttpd_epi32(ewrt);
653 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
654 ewitab = _mm_slli_epi32(ewitab,2);
655 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
656 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
657 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
658 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
659 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
660 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
661 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
662 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
663 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
665 d = _mm256_sub_pd(r12,rswitch);
666 d = _mm256_max_pd(d,_mm256_setzero_pd());
667 d2 = _mm256_mul_pd(d,d);
668 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
670 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
672 /* Evaluate switch function */
673 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
674 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
675 velec = _mm256_mul_pd(velec,sw);
676 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
678 /* Update potential sum for this i atom from the interaction with this j atom. */
679 velec = _mm256_and_pd(velec,cutoff_mask);
680 velecsum = _mm256_add_pd(velecsum,velec);
684 fscal = _mm256_and_pd(fscal,cutoff_mask);
686 /* Calculate temporary vectorial force */
687 tx = _mm256_mul_pd(fscal,dx12);
688 ty = _mm256_mul_pd(fscal,dy12);
689 tz = _mm256_mul_pd(fscal,dz12);
691 /* Update vectorial force */
692 fix1 = _mm256_add_pd(fix1,tx);
693 fiy1 = _mm256_add_pd(fiy1,ty);
694 fiz1 = _mm256_add_pd(fiz1,tz);
696 fjx2 = _mm256_add_pd(fjx2,tx);
697 fjy2 = _mm256_add_pd(fjy2,ty);
698 fjz2 = _mm256_add_pd(fjz2,tz);
702 /**************************
703 * CALCULATE INTERACTIONS *
704 **************************/
706 if (gmx_mm256_any_lt(rsq20,rcutoff2))
709 r20 = _mm256_mul_pd(rsq20,rinv20);
711 /* EWALD ELECTROSTATICS */
713 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
714 ewrt = _mm256_mul_pd(r20,ewtabscale);
715 ewitab = _mm256_cvttpd_epi32(ewrt);
716 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
717 ewitab = _mm_slli_epi32(ewitab,2);
718 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
719 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
720 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
721 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
722 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
723 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
724 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
725 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
726 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
728 d = _mm256_sub_pd(r20,rswitch);
729 d = _mm256_max_pd(d,_mm256_setzero_pd());
730 d2 = _mm256_mul_pd(d,d);
731 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
733 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
735 /* Evaluate switch function */
736 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
737 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
738 velec = _mm256_mul_pd(velec,sw);
739 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
741 /* Update potential sum for this i atom from the interaction with this j atom. */
742 velec = _mm256_and_pd(velec,cutoff_mask);
743 velecsum = _mm256_add_pd(velecsum,velec);
747 fscal = _mm256_and_pd(fscal,cutoff_mask);
749 /* Calculate temporary vectorial force */
750 tx = _mm256_mul_pd(fscal,dx20);
751 ty = _mm256_mul_pd(fscal,dy20);
752 tz = _mm256_mul_pd(fscal,dz20);
754 /* Update vectorial force */
755 fix2 = _mm256_add_pd(fix2,tx);
756 fiy2 = _mm256_add_pd(fiy2,ty);
757 fiz2 = _mm256_add_pd(fiz2,tz);
759 fjx0 = _mm256_add_pd(fjx0,tx);
760 fjy0 = _mm256_add_pd(fjy0,ty);
761 fjz0 = _mm256_add_pd(fjz0,tz);
765 /**************************
766 * CALCULATE INTERACTIONS *
767 **************************/
769 if (gmx_mm256_any_lt(rsq21,rcutoff2))
772 r21 = _mm256_mul_pd(rsq21,rinv21);
774 /* EWALD ELECTROSTATICS */
776 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
777 ewrt = _mm256_mul_pd(r21,ewtabscale);
778 ewitab = _mm256_cvttpd_epi32(ewrt);
779 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
780 ewitab = _mm_slli_epi32(ewitab,2);
781 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
782 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
783 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
784 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
785 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
786 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
787 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
788 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
789 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
791 d = _mm256_sub_pd(r21,rswitch);
792 d = _mm256_max_pd(d,_mm256_setzero_pd());
793 d2 = _mm256_mul_pd(d,d);
794 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
796 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
798 /* Evaluate switch function */
799 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
800 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
801 velec = _mm256_mul_pd(velec,sw);
802 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
804 /* Update potential sum for this i atom from the interaction with this j atom. */
805 velec = _mm256_and_pd(velec,cutoff_mask);
806 velecsum = _mm256_add_pd(velecsum,velec);
810 fscal = _mm256_and_pd(fscal,cutoff_mask);
812 /* Calculate temporary vectorial force */
813 tx = _mm256_mul_pd(fscal,dx21);
814 ty = _mm256_mul_pd(fscal,dy21);
815 tz = _mm256_mul_pd(fscal,dz21);
817 /* Update vectorial force */
818 fix2 = _mm256_add_pd(fix2,tx);
819 fiy2 = _mm256_add_pd(fiy2,ty);
820 fiz2 = _mm256_add_pd(fiz2,tz);
822 fjx1 = _mm256_add_pd(fjx1,tx);
823 fjy1 = _mm256_add_pd(fjy1,ty);
824 fjz1 = _mm256_add_pd(fjz1,tz);
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 if (gmx_mm256_any_lt(rsq22,rcutoff2))
835 r22 = _mm256_mul_pd(rsq22,rinv22);
837 /* EWALD ELECTROSTATICS */
839 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
840 ewrt = _mm256_mul_pd(r22,ewtabscale);
841 ewitab = _mm256_cvttpd_epi32(ewrt);
842 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
843 ewitab = _mm_slli_epi32(ewitab,2);
844 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
845 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
846 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
847 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
848 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
849 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
850 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
851 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
852 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
854 d = _mm256_sub_pd(r22,rswitch);
855 d = _mm256_max_pd(d,_mm256_setzero_pd());
856 d2 = _mm256_mul_pd(d,d);
857 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
859 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
861 /* Evaluate switch function */
862 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
863 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
864 velec = _mm256_mul_pd(velec,sw);
865 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
867 /* Update potential sum for this i atom from the interaction with this j atom. */
868 velec = _mm256_and_pd(velec,cutoff_mask);
869 velecsum = _mm256_add_pd(velecsum,velec);
873 fscal = _mm256_and_pd(fscal,cutoff_mask);
875 /* Calculate temporary vectorial force */
876 tx = _mm256_mul_pd(fscal,dx22);
877 ty = _mm256_mul_pd(fscal,dy22);
878 tz = _mm256_mul_pd(fscal,dz22);
880 /* Update vectorial force */
881 fix2 = _mm256_add_pd(fix2,tx);
882 fiy2 = _mm256_add_pd(fiy2,ty);
883 fiz2 = _mm256_add_pd(fiz2,tz);
885 fjx2 = _mm256_add_pd(fjx2,tx);
886 fjy2 = _mm256_add_pd(fjy2,ty);
887 fjz2 = _mm256_add_pd(fjz2,tz);
891 fjptrA = f+j_coord_offsetA;
892 fjptrB = f+j_coord_offsetB;
893 fjptrC = f+j_coord_offsetC;
894 fjptrD = f+j_coord_offsetD;
896 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
897 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
899 /* Inner loop uses 603 flops */
905 /* Get j neighbor index, and coordinate index */
906 jnrlistA = jjnr[jidx];
907 jnrlistB = jjnr[jidx+1];
908 jnrlistC = jjnr[jidx+2];
909 jnrlistD = jjnr[jidx+3];
910 /* Sign of each element will be negative for non-real atoms.
911 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
912 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
914 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
916 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
917 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
918 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
920 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
921 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
922 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
923 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
924 j_coord_offsetA = DIM*jnrA;
925 j_coord_offsetB = DIM*jnrB;
926 j_coord_offsetC = DIM*jnrC;
927 j_coord_offsetD = DIM*jnrD;
929 /* load j atom coordinates */
930 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
931 x+j_coord_offsetC,x+j_coord_offsetD,
932 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
934 /* Calculate displacement vector */
935 dx00 = _mm256_sub_pd(ix0,jx0);
936 dy00 = _mm256_sub_pd(iy0,jy0);
937 dz00 = _mm256_sub_pd(iz0,jz0);
938 dx01 = _mm256_sub_pd(ix0,jx1);
939 dy01 = _mm256_sub_pd(iy0,jy1);
940 dz01 = _mm256_sub_pd(iz0,jz1);
941 dx02 = _mm256_sub_pd(ix0,jx2);
942 dy02 = _mm256_sub_pd(iy0,jy2);
943 dz02 = _mm256_sub_pd(iz0,jz2);
944 dx10 = _mm256_sub_pd(ix1,jx0);
945 dy10 = _mm256_sub_pd(iy1,jy0);
946 dz10 = _mm256_sub_pd(iz1,jz0);
947 dx11 = _mm256_sub_pd(ix1,jx1);
948 dy11 = _mm256_sub_pd(iy1,jy1);
949 dz11 = _mm256_sub_pd(iz1,jz1);
950 dx12 = _mm256_sub_pd(ix1,jx2);
951 dy12 = _mm256_sub_pd(iy1,jy2);
952 dz12 = _mm256_sub_pd(iz1,jz2);
953 dx20 = _mm256_sub_pd(ix2,jx0);
954 dy20 = _mm256_sub_pd(iy2,jy0);
955 dz20 = _mm256_sub_pd(iz2,jz0);
956 dx21 = _mm256_sub_pd(ix2,jx1);
957 dy21 = _mm256_sub_pd(iy2,jy1);
958 dz21 = _mm256_sub_pd(iz2,jz1);
959 dx22 = _mm256_sub_pd(ix2,jx2);
960 dy22 = _mm256_sub_pd(iy2,jy2);
961 dz22 = _mm256_sub_pd(iz2,jz2);
963 /* Calculate squared distance and things based on it */
964 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
965 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
966 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
967 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
968 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
969 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
970 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
971 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
972 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
974 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
975 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
976 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
977 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
978 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
979 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
980 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
981 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
982 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
984 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
985 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
986 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
987 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
988 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
989 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
990 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
991 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
992 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
994 fjx0 = _mm256_setzero_pd();
995 fjy0 = _mm256_setzero_pd();
996 fjz0 = _mm256_setzero_pd();
997 fjx1 = _mm256_setzero_pd();
998 fjy1 = _mm256_setzero_pd();
999 fjz1 = _mm256_setzero_pd();
1000 fjx2 = _mm256_setzero_pd();
1001 fjy2 = _mm256_setzero_pd();
1002 fjz2 = _mm256_setzero_pd();
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1011 r00 = _mm256_mul_pd(rsq00,rinv00);
1012 r00 = _mm256_andnot_pd(dummy_mask,r00);
1014 /* EWALD ELECTROSTATICS */
1016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1017 ewrt = _mm256_mul_pd(r00,ewtabscale);
1018 ewitab = _mm256_cvttpd_epi32(ewrt);
1019 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1020 ewitab = _mm_slli_epi32(ewitab,2);
1021 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1022 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1023 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1024 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1025 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1026 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1027 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1028 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1029 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1031 /* LENNARD-JONES DISPERSION/REPULSION */
1033 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1034 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1035 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1036 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1037 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1039 d = _mm256_sub_pd(r00,rswitch);
1040 d = _mm256_max_pd(d,_mm256_setzero_pd());
1041 d2 = _mm256_mul_pd(d,d);
1042 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1044 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1046 /* Evaluate switch function */
1047 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1048 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1049 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1050 velec = _mm256_mul_pd(velec,sw);
1051 vvdw = _mm256_mul_pd(vvdw,sw);
1052 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1054 /* Update potential sum for this i atom from the interaction with this j atom. */
1055 velec = _mm256_and_pd(velec,cutoff_mask);
1056 velec = _mm256_andnot_pd(dummy_mask,velec);
1057 velecsum = _mm256_add_pd(velecsum,velec);
1058 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
1059 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
1060 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
1062 fscal = _mm256_add_pd(felec,fvdw);
1064 fscal = _mm256_and_pd(fscal,cutoff_mask);
1066 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1068 /* Calculate temporary vectorial force */
1069 tx = _mm256_mul_pd(fscal,dx00);
1070 ty = _mm256_mul_pd(fscal,dy00);
1071 tz = _mm256_mul_pd(fscal,dz00);
1073 /* Update vectorial force */
1074 fix0 = _mm256_add_pd(fix0,tx);
1075 fiy0 = _mm256_add_pd(fiy0,ty);
1076 fiz0 = _mm256_add_pd(fiz0,tz);
1078 fjx0 = _mm256_add_pd(fjx0,tx);
1079 fjy0 = _mm256_add_pd(fjy0,ty);
1080 fjz0 = _mm256_add_pd(fjz0,tz);
1084 /**************************
1085 * CALCULATE INTERACTIONS *
1086 **************************/
1088 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1091 r01 = _mm256_mul_pd(rsq01,rinv01);
1092 r01 = _mm256_andnot_pd(dummy_mask,r01);
1094 /* EWALD ELECTROSTATICS */
1096 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1097 ewrt = _mm256_mul_pd(r01,ewtabscale);
1098 ewitab = _mm256_cvttpd_epi32(ewrt);
1099 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1100 ewitab = _mm_slli_epi32(ewitab,2);
1101 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1102 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1103 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1104 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1105 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1106 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1107 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1108 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
1109 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1111 d = _mm256_sub_pd(r01,rswitch);
1112 d = _mm256_max_pd(d,_mm256_setzero_pd());
1113 d2 = _mm256_mul_pd(d,d);
1114 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1116 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1118 /* Evaluate switch function */
1119 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1120 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
1121 velec = _mm256_mul_pd(velec,sw);
1122 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1124 /* Update potential sum for this i atom from the interaction with this j atom. */
1125 velec = _mm256_and_pd(velec,cutoff_mask);
1126 velec = _mm256_andnot_pd(dummy_mask,velec);
1127 velecsum = _mm256_add_pd(velecsum,velec);
1131 fscal = _mm256_and_pd(fscal,cutoff_mask);
1133 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1135 /* Calculate temporary vectorial force */
1136 tx = _mm256_mul_pd(fscal,dx01);
1137 ty = _mm256_mul_pd(fscal,dy01);
1138 tz = _mm256_mul_pd(fscal,dz01);
1140 /* Update vectorial force */
1141 fix0 = _mm256_add_pd(fix0,tx);
1142 fiy0 = _mm256_add_pd(fiy0,ty);
1143 fiz0 = _mm256_add_pd(fiz0,tz);
1145 fjx1 = _mm256_add_pd(fjx1,tx);
1146 fjy1 = _mm256_add_pd(fjy1,ty);
1147 fjz1 = _mm256_add_pd(fjz1,tz);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1158 r02 = _mm256_mul_pd(rsq02,rinv02);
1159 r02 = _mm256_andnot_pd(dummy_mask,r02);
1161 /* EWALD ELECTROSTATICS */
1163 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1164 ewrt = _mm256_mul_pd(r02,ewtabscale);
1165 ewitab = _mm256_cvttpd_epi32(ewrt);
1166 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1167 ewitab = _mm_slli_epi32(ewitab,2);
1168 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1169 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1170 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1171 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1172 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1173 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1174 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1175 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
1176 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1178 d = _mm256_sub_pd(r02,rswitch);
1179 d = _mm256_max_pd(d,_mm256_setzero_pd());
1180 d2 = _mm256_mul_pd(d,d);
1181 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1183 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1185 /* Evaluate switch function */
1186 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1187 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
1188 velec = _mm256_mul_pd(velec,sw);
1189 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1191 /* Update potential sum for this i atom from the interaction with this j atom. */
1192 velec = _mm256_and_pd(velec,cutoff_mask);
1193 velec = _mm256_andnot_pd(dummy_mask,velec);
1194 velecsum = _mm256_add_pd(velecsum,velec);
1198 fscal = _mm256_and_pd(fscal,cutoff_mask);
1200 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1202 /* Calculate temporary vectorial force */
1203 tx = _mm256_mul_pd(fscal,dx02);
1204 ty = _mm256_mul_pd(fscal,dy02);
1205 tz = _mm256_mul_pd(fscal,dz02);
1207 /* Update vectorial force */
1208 fix0 = _mm256_add_pd(fix0,tx);
1209 fiy0 = _mm256_add_pd(fiy0,ty);
1210 fiz0 = _mm256_add_pd(fiz0,tz);
1212 fjx2 = _mm256_add_pd(fjx2,tx);
1213 fjy2 = _mm256_add_pd(fjy2,ty);
1214 fjz2 = _mm256_add_pd(fjz2,tz);
1218 /**************************
1219 * CALCULATE INTERACTIONS *
1220 **************************/
1222 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1225 r10 = _mm256_mul_pd(rsq10,rinv10);
1226 r10 = _mm256_andnot_pd(dummy_mask,r10);
1228 /* EWALD ELECTROSTATICS */
1230 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1231 ewrt = _mm256_mul_pd(r10,ewtabscale);
1232 ewitab = _mm256_cvttpd_epi32(ewrt);
1233 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1234 ewitab = _mm_slli_epi32(ewitab,2);
1235 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1236 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1237 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1238 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1239 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1240 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1241 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1242 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1243 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1245 d = _mm256_sub_pd(r10,rswitch);
1246 d = _mm256_max_pd(d,_mm256_setzero_pd());
1247 d2 = _mm256_mul_pd(d,d);
1248 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1250 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1252 /* Evaluate switch function */
1253 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1254 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1255 velec = _mm256_mul_pd(velec,sw);
1256 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec = _mm256_and_pd(velec,cutoff_mask);
1260 velec = _mm256_andnot_pd(dummy_mask,velec);
1261 velecsum = _mm256_add_pd(velecsum,velec);
1265 fscal = _mm256_and_pd(fscal,cutoff_mask);
1267 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1269 /* Calculate temporary vectorial force */
1270 tx = _mm256_mul_pd(fscal,dx10);
1271 ty = _mm256_mul_pd(fscal,dy10);
1272 tz = _mm256_mul_pd(fscal,dz10);
1274 /* Update vectorial force */
1275 fix1 = _mm256_add_pd(fix1,tx);
1276 fiy1 = _mm256_add_pd(fiy1,ty);
1277 fiz1 = _mm256_add_pd(fiz1,tz);
1279 fjx0 = _mm256_add_pd(fjx0,tx);
1280 fjy0 = _mm256_add_pd(fjy0,ty);
1281 fjz0 = _mm256_add_pd(fjz0,tz);
1285 /**************************
1286 * CALCULATE INTERACTIONS *
1287 **************************/
1289 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1292 r11 = _mm256_mul_pd(rsq11,rinv11);
1293 r11 = _mm256_andnot_pd(dummy_mask,r11);
1295 /* EWALD ELECTROSTATICS */
1297 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1298 ewrt = _mm256_mul_pd(r11,ewtabscale);
1299 ewitab = _mm256_cvttpd_epi32(ewrt);
1300 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1301 ewitab = _mm_slli_epi32(ewitab,2);
1302 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1303 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1304 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1305 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1306 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1307 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1308 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1309 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1310 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1312 d = _mm256_sub_pd(r11,rswitch);
1313 d = _mm256_max_pd(d,_mm256_setzero_pd());
1314 d2 = _mm256_mul_pd(d,d);
1315 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1317 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1319 /* Evaluate switch function */
1320 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1321 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
1322 velec = _mm256_mul_pd(velec,sw);
1323 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1325 /* Update potential sum for this i atom from the interaction with this j atom. */
1326 velec = _mm256_and_pd(velec,cutoff_mask);
1327 velec = _mm256_andnot_pd(dummy_mask,velec);
1328 velecsum = _mm256_add_pd(velecsum,velec);
1332 fscal = _mm256_and_pd(fscal,cutoff_mask);
1334 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1336 /* Calculate temporary vectorial force */
1337 tx = _mm256_mul_pd(fscal,dx11);
1338 ty = _mm256_mul_pd(fscal,dy11);
1339 tz = _mm256_mul_pd(fscal,dz11);
1341 /* Update vectorial force */
1342 fix1 = _mm256_add_pd(fix1,tx);
1343 fiy1 = _mm256_add_pd(fiy1,ty);
1344 fiz1 = _mm256_add_pd(fiz1,tz);
1346 fjx1 = _mm256_add_pd(fjx1,tx);
1347 fjy1 = _mm256_add_pd(fjy1,ty);
1348 fjz1 = _mm256_add_pd(fjz1,tz);
1352 /**************************
1353 * CALCULATE INTERACTIONS *
1354 **************************/
1356 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1359 r12 = _mm256_mul_pd(rsq12,rinv12);
1360 r12 = _mm256_andnot_pd(dummy_mask,r12);
1362 /* EWALD ELECTROSTATICS */
1364 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1365 ewrt = _mm256_mul_pd(r12,ewtabscale);
1366 ewitab = _mm256_cvttpd_epi32(ewrt);
1367 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1368 ewitab = _mm_slli_epi32(ewitab,2);
1369 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1370 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1371 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1372 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1373 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1374 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1375 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1376 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1377 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1379 d = _mm256_sub_pd(r12,rswitch);
1380 d = _mm256_max_pd(d,_mm256_setzero_pd());
1381 d2 = _mm256_mul_pd(d,d);
1382 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1384 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1386 /* Evaluate switch function */
1387 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1388 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
1389 velec = _mm256_mul_pd(velec,sw);
1390 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1392 /* Update potential sum for this i atom from the interaction with this j atom. */
1393 velec = _mm256_and_pd(velec,cutoff_mask);
1394 velec = _mm256_andnot_pd(dummy_mask,velec);
1395 velecsum = _mm256_add_pd(velecsum,velec);
1399 fscal = _mm256_and_pd(fscal,cutoff_mask);
1401 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1403 /* Calculate temporary vectorial force */
1404 tx = _mm256_mul_pd(fscal,dx12);
1405 ty = _mm256_mul_pd(fscal,dy12);
1406 tz = _mm256_mul_pd(fscal,dz12);
1408 /* Update vectorial force */
1409 fix1 = _mm256_add_pd(fix1,tx);
1410 fiy1 = _mm256_add_pd(fiy1,ty);
1411 fiz1 = _mm256_add_pd(fiz1,tz);
1413 fjx2 = _mm256_add_pd(fjx2,tx);
1414 fjy2 = _mm256_add_pd(fjy2,ty);
1415 fjz2 = _mm256_add_pd(fjz2,tz);
1419 /**************************
1420 * CALCULATE INTERACTIONS *
1421 **************************/
1423 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1426 r20 = _mm256_mul_pd(rsq20,rinv20);
1427 r20 = _mm256_andnot_pd(dummy_mask,r20);
1429 /* EWALD ELECTROSTATICS */
1431 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1432 ewrt = _mm256_mul_pd(r20,ewtabscale);
1433 ewitab = _mm256_cvttpd_epi32(ewrt);
1434 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1435 ewitab = _mm_slli_epi32(ewitab,2);
1436 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1437 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1438 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1439 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1440 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1441 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1442 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1443 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1444 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1446 d = _mm256_sub_pd(r20,rswitch);
1447 d = _mm256_max_pd(d,_mm256_setzero_pd());
1448 d2 = _mm256_mul_pd(d,d);
1449 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1451 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1453 /* Evaluate switch function */
1454 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1455 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1456 velec = _mm256_mul_pd(velec,sw);
1457 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1459 /* Update potential sum for this i atom from the interaction with this j atom. */
1460 velec = _mm256_and_pd(velec,cutoff_mask);
1461 velec = _mm256_andnot_pd(dummy_mask,velec);
1462 velecsum = _mm256_add_pd(velecsum,velec);
1466 fscal = _mm256_and_pd(fscal,cutoff_mask);
1468 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1470 /* Calculate temporary vectorial force */
1471 tx = _mm256_mul_pd(fscal,dx20);
1472 ty = _mm256_mul_pd(fscal,dy20);
1473 tz = _mm256_mul_pd(fscal,dz20);
1475 /* Update vectorial force */
1476 fix2 = _mm256_add_pd(fix2,tx);
1477 fiy2 = _mm256_add_pd(fiy2,ty);
1478 fiz2 = _mm256_add_pd(fiz2,tz);
1480 fjx0 = _mm256_add_pd(fjx0,tx);
1481 fjy0 = _mm256_add_pd(fjy0,ty);
1482 fjz0 = _mm256_add_pd(fjz0,tz);
1486 /**************************
1487 * CALCULATE INTERACTIONS *
1488 **************************/
1490 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1493 r21 = _mm256_mul_pd(rsq21,rinv21);
1494 r21 = _mm256_andnot_pd(dummy_mask,r21);
1496 /* EWALD ELECTROSTATICS */
1498 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1499 ewrt = _mm256_mul_pd(r21,ewtabscale);
1500 ewitab = _mm256_cvttpd_epi32(ewrt);
1501 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1502 ewitab = _mm_slli_epi32(ewitab,2);
1503 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1504 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1505 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1506 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1507 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1508 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1509 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1510 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1511 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1513 d = _mm256_sub_pd(r21,rswitch);
1514 d = _mm256_max_pd(d,_mm256_setzero_pd());
1515 d2 = _mm256_mul_pd(d,d);
1516 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1518 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1520 /* Evaluate switch function */
1521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1522 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
1523 velec = _mm256_mul_pd(velec,sw);
1524 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1526 /* Update potential sum for this i atom from the interaction with this j atom. */
1527 velec = _mm256_and_pd(velec,cutoff_mask);
1528 velec = _mm256_andnot_pd(dummy_mask,velec);
1529 velecsum = _mm256_add_pd(velecsum,velec);
1533 fscal = _mm256_and_pd(fscal,cutoff_mask);
1535 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1537 /* Calculate temporary vectorial force */
1538 tx = _mm256_mul_pd(fscal,dx21);
1539 ty = _mm256_mul_pd(fscal,dy21);
1540 tz = _mm256_mul_pd(fscal,dz21);
1542 /* Update vectorial force */
1543 fix2 = _mm256_add_pd(fix2,tx);
1544 fiy2 = _mm256_add_pd(fiy2,ty);
1545 fiz2 = _mm256_add_pd(fiz2,tz);
1547 fjx1 = _mm256_add_pd(fjx1,tx);
1548 fjy1 = _mm256_add_pd(fjy1,ty);
1549 fjz1 = _mm256_add_pd(fjz1,tz);
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1560 r22 = _mm256_mul_pd(rsq22,rinv22);
1561 r22 = _mm256_andnot_pd(dummy_mask,r22);
1563 /* EWALD ELECTROSTATICS */
1565 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1566 ewrt = _mm256_mul_pd(r22,ewtabscale);
1567 ewitab = _mm256_cvttpd_epi32(ewrt);
1568 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1569 ewitab = _mm_slli_epi32(ewitab,2);
1570 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1571 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1572 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1573 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1574 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1575 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1576 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1577 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1578 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1580 d = _mm256_sub_pd(r22,rswitch);
1581 d = _mm256_max_pd(d,_mm256_setzero_pd());
1582 d2 = _mm256_mul_pd(d,d);
1583 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1585 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1587 /* Evaluate switch function */
1588 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1589 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
1590 velec = _mm256_mul_pd(velec,sw);
1591 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1593 /* Update potential sum for this i atom from the interaction with this j atom. */
1594 velec = _mm256_and_pd(velec,cutoff_mask);
1595 velec = _mm256_andnot_pd(dummy_mask,velec);
1596 velecsum = _mm256_add_pd(velecsum,velec);
1600 fscal = _mm256_and_pd(fscal,cutoff_mask);
1602 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1604 /* Calculate temporary vectorial force */
1605 tx = _mm256_mul_pd(fscal,dx22);
1606 ty = _mm256_mul_pd(fscal,dy22);
1607 tz = _mm256_mul_pd(fscal,dz22);
1609 /* Update vectorial force */
1610 fix2 = _mm256_add_pd(fix2,tx);
1611 fiy2 = _mm256_add_pd(fiy2,ty);
1612 fiz2 = _mm256_add_pd(fiz2,tz);
1614 fjx2 = _mm256_add_pd(fjx2,tx);
1615 fjy2 = _mm256_add_pd(fjy2,ty);
1616 fjz2 = _mm256_add_pd(fjz2,tz);
1620 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1621 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1622 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1623 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1625 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1626 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1628 /* Inner loop uses 612 flops */
1631 /* End of innermost loop */
1633 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1634 f+i_coord_offset,fshift+i_shift_offset);
1637 /* Update potential energies */
1638 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1639 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1641 /* Increment number of inner iterations */
1642 inneriter += j_index_end - j_index_start;
1644 /* Outer loop uses 20 flops */
1647 /* Increment number of outer iterations */
1650 /* Update outer/inner flops */
1652 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1655 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
1656 * Electrostatics interaction: Ewald
1657 * VdW interaction: LennardJones
1658 * Geometry: Water3-Water3
1659 * Calculate force/pot: Force
1662 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_double
1663 (t_nblist * gmx_restrict nlist,
1664 rvec * gmx_restrict xx,
1665 rvec * gmx_restrict ff,
1666 t_forcerec * gmx_restrict fr,
1667 t_mdatoms * gmx_restrict mdatoms,
1668 nb_kernel_data_t * gmx_restrict kernel_data,
1669 t_nrnb * gmx_restrict nrnb)
1671 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1672 * just 0 for non-waters.
1673 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1674 * jnr indices corresponding to data put in the four positions in the SIMD register.
1676 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1677 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1678 int jnrA,jnrB,jnrC,jnrD;
1679 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1680 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1681 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1682 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1683 real rcutoff_scalar;
1684 real *shiftvec,*fshift,*x,*f;
1685 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1686 real scratch[4*DIM];
1687 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1688 real * vdwioffsetptr0;
1689 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1690 real * vdwioffsetptr1;
1691 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1692 real * vdwioffsetptr2;
1693 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1694 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1695 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1696 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1697 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1698 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1699 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1700 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1701 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1702 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1703 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1704 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1705 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1706 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1707 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1708 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1709 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1712 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1715 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1716 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1718 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1719 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1721 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1722 real rswitch_scalar,d_scalar;
1723 __m256d dummy_mask,cutoff_mask;
1724 __m128 tmpmask0,tmpmask1;
1725 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1726 __m256d one = _mm256_set1_pd(1.0);
1727 __m256d two = _mm256_set1_pd(2.0);
1733 jindex = nlist->jindex;
1735 shiftidx = nlist->shift;
1737 shiftvec = fr->shift_vec[0];
1738 fshift = fr->fshift[0];
1739 facel = _mm256_set1_pd(fr->epsfac);
1740 charge = mdatoms->chargeA;
1741 nvdwtype = fr->ntype;
1742 vdwparam = fr->nbfp;
1743 vdwtype = mdatoms->typeA;
1745 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1746 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1747 beta2 = _mm256_mul_pd(beta,beta);
1748 beta3 = _mm256_mul_pd(beta,beta2);
1750 ewtab = fr->ic->tabq_coul_FDV0;
1751 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1752 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1754 /* Setup water-specific parameters */
1755 inr = nlist->iinr[0];
1756 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1757 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1758 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1759 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1761 jq0 = _mm256_set1_pd(charge[inr+0]);
1762 jq1 = _mm256_set1_pd(charge[inr+1]);
1763 jq2 = _mm256_set1_pd(charge[inr+2]);
1764 vdwjidx0A = 2*vdwtype[inr+0];
1765 qq00 = _mm256_mul_pd(iq0,jq0);
1766 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1767 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1768 qq01 = _mm256_mul_pd(iq0,jq1);
1769 qq02 = _mm256_mul_pd(iq0,jq2);
1770 qq10 = _mm256_mul_pd(iq1,jq0);
1771 qq11 = _mm256_mul_pd(iq1,jq1);
1772 qq12 = _mm256_mul_pd(iq1,jq2);
1773 qq20 = _mm256_mul_pd(iq2,jq0);
1774 qq21 = _mm256_mul_pd(iq2,jq1);
1775 qq22 = _mm256_mul_pd(iq2,jq2);
1777 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1778 rcutoff_scalar = fr->rcoulomb;
1779 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1780 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1782 rswitch_scalar = fr->rcoulomb_switch;
1783 rswitch = _mm256_set1_pd(rswitch_scalar);
1784 /* Setup switch parameters */
1785 d_scalar = rcutoff_scalar-rswitch_scalar;
1786 d = _mm256_set1_pd(d_scalar);
1787 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1788 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1789 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1790 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1791 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1792 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1794 /* Avoid stupid compiler warnings */
1795 jnrA = jnrB = jnrC = jnrD = 0;
1796 j_coord_offsetA = 0;
1797 j_coord_offsetB = 0;
1798 j_coord_offsetC = 0;
1799 j_coord_offsetD = 0;
1804 for(iidx=0;iidx<4*DIM;iidx++)
1806 scratch[iidx] = 0.0;
1809 /* Start outer loop over neighborlists */
1810 for(iidx=0; iidx<nri; iidx++)
1812 /* Load shift vector for this list */
1813 i_shift_offset = DIM*shiftidx[iidx];
1815 /* Load limits for loop over neighbors */
1816 j_index_start = jindex[iidx];
1817 j_index_end = jindex[iidx+1];
1819 /* Get outer coordinate index */
1821 i_coord_offset = DIM*inr;
1823 /* Load i particle coords and add shift vector */
1824 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1825 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1827 fix0 = _mm256_setzero_pd();
1828 fiy0 = _mm256_setzero_pd();
1829 fiz0 = _mm256_setzero_pd();
1830 fix1 = _mm256_setzero_pd();
1831 fiy1 = _mm256_setzero_pd();
1832 fiz1 = _mm256_setzero_pd();
1833 fix2 = _mm256_setzero_pd();
1834 fiy2 = _mm256_setzero_pd();
1835 fiz2 = _mm256_setzero_pd();
1837 /* Start inner kernel loop */
1838 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1841 /* Get j neighbor index, and coordinate index */
1843 jnrB = jjnr[jidx+1];
1844 jnrC = jjnr[jidx+2];
1845 jnrD = jjnr[jidx+3];
1846 j_coord_offsetA = DIM*jnrA;
1847 j_coord_offsetB = DIM*jnrB;
1848 j_coord_offsetC = DIM*jnrC;
1849 j_coord_offsetD = DIM*jnrD;
1851 /* load j atom coordinates */
1852 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1853 x+j_coord_offsetC,x+j_coord_offsetD,
1854 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1856 /* Calculate displacement vector */
1857 dx00 = _mm256_sub_pd(ix0,jx0);
1858 dy00 = _mm256_sub_pd(iy0,jy0);
1859 dz00 = _mm256_sub_pd(iz0,jz0);
1860 dx01 = _mm256_sub_pd(ix0,jx1);
1861 dy01 = _mm256_sub_pd(iy0,jy1);
1862 dz01 = _mm256_sub_pd(iz0,jz1);
1863 dx02 = _mm256_sub_pd(ix0,jx2);
1864 dy02 = _mm256_sub_pd(iy0,jy2);
1865 dz02 = _mm256_sub_pd(iz0,jz2);
1866 dx10 = _mm256_sub_pd(ix1,jx0);
1867 dy10 = _mm256_sub_pd(iy1,jy0);
1868 dz10 = _mm256_sub_pd(iz1,jz0);
1869 dx11 = _mm256_sub_pd(ix1,jx1);
1870 dy11 = _mm256_sub_pd(iy1,jy1);
1871 dz11 = _mm256_sub_pd(iz1,jz1);
1872 dx12 = _mm256_sub_pd(ix1,jx2);
1873 dy12 = _mm256_sub_pd(iy1,jy2);
1874 dz12 = _mm256_sub_pd(iz1,jz2);
1875 dx20 = _mm256_sub_pd(ix2,jx0);
1876 dy20 = _mm256_sub_pd(iy2,jy0);
1877 dz20 = _mm256_sub_pd(iz2,jz0);
1878 dx21 = _mm256_sub_pd(ix2,jx1);
1879 dy21 = _mm256_sub_pd(iy2,jy1);
1880 dz21 = _mm256_sub_pd(iz2,jz1);
1881 dx22 = _mm256_sub_pd(ix2,jx2);
1882 dy22 = _mm256_sub_pd(iy2,jy2);
1883 dz22 = _mm256_sub_pd(iz2,jz2);
1885 /* Calculate squared distance and things based on it */
1886 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1887 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1888 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1889 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1890 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1891 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1892 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1893 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1894 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1896 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1897 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1898 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1899 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1900 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1901 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1902 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1903 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1904 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1906 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1907 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1908 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1909 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1910 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1911 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1912 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1913 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1914 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1916 fjx0 = _mm256_setzero_pd();
1917 fjy0 = _mm256_setzero_pd();
1918 fjz0 = _mm256_setzero_pd();
1919 fjx1 = _mm256_setzero_pd();
1920 fjy1 = _mm256_setzero_pd();
1921 fjz1 = _mm256_setzero_pd();
1922 fjx2 = _mm256_setzero_pd();
1923 fjy2 = _mm256_setzero_pd();
1924 fjz2 = _mm256_setzero_pd();
1926 /**************************
1927 * CALCULATE INTERACTIONS *
1928 **************************/
1930 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1933 r00 = _mm256_mul_pd(rsq00,rinv00);
1935 /* EWALD ELECTROSTATICS */
1937 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1938 ewrt = _mm256_mul_pd(r00,ewtabscale);
1939 ewitab = _mm256_cvttpd_epi32(ewrt);
1940 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1941 ewitab = _mm_slli_epi32(ewitab,2);
1942 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1943 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1944 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1945 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1946 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1947 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1948 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1949 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1950 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1952 /* LENNARD-JONES DISPERSION/REPULSION */
1954 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1955 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1956 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1957 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1958 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1960 d = _mm256_sub_pd(r00,rswitch);
1961 d = _mm256_max_pd(d,_mm256_setzero_pd());
1962 d2 = _mm256_mul_pd(d,d);
1963 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1965 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1967 /* Evaluate switch function */
1968 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1969 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1970 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1971 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1973 fscal = _mm256_add_pd(felec,fvdw);
1975 fscal = _mm256_and_pd(fscal,cutoff_mask);
1977 /* Calculate temporary vectorial force */
1978 tx = _mm256_mul_pd(fscal,dx00);
1979 ty = _mm256_mul_pd(fscal,dy00);
1980 tz = _mm256_mul_pd(fscal,dz00);
1982 /* Update vectorial force */
1983 fix0 = _mm256_add_pd(fix0,tx);
1984 fiy0 = _mm256_add_pd(fiy0,ty);
1985 fiz0 = _mm256_add_pd(fiz0,tz);
1987 fjx0 = _mm256_add_pd(fjx0,tx);
1988 fjy0 = _mm256_add_pd(fjy0,ty);
1989 fjz0 = _mm256_add_pd(fjz0,tz);
1993 /**************************
1994 * CALCULATE INTERACTIONS *
1995 **************************/
1997 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2000 r01 = _mm256_mul_pd(rsq01,rinv01);
2002 /* EWALD ELECTROSTATICS */
2004 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2005 ewrt = _mm256_mul_pd(r01,ewtabscale);
2006 ewitab = _mm256_cvttpd_epi32(ewrt);
2007 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2008 ewitab = _mm_slli_epi32(ewitab,2);
2009 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2010 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2011 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2012 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2013 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2014 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2015 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2016 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
2017 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2019 d = _mm256_sub_pd(r01,rswitch);
2020 d = _mm256_max_pd(d,_mm256_setzero_pd());
2021 d2 = _mm256_mul_pd(d,d);
2022 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2024 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2026 /* Evaluate switch function */
2027 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2028 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
2029 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2033 fscal = _mm256_and_pd(fscal,cutoff_mask);
2035 /* Calculate temporary vectorial force */
2036 tx = _mm256_mul_pd(fscal,dx01);
2037 ty = _mm256_mul_pd(fscal,dy01);
2038 tz = _mm256_mul_pd(fscal,dz01);
2040 /* Update vectorial force */
2041 fix0 = _mm256_add_pd(fix0,tx);
2042 fiy0 = _mm256_add_pd(fiy0,ty);
2043 fiz0 = _mm256_add_pd(fiz0,tz);
2045 fjx1 = _mm256_add_pd(fjx1,tx);
2046 fjy1 = _mm256_add_pd(fjy1,ty);
2047 fjz1 = _mm256_add_pd(fjz1,tz);
2051 /**************************
2052 * CALCULATE INTERACTIONS *
2053 **************************/
2055 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2058 r02 = _mm256_mul_pd(rsq02,rinv02);
2060 /* EWALD ELECTROSTATICS */
2062 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2063 ewrt = _mm256_mul_pd(r02,ewtabscale);
2064 ewitab = _mm256_cvttpd_epi32(ewrt);
2065 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2066 ewitab = _mm_slli_epi32(ewitab,2);
2067 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2068 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2069 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2070 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2071 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2072 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2073 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2074 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
2075 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2077 d = _mm256_sub_pd(r02,rswitch);
2078 d = _mm256_max_pd(d,_mm256_setzero_pd());
2079 d2 = _mm256_mul_pd(d,d);
2080 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2082 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2084 /* Evaluate switch function */
2085 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2086 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
2087 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2091 fscal = _mm256_and_pd(fscal,cutoff_mask);
2093 /* Calculate temporary vectorial force */
2094 tx = _mm256_mul_pd(fscal,dx02);
2095 ty = _mm256_mul_pd(fscal,dy02);
2096 tz = _mm256_mul_pd(fscal,dz02);
2098 /* Update vectorial force */
2099 fix0 = _mm256_add_pd(fix0,tx);
2100 fiy0 = _mm256_add_pd(fiy0,ty);
2101 fiz0 = _mm256_add_pd(fiz0,tz);
2103 fjx2 = _mm256_add_pd(fjx2,tx);
2104 fjy2 = _mm256_add_pd(fjy2,ty);
2105 fjz2 = _mm256_add_pd(fjz2,tz);
2109 /**************************
2110 * CALCULATE INTERACTIONS *
2111 **************************/
2113 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2116 r10 = _mm256_mul_pd(rsq10,rinv10);
2118 /* EWALD ELECTROSTATICS */
2120 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2121 ewrt = _mm256_mul_pd(r10,ewtabscale);
2122 ewitab = _mm256_cvttpd_epi32(ewrt);
2123 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2124 ewitab = _mm_slli_epi32(ewitab,2);
2125 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2126 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2127 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2128 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2129 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2130 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2131 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2132 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
2133 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2135 d = _mm256_sub_pd(r10,rswitch);
2136 d = _mm256_max_pd(d,_mm256_setzero_pd());
2137 d2 = _mm256_mul_pd(d,d);
2138 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2140 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2142 /* Evaluate switch function */
2143 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2144 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
2145 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2149 fscal = _mm256_and_pd(fscal,cutoff_mask);
2151 /* Calculate temporary vectorial force */
2152 tx = _mm256_mul_pd(fscal,dx10);
2153 ty = _mm256_mul_pd(fscal,dy10);
2154 tz = _mm256_mul_pd(fscal,dz10);
2156 /* Update vectorial force */
2157 fix1 = _mm256_add_pd(fix1,tx);
2158 fiy1 = _mm256_add_pd(fiy1,ty);
2159 fiz1 = _mm256_add_pd(fiz1,tz);
2161 fjx0 = _mm256_add_pd(fjx0,tx);
2162 fjy0 = _mm256_add_pd(fjy0,ty);
2163 fjz0 = _mm256_add_pd(fjz0,tz);
2167 /**************************
2168 * CALCULATE INTERACTIONS *
2169 **************************/
2171 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2174 r11 = _mm256_mul_pd(rsq11,rinv11);
2176 /* EWALD ELECTROSTATICS */
2178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2179 ewrt = _mm256_mul_pd(r11,ewtabscale);
2180 ewitab = _mm256_cvttpd_epi32(ewrt);
2181 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2182 ewitab = _mm_slli_epi32(ewitab,2);
2183 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2184 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2185 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2186 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2187 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2188 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2189 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2190 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2191 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2193 d = _mm256_sub_pd(r11,rswitch);
2194 d = _mm256_max_pd(d,_mm256_setzero_pd());
2195 d2 = _mm256_mul_pd(d,d);
2196 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2198 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2200 /* Evaluate switch function */
2201 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2202 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2203 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2207 fscal = _mm256_and_pd(fscal,cutoff_mask);
2209 /* Calculate temporary vectorial force */
2210 tx = _mm256_mul_pd(fscal,dx11);
2211 ty = _mm256_mul_pd(fscal,dy11);
2212 tz = _mm256_mul_pd(fscal,dz11);
2214 /* Update vectorial force */
2215 fix1 = _mm256_add_pd(fix1,tx);
2216 fiy1 = _mm256_add_pd(fiy1,ty);
2217 fiz1 = _mm256_add_pd(fiz1,tz);
2219 fjx1 = _mm256_add_pd(fjx1,tx);
2220 fjy1 = _mm256_add_pd(fjy1,ty);
2221 fjz1 = _mm256_add_pd(fjz1,tz);
2225 /**************************
2226 * CALCULATE INTERACTIONS *
2227 **************************/
2229 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2232 r12 = _mm256_mul_pd(rsq12,rinv12);
2234 /* EWALD ELECTROSTATICS */
2236 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2237 ewrt = _mm256_mul_pd(r12,ewtabscale);
2238 ewitab = _mm256_cvttpd_epi32(ewrt);
2239 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2240 ewitab = _mm_slli_epi32(ewitab,2);
2241 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2242 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2243 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2244 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2245 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2246 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2247 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2248 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2249 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2251 d = _mm256_sub_pd(r12,rswitch);
2252 d = _mm256_max_pd(d,_mm256_setzero_pd());
2253 d2 = _mm256_mul_pd(d,d);
2254 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2256 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2258 /* Evaluate switch function */
2259 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2260 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2261 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2265 fscal = _mm256_and_pd(fscal,cutoff_mask);
2267 /* Calculate temporary vectorial force */
2268 tx = _mm256_mul_pd(fscal,dx12);
2269 ty = _mm256_mul_pd(fscal,dy12);
2270 tz = _mm256_mul_pd(fscal,dz12);
2272 /* Update vectorial force */
2273 fix1 = _mm256_add_pd(fix1,tx);
2274 fiy1 = _mm256_add_pd(fiy1,ty);
2275 fiz1 = _mm256_add_pd(fiz1,tz);
2277 fjx2 = _mm256_add_pd(fjx2,tx);
2278 fjy2 = _mm256_add_pd(fjy2,ty);
2279 fjz2 = _mm256_add_pd(fjz2,tz);
2283 /**************************
2284 * CALCULATE INTERACTIONS *
2285 **************************/
2287 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2290 r20 = _mm256_mul_pd(rsq20,rinv20);
2292 /* EWALD ELECTROSTATICS */
2294 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2295 ewrt = _mm256_mul_pd(r20,ewtabscale);
2296 ewitab = _mm256_cvttpd_epi32(ewrt);
2297 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2298 ewitab = _mm_slli_epi32(ewitab,2);
2299 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2300 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2301 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2302 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2303 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2304 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2305 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2306 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
2307 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2309 d = _mm256_sub_pd(r20,rswitch);
2310 d = _mm256_max_pd(d,_mm256_setzero_pd());
2311 d2 = _mm256_mul_pd(d,d);
2312 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2314 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2316 /* Evaluate switch function */
2317 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2318 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
2319 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2323 fscal = _mm256_and_pd(fscal,cutoff_mask);
2325 /* Calculate temporary vectorial force */
2326 tx = _mm256_mul_pd(fscal,dx20);
2327 ty = _mm256_mul_pd(fscal,dy20);
2328 tz = _mm256_mul_pd(fscal,dz20);
2330 /* Update vectorial force */
2331 fix2 = _mm256_add_pd(fix2,tx);
2332 fiy2 = _mm256_add_pd(fiy2,ty);
2333 fiz2 = _mm256_add_pd(fiz2,tz);
2335 fjx0 = _mm256_add_pd(fjx0,tx);
2336 fjy0 = _mm256_add_pd(fjy0,ty);
2337 fjz0 = _mm256_add_pd(fjz0,tz);
2341 /**************************
2342 * CALCULATE INTERACTIONS *
2343 **************************/
2345 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2348 r21 = _mm256_mul_pd(rsq21,rinv21);
2350 /* EWALD ELECTROSTATICS */
2352 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2353 ewrt = _mm256_mul_pd(r21,ewtabscale);
2354 ewitab = _mm256_cvttpd_epi32(ewrt);
2355 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2356 ewitab = _mm_slli_epi32(ewitab,2);
2357 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2358 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2359 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2360 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2361 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2362 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2363 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2364 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
2365 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2367 d = _mm256_sub_pd(r21,rswitch);
2368 d = _mm256_max_pd(d,_mm256_setzero_pd());
2369 d2 = _mm256_mul_pd(d,d);
2370 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2372 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2374 /* Evaluate switch function */
2375 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2376 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
2377 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2381 fscal = _mm256_and_pd(fscal,cutoff_mask);
2383 /* Calculate temporary vectorial force */
2384 tx = _mm256_mul_pd(fscal,dx21);
2385 ty = _mm256_mul_pd(fscal,dy21);
2386 tz = _mm256_mul_pd(fscal,dz21);
2388 /* Update vectorial force */
2389 fix2 = _mm256_add_pd(fix2,tx);
2390 fiy2 = _mm256_add_pd(fiy2,ty);
2391 fiz2 = _mm256_add_pd(fiz2,tz);
2393 fjx1 = _mm256_add_pd(fjx1,tx);
2394 fjy1 = _mm256_add_pd(fjy1,ty);
2395 fjz1 = _mm256_add_pd(fjz1,tz);
2399 /**************************
2400 * CALCULATE INTERACTIONS *
2401 **************************/
2403 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2406 r22 = _mm256_mul_pd(rsq22,rinv22);
2408 /* EWALD ELECTROSTATICS */
2410 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2411 ewrt = _mm256_mul_pd(r22,ewtabscale);
2412 ewitab = _mm256_cvttpd_epi32(ewrt);
2413 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2414 ewitab = _mm_slli_epi32(ewitab,2);
2415 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2416 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2417 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2418 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2419 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2420 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2421 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2422 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
2423 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2425 d = _mm256_sub_pd(r22,rswitch);
2426 d = _mm256_max_pd(d,_mm256_setzero_pd());
2427 d2 = _mm256_mul_pd(d,d);
2428 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2430 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2432 /* Evaluate switch function */
2433 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2434 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
2435 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2439 fscal = _mm256_and_pd(fscal,cutoff_mask);
2441 /* Calculate temporary vectorial force */
2442 tx = _mm256_mul_pd(fscal,dx22);
2443 ty = _mm256_mul_pd(fscal,dy22);
2444 tz = _mm256_mul_pd(fscal,dz22);
2446 /* Update vectorial force */
2447 fix2 = _mm256_add_pd(fix2,tx);
2448 fiy2 = _mm256_add_pd(fiy2,ty);
2449 fiz2 = _mm256_add_pd(fiz2,tz);
2451 fjx2 = _mm256_add_pd(fjx2,tx);
2452 fjy2 = _mm256_add_pd(fjy2,ty);
2453 fjz2 = _mm256_add_pd(fjz2,tz);
2457 fjptrA = f+j_coord_offsetA;
2458 fjptrB = f+j_coord_offsetB;
2459 fjptrC = f+j_coord_offsetC;
2460 fjptrD = f+j_coord_offsetD;
2462 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2463 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2465 /* Inner loop uses 573 flops */
2468 if(jidx<j_index_end)
2471 /* Get j neighbor index, and coordinate index */
2472 jnrlistA = jjnr[jidx];
2473 jnrlistB = jjnr[jidx+1];
2474 jnrlistC = jjnr[jidx+2];
2475 jnrlistD = jjnr[jidx+3];
2476 /* Sign of each element will be negative for non-real atoms.
2477 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2478 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2480 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2482 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2483 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2484 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2486 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2487 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2488 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2489 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2490 j_coord_offsetA = DIM*jnrA;
2491 j_coord_offsetB = DIM*jnrB;
2492 j_coord_offsetC = DIM*jnrC;
2493 j_coord_offsetD = DIM*jnrD;
2495 /* load j atom coordinates */
2496 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2497 x+j_coord_offsetC,x+j_coord_offsetD,
2498 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2500 /* Calculate displacement vector */
2501 dx00 = _mm256_sub_pd(ix0,jx0);
2502 dy00 = _mm256_sub_pd(iy0,jy0);
2503 dz00 = _mm256_sub_pd(iz0,jz0);
2504 dx01 = _mm256_sub_pd(ix0,jx1);
2505 dy01 = _mm256_sub_pd(iy0,jy1);
2506 dz01 = _mm256_sub_pd(iz0,jz1);
2507 dx02 = _mm256_sub_pd(ix0,jx2);
2508 dy02 = _mm256_sub_pd(iy0,jy2);
2509 dz02 = _mm256_sub_pd(iz0,jz2);
2510 dx10 = _mm256_sub_pd(ix1,jx0);
2511 dy10 = _mm256_sub_pd(iy1,jy0);
2512 dz10 = _mm256_sub_pd(iz1,jz0);
2513 dx11 = _mm256_sub_pd(ix1,jx1);
2514 dy11 = _mm256_sub_pd(iy1,jy1);
2515 dz11 = _mm256_sub_pd(iz1,jz1);
2516 dx12 = _mm256_sub_pd(ix1,jx2);
2517 dy12 = _mm256_sub_pd(iy1,jy2);
2518 dz12 = _mm256_sub_pd(iz1,jz2);
2519 dx20 = _mm256_sub_pd(ix2,jx0);
2520 dy20 = _mm256_sub_pd(iy2,jy0);
2521 dz20 = _mm256_sub_pd(iz2,jz0);
2522 dx21 = _mm256_sub_pd(ix2,jx1);
2523 dy21 = _mm256_sub_pd(iy2,jy1);
2524 dz21 = _mm256_sub_pd(iz2,jz1);
2525 dx22 = _mm256_sub_pd(ix2,jx2);
2526 dy22 = _mm256_sub_pd(iy2,jy2);
2527 dz22 = _mm256_sub_pd(iz2,jz2);
2529 /* Calculate squared distance and things based on it */
2530 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2531 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2532 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2533 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2534 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2535 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2536 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2537 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2538 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2540 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2541 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2542 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2543 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2544 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2545 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2546 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2547 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2548 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2550 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2551 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2552 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2553 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2554 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2555 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2556 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2557 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2558 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2560 fjx0 = _mm256_setzero_pd();
2561 fjy0 = _mm256_setzero_pd();
2562 fjz0 = _mm256_setzero_pd();
2563 fjx1 = _mm256_setzero_pd();
2564 fjy1 = _mm256_setzero_pd();
2565 fjz1 = _mm256_setzero_pd();
2566 fjx2 = _mm256_setzero_pd();
2567 fjy2 = _mm256_setzero_pd();
2568 fjz2 = _mm256_setzero_pd();
2570 /**************************
2571 * CALCULATE INTERACTIONS *
2572 **************************/
2574 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2577 r00 = _mm256_mul_pd(rsq00,rinv00);
2578 r00 = _mm256_andnot_pd(dummy_mask,r00);
2580 /* EWALD ELECTROSTATICS */
2582 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2583 ewrt = _mm256_mul_pd(r00,ewtabscale);
2584 ewitab = _mm256_cvttpd_epi32(ewrt);
2585 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2586 ewitab = _mm_slli_epi32(ewitab,2);
2587 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2588 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2589 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2590 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2591 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2592 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2593 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2594 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
2595 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2597 /* LENNARD-JONES DISPERSION/REPULSION */
2599 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2600 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2601 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2602 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2603 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2605 d = _mm256_sub_pd(r00,rswitch);
2606 d = _mm256_max_pd(d,_mm256_setzero_pd());
2607 d2 = _mm256_mul_pd(d,d);
2608 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2610 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2612 /* Evaluate switch function */
2613 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2614 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
2615 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2616 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2618 fscal = _mm256_add_pd(felec,fvdw);
2620 fscal = _mm256_and_pd(fscal,cutoff_mask);
2622 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2624 /* Calculate temporary vectorial force */
2625 tx = _mm256_mul_pd(fscal,dx00);
2626 ty = _mm256_mul_pd(fscal,dy00);
2627 tz = _mm256_mul_pd(fscal,dz00);
2629 /* Update vectorial force */
2630 fix0 = _mm256_add_pd(fix0,tx);
2631 fiy0 = _mm256_add_pd(fiy0,ty);
2632 fiz0 = _mm256_add_pd(fiz0,tz);
2634 fjx0 = _mm256_add_pd(fjx0,tx);
2635 fjy0 = _mm256_add_pd(fjy0,ty);
2636 fjz0 = _mm256_add_pd(fjz0,tz);
2640 /**************************
2641 * CALCULATE INTERACTIONS *
2642 **************************/
2644 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2647 r01 = _mm256_mul_pd(rsq01,rinv01);
2648 r01 = _mm256_andnot_pd(dummy_mask,r01);
2650 /* EWALD ELECTROSTATICS */
2652 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2653 ewrt = _mm256_mul_pd(r01,ewtabscale);
2654 ewitab = _mm256_cvttpd_epi32(ewrt);
2655 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2656 ewitab = _mm_slli_epi32(ewitab,2);
2657 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2658 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2659 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2660 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2661 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2662 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2663 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2664 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
2665 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2667 d = _mm256_sub_pd(r01,rswitch);
2668 d = _mm256_max_pd(d,_mm256_setzero_pd());
2669 d2 = _mm256_mul_pd(d,d);
2670 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2672 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2674 /* Evaluate switch function */
2675 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2676 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv01,_mm256_mul_pd(velec,dsw)) );
2677 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2681 fscal = _mm256_and_pd(fscal,cutoff_mask);
2683 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2685 /* Calculate temporary vectorial force */
2686 tx = _mm256_mul_pd(fscal,dx01);
2687 ty = _mm256_mul_pd(fscal,dy01);
2688 tz = _mm256_mul_pd(fscal,dz01);
2690 /* Update vectorial force */
2691 fix0 = _mm256_add_pd(fix0,tx);
2692 fiy0 = _mm256_add_pd(fiy0,ty);
2693 fiz0 = _mm256_add_pd(fiz0,tz);
2695 fjx1 = _mm256_add_pd(fjx1,tx);
2696 fjy1 = _mm256_add_pd(fjy1,ty);
2697 fjz1 = _mm256_add_pd(fjz1,tz);
2701 /**************************
2702 * CALCULATE INTERACTIONS *
2703 **************************/
2705 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2708 r02 = _mm256_mul_pd(rsq02,rinv02);
2709 r02 = _mm256_andnot_pd(dummy_mask,r02);
2711 /* EWALD ELECTROSTATICS */
2713 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2714 ewrt = _mm256_mul_pd(r02,ewtabscale);
2715 ewitab = _mm256_cvttpd_epi32(ewrt);
2716 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2717 ewitab = _mm_slli_epi32(ewitab,2);
2718 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2719 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2720 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2721 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2722 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2723 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2724 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2725 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
2726 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2728 d = _mm256_sub_pd(r02,rswitch);
2729 d = _mm256_max_pd(d,_mm256_setzero_pd());
2730 d2 = _mm256_mul_pd(d,d);
2731 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2733 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2735 /* Evaluate switch function */
2736 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2737 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv02,_mm256_mul_pd(velec,dsw)) );
2738 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2742 fscal = _mm256_and_pd(fscal,cutoff_mask);
2744 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2746 /* Calculate temporary vectorial force */
2747 tx = _mm256_mul_pd(fscal,dx02);
2748 ty = _mm256_mul_pd(fscal,dy02);
2749 tz = _mm256_mul_pd(fscal,dz02);
2751 /* Update vectorial force */
2752 fix0 = _mm256_add_pd(fix0,tx);
2753 fiy0 = _mm256_add_pd(fiy0,ty);
2754 fiz0 = _mm256_add_pd(fiz0,tz);
2756 fjx2 = _mm256_add_pd(fjx2,tx);
2757 fjy2 = _mm256_add_pd(fjy2,ty);
2758 fjz2 = _mm256_add_pd(fjz2,tz);
2762 /**************************
2763 * CALCULATE INTERACTIONS *
2764 **************************/
2766 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2769 r10 = _mm256_mul_pd(rsq10,rinv10);
2770 r10 = _mm256_andnot_pd(dummy_mask,r10);
2772 /* EWALD ELECTROSTATICS */
2774 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2775 ewrt = _mm256_mul_pd(r10,ewtabscale);
2776 ewitab = _mm256_cvttpd_epi32(ewrt);
2777 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2778 ewitab = _mm_slli_epi32(ewitab,2);
2779 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2780 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2781 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2782 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2783 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2784 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2785 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2786 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
2787 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2789 d = _mm256_sub_pd(r10,rswitch);
2790 d = _mm256_max_pd(d,_mm256_setzero_pd());
2791 d2 = _mm256_mul_pd(d,d);
2792 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2794 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2796 /* Evaluate switch function */
2797 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2798 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
2799 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2803 fscal = _mm256_and_pd(fscal,cutoff_mask);
2805 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2807 /* Calculate temporary vectorial force */
2808 tx = _mm256_mul_pd(fscal,dx10);
2809 ty = _mm256_mul_pd(fscal,dy10);
2810 tz = _mm256_mul_pd(fscal,dz10);
2812 /* Update vectorial force */
2813 fix1 = _mm256_add_pd(fix1,tx);
2814 fiy1 = _mm256_add_pd(fiy1,ty);
2815 fiz1 = _mm256_add_pd(fiz1,tz);
2817 fjx0 = _mm256_add_pd(fjx0,tx);
2818 fjy0 = _mm256_add_pd(fjy0,ty);
2819 fjz0 = _mm256_add_pd(fjz0,tz);
2823 /**************************
2824 * CALCULATE INTERACTIONS *
2825 **************************/
2827 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2830 r11 = _mm256_mul_pd(rsq11,rinv11);
2831 r11 = _mm256_andnot_pd(dummy_mask,r11);
2833 /* EWALD ELECTROSTATICS */
2835 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2836 ewrt = _mm256_mul_pd(r11,ewtabscale);
2837 ewitab = _mm256_cvttpd_epi32(ewrt);
2838 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2839 ewitab = _mm_slli_epi32(ewitab,2);
2840 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2841 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2842 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2843 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2844 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2845 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2846 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2847 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
2848 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2850 d = _mm256_sub_pd(r11,rswitch);
2851 d = _mm256_max_pd(d,_mm256_setzero_pd());
2852 d2 = _mm256_mul_pd(d,d);
2853 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2855 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2857 /* Evaluate switch function */
2858 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2859 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv11,_mm256_mul_pd(velec,dsw)) );
2860 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2864 fscal = _mm256_and_pd(fscal,cutoff_mask);
2866 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2868 /* Calculate temporary vectorial force */
2869 tx = _mm256_mul_pd(fscal,dx11);
2870 ty = _mm256_mul_pd(fscal,dy11);
2871 tz = _mm256_mul_pd(fscal,dz11);
2873 /* Update vectorial force */
2874 fix1 = _mm256_add_pd(fix1,tx);
2875 fiy1 = _mm256_add_pd(fiy1,ty);
2876 fiz1 = _mm256_add_pd(fiz1,tz);
2878 fjx1 = _mm256_add_pd(fjx1,tx);
2879 fjy1 = _mm256_add_pd(fjy1,ty);
2880 fjz1 = _mm256_add_pd(fjz1,tz);
2884 /**************************
2885 * CALCULATE INTERACTIONS *
2886 **************************/
2888 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2891 r12 = _mm256_mul_pd(rsq12,rinv12);
2892 r12 = _mm256_andnot_pd(dummy_mask,r12);
2894 /* EWALD ELECTROSTATICS */
2896 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2897 ewrt = _mm256_mul_pd(r12,ewtabscale);
2898 ewitab = _mm256_cvttpd_epi32(ewrt);
2899 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2900 ewitab = _mm_slli_epi32(ewitab,2);
2901 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2902 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2903 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2904 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2905 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2906 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2907 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2908 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
2909 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2911 d = _mm256_sub_pd(r12,rswitch);
2912 d = _mm256_max_pd(d,_mm256_setzero_pd());
2913 d2 = _mm256_mul_pd(d,d);
2914 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2916 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2918 /* Evaluate switch function */
2919 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2920 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv12,_mm256_mul_pd(velec,dsw)) );
2921 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2925 fscal = _mm256_and_pd(fscal,cutoff_mask);
2927 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2929 /* Calculate temporary vectorial force */
2930 tx = _mm256_mul_pd(fscal,dx12);
2931 ty = _mm256_mul_pd(fscal,dy12);
2932 tz = _mm256_mul_pd(fscal,dz12);
2934 /* Update vectorial force */
2935 fix1 = _mm256_add_pd(fix1,tx);
2936 fiy1 = _mm256_add_pd(fiy1,ty);
2937 fiz1 = _mm256_add_pd(fiz1,tz);
2939 fjx2 = _mm256_add_pd(fjx2,tx);
2940 fjy2 = _mm256_add_pd(fjy2,ty);
2941 fjz2 = _mm256_add_pd(fjz2,tz);
2945 /**************************
2946 * CALCULATE INTERACTIONS *
2947 **************************/
2949 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2952 r20 = _mm256_mul_pd(rsq20,rinv20);
2953 r20 = _mm256_andnot_pd(dummy_mask,r20);
2955 /* EWALD ELECTROSTATICS */
2957 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2958 ewrt = _mm256_mul_pd(r20,ewtabscale);
2959 ewitab = _mm256_cvttpd_epi32(ewrt);
2960 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2961 ewitab = _mm_slli_epi32(ewitab,2);
2962 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2963 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2964 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
2965 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
2966 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
2967 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
2968 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
2969 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
2970 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2972 d = _mm256_sub_pd(r20,rswitch);
2973 d = _mm256_max_pd(d,_mm256_setzero_pd());
2974 d2 = _mm256_mul_pd(d,d);
2975 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2977 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2979 /* Evaluate switch function */
2980 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2981 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
2982 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2986 fscal = _mm256_and_pd(fscal,cutoff_mask);
2988 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2990 /* Calculate temporary vectorial force */
2991 tx = _mm256_mul_pd(fscal,dx20);
2992 ty = _mm256_mul_pd(fscal,dy20);
2993 tz = _mm256_mul_pd(fscal,dz20);
2995 /* Update vectorial force */
2996 fix2 = _mm256_add_pd(fix2,tx);
2997 fiy2 = _mm256_add_pd(fiy2,ty);
2998 fiz2 = _mm256_add_pd(fiz2,tz);
3000 fjx0 = _mm256_add_pd(fjx0,tx);
3001 fjy0 = _mm256_add_pd(fjy0,ty);
3002 fjz0 = _mm256_add_pd(fjz0,tz);
3006 /**************************
3007 * CALCULATE INTERACTIONS *
3008 **************************/
3010 if (gmx_mm256_any_lt(rsq21,rcutoff2))
3013 r21 = _mm256_mul_pd(rsq21,rinv21);
3014 r21 = _mm256_andnot_pd(dummy_mask,r21);
3016 /* EWALD ELECTROSTATICS */
3018 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3019 ewrt = _mm256_mul_pd(r21,ewtabscale);
3020 ewitab = _mm256_cvttpd_epi32(ewrt);
3021 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3022 ewitab = _mm_slli_epi32(ewitab,2);
3023 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3024 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3025 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3026 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3027 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3028 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3029 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3030 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
3031 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
3033 d = _mm256_sub_pd(r21,rswitch);
3034 d = _mm256_max_pd(d,_mm256_setzero_pd());
3035 d2 = _mm256_mul_pd(d,d);
3036 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3038 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3040 /* Evaluate switch function */
3041 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3042 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv21,_mm256_mul_pd(velec,dsw)) );
3043 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
3047 fscal = _mm256_and_pd(fscal,cutoff_mask);
3049 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3051 /* Calculate temporary vectorial force */
3052 tx = _mm256_mul_pd(fscal,dx21);
3053 ty = _mm256_mul_pd(fscal,dy21);
3054 tz = _mm256_mul_pd(fscal,dz21);
3056 /* Update vectorial force */
3057 fix2 = _mm256_add_pd(fix2,tx);
3058 fiy2 = _mm256_add_pd(fiy2,ty);
3059 fiz2 = _mm256_add_pd(fiz2,tz);
3061 fjx1 = _mm256_add_pd(fjx1,tx);
3062 fjy1 = _mm256_add_pd(fjy1,ty);
3063 fjz1 = _mm256_add_pd(fjz1,tz);
3067 /**************************
3068 * CALCULATE INTERACTIONS *
3069 **************************/
3071 if (gmx_mm256_any_lt(rsq22,rcutoff2))
3074 r22 = _mm256_mul_pd(rsq22,rinv22);
3075 r22 = _mm256_andnot_pd(dummy_mask,r22);
3077 /* EWALD ELECTROSTATICS */
3079 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3080 ewrt = _mm256_mul_pd(r22,ewtabscale);
3081 ewitab = _mm256_cvttpd_epi32(ewrt);
3082 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
3083 ewitab = _mm_slli_epi32(ewitab,2);
3084 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3085 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
3086 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
3087 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
3088 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
3089 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
3090 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
3091 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
3092 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
3094 d = _mm256_sub_pd(r22,rswitch);
3095 d = _mm256_max_pd(d,_mm256_setzero_pd());
3096 d2 = _mm256_mul_pd(d,d);
3097 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
3099 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
3101 /* Evaluate switch function */
3102 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3103 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv22,_mm256_mul_pd(velec,dsw)) );
3104 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
3108 fscal = _mm256_and_pd(fscal,cutoff_mask);
3110 fscal = _mm256_andnot_pd(dummy_mask,fscal);
3112 /* Calculate temporary vectorial force */
3113 tx = _mm256_mul_pd(fscal,dx22);
3114 ty = _mm256_mul_pd(fscal,dy22);
3115 tz = _mm256_mul_pd(fscal,dz22);
3117 /* Update vectorial force */
3118 fix2 = _mm256_add_pd(fix2,tx);
3119 fiy2 = _mm256_add_pd(fiy2,ty);
3120 fiz2 = _mm256_add_pd(fiz2,tz);
3122 fjx2 = _mm256_add_pd(fjx2,tx);
3123 fjy2 = _mm256_add_pd(fjy2,ty);
3124 fjz2 = _mm256_add_pd(fjz2,tz);
3128 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3129 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3130 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3131 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3133 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
3134 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3136 /* Inner loop uses 582 flops */
3139 /* End of innermost loop */
3141 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3142 f+i_coord_offset,fshift+i_shift_offset);
3144 /* Increment number of inner iterations */
3145 inneriter += j_index_end - j_index_start;
3147 /* Outer loop uses 18 flops */
3150 /* Increment number of outer iterations */
3153 /* Update outer/inner flops */
3155 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);