2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
112 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
114 __m128i ifour = _mm_set1_epi32(4);
115 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
118 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
121 __m256d dummy_mask,cutoff_mask;
122 __m128 tmpmask0,tmpmask1;
123 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
124 __m256d one = _mm256_set1_pd(1.0);
125 __m256d two = _mm256_set1_pd(2.0);
131 jindex = nlist->jindex;
133 shiftidx = nlist->shift;
135 shiftvec = fr->shift_vec[0];
136 fshift = fr->fshift[0];
137 facel = _mm256_set1_pd(fr->epsfac);
138 charge = mdatoms->chargeA;
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 vftab = kernel_data->table_vdw->data;
144 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
146 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
147 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
148 beta2 = _mm256_mul_pd(beta,beta);
149 beta3 = _mm256_mul_pd(beta,beta2);
151 ewtab = fr->ic->tabq_coul_FDV0;
152 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
153 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
155 /* Setup water-specific parameters */
156 inr = nlist->iinr[0];
157 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
158 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
159 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
160 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
162 jq0 = _mm256_set1_pd(charge[inr+0]);
163 jq1 = _mm256_set1_pd(charge[inr+1]);
164 jq2 = _mm256_set1_pd(charge[inr+2]);
165 vdwjidx0A = 2*vdwtype[inr+0];
166 qq00 = _mm256_mul_pd(iq0,jq0);
167 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
168 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
169 qq01 = _mm256_mul_pd(iq0,jq1);
170 qq02 = _mm256_mul_pd(iq0,jq2);
171 qq10 = _mm256_mul_pd(iq1,jq0);
172 qq11 = _mm256_mul_pd(iq1,jq1);
173 qq12 = _mm256_mul_pd(iq1,jq2);
174 qq20 = _mm256_mul_pd(iq2,jq0);
175 qq21 = _mm256_mul_pd(iq2,jq1);
176 qq22 = _mm256_mul_pd(iq2,jq2);
178 /* Avoid stupid compiler warnings */
179 jnrA = jnrB = jnrC = jnrD = 0;
188 for(iidx=0;iidx<4*DIM;iidx++)
193 /* Start outer loop over neighborlists */
194 for(iidx=0; iidx<nri; iidx++)
196 /* Load shift vector for this list */
197 i_shift_offset = DIM*shiftidx[iidx];
199 /* Load limits for loop over neighbors */
200 j_index_start = jindex[iidx];
201 j_index_end = jindex[iidx+1];
203 /* Get outer coordinate index */
205 i_coord_offset = DIM*inr;
207 /* Load i particle coords and add shift vector */
208 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
209 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
211 fix0 = _mm256_setzero_pd();
212 fiy0 = _mm256_setzero_pd();
213 fiz0 = _mm256_setzero_pd();
214 fix1 = _mm256_setzero_pd();
215 fiy1 = _mm256_setzero_pd();
216 fiz1 = _mm256_setzero_pd();
217 fix2 = _mm256_setzero_pd();
218 fiy2 = _mm256_setzero_pd();
219 fiz2 = _mm256_setzero_pd();
221 /* Reset potential sums */
222 velecsum = _mm256_setzero_pd();
223 vvdwsum = _mm256_setzero_pd();
225 /* Start inner kernel loop */
226 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
229 /* Get j neighbor index, and coordinate index */
234 j_coord_offsetA = DIM*jnrA;
235 j_coord_offsetB = DIM*jnrB;
236 j_coord_offsetC = DIM*jnrC;
237 j_coord_offsetD = DIM*jnrD;
239 /* load j atom coordinates */
240 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
241 x+j_coord_offsetC,x+j_coord_offsetD,
242 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
244 /* Calculate displacement vector */
245 dx00 = _mm256_sub_pd(ix0,jx0);
246 dy00 = _mm256_sub_pd(iy0,jy0);
247 dz00 = _mm256_sub_pd(iz0,jz0);
248 dx01 = _mm256_sub_pd(ix0,jx1);
249 dy01 = _mm256_sub_pd(iy0,jy1);
250 dz01 = _mm256_sub_pd(iz0,jz1);
251 dx02 = _mm256_sub_pd(ix0,jx2);
252 dy02 = _mm256_sub_pd(iy0,jy2);
253 dz02 = _mm256_sub_pd(iz0,jz2);
254 dx10 = _mm256_sub_pd(ix1,jx0);
255 dy10 = _mm256_sub_pd(iy1,jy0);
256 dz10 = _mm256_sub_pd(iz1,jz0);
257 dx11 = _mm256_sub_pd(ix1,jx1);
258 dy11 = _mm256_sub_pd(iy1,jy1);
259 dz11 = _mm256_sub_pd(iz1,jz1);
260 dx12 = _mm256_sub_pd(ix1,jx2);
261 dy12 = _mm256_sub_pd(iy1,jy2);
262 dz12 = _mm256_sub_pd(iz1,jz2);
263 dx20 = _mm256_sub_pd(ix2,jx0);
264 dy20 = _mm256_sub_pd(iy2,jy0);
265 dz20 = _mm256_sub_pd(iz2,jz0);
266 dx21 = _mm256_sub_pd(ix2,jx1);
267 dy21 = _mm256_sub_pd(iy2,jy1);
268 dz21 = _mm256_sub_pd(iz2,jz1);
269 dx22 = _mm256_sub_pd(ix2,jx2);
270 dy22 = _mm256_sub_pd(iy2,jy2);
271 dz22 = _mm256_sub_pd(iz2,jz2);
273 /* Calculate squared distance and things based on it */
274 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
275 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
276 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
277 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
278 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
279 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
280 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
281 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
282 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
284 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
285 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
286 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
287 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
288 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
289 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
290 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
291 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
292 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
294 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
295 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
296 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
297 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
298 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
299 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
300 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
301 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
302 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
304 fjx0 = _mm256_setzero_pd();
305 fjy0 = _mm256_setzero_pd();
306 fjz0 = _mm256_setzero_pd();
307 fjx1 = _mm256_setzero_pd();
308 fjy1 = _mm256_setzero_pd();
309 fjz1 = _mm256_setzero_pd();
310 fjx2 = _mm256_setzero_pd();
311 fjy2 = _mm256_setzero_pd();
312 fjz2 = _mm256_setzero_pd();
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 r00 = _mm256_mul_pd(rsq00,rinv00);
320 /* Calculate table index by multiplying r with table scale and truncate to integer */
321 rt = _mm256_mul_pd(r00,vftabscale);
322 vfitab = _mm256_cvttpd_epi32(rt);
323 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
324 vfitab = _mm_slli_epi32(vfitab,3);
326 /* EWALD ELECTROSTATICS */
328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
329 ewrt = _mm256_mul_pd(r00,ewtabscale);
330 ewitab = _mm256_cvttpd_epi32(ewrt);
331 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
332 ewitab = _mm_slli_epi32(ewitab,2);
333 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
334 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
335 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
336 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
337 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
338 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
339 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
340 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
341 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
343 /* CUBIC SPLINE TABLE DISPERSION */
344 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
345 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
346 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
347 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
348 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
349 Heps = _mm256_mul_pd(vfeps,H);
350 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
351 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
352 vvdw6 = _mm256_mul_pd(c6_00,VV);
353 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
354 fvdw6 = _mm256_mul_pd(c6_00,FF);
356 /* CUBIC SPLINE TABLE REPULSION */
357 vfitab = _mm_add_epi32(vfitab,ifour);
358 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
359 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
360 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
361 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
362 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
363 Heps = _mm256_mul_pd(vfeps,H);
364 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
365 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
366 vvdw12 = _mm256_mul_pd(c12_00,VV);
367 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
368 fvdw12 = _mm256_mul_pd(c12_00,FF);
369 vvdw = _mm256_add_pd(vvdw12,vvdw6);
370 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velecsum = _mm256_add_pd(velecsum,velec);
374 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
376 fscal = _mm256_add_pd(felec,fvdw);
378 /* Calculate temporary vectorial force */
379 tx = _mm256_mul_pd(fscal,dx00);
380 ty = _mm256_mul_pd(fscal,dy00);
381 tz = _mm256_mul_pd(fscal,dz00);
383 /* Update vectorial force */
384 fix0 = _mm256_add_pd(fix0,tx);
385 fiy0 = _mm256_add_pd(fiy0,ty);
386 fiz0 = _mm256_add_pd(fiz0,tz);
388 fjx0 = _mm256_add_pd(fjx0,tx);
389 fjy0 = _mm256_add_pd(fjy0,ty);
390 fjz0 = _mm256_add_pd(fjz0,tz);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 r01 = _mm256_mul_pd(rsq01,rinv01);
398 /* EWALD ELECTROSTATICS */
400 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
401 ewrt = _mm256_mul_pd(r01,ewtabscale);
402 ewitab = _mm256_cvttpd_epi32(ewrt);
403 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
404 ewitab = _mm_slli_epi32(ewitab,2);
405 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
406 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
407 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
408 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
409 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
410 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
411 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
412 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
413 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
415 /* Update potential sum for this i atom from the interaction with this j atom. */
416 velecsum = _mm256_add_pd(velecsum,velec);
420 /* Calculate temporary vectorial force */
421 tx = _mm256_mul_pd(fscal,dx01);
422 ty = _mm256_mul_pd(fscal,dy01);
423 tz = _mm256_mul_pd(fscal,dz01);
425 /* Update vectorial force */
426 fix0 = _mm256_add_pd(fix0,tx);
427 fiy0 = _mm256_add_pd(fiy0,ty);
428 fiz0 = _mm256_add_pd(fiz0,tz);
430 fjx1 = _mm256_add_pd(fjx1,tx);
431 fjy1 = _mm256_add_pd(fjy1,ty);
432 fjz1 = _mm256_add_pd(fjz1,tz);
434 /**************************
435 * CALCULATE INTERACTIONS *
436 **************************/
438 r02 = _mm256_mul_pd(rsq02,rinv02);
440 /* EWALD ELECTROSTATICS */
442 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
443 ewrt = _mm256_mul_pd(r02,ewtabscale);
444 ewitab = _mm256_cvttpd_epi32(ewrt);
445 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
446 ewitab = _mm_slli_epi32(ewitab,2);
447 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
448 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
449 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
450 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
451 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
452 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
453 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
454 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
455 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velecsum = _mm256_add_pd(velecsum,velec);
462 /* Calculate temporary vectorial force */
463 tx = _mm256_mul_pd(fscal,dx02);
464 ty = _mm256_mul_pd(fscal,dy02);
465 tz = _mm256_mul_pd(fscal,dz02);
467 /* Update vectorial force */
468 fix0 = _mm256_add_pd(fix0,tx);
469 fiy0 = _mm256_add_pd(fiy0,ty);
470 fiz0 = _mm256_add_pd(fiz0,tz);
472 fjx2 = _mm256_add_pd(fjx2,tx);
473 fjy2 = _mm256_add_pd(fjy2,ty);
474 fjz2 = _mm256_add_pd(fjz2,tz);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 r10 = _mm256_mul_pd(rsq10,rinv10);
482 /* EWALD ELECTROSTATICS */
484 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
485 ewrt = _mm256_mul_pd(r10,ewtabscale);
486 ewitab = _mm256_cvttpd_epi32(ewrt);
487 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
488 ewitab = _mm_slli_epi32(ewitab,2);
489 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
490 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
491 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
492 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
493 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
494 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
495 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
496 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
497 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velecsum = _mm256_add_pd(velecsum,velec);
504 /* Calculate temporary vectorial force */
505 tx = _mm256_mul_pd(fscal,dx10);
506 ty = _mm256_mul_pd(fscal,dy10);
507 tz = _mm256_mul_pd(fscal,dz10);
509 /* Update vectorial force */
510 fix1 = _mm256_add_pd(fix1,tx);
511 fiy1 = _mm256_add_pd(fiy1,ty);
512 fiz1 = _mm256_add_pd(fiz1,tz);
514 fjx0 = _mm256_add_pd(fjx0,tx);
515 fjy0 = _mm256_add_pd(fjy0,ty);
516 fjz0 = _mm256_add_pd(fjz0,tz);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 r11 = _mm256_mul_pd(rsq11,rinv11);
524 /* EWALD ELECTROSTATICS */
526 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
527 ewrt = _mm256_mul_pd(r11,ewtabscale);
528 ewitab = _mm256_cvttpd_epi32(ewrt);
529 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
530 ewitab = _mm_slli_epi32(ewitab,2);
531 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
532 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
533 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
534 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
535 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
536 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
537 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
538 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
539 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velecsum = _mm256_add_pd(velecsum,velec);
546 /* Calculate temporary vectorial force */
547 tx = _mm256_mul_pd(fscal,dx11);
548 ty = _mm256_mul_pd(fscal,dy11);
549 tz = _mm256_mul_pd(fscal,dz11);
551 /* Update vectorial force */
552 fix1 = _mm256_add_pd(fix1,tx);
553 fiy1 = _mm256_add_pd(fiy1,ty);
554 fiz1 = _mm256_add_pd(fiz1,tz);
556 fjx1 = _mm256_add_pd(fjx1,tx);
557 fjy1 = _mm256_add_pd(fjy1,ty);
558 fjz1 = _mm256_add_pd(fjz1,tz);
560 /**************************
561 * CALCULATE INTERACTIONS *
562 **************************/
564 r12 = _mm256_mul_pd(rsq12,rinv12);
566 /* EWALD ELECTROSTATICS */
568 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
569 ewrt = _mm256_mul_pd(r12,ewtabscale);
570 ewitab = _mm256_cvttpd_epi32(ewrt);
571 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
572 ewitab = _mm_slli_epi32(ewitab,2);
573 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
574 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
575 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
576 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
577 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
578 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
579 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
580 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
581 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
583 /* Update potential sum for this i atom from the interaction with this j atom. */
584 velecsum = _mm256_add_pd(velecsum,velec);
588 /* Calculate temporary vectorial force */
589 tx = _mm256_mul_pd(fscal,dx12);
590 ty = _mm256_mul_pd(fscal,dy12);
591 tz = _mm256_mul_pd(fscal,dz12);
593 /* Update vectorial force */
594 fix1 = _mm256_add_pd(fix1,tx);
595 fiy1 = _mm256_add_pd(fiy1,ty);
596 fiz1 = _mm256_add_pd(fiz1,tz);
598 fjx2 = _mm256_add_pd(fjx2,tx);
599 fjy2 = _mm256_add_pd(fjy2,ty);
600 fjz2 = _mm256_add_pd(fjz2,tz);
602 /**************************
603 * CALCULATE INTERACTIONS *
604 **************************/
606 r20 = _mm256_mul_pd(rsq20,rinv20);
608 /* EWALD ELECTROSTATICS */
610 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
611 ewrt = _mm256_mul_pd(r20,ewtabscale);
612 ewitab = _mm256_cvttpd_epi32(ewrt);
613 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
614 ewitab = _mm_slli_epi32(ewitab,2);
615 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
616 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
617 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
618 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
619 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
620 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
621 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
622 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
623 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
625 /* Update potential sum for this i atom from the interaction with this j atom. */
626 velecsum = _mm256_add_pd(velecsum,velec);
630 /* Calculate temporary vectorial force */
631 tx = _mm256_mul_pd(fscal,dx20);
632 ty = _mm256_mul_pd(fscal,dy20);
633 tz = _mm256_mul_pd(fscal,dz20);
635 /* Update vectorial force */
636 fix2 = _mm256_add_pd(fix2,tx);
637 fiy2 = _mm256_add_pd(fiy2,ty);
638 fiz2 = _mm256_add_pd(fiz2,tz);
640 fjx0 = _mm256_add_pd(fjx0,tx);
641 fjy0 = _mm256_add_pd(fjy0,ty);
642 fjz0 = _mm256_add_pd(fjz0,tz);
644 /**************************
645 * CALCULATE INTERACTIONS *
646 **************************/
648 r21 = _mm256_mul_pd(rsq21,rinv21);
650 /* EWALD ELECTROSTATICS */
652 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
653 ewrt = _mm256_mul_pd(r21,ewtabscale);
654 ewitab = _mm256_cvttpd_epi32(ewrt);
655 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
656 ewitab = _mm_slli_epi32(ewitab,2);
657 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
658 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
659 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
660 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
661 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
662 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
663 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
664 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
665 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
667 /* Update potential sum for this i atom from the interaction with this j atom. */
668 velecsum = _mm256_add_pd(velecsum,velec);
672 /* Calculate temporary vectorial force */
673 tx = _mm256_mul_pd(fscal,dx21);
674 ty = _mm256_mul_pd(fscal,dy21);
675 tz = _mm256_mul_pd(fscal,dz21);
677 /* Update vectorial force */
678 fix2 = _mm256_add_pd(fix2,tx);
679 fiy2 = _mm256_add_pd(fiy2,ty);
680 fiz2 = _mm256_add_pd(fiz2,tz);
682 fjx1 = _mm256_add_pd(fjx1,tx);
683 fjy1 = _mm256_add_pd(fjy1,ty);
684 fjz1 = _mm256_add_pd(fjz1,tz);
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 r22 = _mm256_mul_pd(rsq22,rinv22);
692 /* EWALD ELECTROSTATICS */
694 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
695 ewrt = _mm256_mul_pd(r22,ewtabscale);
696 ewitab = _mm256_cvttpd_epi32(ewrt);
697 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
698 ewitab = _mm_slli_epi32(ewitab,2);
699 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
700 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
701 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
702 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
703 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
704 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
705 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
706 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
707 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
709 /* Update potential sum for this i atom from the interaction with this j atom. */
710 velecsum = _mm256_add_pd(velecsum,velec);
714 /* Calculate temporary vectorial force */
715 tx = _mm256_mul_pd(fscal,dx22);
716 ty = _mm256_mul_pd(fscal,dy22);
717 tz = _mm256_mul_pd(fscal,dz22);
719 /* Update vectorial force */
720 fix2 = _mm256_add_pd(fix2,tx);
721 fiy2 = _mm256_add_pd(fiy2,ty);
722 fiz2 = _mm256_add_pd(fiz2,tz);
724 fjx2 = _mm256_add_pd(fjx2,tx);
725 fjy2 = _mm256_add_pd(fjy2,ty);
726 fjz2 = _mm256_add_pd(fjz2,tz);
728 fjptrA = f+j_coord_offsetA;
729 fjptrB = f+j_coord_offsetB;
730 fjptrC = f+j_coord_offsetC;
731 fjptrD = f+j_coord_offsetD;
733 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
734 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
736 /* Inner loop uses 403 flops */
742 /* Get j neighbor index, and coordinate index */
743 jnrlistA = jjnr[jidx];
744 jnrlistB = jjnr[jidx+1];
745 jnrlistC = jjnr[jidx+2];
746 jnrlistD = jjnr[jidx+3];
747 /* Sign of each element will be negative for non-real atoms.
748 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
749 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
751 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
753 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
754 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
755 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
757 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
758 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
759 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
760 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
761 j_coord_offsetA = DIM*jnrA;
762 j_coord_offsetB = DIM*jnrB;
763 j_coord_offsetC = DIM*jnrC;
764 j_coord_offsetD = DIM*jnrD;
766 /* load j atom coordinates */
767 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
768 x+j_coord_offsetC,x+j_coord_offsetD,
769 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
771 /* Calculate displacement vector */
772 dx00 = _mm256_sub_pd(ix0,jx0);
773 dy00 = _mm256_sub_pd(iy0,jy0);
774 dz00 = _mm256_sub_pd(iz0,jz0);
775 dx01 = _mm256_sub_pd(ix0,jx1);
776 dy01 = _mm256_sub_pd(iy0,jy1);
777 dz01 = _mm256_sub_pd(iz0,jz1);
778 dx02 = _mm256_sub_pd(ix0,jx2);
779 dy02 = _mm256_sub_pd(iy0,jy2);
780 dz02 = _mm256_sub_pd(iz0,jz2);
781 dx10 = _mm256_sub_pd(ix1,jx0);
782 dy10 = _mm256_sub_pd(iy1,jy0);
783 dz10 = _mm256_sub_pd(iz1,jz0);
784 dx11 = _mm256_sub_pd(ix1,jx1);
785 dy11 = _mm256_sub_pd(iy1,jy1);
786 dz11 = _mm256_sub_pd(iz1,jz1);
787 dx12 = _mm256_sub_pd(ix1,jx2);
788 dy12 = _mm256_sub_pd(iy1,jy2);
789 dz12 = _mm256_sub_pd(iz1,jz2);
790 dx20 = _mm256_sub_pd(ix2,jx0);
791 dy20 = _mm256_sub_pd(iy2,jy0);
792 dz20 = _mm256_sub_pd(iz2,jz0);
793 dx21 = _mm256_sub_pd(ix2,jx1);
794 dy21 = _mm256_sub_pd(iy2,jy1);
795 dz21 = _mm256_sub_pd(iz2,jz1);
796 dx22 = _mm256_sub_pd(ix2,jx2);
797 dy22 = _mm256_sub_pd(iy2,jy2);
798 dz22 = _mm256_sub_pd(iz2,jz2);
800 /* Calculate squared distance and things based on it */
801 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
802 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
803 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
804 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
805 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
806 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
807 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
808 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
809 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
811 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
812 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
813 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
814 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
815 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
816 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
817 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
818 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
819 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
821 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
822 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
823 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
824 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
825 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
826 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
827 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
828 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
829 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
831 fjx0 = _mm256_setzero_pd();
832 fjy0 = _mm256_setzero_pd();
833 fjz0 = _mm256_setzero_pd();
834 fjx1 = _mm256_setzero_pd();
835 fjy1 = _mm256_setzero_pd();
836 fjz1 = _mm256_setzero_pd();
837 fjx2 = _mm256_setzero_pd();
838 fjy2 = _mm256_setzero_pd();
839 fjz2 = _mm256_setzero_pd();
841 /**************************
842 * CALCULATE INTERACTIONS *
843 **************************/
845 r00 = _mm256_mul_pd(rsq00,rinv00);
846 r00 = _mm256_andnot_pd(dummy_mask,r00);
848 /* Calculate table index by multiplying r with table scale and truncate to integer */
849 rt = _mm256_mul_pd(r00,vftabscale);
850 vfitab = _mm256_cvttpd_epi32(rt);
851 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
852 vfitab = _mm_slli_epi32(vfitab,3);
854 /* EWALD ELECTROSTATICS */
856 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
857 ewrt = _mm256_mul_pd(r00,ewtabscale);
858 ewitab = _mm256_cvttpd_epi32(ewrt);
859 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
860 ewitab = _mm_slli_epi32(ewitab,2);
861 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
862 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
863 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
864 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
865 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
866 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
867 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
868 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
869 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
871 /* CUBIC SPLINE TABLE DISPERSION */
872 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
873 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
874 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
875 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
876 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
877 Heps = _mm256_mul_pd(vfeps,H);
878 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
879 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
880 vvdw6 = _mm256_mul_pd(c6_00,VV);
881 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
882 fvdw6 = _mm256_mul_pd(c6_00,FF);
884 /* CUBIC SPLINE TABLE REPULSION */
885 vfitab = _mm_add_epi32(vfitab,ifour);
886 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
887 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
888 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
889 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
890 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
891 Heps = _mm256_mul_pd(vfeps,H);
892 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
893 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
894 vvdw12 = _mm256_mul_pd(c12_00,VV);
895 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
896 fvdw12 = _mm256_mul_pd(c12_00,FF);
897 vvdw = _mm256_add_pd(vvdw12,vvdw6);
898 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec = _mm256_andnot_pd(dummy_mask,velec);
902 velecsum = _mm256_add_pd(velecsum,velec);
903 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
904 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
906 fscal = _mm256_add_pd(felec,fvdw);
908 fscal = _mm256_andnot_pd(dummy_mask,fscal);
910 /* Calculate temporary vectorial force */
911 tx = _mm256_mul_pd(fscal,dx00);
912 ty = _mm256_mul_pd(fscal,dy00);
913 tz = _mm256_mul_pd(fscal,dz00);
915 /* Update vectorial force */
916 fix0 = _mm256_add_pd(fix0,tx);
917 fiy0 = _mm256_add_pd(fiy0,ty);
918 fiz0 = _mm256_add_pd(fiz0,tz);
920 fjx0 = _mm256_add_pd(fjx0,tx);
921 fjy0 = _mm256_add_pd(fjy0,ty);
922 fjz0 = _mm256_add_pd(fjz0,tz);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 r01 = _mm256_mul_pd(rsq01,rinv01);
929 r01 = _mm256_andnot_pd(dummy_mask,r01);
931 /* EWALD ELECTROSTATICS */
933 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
934 ewrt = _mm256_mul_pd(r01,ewtabscale);
935 ewitab = _mm256_cvttpd_epi32(ewrt);
936 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
937 ewitab = _mm_slli_epi32(ewitab,2);
938 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
939 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
940 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
941 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
942 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
943 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
944 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
945 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
946 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec = _mm256_andnot_pd(dummy_mask,velec);
950 velecsum = _mm256_add_pd(velecsum,velec);
954 fscal = _mm256_andnot_pd(dummy_mask,fscal);
956 /* Calculate temporary vectorial force */
957 tx = _mm256_mul_pd(fscal,dx01);
958 ty = _mm256_mul_pd(fscal,dy01);
959 tz = _mm256_mul_pd(fscal,dz01);
961 /* Update vectorial force */
962 fix0 = _mm256_add_pd(fix0,tx);
963 fiy0 = _mm256_add_pd(fiy0,ty);
964 fiz0 = _mm256_add_pd(fiz0,tz);
966 fjx1 = _mm256_add_pd(fjx1,tx);
967 fjy1 = _mm256_add_pd(fjy1,ty);
968 fjz1 = _mm256_add_pd(fjz1,tz);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r02 = _mm256_mul_pd(rsq02,rinv02);
975 r02 = _mm256_andnot_pd(dummy_mask,r02);
977 /* EWALD ELECTROSTATICS */
979 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
980 ewrt = _mm256_mul_pd(r02,ewtabscale);
981 ewitab = _mm256_cvttpd_epi32(ewrt);
982 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
983 ewitab = _mm_slli_epi32(ewitab,2);
984 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
985 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
986 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
987 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
988 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
989 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
990 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
991 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
992 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm256_andnot_pd(dummy_mask,velec);
996 velecsum = _mm256_add_pd(velecsum,velec);
1000 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1002 /* Calculate temporary vectorial force */
1003 tx = _mm256_mul_pd(fscal,dx02);
1004 ty = _mm256_mul_pd(fscal,dy02);
1005 tz = _mm256_mul_pd(fscal,dz02);
1007 /* Update vectorial force */
1008 fix0 = _mm256_add_pd(fix0,tx);
1009 fiy0 = _mm256_add_pd(fiy0,ty);
1010 fiz0 = _mm256_add_pd(fiz0,tz);
1012 fjx2 = _mm256_add_pd(fjx2,tx);
1013 fjy2 = _mm256_add_pd(fjy2,ty);
1014 fjz2 = _mm256_add_pd(fjz2,tz);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 r10 = _mm256_mul_pd(rsq10,rinv10);
1021 r10 = _mm256_andnot_pd(dummy_mask,r10);
1023 /* EWALD ELECTROSTATICS */
1025 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1026 ewrt = _mm256_mul_pd(r10,ewtabscale);
1027 ewitab = _mm256_cvttpd_epi32(ewrt);
1028 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1029 ewitab = _mm_slli_epi32(ewitab,2);
1030 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1031 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1032 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1033 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1034 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1035 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1036 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1037 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1038 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec = _mm256_andnot_pd(dummy_mask,velec);
1042 velecsum = _mm256_add_pd(velecsum,velec);
1046 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1048 /* Calculate temporary vectorial force */
1049 tx = _mm256_mul_pd(fscal,dx10);
1050 ty = _mm256_mul_pd(fscal,dy10);
1051 tz = _mm256_mul_pd(fscal,dz10);
1053 /* Update vectorial force */
1054 fix1 = _mm256_add_pd(fix1,tx);
1055 fiy1 = _mm256_add_pd(fiy1,ty);
1056 fiz1 = _mm256_add_pd(fiz1,tz);
1058 fjx0 = _mm256_add_pd(fjx0,tx);
1059 fjy0 = _mm256_add_pd(fjy0,ty);
1060 fjz0 = _mm256_add_pd(fjz0,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r11 = _mm256_mul_pd(rsq11,rinv11);
1067 r11 = _mm256_andnot_pd(dummy_mask,r11);
1069 /* EWALD ELECTROSTATICS */
1071 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1072 ewrt = _mm256_mul_pd(r11,ewtabscale);
1073 ewitab = _mm256_cvttpd_epi32(ewrt);
1074 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1075 ewitab = _mm_slli_epi32(ewitab,2);
1076 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1077 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1078 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1079 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1080 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1081 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1082 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1083 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1084 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1086 /* Update potential sum for this i atom from the interaction with this j atom. */
1087 velec = _mm256_andnot_pd(dummy_mask,velec);
1088 velecsum = _mm256_add_pd(velecsum,velec);
1092 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1094 /* Calculate temporary vectorial force */
1095 tx = _mm256_mul_pd(fscal,dx11);
1096 ty = _mm256_mul_pd(fscal,dy11);
1097 tz = _mm256_mul_pd(fscal,dz11);
1099 /* Update vectorial force */
1100 fix1 = _mm256_add_pd(fix1,tx);
1101 fiy1 = _mm256_add_pd(fiy1,ty);
1102 fiz1 = _mm256_add_pd(fiz1,tz);
1104 fjx1 = _mm256_add_pd(fjx1,tx);
1105 fjy1 = _mm256_add_pd(fjy1,ty);
1106 fjz1 = _mm256_add_pd(fjz1,tz);
1108 /**************************
1109 * CALCULATE INTERACTIONS *
1110 **************************/
1112 r12 = _mm256_mul_pd(rsq12,rinv12);
1113 r12 = _mm256_andnot_pd(dummy_mask,r12);
1115 /* EWALD ELECTROSTATICS */
1117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1118 ewrt = _mm256_mul_pd(r12,ewtabscale);
1119 ewitab = _mm256_cvttpd_epi32(ewrt);
1120 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1121 ewitab = _mm_slli_epi32(ewitab,2);
1122 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1123 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1124 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1125 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1126 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1127 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1128 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1129 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1130 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1132 /* Update potential sum for this i atom from the interaction with this j atom. */
1133 velec = _mm256_andnot_pd(dummy_mask,velec);
1134 velecsum = _mm256_add_pd(velecsum,velec);
1138 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1140 /* Calculate temporary vectorial force */
1141 tx = _mm256_mul_pd(fscal,dx12);
1142 ty = _mm256_mul_pd(fscal,dy12);
1143 tz = _mm256_mul_pd(fscal,dz12);
1145 /* Update vectorial force */
1146 fix1 = _mm256_add_pd(fix1,tx);
1147 fiy1 = _mm256_add_pd(fiy1,ty);
1148 fiz1 = _mm256_add_pd(fiz1,tz);
1150 fjx2 = _mm256_add_pd(fjx2,tx);
1151 fjy2 = _mm256_add_pd(fjy2,ty);
1152 fjz2 = _mm256_add_pd(fjz2,tz);
1154 /**************************
1155 * CALCULATE INTERACTIONS *
1156 **************************/
1158 r20 = _mm256_mul_pd(rsq20,rinv20);
1159 r20 = _mm256_andnot_pd(dummy_mask,r20);
1161 /* EWALD ELECTROSTATICS */
1163 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1164 ewrt = _mm256_mul_pd(r20,ewtabscale);
1165 ewitab = _mm256_cvttpd_epi32(ewrt);
1166 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1167 ewitab = _mm_slli_epi32(ewitab,2);
1168 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1169 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1170 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1171 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1172 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1173 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1174 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1175 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1176 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm256_andnot_pd(dummy_mask,velec);
1180 velecsum = _mm256_add_pd(velecsum,velec);
1184 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1186 /* Calculate temporary vectorial force */
1187 tx = _mm256_mul_pd(fscal,dx20);
1188 ty = _mm256_mul_pd(fscal,dy20);
1189 tz = _mm256_mul_pd(fscal,dz20);
1191 /* Update vectorial force */
1192 fix2 = _mm256_add_pd(fix2,tx);
1193 fiy2 = _mm256_add_pd(fiy2,ty);
1194 fiz2 = _mm256_add_pd(fiz2,tz);
1196 fjx0 = _mm256_add_pd(fjx0,tx);
1197 fjy0 = _mm256_add_pd(fjy0,ty);
1198 fjz0 = _mm256_add_pd(fjz0,tz);
1200 /**************************
1201 * CALCULATE INTERACTIONS *
1202 **************************/
1204 r21 = _mm256_mul_pd(rsq21,rinv21);
1205 r21 = _mm256_andnot_pd(dummy_mask,r21);
1207 /* EWALD ELECTROSTATICS */
1209 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1210 ewrt = _mm256_mul_pd(r21,ewtabscale);
1211 ewitab = _mm256_cvttpd_epi32(ewrt);
1212 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1213 ewitab = _mm_slli_epi32(ewitab,2);
1214 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1215 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1216 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1217 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1218 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1219 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1220 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1221 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1222 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1224 /* Update potential sum for this i atom from the interaction with this j atom. */
1225 velec = _mm256_andnot_pd(dummy_mask,velec);
1226 velecsum = _mm256_add_pd(velecsum,velec);
1230 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1232 /* Calculate temporary vectorial force */
1233 tx = _mm256_mul_pd(fscal,dx21);
1234 ty = _mm256_mul_pd(fscal,dy21);
1235 tz = _mm256_mul_pd(fscal,dz21);
1237 /* Update vectorial force */
1238 fix2 = _mm256_add_pd(fix2,tx);
1239 fiy2 = _mm256_add_pd(fiy2,ty);
1240 fiz2 = _mm256_add_pd(fiz2,tz);
1242 fjx1 = _mm256_add_pd(fjx1,tx);
1243 fjy1 = _mm256_add_pd(fjy1,ty);
1244 fjz1 = _mm256_add_pd(fjz1,tz);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 r22 = _mm256_mul_pd(rsq22,rinv22);
1251 r22 = _mm256_andnot_pd(dummy_mask,r22);
1253 /* EWALD ELECTROSTATICS */
1255 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1256 ewrt = _mm256_mul_pd(r22,ewtabscale);
1257 ewitab = _mm256_cvttpd_epi32(ewrt);
1258 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1259 ewitab = _mm_slli_epi32(ewitab,2);
1260 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1261 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1262 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1263 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1264 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1265 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1266 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1267 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1268 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1270 /* Update potential sum for this i atom from the interaction with this j atom. */
1271 velec = _mm256_andnot_pd(dummy_mask,velec);
1272 velecsum = _mm256_add_pd(velecsum,velec);
1276 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1278 /* Calculate temporary vectorial force */
1279 tx = _mm256_mul_pd(fscal,dx22);
1280 ty = _mm256_mul_pd(fscal,dy22);
1281 tz = _mm256_mul_pd(fscal,dz22);
1283 /* Update vectorial force */
1284 fix2 = _mm256_add_pd(fix2,tx);
1285 fiy2 = _mm256_add_pd(fiy2,ty);
1286 fiz2 = _mm256_add_pd(fiz2,tz);
1288 fjx2 = _mm256_add_pd(fjx2,tx);
1289 fjy2 = _mm256_add_pd(fjy2,ty);
1290 fjz2 = _mm256_add_pd(fjz2,tz);
1292 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1293 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1294 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1295 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1297 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1298 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1300 /* Inner loop uses 412 flops */
1303 /* End of innermost loop */
1305 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1306 f+i_coord_offset,fshift+i_shift_offset);
1309 /* Update potential energies */
1310 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1311 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1313 /* Increment number of inner iterations */
1314 inneriter += j_index_end - j_index_start;
1316 /* Outer loop uses 20 flops */
1319 /* Increment number of outer iterations */
1322 /* Update outer/inner flops */
1324 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*412);
1327 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1328 * Electrostatics interaction: Ewald
1329 * VdW interaction: CubicSplineTable
1330 * Geometry: Water3-Water3
1331 * Calculate force/pot: Force
1334 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1335 (t_nblist * gmx_restrict nlist,
1336 rvec * gmx_restrict xx,
1337 rvec * gmx_restrict ff,
1338 t_forcerec * gmx_restrict fr,
1339 t_mdatoms * gmx_restrict mdatoms,
1340 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1341 t_nrnb * gmx_restrict nrnb)
1343 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1344 * just 0 for non-waters.
1345 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1346 * jnr indices corresponding to data put in the four positions in the SIMD register.
1348 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1349 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1350 int jnrA,jnrB,jnrC,jnrD;
1351 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1352 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1353 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1354 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1355 real rcutoff_scalar;
1356 real *shiftvec,*fshift,*x,*f;
1357 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1358 real scratch[4*DIM];
1359 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1360 real * vdwioffsetptr0;
1361 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1362 real * vdwioffsetptr1;
1363 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1364 real * vdwioffsetptr2;
1365 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1366 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1367 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1368 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1369 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1370 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1371 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1372 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1373 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1374 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1375 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1376 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1377 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1378 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1379 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1380 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1381 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1384 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1387 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1388 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1390 __m128i ifour = _mm_set1_epi32(4);
1391 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1394 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1395 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1397 __m256d dummy_mask,cutoff_mask;
1398 __m128 tmpmask0,tmpmask1;
1399 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1400 __m256d one = _mm256_set1_pd(1.0);
1401 __m256d two = _mm256_set1_pd(2.0);
1407 jindex = nlist->jindex;
1409 shiftidx = nlist->shift;
1411 shiftvec = fr->shift_vec[0];
1412 fshift = fr->fshift[0];
1413 facel = _mm256_set1_pd(fr->epsfac);
1414 charge = mdatoms->chargeA;
1415 nvdwtype = fr->ntype;
1416 vdwparam = fr->nbfp;
1417 vdwtype = mdatoms->typeA;
1419 vftab = kernel_data->table_vdw->data;
1420 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1422 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1423 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1424 beta2 = _mm256_mul_pd(beta,beta);
1425 beta3 = _mm256_mul_pd(beta,beta2);
1427 ewtab = fr->ic->tabq_coul_F;
1428 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1429 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1431 /* Setup water-specific parameters */
1432 inr = nlist->iinr[0];
1433 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1434 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1435 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1436 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1438 jq0 = _mm256_set1_pd(charge[inr+0]);
1439 jq1 = _mm256_set1_pd(charge[inr+1]);
1440 jq2 = _mm256_set1_pd(charge[inr+2]);
1441 vdwjidx0A = 2*vdwtype[inr+0];
1442 qq00 = _mm256_mul_pd(iq0,jq0);
1443 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1444 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1445 qq01 = _mm256_mul_pd(iq0,jq1);
1446 qq02 = _mm256_mul_pd(iq0,jq2);
1447 qq10 = _mm256_mul_pd(iq1,jq0);
1448 qq11 = _mm256_mul_pd(iq1,jq1);
1449 qq12 = _mm256_mul_pd(iq1,jq2);
1450 qq20 = _mm256_mul_pd(iq2,jq0);
1451 qq21 = _mm256_mul_pd(iq2,jq1);
1452 qq22 = _mm256_mul_pd(iq2,jq2);
1454 /* Avoid stupid compiler warnings */
1455 jnrA = jnrB = jnrC = jnrD = 0;
1456 j_coord_offsetA = 0;
1457 j_coord_offsetB = 0;
1458 j_coord_offsetC = 0;
1459 j_coord_offsetD = 0;
1464 for(iidx=0;iidx<4*DIM;iidx++)
1466 scratch[iidx] = 0.0;
1469 /* Start outer loop over neighborlists */
1470 for(iidx=0; iidx<nri; iidx++)
1472 /* Load shift vector for this list */
1473 i_shift_offset = DIM*shiftidx[iidx];
1475 /* Load limits for loop over neighbors */
1476 j_index_start = jindex[iidx];
1477 j_index_end = jindex[iidx+1];
1479 /* Get outer coordinate index */
1481 i_coord_offset = DIM*inr;
1483 /* Load i particle coords and add shift vector */
1484 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1485 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1487 fix0 = _mm256_setzero_pd();
1488 fiy0 = _mm256_setzero_pd();
1489 fiz0 = _mm256_setzero_pd();
1490 fix1 = _mm256_setzero_pd();
1491 fiy1 = _mm256_setzero_pd();
1492 fiz1 = _mm256_setzero_pd();
1493 fix2 = _mm256_setzero_pd();
1494 fiy2 = _mm256_setzero_pd();
1495 fiz2 = _mm256_setzero_pd();
1497 /* Start inner kernel loop */
1498 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1501 /* Get j neighbor index, and coordinate index */
1503 jnrB = jjnr[jidx+1];
1504 jnrC = jjnr[jidx+2];
1505 jnrD = jjnr[jidx+3];
1506 j_coord_offsetA = DIM*jnrA;
1507 j_coord_offsetB = DIM*jnrB;
1508 j_coord_offsetC = DIM*jnrC;
1509 j_coord_offsetD = DIM*jnrD;
1511 /* load j atom coordinates */
1512 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1513 x+j_coord_offsetC,x+j_coord_offsetD,
1514 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1516 /* Calculate displacement vector */
1517 dx00 = _mm256_sub_pd(ix0,jx0);
1518 dy00 = _mm256_sub_pd(iy0,jy0);
1519 dz00 = _mm256_sub_pd(iz0,jz0);
1520 dx01 = _mm256_sub_pd(ix0,jx1);
1521 dy01 = _mm256_sub_pd(iy0,jy1);
1522 dz01 = _mm256_sub_pd(iz0,jz1);
1523 dx02 = _mm256_sub_pd(ix0,jx2);
1524 dy02 = _mm256_sub_pd(iy0,jy2);
1525 dz02 = _mm256_sub_pd(iz0,jz2);
1526 dx10 = _mm256_sub_pd(ix1,jx0);
1527 dy10 = _mm256_sub_pd(iy1,jy0);
1528 dz10 = _mm256_sub_pd(iz1,jz0);
1529 dx11 = _mm256_sub_pd(ix1,jx1);
1530 dy11 = _mm256_sub_pd(iy1,jy1);
1531 dz11 = _mm256_sub_pd(iz1,jz1);
1532 dx12 = _mm256_sub_pd(ix1,jx2);
1533 dy12 = _mm256_sub_pd(iy1,jy2);
1534 dz12 = _mm256_sub_pd(iz1,jz2);
1535 dx20 = _mm256_sub_pd(ix2,jx0);
1536 dy20 = _mm256_sub_pd(iy2,jy0);
1537 dz20 = _mm256_sub_pd(iz2,jz0);
1538 dx21 = _mm256_sub_pd(ix2,jx1);
1539 dy21 = _mm256_sub_pd(iy2,jy1);
1540 dz21 = _mm256_sub_pd(iz2,jz1);
1541 dx22 = _mm256_sub_pd(ix2,jx2);
1542 dy22 = _mm256_sub_pd(iy2,jy2);
1543 dz22 = _mm256_sub_pd(iz2,jz2);
1545 /* Calculate squared distance and things based on it */
1546 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1547 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1548 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1549 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1550 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1551 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1552 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1553 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1554 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1556 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1557 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1558 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1559 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1560 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1561 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1562 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1563 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1564 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1566 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1567 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1568 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1569 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1570 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1571 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1572 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1573 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1574 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1576 fjx0 = _mm256_setzero_pd();
1577 fjy0 = _mm256_setzero_pd();
1578 fjz0 = _mm256_setzero_pd();
1579 fjx1 = _mm256_setzero_pd();
1580 fjy1 = _mm256_setzero_pd();
1581 fjz1 = _mm256_setzero_pd();
1582 fjx2 = _mm256_setzero_pd();
1583 fjy2 = _mm256_setzero_pd();
1584 fjz2 = _mm256_setzero_pd();
1586 /**************************
1587 * CALCULATE INTERACTIONS *
1588 **************************/
1590 r00 = _mm256_mul_pd(rsq00,rinv00);
1592 /* Calculate table index by multiplying r with table scale and truncate to integer */
1593 rt = _mm256_mul_pd(r00,vftabscale);
1594 vfitab = _mm256_cvttpd_epi32(rt);
1595 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1596 vfitab = _mm_slli_epi32(vfitab,3);
1598 /* EWALD ELECTROSTATICS */
1600 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1601 ewrt = _mm256_mul_pd(r00,ewtabscale);
1602 ewitab = _mm256_cvttpd_epi32(ewrt);
1603 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1604 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1605 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1607 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1608 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1610 /* CUBIC SPLINE TABLE DISPERSION */
1611 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1612 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1613 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1614 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1615 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1616 Heps = _mm256_mul_pd(vfeps,H);
1617 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1618 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1619 fvdw6 = _mm256_mul_pd(c6_00,FF);
1621 /* CUBIC SPLINE TABLE REPULSION */
1622 vfitab = _mm_add_epi32(vfitab,ifour);
1623 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1624 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1625 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1626 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1627 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1628 Heps = _mm256_mul_pd(vfeps,H);
1629 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1630 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1631 fvdw12 = _mm256_mul_pd(c12_00,FF);
1632 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1634 fscal = _mm256_add_pd(felec,fvdw);
1636 /* Calculate temporary vectorial force */
1637 tx = _mm256_mul_pd(fscal,dx00);
1638 ty = _mm256_mul_pd(fscal,dy00);
1639 tz = _mm256_mul_pd(fscal,dz00);
1641 /* Update vectorial force */
1642 fix0 = _mm256_add_pd(fix0,tx);
1643 fiy0 = _mm256_add_pd(fiy0,ty);
1644 fiz0 = _mm256_add_pd(fiz0,tz);
1646 fjx0 = _mm256_add_pd(fjx0,tx);
1647 fjy0 = _mm256_add_pd(fjy0,ty);
1648 fjz0 = _mm256_add_pd(fjz0,tz);
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 r01 = _mm256_mul_pd(rsq01,rinv01);
1656 /* EWALD ELECTROSTATICS */
1658 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1659 ewrt = _mm256_mul_pd(r01,ewtabscale);
1660 ewitab = _mm256_cvttpd_epi32(ewrt);
1661 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1662 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1663 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1665 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1666 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1670 /* Calculate temporary vectorial force */
1671 tx = _mm256_mul_pd(fscal,dx01);
1672 ty = _mm256_mul_pd(fscal,dy01);
1673 tz = _mm256_mul_pd(fscal,dz01);
1675 /* Update vectorial force */
1676 fix0 = _mm256_add_pd(fix0,tx);
1677 fiy0 = _mm256_add_pd(fiy0,ty);
1678 fiz0 = _mm256_add_pd(fiz0,tz);
1680 fjx1 = _mm256_add_pd(fjx1,tx);
1681 fjy1 = _mm256_add_pd(fjy1,ty);
1682 fjz1 = _mm256_add_pd(fjz1,tz);
1684 /**************************
1685 * CALCULATE INTERACTIONS *
1686 **************************/
1688 r02 = _mm256_mul_pd(rsq02,rinv02);
1690 /* EWALD ELECTROSTATICS */
1692 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1693 ewrt = _mm256_mul_pd(r02,ewtabscale);
1694 ewitab = _mm256_cvttpd_epi32(ewrt);
1695 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1696 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1697 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1699 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1700 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1704 /* Calculate temporary vectorial force */
1705 tx = _mm256_mul_pd(fscal,dx02);
1706 ty = _mm256_mul_pd(fscal,dy02);
1707 tz = _mm256_mul_pd(fscal,dz02);
1709 /* Update vectorial force */
1710 fix0 = _mm256_add_pd(fix0,tx);
1711 fiy0 = _mm256_add_pd(fiy0,ty);
1712 fiz0 = _mm256_add_pd(fiz0,tz);
1714 fjx2 = _mm256_add_pd(fjx2,tx);
1715 fjy2 = _mm256_add_pd(fjy2,ty);
1716 fjz2 = _mm256_add_pd(fjz2,tz);
1718 /**************************
1719 * CALCULATE INTERACTIONS *
1720 **************************/
1722 r10 = _mm256_mul_pd(rsq10,rinv10);
1724 /* EWALD ELECTROSTATICS */
1726 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1727 ewrt = _mm256_mul_pd(r10,ewtabscale);
1728 ewitab = _mm256_cvttpd_epi32(ewrt);
1729 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1730 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1731 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1733 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1734 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1738 /* Calculate temporary vectorial force */
1739 tx = _mm256_mul_pd(fscal,dx10);
1740 ty = _mm256_mul_pd(fscal,dy10);
1741 tz = _mm256_mul_pd(fscal,dz10);
1743 /* Update vectorial force */
1744 fix1 = _mm256_add_pd(fix1,tx);
1745 fiy1 = _mm256_add_pd(fiy1,ty);
1746 fiz1 = _mm256_add_pd(fiz1,tz);
1748 fjx0 = _mm256_add_pd(fjx0,tx);
1749 fjy0 = _mm256_add_pd(fjy0,ty);
1750 fjz0 = _mm256_add_pd(fjz0,tz);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 r11 = _mm256_mul_pd(rsq11,rinv11);
1758 /* EWALD ELECTROSTATICS */
1760 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1761 ewrt = _mm256_mul_pd(r11,ewtabscale);
1762 ewitab = _mm256_cvttpd_epi32(ewrt);
1763 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1764 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1765 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1767 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1768 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1772 /* Calculate temporary vectorial force */
1773 tx = _mm256_mul_pd(fscal,dx11);
1774 ty = _mm256_mul_pd(fscal,dy11);
1775 tz = _mm256_mul_pd(fscal,dz11);
1777 /* Update vectorial force */
1778 fix1 = _mm256_add_pd(fix1,tx);
1779 fiy1 = _mm256_add_pd(fiy1,ty);
1780 fiz1 = _mm256_add_pd(fiz1,tz);
1782 fjx1 = _mm256_add_pd(fjx1,tx);
1783 fjy1 = _mm256_add_pd(fjy1,ty);
1784 fjz1 = _mm256_add_pd(fjz1,tz);
1786 /**************************
1787 * CALCULATE INTERACTIONS *
1788 **************************/
1790 r12 = _mm256_mul_pd(rsq12,rinv12);
1792 /* EWALD ELECTROSTATICS */
1794 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1795 ewrt = _mm256_mul_pd(r12,ewtabscale);
1796 ewitab = _mm256_cvttpd_epi32(ewrt);
1797 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1798 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1799 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1801 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1802 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1806 /* Calculate temporary vectorial force */
1807 tx = _mm256_mul_pd(fscal,dx12);
1808 ty = _mm256_mul_pd(fscal,dy12);
1809 tz = _mm256_mul_pd(fscal,dz12);
1811 /* Update vectorial force */
1812 fix1 = _mm256_add_pd(fix1,tx);
1813 fiy1 = _mm256_add_pd(fiy1,ty);
1814 fiz1 = _mm256_add_pd(fiz1,tz);
1816 fjx2 = _mm256_add_pd(fjx2,tx);
1817 fjy2 = _mm256_add_pd(fjy2,ty);
1818 fjz2 = _mm256_add_pd(fjz2,tz);
1820 /**************************
1821 * CALCULATE INTERACTIONS *
1822 **************************/
1824 r20 = _mm256_mul_pd(rsq20,rinv20);
1826 /* EWALD ELECTROSTATICS */
1828 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1829 ewrt = _mm256_mul_pd(r20,ewtabscale);
1830 ewitab = _mm256_cvttpd_epi32(ewrt);
1831 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1832 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1833 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1835 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1836 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1840 /* Calculate temporary vectorial force */
1841 tx = _mm256_mul_pd(fscal,dx20);
1842 ty = _mm256_mul_pd(fscal,dy20);
1843 tz = _mm256_mul_pd(fscal,dz20);
1845 /* Update vectorial force */
1846 fix2 = _mm256_add_pd(fix2,tx);
1847 fiy2 = _mm256_add_pd(fiy2,ty);
1848 fiz2 = _mm256_add_pd(fiz2,tz);
1850 fjx0 = _mm256_add_pd(fjx0,tx);
1851 fjy0 = _mm256_add_pd(fjy0,ty);
1852 fjz0 = _mm256_add_pd(fjz0,tz);
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 r21 = _mm256_mul_pd(rsq21,rinv21);
1860 /* EWALD ELECTROSTATICS */
1862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1863 ewrt = _mm256_mul_pd(r21,ewtabscale);
1864 ewitab = _mm256_cvttpd_epi32(ewrt);
1865 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1866 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1867 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1869 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1870 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1874 /* Calculate temporary vectorial force */
1875 tx = _mm256_mul_pd(fscal,dx21);
1876 ty = _mm256_mul_pd(fscal,dy21);
1877 tz = _mm256_mul_pd(fscal,dz21);
1879 /* Update vectorial force */
1880 fix2 = _mm256_add_pd(fix2,tx);
1881 fiy2 = _mm256_add_pd(fiy2,ty);
1882 fiz2 = _mm256_add_pd(fiz2,tz);
1884 fjx1 = _mm256_add_pd(fjx1,tx);
1885 fjy1 = _mm256_add_pd(fjy1,ty);
1886 fjz1 = _mm256_add_pd(fjz1,tz);
1888 /**************************
1889 * CALCULATE INTERACTIONS *
1890 **************************/
1892 r22 = _mm256_mul_pd(rsq22,rinv22);
1894 /* EWALD ELECTROSTATICS */
1896 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1897 ewrt = _mm256_mul_pd(r22,ewtabscale);
1898 ewitab = _mm256_cvttpd_epi32(ewrt);
1899 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1900 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1901 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1903 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1904 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1908 /* Calculate temporary vectorial force */
1909 tx = _mm256_mul_pd(fscal,dx22);
1910 ty = _mm256_mul_pd(fscal,dy22);
1911 tz = _mm256_mul_pd(fscal,dz22);
1913 /* Update vectorial force */
1914 fix2 = _mm256_add_pd(fix2,tx);
1915 fiy2 = _mm256_add_pd(fiy2,ty);
1916 fiz2 = _mm256_add_pd(fiz2,tz);
1918 fjx2 = _mm256_add_pd(fjx2,tx);
1919 fjy2 = _mm256_add_pd(fjy2,ty);
1920 fjz2 = _mm256_add_pd(fjz2,tz);
1922 fjptrA = f+j_coord_offsetA;
1923 fjptrB = f+j_coord_offsetB;
1924 fjptrC = f+j_coord_offsetC;
1925 fjptrD = f+j_coord_offsetD;
1927 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1928 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1930 /* Inner loop uses 350 flops */
1933 if(jidx<j_index_end)
1936 /* Get j neighbor index, and coordinate index */
1937 jnrlistA = jjnr[jidx];
1938 jnrlistB = jjnr[jidx+1];
1939 jnrlistC = jjnr[jidx+2];
1940 jnrlistD = jjnr[jidx+3];
1941 /* Sign of each element will be negative for non-real atoms.
1942 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1943 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1945 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1947 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1948 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1949 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1951 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1952 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1953 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1954 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1955 j_coord_offsetA = DIM*jnrA;
1956 j_coord_offsetB = DIM*jnrB;
1957 j_coord_offsetC = DIM*jnrC;
1958 j_coord_offsetD = DIM*jnrD;
1960 /* load j atom coordinates */
1961 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1962 x+j_coord_offsetC,x+j_coord_offsetD,
1963 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1965 /* Calculate displacement vector */
1966 dx00 = _mm256_sub_pd(ix0,jx0);
1967 dy00 = _mm256_sub_pd(iy0,jy0);
1968 dz00 = _mm256_sub_pd(iz0,jz0);
1969 dx01 = _mm256_sub_pd(ix0,jx1);
1970 dy01 = _mm256_sub_pd(iy0,jy1);
1971 dz01 = _mm256_sub_pd(iz0,jz1);
1972 dx02 = _mm256_sub_pd(ix0,jx2);
1973 dy02 = _mm256_sub_pd(iy0,jy2);
1974 dz02 = _mm256_sub_pd(iz0,jz2);
1975 dx10 = _mm256_sub_pd(ix1,jx0);
1976 dy10 = _mm256_sub_pd(iy1,jy0);
1977 dz10 = _mm256_sub_pd(iz1,jz0);
1978 dx11 = _mm256_sub_pd(ix1,jx1);
1979 dy11 = _mm256_sub_pd(iy1,jy1);
1980 dz11 = _mm256_sub_pd(iz1,jz1);
1981 dx12 = _mm256_sub_pd(ix1,jx2);
1982 dy12 = _mm256_sub_pd(iy1,jy2);
1983 dz12 = _mm256_sub_pd(iz1,jz2);
1984 dx20 = _mm256_sub_pd(ix2,jx0);
1985 dy20 = _mm256_sub_pd(iy2,jy0);
1986 dz20 = _mm256_sub_pd(iz2,jz0);
1987 dx21 = _mm256_sub_pd(ix2,jx1);
1988 dy21 = _mm256_sub_pd(iy2,jy1);
1989 dz21 = _mm256_sub_pd(iz2,jz1);
1990 dx22 = _mm256_sub_pd(ix2,jx2);
1991 dy22 = _mm256_sub_pd(iy2,jy2);
1992 dz22 = _mm256_sub_pd(iz2,jz2);
1994 /* Calculate squared distance and things based on it */
1995 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1996 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1997 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1998 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1999 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2000 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2001 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2002 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2003 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2005 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2006 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2007 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2008 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2009 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2010 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2011 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2012 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2013 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2015 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2016 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2017 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2018 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2019 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2020 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2021 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2022 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2023 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2025 fjx0 = _mm256_setzero_pd();
2026 fjy0 = _mm256_setzero_pd();
2027 fjz0 = _mm256_setzero_pd();
2028 fjx1 = _mm256_setzero_pd();
2029 fjy1 = _mm256_setzero_pd();
2030 fjz1 = _mm256_setzero_pd();
2031 fjx2 = _mm256_setzero_pd();
2032 fjy2 = _mm256_setzero_pd();
2033 fjz2 = _mm256_setzero_pd();
2035 /**************************
2036 * CALCULATE INTERACTIONS *
2037 **************************/
2039 r00 = _mm256_mul_pd(rsq00,rinv00);
2040 r00 = _mm256_andnot_pd(dummy_mask,r00);
2042 /* Calculate table index by multiplying r with table scale and truncate to integer */
2043 rt = _mm256_mul_pd(r00,vftabscale);
2044 vfitab = _mm256_cvttpd_epi32(rt);
2045 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2046 vfitab = _mm_slli_epi32(vfitab,3);
2048 /* EWALD ELECTROSTATICS */
2050 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2051 ewrt = _mm256_mul_pd(r00,ewtabscale);
2052 ewitab = _mm256_cvttpd_epi32(ewrt);
2053 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2054 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2055 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2057 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2058 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2060 /* CUBIC SPLINE TABLE DISPERSION */
2061 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2062 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2063 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2064 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2065 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2066 Heps = _mm256_mul_pd(vfeps,H);
2067 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2068 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2069 fvdw6 = _mm256_mul_pd(c6_00,FF);
2071 /* CUBIC SPLINE TABLE REPULSION */
2072 vfitab = _mm_add_epi32(vfitab,ifour);
2073 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2074 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2075 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2076 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2077 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2078 Heps = _mm256_mul_pd(vfeps,H);
2079 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2080 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2081 fvdw12 = _mm256_mul_pd(c12_00,FF);
2082 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2084 fscal = _mm256_add_pd(felec,fvdw);
2086 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2088 /* Calculate temporary vectorial force */
2089 tx = _mm256_mul_pd(fscal,dx00);
2090 ty = _mm256_mul_pd(fscal,dy00);
2091 tz = _mm256_mul_pd(fscal,dz00);
2093 /* Update vectorial force */
2094 fix0 = _mm256_add_pd(fix0,tx);
2095 fiy0 = _mm256_add_pd(fiy0,ty);
2096 fiz0 = _mm256_add_pd(fiz0,tz);
2098 fjx0 = _mm256_add_pd(fjx0,tx);
2099 fjy0 = _mm256_add_pd(fjy0,ty);
2100 fjz0 = _mm256_add_pd(fjz0,tz);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 r01 = _mm256_mul_pd(rsq01,rinv01);
2107 r01 = _mm256_andnot_pd(dummy_mask,r01);
2109 /* EWALD ELECTROSTATICS */
2111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112 ewrt = _mm256_mul_pd(r01,ewtabscale);
2113 ewitab = _mm256_cvttpd_epi32(ewrt);
2114 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2115 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2116 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2118 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2119 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2123 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2125 /* Calculate temporary vectorial force */
2126 tx = _mm256_mul_pd(fscal,dx01);
2127 ty = _mm256_mul_pd(fscal,dy01);
2128 tz = _mm256_mul_pd(fscal,dz01);
2130 /* Update vectorial force */
2131 fix0 = _mm256_add_pd(fix0,tx);
2132 fiy0 = _mm256_add_pd(fiy0,ty);
2133 fiz0 = _mm256_add_pd(fiz0,tz);
2135 fjx1 = _mm256_add_pd(fjx1,tx);
2136 fjy1 = _mm256_add_pd(fjy1,ty);
2137 fjz1 = _mm256_add_pd(fjz1,tz);
2139 /**************************
2140 * CALCULATE INTERACTIONS *
2141 **************************/
2143 r02 = _mm256_mul_pd(rsq02,rinv02);
2144 r02 = _mm256_andnot_pd(dummy_mask,r02);
2146 /* EWALD ELECTROSTATICS */
2148 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2149 ewrt = _mm256_mul_pd(r02,ewtabscale);
2150 ewitab = _mm256_cvttpd_epi32(ewrt);
2151 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2152 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2153 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2155 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2156 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2160 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2162 /* Calculate temporary vectorial force */
2163 tx = _mm256_mul_pd(fscal,dx02);
2164 ty = _mm256_mul_pd(fscal,dy02);
2165 tz = _mm256_mul_pd(fscal,dz02);
2167 /* Update vectorial force */
2168 fix0 = _mm256_add_pd(fix0,tx);
2169 fiy0 = _mm256_add_pd(fiy0,ty);
2170 fiz0 = _mm256_add_pd(fiz0,tz);
2172 fjx2 = _mm256_add_pd(fjx2,tx);
2173 fjy2 = _mm256_add_pd(fjy2,ty);
2174 fjz2 = _mm256_add_pd(fjz2,tz);
2176 /**************************
2177 * CALCULATE INTERACTIONS *
2178 **************************/
2180 r10 = _mm256_mul_pd(rsq10,rinv10);
2181 r10 = _mm256_andnot_pd(dummy_mask,r10);
2183 /* EWALD ELECTROSTATICS */
2185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2186 ewrt = _mm256_mul_pd(r10,ewtabscale);
2187 ewitab = _mm256_cvttpd_epi32(ewrt);
2188 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2189 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2190 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2192 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2193 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2197 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2199 /* Calculate temporary vectorial force */
2200 tx = _mm256_mul_pd(fscal,dx10);
2201 ty = _mm256_mul_pd(fscal,dy10);
2202 tz = _mm256_mul_pd(fscal,dz10);
2204 /* Update vectorial force */
2205 fix1 = _mm256_add_pd(fix1,tx);
2206 fiy1 = _mm256_add_pd(fiy1,ty);
2207 fiz1 = _mm256_add_pd(fiz1,tz);
2209 fjx0 = _mm256_add_pd(fjx0,tx);
2210 fjy0 = _mm256_add_pd(fjy0,ty);
2211 fjz0 = _mm256_add_pd(fjz0,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 r11 = _mm256_mul_pd(rsq11,rinv11);
2218 r11 = _mm256_andnot_pd(dummy_mask,r11);
2220 /* EWALD ELECTROSTATICS */
2222 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2223 ewrt = _mm256_mul_pd(r11,ewtabscale);
2224 ewitab = _mm256_cvttpd_epi32(ewrt);
2225 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2226 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2227 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2229 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2230 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2234 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2236 /* Calculate temporary vectorial force */
2237 tx = _mm256_mul_pd(fscal,dx11);
2238 ty = _mm256_mul_pd(fscal,dy11);
2239 tz = _mm256_mul_pd(fscal,dz11);
2241 /* Update vectorial force */
2242 fix1 = _mm256_add_pd(fix1,tx);
2243 fiy1 = _mm256_add_pd(fiy1,ty);
2244 fiz1 = _mm256_add_pd(fiz1,tz);
2246 fjx1 = _mm256_add_pd(fjx1,tx);
2247 fjy1 = _mm256_add_pd(fjy1,ty);
2248 fjz1 = _mm256_add_pd(fjz1,tz);
2250 /**************************
2251 * CALCULATE INTERACTIONS *
2252 **************************/
2254 r12 = _mm256_mul_pd(rsq12,rinv12);
2255 r12 = _mm256_andnot_pd(dummy_mask,r12);
2257 /* EWALD ELECTROSTATICS */
2259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2260 ewrt = _mm256_mul_pd(r12,ewtabscale);
2261 ewitab = _mm256_cvttpd_epi32(ewrt);
2262 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2263 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2264 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2266 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2267 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2271 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2273 /* Calculate temporary vectorial force */
2274 tx = _mm256_mul_pd(fscal,dx12);
2275 ty = _mm256_mul_pd(fscal,dy12);
2276 tz = _mm256_mul_pd(fscal,dz12);
2278 /* Update vectorial force */
2279 fix1 = _mm256_add_pd(fix1,tx);
2280 fiy1 = _mm256_add_pd(fiy1,ty);
2281 fiz1 = _mm256_add_pd(fiz1,tz);
2283 fjx2 = _mm256_add_pd(fjx2,tx);
2284 fjy2 = _mm256_add_pd(fjy2,ty);
2285 fjz2 = _mm256_add_pd(fjz2,tz);
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2291 r20 = _mm256_mul_pd(rsq20,rinv20);
2292 r20 = _mm256_andnot_pd(dummy_mask,r20);
2294 /* EWALD ELECTROSTATICS */
2296 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2297 ewrt = _mm256_mul_pd(r20,ewtabscale);
2298 ewitab = _mm256_cvttpd_epi32(ewrt);
2299 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2300 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2301 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2303 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2304 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2308 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2310 /* Calculate temporary vectorial force */
2311 tx = _mm256_mul_pd(fscal,dx20);
2312 ty = _mm256_mul_pd(fscal,dy20);
2313 tz = _mm256_mul_pd(fscal,dz20);
2315 /* Update vectorial force */
2316 fix2 = _mm256_add_pd(fix2,tx);
2317 fiy2 = _mm256_add_pd(fiy2,ty);
2318 fiz2 = _mm256_add_pd(fiz2,tz);
2320 fjx0 = _mm256_add_pd(fjx0,tx);
2321 fjy0 = _mm256_add_pd(fjy0,ty);
2322 fjz0 = _mm256_add_pd(fjz0,tz);
2324 /**************************
2325 * CALCULATE INTERACTIONS *
2326 **************************/
2328 r21 = _mm256_mul_pd(rsq21,rinv21);
2329 r21 = _mm256_andnot_pd(dummy_mask,r21);
2331 /* EWALD ELECTROSTATICS */
2333 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2334 ewrt = _mm256_mul_pd(r21,ewtabscale);
2335 ewitab = _mm256_cvttpd_epi32(ewrt);
2336 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2337 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2338 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2340 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2341 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2345 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2347 /* Calculate temporary vectorial force */
2348 tx = _mm256_mul_pd(fscal,dx21);
2349 ty = _mm256_mul_pd(fscal,dy21);
2350 tz = _mm256_mul_pd(fscal,dz21);
2352 /* Update vectorial force */
2353 fix2 = _mm256_add_pd(fix2,tx);
2354 fiy2 = _mm256_add_pd(fiy2,ty);
2355 fiz2 = _mm256_add_pd(fiz2,tz);
2357 fjx1 = _mm256_add_pd(fjx1,tx);
2358 fjy1 = _mm256_add_pd(fjy1,ty);
2359 fjz1 = _mm256_add_pd(fjz1,tz);
2361 /**************************
2362 * CALCULATE INTERACTIONS *
2363 **************************/
2365 r22 = _mm256_mul_pd(rsq22,rinv22);
2366 r22 = _mm256_andnot_pd(dummy_mask,r22);
2368 /* EWALD ELECTROSTATICS */
2370 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2371 ewrt = _mm256_mul_pd(r22,ewtabscale);
2372 ewitab = _mm256_cvttpd_epi32(ewrt);
2373 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2374 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2375 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2377 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2378 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2382 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2384 /* Calculate temporary vectorial force */
2385 tx = _mm256_mul_pd(fscal,dx22);
2386 ty = _mm256_mul_pd(fscal,dy22);
2387 tz = _mm256_mul_pd(fscal,dz22);
2389 /* Update vectorial force */
2390 fix2 = _mm256_add_pd(fix2,tx);
2391 fiy2 = _mm256_add_pd(fiy2,ty);
2392 fiz2 = _mm256_add_pd(fiz2,tz);
2394 fjx2 = _mm256_add_pd(fjx2,tx);
2395 fjy2 = _mm256_add_pd(fjy2,ty);
2396 fjz2 = _mm256_add_pd(fjz2,tz);
2398 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2399 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2400 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2401 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2403 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2404 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2406 /* Inner loop uses 359 flops */
2409 /* End of innermost loop */
2411 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2412 f+i_coord_offset,fshift+i_shift_offset);
2414 /* Increment number of inner iterations */
2415 inneriter += j_index_end - j_index_start;
2417 /* Outer loop uses 18 flops */
2420 /* Increment number of outer iterations */
2423 /* Update outer/inner flops */
2425 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);