2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
112 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
114 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
117 __m256d dummy_mask,cutoff_mask;
118 __m128 tmpmask0,tmpmask1;
119 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120 __m256d one = _mm256_set1_pd(1.0);
121 __m256d two = _mm256_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm256_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
140 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
141 beta2 = _mm256_mul_pd(beta,beta);
142 beta3 = _mm256_mul_pd(beta,beta2);
144 ewtab = fr->ic->tabq_coul_FDV0;
145 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
146 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
148 /* Setup water-specific parameters */
149 inr = nlist->iinr[0];
150 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
151 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
152 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
153 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
155 jq0 = _mm256_set1_pd(charge[inr+0]);
156 jq1 = _mm256_set1_pd(charge[inr+1]);
157 jq2 = _mm256_set1_pd(charge[inr+2]);
158 vdwjidx0A = 2*vdwtype[inr+0];
159 qq00 = _mm256_mul_pd(iq0,jq0);
160 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
161 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
162 qq01 = _mm256_mul_pd(iq0,jq1);
163 qq02 = _mm256_mul_pd(iq0,jq2);
164 qq10 = _mm256_mul_pd(iq1,jq0);
165 qq11 = _mm256_mul_pd(iq1,jq1);
166 qq12 = _mm256_mul_pd(iq1,jq2);
167 qq20 = _mm256_mul_pd(iq2,jq0);
168 qq21 = _mm256_mul_pd(iq2,jq1);
169 qq22 = _mm256_mul_pd(iq2,jq2);
171 /* Avoid stupid compiler warnings */
172 jnrA = jnrB = jnrC = jnrD = 0;
181 for(iidx=0;iidx<4*DIM;iidx++)
186 /* Start outer loop over neighborlists */
187 for(iidx=0; iidx<nri; iidx++)
189 /* Load shift vector for this list */
190 i_shift_offset = DIM*shiftidx[iidx];
192 /* Load limits for loop over neighbors */
193 j_index_start = jindex[iidx];
194 j_index_end = jindex[iidx+1];
196 /* Get outer coordinate index */
198 i_coord_offset = DIM*inr;
200 /* Load i particle coords and add shift vector */
201 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
202 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
204 fix0 = _mm256_setzero_pd();
205 fiy0 = _mm256_setzero_pd();
206 fiz0 = _mm256_setzero_pd();
207 fix1 = _mm256_setzero_pd();
208 fiy1 = _mm256_setzero_pd();
209 fiz1 = _mm256_setzero_pd();
210 fix2 = _mm256_setzero_pd();
211 fiy2 = _mm256_setzero_pd();
212 fiz2 = _mm256_setzero_pd();
214 /* Reset potential sums */
215 velecsum = _mm256_setzero_pd();
216 vvdwsum = _mm256_setzero_pd();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
229 j_coord_offsetC = DIM*jnrC;
230 j_coord_offsetD = DIM*jnrD;
232 /* load j atom coordinates */
233 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
234 x+j_coord_offsetC,x+j_coord_offsetD,
235 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
237 /* Calculate displacement vector */
238 dx00 = _mm256_sub_pd(ix0,jx0);
239 dy00 = _mm256_sub_pd(iy0,jy0);
240 dz00 = _mm256_sub_pd(iz0,jz0);
241 dx01 = _mm256_sub_pd(ix0,jx1);
242 dy01 = _mm256_sub_pd(iy0,jy1);
243 dz01 = _mm256_sub_pd(iz0,jz1);
244 dx02 = _mm256_sub_pd(ix0,jx2);
245 dy02 = _mm256_sub_pd(iy0,jy2);
246 dz02 = _mm256_sub_pd(iz0,jz2);
247 dx10 = _mm256_sub_pd(ix1,jx0);
248 dy10 = _mm256_sub_pd(iy1,jy0);
249 dz10 = _mm256_sub_pd(iz1,jz0);
250 dx11 = _mm256_sub_pd(ix1,jx1);
251 dy11 = _mm256_sub_pd(iy1,jy1);
252 dz11 = _mm256_sub_pd(iz1,jz1);
253 dx12 = _mm256_sub_pd(ix1,jx2);
254 dy12 = _mm256_sub_pd(iy1,jy2);
255 dz12 = _mm256_sub_pd(iz1,jz2);
256 dx20 = _mm256_sub_pd(ix2,jx0);
257 dy20 = _mm256_sub_pd(iy2,jy0);
258 dz20 = _mm256_sub_pd(iz2,jz0);
259 dx21 = _mm256_sub_pd(ix2,jx1);
260 dy21 = _mm256_sub_pd(iy2,jy1);
261 dz21 = _mm256_sub_pd(iz2,jz1);
262 dx22 = _mm256_sub_pd(ix2,jx2);
263 dy22 = _mm256_sub_pd(iy2,jy2);
264 dz22 = _mm256_sub_pd(iz2,jz2);
266 /* Calculate squared distance and things based on it */
267 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
268 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
269 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
270 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
271 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
272 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
273 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
274 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
275 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
277 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
278 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
279 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
280 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
281 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
282 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
283 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
284 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
285 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
287 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
288 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
289 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
290 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
291 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
292 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
293 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
294 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
295 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
297 fjx0 = _mm256_setzero_pd();
298 fjy0 = _mm256_setzero_pd();
299 fjz0 = _mm256_setzero_pd();
300 fjx1 = _mm256_setzero_pd();
301 fjy1 = _mm256_setzero_pd();
302 fjz1 = _mm256_setzero_pd();
303 fjx2 = _mm256_setzero_pd();
304 fjy2 = _mm256_setzero_pd();
305 fjz2 = _mm256_setzero_pd();
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 r00 = _mm256_mul_pd(rsq00,rinv00);
313 /* EWALD ELECTROSTATICS */
315 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
316 ewrt = _mm256_mul_pd(r00,ewtabscale);
317 ewitab = _mm256_cvttpd_epi32(ewrt);
318 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
319 ewitab = _mm_slli_epi32(ewitab,2);
320 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
321 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
322 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
323 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
324 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
325 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
326 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
327 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
328 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
330 /* LENNARD-JONES DISPERSION/REPULSION */
332 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
333 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
334 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
335 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
336 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 velecsum = _mm256_add_pd(velecsum,velec);
340 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
342 fscal = _mm256_add_pd(felec,fvdw);
344 /* Calculate temporary vectorial force */
345 tx = _mm256_mul_pd(fscal,dx00);
346 ty = _mm256_mul_pd(fscal,dy00);
347 tz = _mm256_mul_pd(fscal,dz00);
349 /* Update vectorial force */
350 fix0 = _mm256_add_pd(fix0,tx);
351 fiy0 = _mm256_add_pd(fiy0,ty);
352 fiz0 = _mm256_add_pd(fiz0,tz);
354 fjx0 = _mm256_add_pd(fjx0,tx);
355 fjy0 = _mm256_add_pd(fjy0,ty);
356 fjz0 = _mm256_add_pd(fjz0,tz);
358 /**************************
359 * CALCULATE INTERACTIONS *
360 **************************/
362 r01 = _mm256_mul_pd(rsq01,rinv01);
364 /* EWALD ELECTROSTATICS */
366 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
367 ewrt = _mm256_mul_pd(r01,ewtabscale);
368 ewitab = _mm256_cvttpd_epi32(ewrt);
369 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
370 ewitab = _mm_slli_epi32(ewitab,2);
371 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
372 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
373 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
374 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
375 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
376 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
377 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
378 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
379 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
381 /* Update potential sum for this i atom from the interaction with this j atom. */
382 velecsum = _mm256_add_pd(velecsum,velec);
386 /* Calculate temporary vectorial force */
387 tx = _mm256_mul_pd(fscal,dx01);
388 ty = _mm256_mul_pd(fscal,dy01);
389 tz = _mm256_mul_pd(fscal,dz01);
391 /* Update vectorial force */
392 fix0 = _mm256_add_pd(fix0,tx);
393 fiy0 = _mm256_add_pd(fiy0,ty);
394 fiz0 = _mm256_add_pd(fiz0,tz);
396 fjx1 = _mm256_add_pd(fjx1,tx);
397 fjy1 = _mm256_add_pd(fjy1,ty);
398 fjz1 = _mm256_add_pd(fjz1,tz);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 r02 = _mm256_mul_pd(rsq02,rinv02);
406 /* EWALD ELECTROSTATICS */
408 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
409 ewrt = _mm256_mul_pd(r02,ewtabscale);
410 ewitab = _mm256_cvttpd_epi32(ewrt);
411 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
412 ewitab = _mm_slli_epi32(ewitab,2);
413 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
414 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
415 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
416 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
417 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
418 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
419 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
420 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
421 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velecsum = _mm256_add_pd(velecsum,velec);
428 /* Calculate temporary vectorial force */
429 tx = _mm256_mul_pd(fscal,dx02);
430 ty = _mm256_mul_pd(fscal,dy02);
431 tz = _mm256_mul_pd(fscal,dz02);
433 /* Update vectorial force */
434 fix0 = _mm256_add_pd(fix0,tx);
435 fiy0 = _mm256_add_pd(fiy0,ty);
436 fiz0 = _mm256_add_pd(fiz0,tz);
438 fjx2 = _mm256_add_pd(fjx2,tx);
439 fjy2 = _mm256_add_pd(fjy2,ty);
440 fjz2 = _mm256_add_pd(fjz2,tz);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 r10 = _mm256_mul_pd(rsq10,rinv10);
448 /* EWALD ELECTROSTATICS */
450 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
451 ewrt = _mm256_mul_pd(r10,ewtabscale);
452 ewitab = _mm256_cvttpd_epi32(ewrt);
453 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
454 ewitab = _mm_slli_epi32(ewitab,2);
455 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
456 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
457 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
458 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
459 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
460 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
461 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
462 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
463 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velecsum = _mm256_add_pd(velecsum,velec);
470 /* Calculate temporary vectorial force */
471 tx = _mm256_mul_pd(fscal,dx10);
472 ty = _mm256_mul_pd(fscal,dy10);
473 tz = _mm256_mul_pd(fscal,dz10);
475 /* Update vectorial force */
476 fix1 = _mm256_add_pd(fix1,tx);
477 fiy1 = _mm256_add_pd(fiy1,ty);
478 fiz1 = _mm256_add_pd(fiz1,tz);
480 fjx0 = _mm256_add_pd(fjx0,tx);
481 fjy0 = _mm256_add_pd(fjy0,ty);
482 fjz0 = _mm256_add_pd(fjz0,tz);
484 /**************************
485 * CALCULATE INTERACTIONS *
486 **************************/
488 r11 = _mm256_mul_pd(rsq11,rinv11);
490 /* EWALD ELECTROSTATICS */
492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
493 ewrt = _mm256_mul_pd(r11,ewtabscale);
494 ewitab = _mm256_cvttpd_epi32(ewrt);
495 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
496 ewitab = _mm_slli_epi32(ewitab,2);
497 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
498 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
499 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
500 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
501 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
502 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
503 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
504 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
505 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
507 /* Update potential sum for this i atom from the interaction with this j atom. */
508 velecsum = _mm256_add_pd(velecsum,velec);
512 /* Calculate temporary vectorial force */
513 tx = _mm256_mul_pd(fscal,dx11);
514 ty = _mm256_mul_pd(fscal,dy11);
515 tz = _mm256_mul_pd(fscal,dz11);
517 /* Update vectorial force */
518 fix1 = _mm256_add_pd(fix1,tx);
519 fiy1 = _mm256_add_pd(fiy1,ty);
520 fiz1 = _mm256_add_pd(fiz1,tz);
522 fjx1 = _mm256_add_pd(fjx1,tx);
523 fjy1 = _mm256_add_pd(fjy1,ty);
524 fjz1 = _mm256_add_pd(fjz1,tz);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 r12 = _mm256_mul_pd(rsq12,rinv12);
532 /* EWALD ELECTROSTATICS */
534 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
535 ewrt = _mm256_mul_pd(r12,ewtabscale);
536 ewitab = _mm256_cvttpd_epi32(ewrt);
537 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
538 ewitab = _mm_slli_epi32(ewitab,2);
539 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
540 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
541 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
542 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
543 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
544 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
545 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
546 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
547 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
549 /* Update potential sum for this i atom from the interaction with this j atom. */
550 velecsum = _mm256_add_pd(velecsum,velec);
554 /* Calculate temporary vectorial force */
555 tx = _mm256_mul_pd(fscal,dx12);
556 ty = _mm256_mul_pd(fscal,dy12);
557 tz = _mm256_mul_pd(fscal,dz12);
559 /* Update vectorial force */
560 fix1 = _mm256_add_pd(fix1,tx);
561 fiy1 = _mm256_add_pd(fiy1,ty);
562 fiz1 = _mm256_add_pd(fiz1,tz);
564 fjx2 = _mm256_add_pd(fjx2,tx);
565 fjy2 = _mm256_add_pd(fjy2,ty);
566 fjz2 = _mm256_add_pd(fjz2,tz);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 r20 = _mm256_mul_pd(rsq20,rinv20);
574 /* EWALD ELECTROSTATICS */
576 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
577 ewrt = _mm256_mul_pd(r20,ewtabscale);
578 ewitab = _mm256_cvttpd_epi32(ewrt);
579 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
580 ewitab = _mm_slli_epi32(ewitab,2);
581 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
582 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
583 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
584 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
585 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
586 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
587 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
588 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
589 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velecsum = _mm256_add_pd(velecsum,velec);
596 /* Calculate temporary vectorial force */
597 tx = _mm256_mul_pd(fscal,dx20);
598 ty = _mm256_mul_pd(fscal,dy20);
599 tz = _mm256_mul_pd(fscal,dz20);
601 /* Update vectorial force */
602 fix2 = _mm256_add_pd(fix2,tx);
603 fiy2 = _mm256_add_pd(fiy2,ty);
604 fiz2 = _mm256_add_pd(fiz2,tz);
606 fjx0 = _mm256_add_pd(fjx0,tx);
607 fjy0 = _mm256_add_pd(fjy0,ty);
608 fjz0 = _mm256_add_pd(fjz0,tz);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 r21 = _mm256_mul_pd(rsq21,rinv21);
616 /* EWALD ELECTROSTATICS */
618 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
619 ewrt = _mm256_mul_pd(r21,ewtabscale);
620 ewitab = _mm256_cvttpd_epi32(ewrt);
621 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
622 ewitab = _mm_slli_epi32(ewitab,2);
623 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
624 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
625 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
626 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
627 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
628 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
629 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
630 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
631 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
633 /* Update potential sum for this i atom from the interaction with this j atom. */
634 velecsum = _mm256_add_pd(velecsum,velec);
638 /* Calculate temporary vectorial force */
639 tx = _mm256_mul_pd(fscal,dx21);
640 ty = _mm256_mul_pd(fscal,dy21);
641 tz = _mm256_mul_pd(fscal,dz21);
643 /* Update vectorial force */
644 fix2 = _mm256_add_pd(fix2,tx);
645 fiy2 = _mm256_add_pd(fiy2,ty);
646 fiz2 = _mm256_add_pd(fiz2,tz);
648 fjx1 = _mm256_add_pd(fjx1,tx);
649 fjy1 = _mm256_add_pd(fjy1,ty);
650 fjz1 = _mm256_add_pd(fjz1,tz);
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 r22 = _mm256_mul_pd(rsq22,rinv22);
658 /* EWALD ELECTROSTATICS */
660 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
661 ewrt = _mm256_mul_pd(r22,ewtabscale);
662 ewitab = _mm256_cvttpd_epi32(ewrt);
663 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
664 ewitab = _mm_slli_epi32(ewitab,2);
665 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
666 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
667 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
668 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
669 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
670 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
671 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
672 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
673 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 velecsum = _mm256_add_pd(velecsum,velec);
680 /* Calculate temporary vectorial force */
681 tx = _mm256_mul_pd(fscal,dx22);
682 ty = _mm256_mul_pd(fscal,dy22);
683 tz = _mm256_mul_pd(fscal,dz22);
685 /* Update vectorial force */
686 fix2 = _mm256_add_pd(fix2,tx);
687 fiy2 = _mm256_add_pd(fiy2,ty);
688 fiz2 = _mm256_add_pd(fiz2,tz);
690 fjx2 = _mm256_add_pd(fjx2,tx);
691 fjy2 = _mm256_add_pd(fjy2,ty);
692 fjz2 = _mm256_add_pd(fjz2,tz);
694 fjptrA = f+j_coord_offsetA;
695 fjptrB = f+j_coord_offsetB;
696 fjptrC = f+j_coord_offsetC;
697 fjptrD = f+j_coord_offsetD;
699 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
700 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
702 /* Inner loop uses 381 flops */
708 /* Get j neighbor index, and coordinate index */
709 jnrlistA = jjnr[jidx];
710 jnrlistB = jjnr[jidx+1];
711 jnrlistC = jjnr[jidx+2];
712 jnrlistD = jjnr[jidx+3];
713 /* Sign of each element will be negative for non-real atoms.
714 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
715 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
717 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
719 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
720 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
721 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
723 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
724 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
725 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
726 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
727 j_coord_offsetA = DIM*jnrA;
728 j_coord_offsetB = DIM*jnrB;
729 j_coord_offsetC = DIM*jnrC;
730 j_coord_offsetD = DIM*jnrD;
732 /* load j atom coordinates */
733 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
734 x+j_coord_offsetC,x+j_coord_offsetD,
735 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
737 /* Calculate displacement vector */
738 dx00 = _mm256_sub_pd(ix0,jx0);
739 dy00 = _mm256_sub_pd(iy0,jy0);
740 dz00 = _mm256_sub_pd(iz0,jz0);
741 dx01 = _mm256_sub_pd(ix0,jx1);
742 dy01 = _mm256_sub_pd(iy0,jy1);
743 dz01 = _mm256_sub_pd(iz0,jz1);
744 dx02 = _mm256_sub_pd(ix0,jx2);
745 dy02 = _mm256_sub_pd(iy0,jy2);
746 dz02 = _mm256_sub_pd(iz0,jz2);
747 dx10 = _mm256_sub_pd(ix1,jx0);
748 dy10 = _mm256_sub_pd(iy1,jy0);
749 dz10 = _mm256_sub_pd(iz1,jz0);
750 dx11 = _mm256_sub_pd(ix1,jx1);
751 dy11 = _mm256_sub_pd(iy1,jy1);
752 dz11 = _mm256_sub_pd(iz1,jz1);
753 dx12 = _mm256_sub_pd(ix1,jx2);
754 dy12 = _mm256_sub_pd(iy1,jy2);
755 dz12 = _mm256_sub_pd(iz1,jz2);
756 dx20 = _mm256_sub_pd(ix2,jx0);
757 dy20 = _mm256_sub_pd(iy2,jy0);
758 dz20 = _mm256_sub_pd(iz2,jz0);
759 dx21 = _mm256_sub_pd(ix2,jx1);
760 dy21 = _mm256_sub_pd(iy2,jy1);
761 dz21 = _mm256_sub_pd(iz2,jz1);
762 dx22 = _mm256_sub_pd(ix2,jx2);
763 dy22 = _mm256_sub_pd(iy2,jy2);
764 dz22 = _mm256_sub_pd(iz2,jz2);
766 /* Calculate squared distance and things based on it */
767 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
768 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
769 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
770 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
771 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
772 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
773 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
774 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
775 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
777 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
778 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
779 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
780 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
781 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
782 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
783 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
784 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
785 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
787 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
788 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
789 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
790 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
791 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
792 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
793 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
794 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
795 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
797 fjx0 = _mm256_setzero_pd();
798 fjy0 = _mm256_setzero_pd();
799 fjz0 = _mm256_setzero_pd();
800 fjx1 = _mm256_setzero_pd();
801 fjy1 = _mm256_setzero_pd();
802 fjz1 = _mm256_setzero_pd();
803 fjx2 = _mm256_setzero_pd();
804 fjy2 = _mm256_setzero_pd();
805 fjz2 = _mm256_setzero_pd();
807 /**************************
808 * CALCULATE INTERACTIONS *
809 **************************/
811 r00 = _mm256_mul_pd(rsq00,rinv00);
812 r00 = _mm256_andnot_pd(dummy_mask,r00);
814 /* EWALD ELECTROSTATICS */
816 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
817 ewrt = _mm256_mul_pd(r00,ewtabscale);
818 ewitab = _mm256_cvttpd_epi32(ewrt);
819 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
820 ewitab = _mm_slli_epi32(ewitab,2);
821 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
822 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
823 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
824 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
825 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
826 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
827 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
828 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
829 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
831 /* LENNARD-JONES DISPERSION/REPULSION */
833 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
834 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
835 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
836 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
837 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
839 /* Update potential sum for this i atom from the interaction with this j atom. */
840 velec = _mm256_andnot_pd(dummy_mask,velec);
841 velecsum = _mm256_add_pd(velecsum,velec);
842 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
843 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
845 fscal = _mm256_add_pd(felec,fvdw);
847 fscal = _mm256_andnot_pd(dummy_mask,fscal);
849 /* Calculate temporary vectorial force */
850 tx = _mm256_mul_pd(fscal,dx00);
851 ty = _mm256_mul_pd(fscal,dy00);
852 tz = _mm256_mul_pd(fscal,dz00);
854 /* Update vectorial force */
855 fix0 = _mm256_add_pd(fix0,tx);
856 fiy0 = _mm256_add_pd(fiy0,ty);
857 fiz0 = _mm256_add_pd(fiz0,tz);
859 fjx0 = _mm256_add_pd(fjx0,tx);
860 fjy0 = _mm256_add_pd(fjy0,ty);
861 fjz0 = _mm256_add_pd(fjz0,tz);
863 /**************************
864 * CALCULATE INTERACTIONS *
865 **************************/
867 r01 = _mm256_mul_pd(rsq01,rinv01);
868 r01 = _mm256_andnot_pd(dummy_mask,r01);
870 /* EWALD ELECTROSTATICS */
872 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
873 ewrt = _mm256_mul_pd(r01,ewtabscale);
874 ewitab = _mm256_cvttpd_epi32(ewrt);
875 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
876 ewitab = _mm_slli_epi32(ewitab,2);
877 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
878 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
879 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
880 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
881 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
882 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
883 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
884 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
885 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm256_andnot_pd(dummy_mask,velec);
889 velecsum = _mm256_add_pd(velecsum,velec);
893 fscal = _mm256_andnot_pd(dummy_mask,fscal);
895 /* Calculate temporary vectorial force */
896 tx = _mm256_mul_pd(fscal,dx01);
897 ty = _mm256_mul_pd(fscal,dy01);
898 tz = _mm256_mul_pd(fscal,dz01);
900 /* Update vectorial force */
901 fix0 = _mm256_add_pd(fix0,tx);
902 fiy0 = _mm256_add_pd(fiy0,ty);
903 fiz0 = _mm256_add_pd(fiz0,tz);
905 fjx1 = _mm256_add_pd(fjx1,tx);
906 fjy1 = _mm256_add_pd(fjy1,ty);
907 fjz1 = _mm256_add_pd(fjz1,tz);
909 /**************************
910 * CALCULATE INTERACTIONS *
911 **************************/
913 r02 = _mm256_mul_pd(rsq02,rinv02);
914 r02 = _mm256_andnot_pd(dummy_mask,r02);
916 /* EWALD ELECTROSTATICS */
918 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
919 ewrt = _mm256_mul_pd(r02,ewtabscale);
920 ewitab = _mm256_cvttpd_epi32(ewrt);
921 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
922 ewitab = _mm_slli_epi32(ewitab,2);
923 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
924 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
925 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
926 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
927 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
928 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
929 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
930 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
931 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm256_andnot_pd(dummy_mask,velec);
935 velecsum = _mm256_add_pd(velecsum,velec);
939 fscal = _mm256_andnot_pd(dummy_mask,fscal);
941 /* Calculate temporary vectorial force */
942 tx = _mm256_mul_pd(fscal,dx02);
943 ty = _mm256_mul_pd(fscal,dy02);
944 tz = _mm256_mul_pd(fscal,dz02);
946 /* Update vectorial force */
947 fix0 = _mm256_add_pd(fix0,tx);
948 fiy0 = _mm256_add_pd(fiy0,ty);
949 fiz0 = _mm256_add_pd(fiz0,tz);
951 fjx2 = _mm256_add_pd(fjx2,tx);
952 fjy2 = _mm256_add_pd(fjy2,ty);
953 fjz2 = _mm256_add_pd(fjz2,tz);
955 /**************************
956 * CALCULATE INTERACTIONS *
957 **************************/
959 r10 = _mm256_mul_pd(rsq10,rinv10);
960 r10 = _mm256_andnot_pd(dummy_mask,r10);
962 /* EWALD ELECTROSTATICS */
964 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
965 ewrt = _mm256_mul_pd(r10,ewtabscale);
966 ewitab = _mm256_cvttpd_epi32(ewrt);
967 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
968 ewitab = _mm_slli_epi32(ewitab,2);
969 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
970 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
971 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
972 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
973 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
974 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
975 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
976 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
977 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm256_andnot_pd(dummy_mask,velec);
981 velecsum = _mm256_add_pd(velecsum,velec);
985 fscal = _mm256_andnot_pd(dummy_mask,fscal);
987 /* Calculate temporary vectorial force */
988 tx = _mm256_mul_pd(fscal,dx10);
989 ty = _mm256_mul_pd(fscal,dy10);
990 tz = _mm256_mul_pd(fscal,dz10);
992 /* Update vectorial force */
993 fix1 = _mm256_add_pd(fix1,tx);
994 fiy1 = _mm256_add_pd(fiy1,ty);
995 fiz1 = _mm256_add_pd(fiz1,tz);
997 fjx0 = _mm256_add_pd(fjx0,tx);
998 fjy0 = _mm256_add_pd(fjy0,ty);
999 fjz0 = _mm256_add_pd(fjz0,tz);
1001 /**************************
1002 * CALCULATE INTERACTIONS *
1003 **************************/
1005 r11 = _mm256_mul_pd(rsq11,rinv11);
1006 r11 = _mm256_andnot_pd(dummy_mask,r11);
1008 /* EWALD ELECTROSTATICS */
1010 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1011 ewrt = _mm256_mul_pd(r11,ewtabscale);
1012 ewitab = _mm256_cvttpd_epi32(ewrt);
1013 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1014 ewitab = _mm_slli_epi32(ewitab,2);
1015 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1016 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1017 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1018 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1019 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1020 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1021 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1022 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1023 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm256_andnot_pd(dummy_mask,velec);
1027 velecsum = _mm256_add_pd(velecsum,velec);
1031 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm256_mul_pd(fscal,dx11);
1035 ty = _mm256_mul_pd(fscal,dy11);
1036 tz = _mm256_mul_pd(fscal,dz11);
1038 /* Update vectorial force */
1039 fix1 = _mm256_add_pd(fix1,tx);
1040 fiy1 = _mm256_add_pd(fiy1,ty);
1041 fiz1 = _mm256_add_pd(fiz1,tz);
1043 fjx1 = _mm256_add_pd(fjx1,tx);
1044 fjy1 = _mm256_add_pd(fjy1,ty);
1045 fjz1 = _mm256_add_pd(fjz1,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 r12 = _mm256_mul_pd(rsq12,rinv12);
1052 r12 = _mm256_andnot_pd(dummy_mask,r12);
1054 /* EWALD ELECTROSTATICS */
1056 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1057 ewrt = _mm256_mul_pd(r12,ewtabscale);
1058 ewitab = _mm256_cvttpd_epi32(ewrt);
1059 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1060 ewitab = _mm_slli_epi32(ewitab,2);
1061 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1062 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1063 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1064 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1065 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1066 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1067 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1068 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1069 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1071 /* Update potential sum for this i atom from the interaction with this j atom. */
1072 velec = _mm256_andnot_pd(dummy_mask,velec);
1073 velecsum = _mm256_add_pd(velecsum,velec);
1077 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1079 /* Calculate temporary vectorial force */
1080 tx = _mm256_mul_pd(fscal,dx12);
1081 ty = _mm256_mul_pd(fscal,dy12);
1082 tz = _mm256_mul_pd(fscal,dz12);
1084 /* Update vectorial force */
1085 fix1 = _mm256_add_pd(fix1,tx);
1086 fiy1 = _mm256_add_pd(fiy1,ty);
1087 fiz1 = _mm256_add_pd(fiz1,tz);
1089 fjx2 = _mm256_add_pd(fjx2,tx);
1090 fjy2 = _mm256_add_pd(fjy2,ty);
1091 fjz2 = _mm256_add_pd(fjz2,tz);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 r20 = _mm256_mul_pd(rsq20,rinv20);
1098 r20 = _mm256_andnot_pd(dummy_mask,r20);
1100 /* EWALD ELECTROSTATICS */
1102 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1103 ewrt = _mm256_mul_pd(r20,ewtabscale);
1104 ewitab = _mm256_cvttpd_epi32(ewrt);
1105 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1106 ewitab = _mm_slli_epi32(ewitab,2);
1107 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1108 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1109 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1110 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1111 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1112 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1113 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1114 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1115 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1117 /* Update potential sum for this i atom from the interaction with this j atom. */
1118 velec = _mm256_andnot_pd(dummy_mask,velec);
1119 velecsum = _mm256_add_pd(velecsum,velec);
1123 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1125 /* Calculate temporary vectorial force */
1126 tx = _mm256_mul_pd(fscal,dx20);
1127 ty = _mm256_mul_pd(fscal,dy20);
1128 tz = _mm256_mul_pd(fscal,dz20);
1130 /* Update vectorial force */
1131 fix2 = _mm256_add_pd(fix2,tx);
1132 fiy2 = _mm256_add_pd(fiy2,ty);
1133 fiz2 = _mm256_add_pd(fiz2,tz);
1135 fjx0 = _mm256_add_pd(fjx0,tx);
1136 fjy0 = _mm256_add_pd(fjy0,ty);
1137 fjz0 = _mm256_add_pd(fjz0,tz);
1139 /**************************
1140 * CALCULATE INTERACTIONS *
1141 **************************/
1143 r21 = _mm256_mul_pd(rsq21,rinv21);
1144 r21 = _mm256_andnot_pd(dummy_mask,r21);
1146 /* EWALD ELECTROSTATICS */
1148 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1149 ewrt = _mm256_mul_pd(r21,ewtabscale);
1150 ewitab = _mm256_cvttpd_epi32(ewrt);
1151 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1152 ewitab = _mm_slli_epi32(ewitab,2);
1153 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1154 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1155 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1156 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1157 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1158 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1159 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1160 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1161 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1163 /* Update potential sum for this i atom from the interaction with this j atom. */
1164 velec = _mm256_andnot_pd(dummy_mask,velec);
1165 velecsum = _mm256_add_pd(velecsum,velec);
1169 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1171 /* Calculate temporary vectorial force */
1172 tx = _mm256_mul_pd(fscal,dx21);
1173 ty = _mm256_mul_pd(fscal,dy21);
1174 tz = _mm256_mul_pd(fscal,dz21);
1176 /* Update vectorial force */
1177 fix2 = _mm256_add_pd(fix2,tx);
1178 fiy2 = _mm256_add_pd(fiy2,ty);
1179 fiz2 = _mm256_add_pd(fiz2,tz);
1181 fjx1 = _mm256_add_pd(fjx1,tx);
1182 fjy1 = _mm256_add_pd(fjy1,ty);
1183 fjz1 = _mm256_add_pd(fjz1,tz);
1185 /**************************
1186 * CALCULATE INTERACTIONS *
1187 **************************/
1189 r22 = _mm256_mul_pd(rsq22,rinv22);
1190 r22 = _mm256_andnot_pd(dummy_mask,r22);
1192 /* EWALD ELECTROSTATICS */
1194 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1195 ewrt = _mm256_mul_pd(r22,ewtabscale);
1196 ewitab = _mm256_cvttpd_epi32(ewrt);
1197 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1198 ewitab = _mm_slli_epi32(ewitab,2);
1199 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1200 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1201 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1202 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1203 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1204 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1205 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1206 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1207 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1209 /* Update potential sum for this i atom from the interaction with this j atom. */
1210 velec = _mm256_andnot_pd(dummy_mask,velec);
1211 velecsum = _mm256_add_pd(velecsum,velec);
1215 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1217 /* Calculate temporary vectorial force */
1218 tx = _mm256_mul_pd(fscal,dx22);
1219 ty = _mm256_mul_pd(fscal,dy22);
1220 tz = _mm256_mul_pd(fscal,dz22);
1222 /* Update vectorial force */
1223 fix2 = _mm256_add_pd(fix2,tx);
1224 fiy2 = _mm256_add_pd(fiy2,ty);
1225 fiz2 = _mm256_add_pd(fiz2,tz);
1227 fjx2 = _mm256_add_pd(fjx2,tx);
1228 fjy2 = _mm256_add_pd(fjy2,ty);
1229 fjz2 = _mm256_add_pd(fjz2,tz);
1231 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1232 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1233 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1234 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1236 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1237 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1239 /* Inner loop uses 390 flops */
1242 /* End of innermost loop */
1244 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1245 f+i_coord_offset,fshift+i_shift_offset);
1248 /* Update potential energies */
1249 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1250 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1252 /* Increment number of inner iterations */
1253 inneriter += j_index_end - j_index_start;
1255 /* Outer loop uses 20 flops */
1258 /* Increment number of outer iterations */
1261 /* Update outer/inner flops */
1263 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1266 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
1267 * Electrostatics interaction: Ewald
1268 * VdW interaction: LennardJones
1269 * Geometry: Water3-Water3
1270 * Calculate force/pot: Force
1273 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_double
1274 (t_nblist * gmx_restrict nlist,
1275 rvec * gmx_restrict xx,
1276 rvec * gmx_restrict ff,
1277 t_forcerec * gmx_restrict fr,
1278 t_mdatoms * gmx_restrict mdatoms,
1279 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1280 t_nrnb * gmx_restrict nrnb)
1282 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1283 * just 0 for non-waters.
1284 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1285 * jnr indices corresponding to data put in the four positions in the SIMD register.
1287 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1288 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1289 int jnrA,jnrB,jnrC,jnrD;
1290 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1291 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1292 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1293 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1294 real rcutoff_scalar;
1295 real *shiftvec,*fshift,*x,*f;
1296 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1297 real scratch[4*DIM];
1298 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1299 real * vdwioffsetptr0;
1300 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1301 real * vdwioffsetptr1;
1302 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1303 real * vdwioffsetptr2;
1304 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1305 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1306 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1307 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1308 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1309 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1310 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1311 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1312 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1313 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1314 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1315 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1316 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1317 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1318 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1319 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1320 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1323 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1326 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1327 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1329 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1330 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1332 __m256d dummy_mask,cutoff_mask;
1333 __m128 tmpmask0,tmpmask1;
1334 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1335 __m256d one = _mm256_set1_pd(1.0);
1336 __m256d two = _mm256_set1_pd(2.0);
1342 jindex = nlist->jindex;
1344 shiftidx = nlist->shift;
1346 shiftvec = fr->shift_vec[0];
1347 fshift = fr->fshift[0];
1348 facel = _mm256_set1_pd(fr->epsfac);
1349 charge = mdatoms->chargeA;
1350 nvdwtype = fr->ntype;
1351 vdwparam = fr->nbfp;
1352 vdwtype = mdatoms->typeA;
1354 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1355 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1356 beta2 = _mm256_mul_pd(beta,beta);
1357 beta3 = _mm256_mul_pd(beta,beta2);
1359 ewtab = fr->ic->tabq_coul_F;
1360 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1361 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1363 /* Setup water-specific parameters */
1364 inr = nlist->iinr[0];
1365 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1366 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1367 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1368 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1370 jq0 = _mm256_set1_pd(charge[inr+0]);
1371 jq1 = _mm256_set1_pd(charge[inr+1]);
1372 jq2 = _mm256_set1_pd(charge[inr+2]);
1373 vdwjidx0A = 2*vdwtype[inr+0];
1374 qq00 = _mm256_mul_pd(iq0,jq0);
1375 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1376 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1377 qq01 = _mm256_mul_pd(iq0,jq1);
1378 qq02 = _mm256_mul_pd(iq0,jq2);
1379 qq10 = _mm256_mul_pd(iq1,jq0);
1380 qq11 = _mm256_mul_pd(iq1,jq1);
1381 qq12 = _mm256_mul_pd(iq1,jq2);
1382 qq20 = _mm256_mul_pd(iq2,jq0);
1383 qq21 = _mm256_mul_pd(iq2,jq1);
1384 qq22 = _mm256_mul_pd(iq2,jq2);
1386 /* Avoid stupid compiler warnings */
1387 jnrA = jnrB = jnrC = jnrD = 0;
1388 j_coord_offsetA = 0;
1389 j_coord_offsetB = 0;
1390 j_coord_offsetC = 0;
1391 j_coord_offsetD = 0;
1396 for(iidx=0;iidx<4*DIM;iidx++)
1398 scratch[iidx] = 0.0;
1401 /* Start outer loop over neighborlists */
1402 for(iidx=0; iidx<nri; iidx++)
1404 /* Load shift vector for this list */
1405 i_shift_offset = DIM*shiftidx[iidx];
1407 /* Load limits for loop over neighbors */
1408 j_index_start = jindex[iidx];
1409 j_index_end = jindex[iidx+1];
1411 /* Get outer coordinate index */
1413 i_coord_offset = DIM*inr;
1415 /* Load i particle coords and add shift vector */
1416 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1417 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1419 fix0 = _mm256_setzero_pd();
1420 fiy0 = _mm256_setzero_pd();
1421 fiz0 = _mm256_setzero_pd();
1422 fix1 = _mm256_setzero_pd();
1423 fiy1 = _mm256_setzero_pd();
1424 fiz1 = _mm256_setzero_pd();
1425 fix2 = _mm256_setzero_pd();
1426 fiy2 = _mm256_setzero_pd();
1427 fiz2 = _mm256_setzero_pd();
1429 /* Start inner kernel loop */
1430 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1433 /* Get j neighbor index, and coordinate index */
1435 jnrB = jjnr[jidx+1];
1436 jnrC = jjnr[jidx+2];
1437 jnrD = jjnr[jidx+3];
1438 j_coord_offsetA = DIM*jnrA;
1439 j_coord_offsetB = DIM*jnrB;
1440 j_coord_offsetC = DIM*jnrC;
1441 j_coord_offsetD = DIM*jnrD;
1443 /* load j atom coordinates */
1444 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1445 x+j_coord_offsetC,x+j_coord_offsetD,
1446 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1448 /* Calculate displacement vector */
1449 dx00 = _mm256_sub_pd(ix0,jx0);
1450 dy00 = _mm256_sub_pd(iy0,jy0);
1451 dz00 = _mm256_sub_pd(iz0,jz0);
1452 dx01 = _mm256_sub_pd(ix0,jx1);
1453 dy01 = _mm256_sub_pd(iy0,jy1);
1454 dz01 = _mm256_sub_pd(iz0,jz1);
1455 dx02 = _mm256_sub_pd(ix0,jx2);
1456 dy02 = _mm256_sub_pd(iy0,jy2);
1457 dz02 = _mm256_sub_pd(iz0,jz2);
1458 dx10 = _mm256_sub_pd(ix1,jx0);
1459 dy10 = _mm256_sub_pd(iy1,jy0);
1460 dz10 = _mm256_sub_pd(iz1,jz0);
1461 dx11 = _mm256_sub_pd(ix1,jx1);
1462 dy11 = _mm256_sub_pd(iy1,jy1);
1463 dz11 = _mm256_sub_pd(iz1,jz1);
1464 dx12 = _mm256_sub_pd(ix1,jx2);
1465 dy12 = _mm256_sub_pd(iy1,jy2);
1466 dz12 = _mm256_sub_pd(iz1,jz2);
1467 dx20 = _mm256_sub_pd(ix2,jx0);
1468 dy20 = _mm256_sub_pd(iy2,jy0);
1469 dz20 = _mm256_sub_pd(iz2,jz0);
1470 dx21 = _mm256_sub_pd(ix2,jx1);
1471 dy21 = _mm256_sub_pd(iy2,jy1);
1472 dz21 = _mm256_sub_pd(iz2,jz1);
1473 dx22 = _mm256_sub_pd(ix2,jx2);
1474 dy22 = _mm256_sub_pd(iy2,jy2);
1475 dz22 = _mm256_sub_pd(iz2,jz2);
1477 /* Calculate squared distance and things based on it */
1478 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1479 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1480 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1481 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1482 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1483 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1484 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1485 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1486 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1488 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1489 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1490 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1491 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1492 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1493 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1494 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1495 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1496 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1498 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1499 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1500 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1501 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1502 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1503 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1504 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1505 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1506 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1508 fjx0 = _mm256_setzero_pd();
1509 fjy0 = _mm256_setzero_pd();
1510 fjz0 = _mm256_setzero_pd();
1511 fjx1 = _mm256_setzero_pd();
1512 fjy1 = _mm256_setzero_pd();
1513 fjz1 = _mm256_setzero_pd();
1514 fjx2 = _mm256_setzero_pd();
1515 fjy2 = _mm256_setzero_pd();
1516 fjz2 = _mm256_setzero_pd();
1518 /**************************
1519 * CALCULATE INTERACTIONS *
1520 **************************/
1522 r00 = _mm256_mul_pd(rsq00,rinv00);
1524 /* EWALD ELECTROSTATICS */
1526 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1527 ewrt = _mm256_mul_pd(r00,ewtabscale);
1528 ewitab = _mm256_cvttpd_epi32(ewrt);
1529 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1530 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1531 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1533 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1534 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1536 /* LENNARD-JONES DISPERSION/REPULSION */
1538 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1539 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1541 fscal = _mm256_add_pd(felec,fvdw);
1543 /* Calculate temporary vectorial force */
1544 tx = _mm256_mul_pd(fscal,dx00);
1545 ty = _mm256_mul_pd(fscal,dy00);
1546 tz = _mm256_mul_pd(fscal,dz00);
1548 /* Update vectorial force */
1549 fix0 = _mm256_add_pd(fix0,tx);
1550 fiy0 = _mm256_add_pd(fiy0,ty);
1551 fiz0 = _mm256_add_pd(fiz0,tz);
1553 fjx0 = _mm256_add_pd(fjx0,tx);
1554 fjy0 = _mm256_add_pd(fjy0,ty);
1555 fjz0 = _mm256_add_pd(fjz0,tz);
1557 /**************************
1558 * CALCULATE INTERACTIONS *
1559 **************************/
1561 r01 = _mm256_mul_pd(rsq01,rinv01);
1563 /* EWALD ELECTROSTATICS */
1565 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1566 ewrt = _mm256_mul_pd(r01,ewtabscale);
1567 ewitab = _mm256_cvttpd_epi32(ewrt);
1568 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1569 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1570 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1572 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1573 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1577 /* Calculate temporary vectorial force */
1578 tx = _mm256_mul_pd(fscal,dx01);
1579 ty = _mm256_mul_pd(fscal,dy01);
1580 tz = _mm256_mul_pd(fscal,dz01);
1582 /* Update vectorial force */
1583 fix0 = _mm256_add_pd(fix0,tx);
1584 fiy0 = _mm256_add_pd(fiy0,ty);
1585 fiz0 = _mm256_add_pd(fiz0,tz);
1587 fjx1 = _mm256_add_pd(fjx1,tx);
1588 fjy1 = _mm256_add_pd(fjy1,ty);
1589 fjz1 = _mm256_add_pd(fjz1,tz);
1591 /**************************
1592 * CALCULATE INTERACTIONS *
1593 **************************/
1595 r02 = _mm256_mul_pd(rsq02,rinv02);
1597 /* EWALD ELECTROSTATICS */
1599 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1600 ewrt = _mm256_mul_pd(r02,ewtabscale);
1601 ewitab = _mm256_cvttpd_epi32(ewrt);
1602 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1603 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1604 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1606 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1607 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1611 /* Calculate temporary vectorial force */
1612 tx = _mm256_mul_pd(fscal,dx02);
1613 ty = _mm256_mul_pd(fscal,dy02);
1614 tz = _mm256_mul_pd(fscal,dz02);
1616 /* Update vectorial force */
1617 fix0 = _mm256_add_pd(fix0,tx);
1618 fiy0 = _mm256_add_pd(fiy0,ty);
1619 fiz0 = _mm256_add_pd(fiz0,tz);
1621 fjx2 = _mm256_add_pd(fjx2,tx);
1622 fjy2 = _mm256_add_pd(fjy2,ty);
1623 fjz2 = _mm256_add_pd(fjz2,tz);
1625 /**************************
1626 * CALCULATE INTERACTIONS *
1627 **************************/
1629 r10 = _mm256_mul_pd(rsq10,rinv10);
1631 /* EWALD ELECTROSTATICS */
1633 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1634 ewrt = _mm256_mul_pd(r10,ewtabscale);
1635 ewitab = _mm256_cvttpd_epi32(ewrt);
1636 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1637 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1638 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1640 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1641 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1645 /* Calculate temporary vectorial force */
1646 tx = _mm256_mul_pd(fscal,dx10);
1647 ty = _mm256_mul_pd(fscal,dy10);
1648 tz = _mm256_mul_pd(fscal,dz10);
1650 /* Update vectorial force */
1651 fix1 = _mm256_add_pd(fix1,tx);
1652 fiy1 = _mm256_add_pd(fiy1,ty);
1653 fiz1 = _mm256_add_pd(fiz1,tz);
1655 fjx0 = _mm256_add_pd(fjx0,tx);
1656 fjy0 = _mm256_add_pd(fjy0,ty);
1657 fjz0 = _mm256_add_pd(fjz0,tz);
1659 /**************************
1660 * CALCULATE INTERACTIONS *
1661 **************************/
1663 r11 = _mm256_mul_pd(rsq11,rinv11);
1665 /* EWALD ELECTROSTATICS */
1667 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1668 ewrt = _mm256_mul_pd(r11,ewtabscale);
1669 ewitab = _mm256_cvttpd_epi32(ewrt);
1670 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1671 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1672 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1674 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1675 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1679 /* Calculate temporary vectorial force */
1680 tx = _mm256_mul_pd(fscal,dx11);
1681 ty = _mm256_mul_pd(fscal,dy11);
1682 tz = _mm256_mul_pd(fscal,dz11);
1684 /* Update vectorial force */
1685 fix1 = _mm256_add_pd(fix1,tx);
1686 fiy1 = _mm256_add_pd(fiy1,ty);
1687 fiz1 = _mm256_add_pd(fiz1,tz);
1689 fjx1 = _mm256_add_pd(fjx1,tx);
1690 fjy1 = _mm256_add_pd(fjy1,ty);
1691 fjz1 = _mm256_add_pd(fjz1,tz);
1693 /**************************
1694 * CALCULATE INTERACTIONS *
1695 **************************/
1697 r12 = _mm256_mul_pd(rsq12,rinv12);
1699 /* EWALD ELECTROSTATICS */
1701 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1702 ewrt = _mm256_mul_pd(r12,ewtabscale);
1703 ewitab = _mm256_cvttpd_epi32(ewrt);
1704 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1705 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1706 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1708 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1709 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1713 /* Calculate temporary vectorial force */
1714 tx = _mm256_mul_pd(fscal,dx12);
1715 ty = _mm256_mul_pd(fscal,dy12);
1716 tz = _mm256_mul_pd(fscal,dz12);
1718 /* Update vectorial force */
1719 fix1 = _mm256_add_pd(fix1,tx);
1720 fiy1 = _mm256_add_pd(fiy1,ty);
1721 fiz1 = _mm256_add_pd(fiz1,tz);
1723 fjx2 = _mm256_add_pd(fjx2,tx);
1724 fjy2 = _mm256_add_pd(fjy2,ty);
1725 fjz2 = _mm256_add_pd(fjz2,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 r20 = _mm256_mul_pd(rsq20,rinv20);
1733 /* EWALD ELECTROSTATICS */
1735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736 ewrt = _mm256_mul_pd(r20,ewtabscale);
1737 ewitab = _mm256_cvttpd_epi32(ewrt);
1738 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1739 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1740 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1742 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1743 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1747 /* Calculate temporary vectorial force */
1748 tx = _mm256_mul_pd(fscal,dx20);
1749 ty = _mm256_mul_pd(fscal,dy20);
1750 tz = _mm256_mul_pd(fscal,dz20);
1752 /* Update vectorial force */
1753 fix2 = _mm256_add_pd(fix2,tx);
1754 fiy2 = _mm256_add_pd(fiy2,ty);
1755 fiz2 = _mm256_add_pd(fiz2,tz);
1757 fjx0 = _mm256_add_pd(fjx0,tx);
1758 fjy0 = _mm256_add_pd(fjy0,ty);
1759 fjz0 = _mm256_add_pd(fjz0,tz);
1761 /**************************
1762 * CALCULATE INTERACTIONS *
1763 **************************/
1765 r21 = _mm256_mul_pd(rsq21,rinv21);
1767 /* EWALD ELECTROSTATICS */
1769 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1770 ewrt = _mm256_mul_pd(r21,ewtabscale);
1771 ewitab = _mm256_cvttpd_epi32(ewrt);
1772 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1773 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1774 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1776 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1777 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1781 /* Calculate temporary vectorial force */
1782 tx = _mm256_mul_pd(fscal,dx21);
1783 ty = _mm256_mul_pd(fscal,dy21);
1784 tz = _mm256_mul_pd(fscal,dz21);
1786 /* Update vectorial force */
1787 fix2 = _mm256_add_pd(fix2,tx);
1788 fiy2 = _mm256_add_pd(fiy2,ty);
1789 fiz2 = _mm256_add_pd(fiz2,tz);
1791 fjx1 = _mm256_add_pd(fjx1,tx);
1792 fjy1 = _mm256_add_pd(fjy1,ty);
1793 fjz1 = _mm256_add_pd(fjz1,tz);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 r22 = _mm256_mul_pd(rsq22,rinv22);
1801 /* EWALD ELECTROSTATICS */
1803 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1804 ewrt = _mm256_mul_pd(r22,ewtabscale);
1805 ewitab = _mm256_cvttpd_epi32(ewrt);
1806 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1807 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1808 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1810 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1811 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1815 /* Calculate temporary vectorial force */
1816 tx = _mm256_mul_pd(fscal,dx22);
1817 ty = _mm256_mul_pd(fscal,dy22);
1818 tz = _mm256_mul_pd(fscal,dz22);
1820 /* Update vectorial force */
1821 fix2 = _mm256_add_pd(fix2,tx);
1822 fiy2 = _mm256_add_pd(fiy2,ty);
1823 fiz2 = _mm256_add_pd(fiz2,tz);
1825 fjx2 = _mm256_add_pd(fjx2,tx);
1826 fjy2 = _mm256_add_pd(fjy2,ty);
1827 fjz2 = _mm256_add_pd(fjz2,tz);
1829 fjptrA = f+j_coord_offsetA;
1830 fjptrB = f+j_coord_offsetB;
1831 fjptrC = f+j_coord_offsetC;
1832 fjptrD = f+j_coord_offsetD;
1834 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1835 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1837 /* Inner loop uses 331 flops */
1840 if(jidx<j_index_end)
1843 /* Get j neighbor index, and coordinate index */
1844 jnrlistA = jjnr[jidx];
1845 jnrlistB = jjnr[jidx+1];
1846 jnrlistC = jjnr[jidx+2];
1847 jnrlistD = jjnr[jidx+3];
1848 /* Sign of each element will be negative for non-real atoms.
1849 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1850 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1852 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1854 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1855 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1856 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1858 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1859 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1860 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1861 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1862 j_coord_offsetA = DIM*jnrA;
1863 j_coord_offsetB = DIM*jnrB;
1864 j_coord_offsetC = DIM*jnrC;
1865 j_coord_offsetD = DIM*jnrD;
1867 /* load j atom coordinates */
1868 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1869 x+j_coord_offsetC,x+j_coord_offsetD,
1870 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1872 /* Calculate displacement vector */
1873 dx00 = _mm256_sub_pd(ix0,jx0);
1874 dy00 = _mm256_sub_pd(iy0,jy0);
1875 dz00 = _mm256_sub_pd(iz0,jz0);
1876 dx01 = _mm256_sub_pd(ix0,jx1);
1877 dy01 = _mm256_sub_pd(iy0,jy1);
1878 dz01 = _mm256_sub_pd(iz0,jz1);
1879 dx02 = _mm256_sub_pd(ix0,jx2);
1880 dy02 = _mm256_sub_pd(iy0,jy2);
1881 dz02 = _mm256_sub_pd(iz0,jz2);
1882 dx10 = _mm256_sub_pd(ix1,jx0);
1883 dy10 = _mm256_sub_pd(iy1,jy0);
1884 dz10 = _mm256_sub_pd(iz1,jz0);
1885 dx11 = _mm256_sub_pd(ix1,jx1);
1886 dy11 = _mm256_sub_pd(iy1,jy1);
1887 dz11 = _mm256_sub_pd(iz1,jz1);
1888 dx12 = _mm256_sub_pd(ix1,jx2);
1889 dy12 = _mm256_sub_pd(iy1,jy2);
1890 dz12 = _mm256_sub_pd(iz1,jz2);
1891 dx20 = _mm256_sub_pd(ix2,jx0);
1892 dy20 = _mm256_sub_pd(iy2,jy0);
1893 dz20 = _mm256_sub_pd(iz2,jz0);
1894 dx21 = _mm256_sub_pd(ix2,jx1);
1895 dy21 = _mm256_sub_pd(iy2,jy1);
1896 dz21 = _mm256_sub_pd(iz2,jz1);
1897 dx22 = _mm256_sub_pd(ix2,jx2);
1898 dy22 = _mm256_sub_pd(iy2,jy2);
1899 dz22 = _mm256_sub_pd(iz2,jz2);
1901 /* Calculate squared distance and things based on it */
1902 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1903 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1904 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1905 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1906 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1907 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1908 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1909 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1910 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1912 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1913 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1914 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1915 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1916 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1917 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1918 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1919 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1920 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1922 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1923 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1924 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1925 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1926 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1927 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1928 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1929 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1930 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1932 fjx0 = _mm256_setzero_pd();
1933 fjy0 = _mm256_setzero_pd();
1934 fjz0 = _mm256_setzero_pd();
1935 fjx1 = _mm256_setzero_pd();
1936 fjy1 = _mm256_setzero_pd();
1937 fjz1 = _mm256_setzero_pd();
1938 fjx2 = _mm256_setzero_pd();
1939 fjy2 = _mm256_setzero_pd();
1940 fjz2 = _mm256_setzero_pd();
1942 /**************************
1943 * CALCULATE INTERACTIONS *
1944 **************************/
1946 r00 = _mm256_mul_pd(rsq00,rinv00);
1947 r00 = _mm256_andnot_pd(dummy_mask,r00);
1949 /* EWALD ELECTROSTATICS */
1951 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1952 ewrt = _mm256_mul_pd(r00,ewtabscale);
1953 ewitab = _mm256_cvttpd_epi32(ewrt);
1954 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1955 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1956 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1958 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1959 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1961 /* LENNARD-JONES DISPERSION/REPULSION */
1963 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1964 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1966 fscal = _mm256_add_pd(felec,fvdw);
1968 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1970 /* Calculate temporary vectorial force */
1971 tx = _mm256_mul_pd(fscal,dx00);
1972 ty = _mm256_mul_pd(fscal,dy00);
1973 tz = _mm256_mul_pd(fscal,dz00);
1975 /* Update vectorial force */
1976 fix0 = _mm256_add_pd(fix0,tx);
1977 fiy0 = _mm256_add_pd(fiy0,ty);
1978 fiz0 = _mm256_add_pd(fiz0,tz);
1980 fjx0 = _mm256_add_pd(fjx0,tx);
1981 fjy0 = _mm256_add_pd(fjy0,ty);
1982 fjz0 = _mm256_add_pd(fjz0,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 r01 = _mm256_mul_pd(rsq01,rinv01);
1989 r01 = _mm256_andnot_pd(dummy_mask,r01);
1991 /* EWALD ELECTROSTATICS */
1993 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1994 ewrt = _mm256_mul_pd(r01,ewtabscale);
1995 ewitab = _mm256_cvttpd_epi32(ewrt);
1996 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1997 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1998 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2000 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2001 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2005 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2007 /* Calculate temporary vectorial force */
2008 tx = _mm256_mul_pd(fscal,dx01);
2009 ty = _mm256_mul_pd(fscal,dy01);
2010 tz = _mm256_mul_pd(fscal,dz01);
2012 /* Update vectorial force */
2013 fix0 = _mm256_add_pd(fix0,tx);
2014 fiy0 = _mm256_add_pd(fiy0,ty);
2015 fiz0 = _mm256_add_pd(fiz0,tz);
2017 fjx1 = _mm256_add_pd(fjx1,tx);
2018 fjy1 = _mm256_add_pd(fjy1,ty);
2019 fjz1 = _mm256_add_pd(fjz1,tz);
2021 /**************************
2022 * CALCULATE INTERACTIONS *
2023 **************************/
2025 r02 = _mm256_mul_pd(rsq02,rinv02);
2026 r02 = _mm256_andnot_pd(dummy_mask,r02);
2028 /* EWALD ELECTROSTATICS */
2030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2031 ewrt = _mm256_mul_pd(r02,ewtabscale);
2032 ewitab = _mm256_cvttpd_epi32(ewrt);
2033 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2034 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2035 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2037 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2038 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2042 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2044 /* Calculate temporary vectorial force */
2045 tx = _mm256_mul_pd(fscal,dx02);
2046 ty = _mm256_mul_pd(fscal,dy02);
2047 tz = _mm256_mul_pd(fscal,dz02);
2049 /* Update vectorial force */
2050 fix0 = _mm256_add_pd(fix0,tx);
2051 fiy0 = _mm256_add_pd(fiy0,ty);
2052 fiz0 = _mm256_add_pd(fiz0,tz);
2054 fjx2 = _mm256_add_pd(fjx2,tx);
2055 fjy2 = _mm256_add_pd(fjy2,ty);
2056 fjz2 = _mm256_add_pd(fjz2,tz);
2058 /**************************
2059 * CALCULATE INTERACTIONS *
2060 **************************/
2062 r10 = _mm256_mul_pd(rsq10,rinv10);
2063 r10 = _mm256_andnot_pd(dummy_mask,r10);
2065 /* EWALD ELECTROSTATICS */
2067 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2068 ewrt = _mm256_mul_pd(r10,ewtabscale);
2069 ewitab = _mm256_cvttpd_epi32(ewrt);
2070 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2071 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2072 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2074 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2075 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2079 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2081 /* Calculate temporary vectorial force */
2082 tx = _mm256_mul_pd(fscal,dx10);
2083 ty = _mm256_mul_pd(fscal,dy10);
2084 tz = _mm256_mul_pd(fscal,dz10);
2086 /* Update vectorial force */
2087 fix1 = _mm256_add_pd(fix1,tx);
2088 fiy1 = _mm256_add_pd(fiy1,ty);
2089 fiz1 = _mm256_add_pd(fiz1,tz);
2091 fjx0 = _mm256_add_pd(fjx0,tx);
2092 fjy0 = _mm256_add_pd(fjy0,ty);
2093 fjz0 = _mm256_add_pd(fjz0,tz);
2095 /**************************
2096 * CALCULATE INTERACTIONS *
2097 **************************/
2099 r11 = _mm256_mul_pd(rsq11,rinv11);
2100 r11 = _mm256_andnot_pd(dummy_mask,r11);
2102 /* EWALD ELECTROSTATICS */
2104 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2105 ewrt = _mm256_mul_pd(r11,ewtabscale);
2106 ewitab = _mm256_cvttpd_epi32(ewrt);
2107 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2108 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2109 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2111 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2112 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2116 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2118 /* Calculate temporary vectorial force */
2119 tx = _mm256_mul_pd(fscal,dx11);
2120 ty = _mm256_mul_pd(fscal,dy11);
2121 tz = _mm256_mul_pd(fscal,dz11);
2123 /* Update vectorial force */
2124 fix1 = _mm256_add_pd(fix1,tx);
2125 fiy1 = _mm256_add_pd(fiy1,ty);
2126 fiz1 = _mm256_add_pd(fiz1,tz);
2128 fjx1 = _mm256_add_pd(fjx1,tx);
2129 fjy1 = _mm256_add_pd(fjy1,ty);
2130 fjz1 = _mm256_add_pd(fjz1,tz);
2132 /**************************
2133 * CALCULATE INTERACTIONS *
2134 **************************/
2136 r12 = _mm256_mul_pd(rsq12,rinv12);
2137 r12 = _mm256_andnot_pd(dummy_mask,r12);
2139 /* EWALD ELECTROSTATICS */
2141 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2142 ewrt = _mm256_mul_pd(r12,ewtabscale);
2143 ewitab = _mm256_cvttpd_epi32(ewrt);
2144 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2145 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2146 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2148 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2149 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2153 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2155 /* Calculate temporary vectorial force */
2156 tx = _mm256_mul_pd(fscal,dx12);
2157 ty = _mm256_mul_pd(fscal,dy12);
2158 tz = _mm256_mul_pd(fscal,dz12);
2160 /* Update vectorial force */
2161 fix1 = _mm256_add_pd(fix1,tx);
2162 fiy1 = _mm256_add_pd(fiy1,ty);
2163 fiz1 = _mm256_add_pd(fiz1,tz);
2165 fjx2 = _mm256_add_pd(fjx2,tx);
2166 fjy2 = _mm256_add_pd(fjy2,ty);
2167 fjz2 = _mm256_add_pd(fjz2,tz);
2169 /**************************
2170 * CALCULATE INTERACTIONS *
2171 **************************/
2173 r20 = _mm256_mul_pd(rsq20,rinv20);
2174 r20 = _mm256_andnot_pd(dummy_mask,r20);
2176 /* EWALD ELECTROSTATICS */
2178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2179 ewrt = _mm256_mul_pd(r20,ewtabscale);
2180 ewitab = _mm256_cvttpd_epi32(ewrt);
2181 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2182 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2183 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2185 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2186 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2190 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2192 /* Calculate temporary vectorial force */
2193 tx = _mm256_mul_pd(fscal,dx20);
2194 ty = _mm256_mul_pd(fscal,dy20);
2195 tz = _mm256_mul_pd(fscal,dz20);
2197 /* Update vectorial force */
2198 fix2 = _mm256_add_pd(fix2,tx);
2199 fiy2 = _mm256_add_pd(fiy2,ty);
2200 fiz2 = _mm256_add_pd(fiz2,tz);
2202 fjx0 = _mm256_add_pd(fjx0,tx);
2203 fjy0 = _mm256_add_pd(fjy0,ty);
2204 fjz0 = _mm256_add_pd(fjz0,tz);
2206 /**************************
2207 * CALCULATE INTERACTIONS *
2208 **************************/
2210 r21 = _mm256_mul_pd(rsq21,rinv21);
2211 r21 = _mm256_andnot_pd(dummy_mask,r21);
2213 /* EWALD ELECTROSTATICS */
2215 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2216 ewrt = _mm256_mul_pd(r21,ewtabscale);
2217 ewitab = _mm256_cvttpd_epi32(ewrt);
2218 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2219 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2220 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2222 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2223 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2227 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2229 /* Calculate temporary vectorial force */
2230 tx = _mm256_mul_pd(fscal,dx21);
2231 ty = _mm256_mul_pd(fscal,dy21);
2232 tz = _mm256_mul_pd(fscal,dz21);
2234 /* Update vectorial force */
2235 fix2 = _mm256_add_pd(fix2,tx);
2236 fiy2 = _mm256_add_pd(fiy2,ty);
2237 fiz2 = _mm256_add_pd(fiz2,tz);
2239 fjx1 = _mm256_add_pd(fjx1,tx);
2240 fjy1 = _mm256_add_pd(fjy1,ty);
2241 fjz1 = _mm256_add_pd(fjz1,tz);
2243 /**************************
2244 * CALCULATE INTERACTIONS *
2245 **************************/
2247 r22 = _mm256_mul_pd(rsq22,rinv22);
2248 r22 = _mm256_andnot_pd(dummy_mask,r22);
2250 /* EWALD ELECTROSTATICS */
2252 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2253 ewrt = _mm256_mul_pd(r22,ewtabscale);
2254 ewitab = _mm256_cvttpd_epi32(ewrt);
2255 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2256 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2257 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2259 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2260 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2264 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2266 /* Calculate temporary vectorial force */
2267 tx = _mm256_mul_pd(fscal,dx22);
2268 ty = _mm256_mul_pd(fscal,dy22);
2269 tz = _mm256_mul_pd(fscal,dz22);
2271 /* Update vectorial force */
2272 fix2 = _mm256_add_pd(fix2,tx);
2273 fiy2 = _mm256_add_pd(fiy2,ty);
2274 fiz2 = _mm256_add_pd(fiz2,tz);
2276 fjx2 = _mm256_add_pd(fjx2,tx);
2277 fjy2 = _mm256_add_pd(fjy2,ty);
2278 fjz2 = _mm256_add_pd(fjz2,tz);
2280 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2281 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2282 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2283 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2285 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2286 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2288 /* Inner loop uses 340 flops */
2291 /* End of innermost loop */
2293 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2294 f+i_coord_offset,fshift+i_shift_offset);
2296 /* Increment number of inner iterations */
2297 inneriter += j_index_end - j_index_start;
2299 /* Outer loop uses 18 flops */
2302 /* Increment number of outer iterations */
2305 /* Update outer/inner flops */
2307 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);