2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: None
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
111 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
113 __m256d dummy_mask,cutoff_mask;
114 __m128 tmpmask0,tmpmask1;
115 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
116 __m256d one = _mm256_set1_pd(1.0);
117 __m256d two = _mm256_set1_pd(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm256_set1_pd(fr->epsfac);
130 charge = mdatoms->chargeA;
132 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
133 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
134 beta2 = _mm256_mul_pd(beta,beta);
135 beta3 = _mm256_mul_pd(beta,beta2);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
144 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
145 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
147 jq0 = _mm256_set1_pd(charge[inr+0]);
148 jq1 = _mm256_set1_pd(charge[inr+1]);
149 jq2 = _mm256_set1_pd(charge[inr+2]);
150 qq00 = _mm256_mul_pd(iq0,jq0);
151 qq01 = _mm256_mul_pd(iq0,jq1);
152 qq02 = _mm256_mul_pd(iq0,jq2);
153 qq10 = _mm256_mul_pd(iq1,jq0);
154 qq11 = _mm256_mul_pd(iq1,jq1);
155 qq12 = _mm256_mul_pd(iq1,jq2);
156 qq20 = _mm256_mul_pd(iq2,jq0);
157 qq21 = _mm256_mul_pd(iq2,jq1);
158 qq22 = _mm256_mul_pd(iq2,jq2);
160 /* Avoid stupid compiler warnings */
161 jnrA = jnrB = jnrC = jnrD = 0;
170 for(iidx=0;iidx<4*DIM;iidx++)
175 /* Start outer loop over neighborlists */
176 for(iidx=0; iidx<nri; iidx++)
178 /* Load shift vector for this list */
179 i_shift_offset = DIM*shiftidx[iidx];
181 /* Load limits for loop over neighbors */
182 j_index_start = jindex[iidx];
183 j_index_end = jindex[iidx+1];
185 /* Get outer coordinate index */
187 i_coord_offset = DIM*inr;
189 /* Load i particle coords and add shift vector */
190 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
191 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
193 fix0 = _mm256_setzero_pd();
194 fiy0 = _mm256_setzero_pd();
195 fiz0 = _mm256_setzero_pd();
196 fix1 = _mm256_setzero_pd();
197 fiy1 = _mm256_setzero_pd();
198 fiz1 = _mm256_setzero_pd();
199 fix2 = _mm256_setzero_pd();
200 fiy2 = _mm256_setzero_pd();
201 fiz2 = _mm256_setzero_pd();
203 /* Reset potential sums */
204 velecsum = _mm256_setzero_pd();
206 /* Start inner kernel loop */
207 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
210 /* Get j neighbor index, and coordinate index */
215 j_coord_offsetA = DIM*jnrA;
216 j_coord_offsetB = DIM*jnrB;
217 j_coord_offsetC = DIM*jnrC;
218 j_coord_offsetD = DIM*jnrD;
220 /* load j atom coordinates */
221 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222 x+j_coord_offsetC,x+j_coord_offsetD,
223 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
225 /* Calculate displacement vector */
226 dx00 = _mm256_sub_pd(ix0,jx0);
227 dy00 = _mm256_sub_pd(iy0,jy0);
228 dz00 = _mm256_sub_pd(iz0,jz0);
229 dx01 = _mm256_sub_pd(ix0,jx1);
230 dy01 = _mm256_sub_pd(iy0,jy1);
231 dz01 = _mm256_sub_pd(iz0,jz1);
232 dx02 = _mm256_sub_pd(ix0,jx2);
233 dy02 = _mm256_sub_pd(iy0,jy2);
234 dz02 = _mm256_sub_pd(iz0,jz2);
235 dx10 = _mm256_sub_pd(ix1,jx0);
236 dy10 = _mm256_sub_pd(iy1,jy0);
237 dz10 = _mm256_sub_pd(iz1,jz0);
238 dx11 = _mm256_sub_pd(ix1,jx1);
239 dy11 = _mm256_sub_pd(iy1,jy1);
240 dz11 = _mm256_sub_pd(iz1,jz1);
241 dx12 = _mm256_sub_pd(ix1,jx2);
242 dy12 = _mm256_sub_pd(iy1,jy2);
243 dz12 = _mm256_sub_pd(iz1,jz2);
244 dx20 = _mm256_sub_pd(ix2,jx0);
245 dy20 = _mm256_sub_pd(iy2,jy0);
246 dz20 = _mm256_sub_pd(iz2,jz0);
247 dx21 = _mm256_sub_pd(ix2,jx1);
248 dy21 = _mm256_sub_pd(iy2,jy1);
249 dz21 = _mm256_sub_pd(iz2,jz1);
250 dx22 = _mm256_sub_pd(ix2,jx2);
251 dy22 = _mm256_sub_pd(iy2,jy2);
252 dz22 = _mm256_sub_pd(iz2,jz2);
254 /* Calculate squared distance and things based on it */
255 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
256 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
257 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
258 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
259 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
260 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
261 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
262 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
263 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
265 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
266 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
267 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
268 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
269 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
270 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
271 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
272 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
273 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
275 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
276 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
277 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
278 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
279 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
280 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
281 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
282 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
283 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
285 fjx0 = _mm256_setzero_pd();
286 fjy0 = _mm256_setzero_pd();
287 fjz0 = _mm256_setzero_pd();
288 fjx1 = _mm256_setzero_pd();
289 fjy1 = _mm256_setzero_pd();
290 fjz1 = _mm256_setzero_pd();
291 fjx2 = _mm256_setzero_pd();
292 fjy2 = _mm256_setzero_pd();
293 fjz2 = _mm256_setzero_pd();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r00 = _mm256_mul_pd(rsq00,rinv00);
301 /* EWALD ELECTROSTATICS */
303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
304 ewrt = _mm256_mul_pd(r00,ewtabscale);
305 ewitab = _mm256_cvttpd_epi32(ewrt);
306 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
307 ewitab = _mm_slli_epi32(ewitab,2);
308 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
309 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
310 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
311 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
312 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
313 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
314 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
315 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
316 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
318 /* Update potential sum for this i atom from the interaction with this j atom. */
319 velecsum = _mm256_add_pd(velecsum,velec);
323 /* Calculate temporary vectorial force */
324 tx = _mm256_mul_pd(fscal,dx00);
325 ty = _mm256_mul_pd(fscal,dy00);
326 tz = _mm256_mul_pd(fscal,dz00);
328 /* Update vectorial force */
329 fix0 = _mm256_add_pd(fix0,tx);
330 fiy0 = _mm256_add_pd(fiy0,ty);
331 fiz0 = _mm256_add_pd(fiz0,tz);
333 fjx0 = _mm256_add_pd(fjx0,tx);
334 fjy0 = _mm256_add_pd(fjy0,ty);
335 fjz0 = _mm256_add_pd(fjz0,tz);
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 r01 = _mm256_mul_pd(rsq01,rinv01);
343 /* EWALD ELECTROSTATICS */
345 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
346 ewrt = _mm256_mul_pd(r01,ewtabscale);
347 ewitab = _mm256_cvttpd_epi32(ewrt);
348 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
349 ewitab = _mm_slli_epi32(ewitab,2);
350 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
351 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
352 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
353 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
354 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
355 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
356 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
357 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
358 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 velecsum = _mm256_add_pd(velecsum,velec);
365 /* Calculate temporary vectorial force */
366 tx = _mm256_mul_pd(fscal,dx01);
367 ty = _mm256_mul_pd(fscal,dy01);
368 tz = _mm256_mul_pd(fscal,dz01);
370 /* Update vectorial force */
371 fix0 = _mm256_add_pd(fix0,tx);
372 fiy0 = _mm256_add_pd(fiy0,ty);
373 fiz0 = _mm256_add_pd(fiz0,tz);
375 fjx1 = _mm256_add_pd(fjx1,tx);
376 fjy1 = _mm256_add_pd(fjy1,ty);
377 fjz1 = _mm256_add_pd(fjz1,tz);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 r02 = _mm256_mul_pd(rsq02,rinv02);
385 /* EWALD ELECTROSTATICS */
387 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388 ewrt = _mm256_mul_pd(r02,ewtabscale);
389 ewitab = _mm256_cvttpd_epi32(ewrt);
390 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
391 ewitab = _mm_slli_epi32(ewitab,2);
392 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
393 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
394 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
395 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
396 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
397 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
398 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
399 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
400 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velecsum = _mm256_add_pd(velecsum,velec);
407 /* Calculate temporary vectorial force */
408 tx = _mm256_mul_pd(fscal,dx02);
409 ty = _mm256_mul_pd(fscal,dy02);
410 tz = _mm256_mul_pd(fscal,dz02);
412 /* Update vectorial force */
413 fix0 = _mm256_add_pd(fix0,tx);
414 fiy0 = _mm256_add_pd(fiy0,ty);
415 fiz0 = _mm256_add_pd(fiz0,tz);
417 fjx2 = _mm256_add_pd(fjx2,tx);
418 fjy2 = _mm256_add_pd(fjy2,ty);
419 fjz2 = _mm256_add_pd(fjz2,tz);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 r10 = _mm256_mul_pd(rsq10,rinv10);
427 /* EWALD ELECTROSTATICS */
429 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
430 ewrt = _mm256_mul_pd(r10,ewtabscale);
431 ewitab = _mm256_cvttpd_epi32(ewrt);
432 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
433 ewitab = _mm_slli_epi32(ewitab,2);
434 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
435 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
436 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
437 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
438 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
439 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
440 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
441 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
442 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
444 /* Update potential sum for this i atom from the interaction with this j atom. */
445 velecsum = _mm256_add_pd(velecsum,velec);
449 /* Calculate temporary vectorial force */
450 tx = _mm256_mul_pd(fscal,dx10);
451 ty = _mm256_mul_pd(fscal,dy10);
452 tz = _mm256_mul_pd(fscal,dz10);
454 /* Update vectorial force */
455 fix1 = _mm256_add_pd(fix1,tx);
456 fiy1 = _mm256_add_pd(fiy1,ty);
457 fiz1 = _mm256_add_pd(fiz1,tz);
459 fjx0 = _mm256_add_pd(fjx0,tx);
460 fjy0 = _mm256_add_pd(fjy0,ty);
461 fjz0 = _mm256_add_pd(fjz0,tz);
463 /**************************
464 * CALCULATE INTERACTIONS *
465 **************************/
467 r11 = _mm256_mul_pd(rsq11,rinv11);
469 /* EWALD ELECTROSTATICS */
471 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
472 ewrt = _mm256_mul_pd(r11,ewtabscale);
473 ewitab = _mm256_cvttpd_epi32(ewrt);
474 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
475 ewitab = _mm_slli_epi32(ewitab,2);
476 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
477 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
478 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
479 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
480 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
481 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
482 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
483 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
484 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velecsum = _mm256_add_pd(velecsum,velec);
491 /* Calculate temporary vectorial force */
492 tx = _mm256_mul_pd(fscal,dx11);
493 ty = _mm256_mul_pd(fscal,dy11);
494 tz = _mm256_mul_pd(fscal,dz11);
496 /* Update vectorial force */
497 fix1 = _mm256_add_pd(fix1,tx);
498 fiy1 = _mm256_add_pd(fiy1,ty);
499 fiz1 = _mm256_add_pd(fiz1,tz);
501 fjx1 = _mm256_add_pd(fjx1,tx);
502 fjy1 = _mm256_add_pd(fjy1,ty);
503 fjz1 = _mm256_add_pd(fjz1,tz);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 r12 = _mm256_mul_pd(rsq12,rinv12);
511 /* EWALD ELECTROSTATICS */
513 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514 ewrt = _mm256_mul_pd(r12,ewtabscale);
515 ewitab = _mm256_cvttpd_epi32(ewrt);
516 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
517 ewitab = _mm_slli_epi32(ewitab,2);
518 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
519 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
520 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
521 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
522 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
523 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
524 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
525 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
526 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velecsum = _mm256_add_pd(velecsum,velec);
533 /* Calculate temporary vectorial force */
534 tx = _mm256_mul_pd(fscal,dx12);
535 ty = _mm256_mul_pd(fscal,dy12);
536 tz = _mm256_mul_pd(fscal,dz12);
538 /* Update vectorial force */
539 fix1 = _mm256_add_pd(fix1,tx);
540 fiy1 = _mm256_add_pd(fiy1,ty);
541 fiz1 = _mm256_add_pd(fiz1,tz);
543 fjx2 = _mm256_add_pd(fjx2,tx);
544 fjy2 = _mm256_add_pd(fjy2,ty);
545 fjz2 = _mm256_add_pd(fjz2,tz);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 r20 = _mm256_mul_pd(rsq20,rinv20);
553 /* EWALD ELECTROSTATICS */
555 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
556 ewrt = _mm256_mul_pd(r20,ewtabscale);
557 ewitab = _mm256_cvttpd_epi32(ewrt);
558 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
559 ewitab = _mm_slli_epi32(ewitab,2);
560 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
561 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
562 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
563 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
564 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
565 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
566 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
567 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
568 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velecsum = _mm256_add_pd(velecsum,velec);
575 /* Calculate temporary vectorial force */
576 tx = _mm256_mul_pd(fscal,dx20);
577 ty = _mm256_mul_pd(fscal,dy20);
578 tz = _mm256_mul_pd(fscal,dz20);
580 /* Update vectorial force */
581 fix2 = _mm256_add_pd(fix2,tx);
582 fiy2 = _mm256_add_pd(fiy2,ty);
583 fiz2 = _mm256_add_pd(fiz2,tz);
585 fjx0 = _mm256_add_pd(fjx0,tx);
586 fjy0 = _mm256_add_pd(fjy0,ty);
587 fjz0 = _mm256_add_pd(fjz0,tz);
589 /**************************
590 * CALCULATE INTERACTIONS *
591 **************************/
593 r21 = _mm256_mul_pd(rsq21,rinv21);
595 /* EWALD ELECTROSTATICS */
597 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
598 ewrt = _mm256_mul_pd(r21,ewtabscale);
599 ewitab = _mm256_cvttpd_epi32(ewrt);
600 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
601 ewitab = _mm_slli_epi32(ewitab,2);
602 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
603 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
604 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
605 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
606 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
607 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
608 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
609 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
610 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velecsum = _mm256_add_pd(velecsum,velec);
617 /* Calculate temporary vectorial force */
618 tx = _mm256_mul_pd(fscal,dx21);
619 ty = _mm256_mul_pd(fscal,dy21);
620 tz = _mm256_mul_pd(fscal,dz21);
622 /* Update vectorial force */
623 fix2 = _mm256_add_pd(fix2,tx);
624 fiy2 = _mm256_add_pd(fiy2,ty);
625 fiz2 = _mm256_add_pd(fiz2,tz);
627 fjx1 = _mm256_add_pd(fjx1,tx);
628 fjy1 = _mm256_add_pd(fjy1,ty);
629 fjz1 = _mm256_add_pd(fjz1,tz);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 r22 = _mm256_mul_pd(rsq22,rinv22);
637 /* EWALD ELECTROSTATICS */
639 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
640 ewrt = _mm256_mul_pd(r22,ewtabscale);
641 ewitab = _mm256_cvttpd_epi32(ewrt);
642 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
643 ewitab = _mm_slli_epi32(ewitab,2);
644 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
645 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
646 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
647 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
648 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
649 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
650 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
651 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
652 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velecsum = _mm256_add_pd(velecsum,velec);
659 /* Calculate temporary vectorial force */
660 tx = _mm256_mul_pd(fscal,dx22);
661 ty = _mm256_mul_pd(fscal,dy22);
662 tz = _mm256_mul_pd(fscal,dz22);
664 /* Update vectorial force */
665 fix2 = _mm256_add_pd(fix2,tx);
666 fiy2 = _mm256_add_pd(fiy2,ty);
667 fiz2 = _mm256_add_pd(fiz2,tz);
669 fjx2 = _mm256_add_pd(fjx2,tx);
670 fjy2 = _mm256_add_pd(fjy2,ty);
671 fjz2 = _mm256_add_pd(fjz2,tz);
673 fjptrA = f+j_coord_offsetA;
674 fjptrB = f+j_coord_offsetB;
675 fjptrC = f+j_coord_offsetC;
676 fjptrD = f+j_coord_offsetD;
678 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
679 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
681 /* Inner loop uses 369 flops */
687 /* Get j neighbor index, and coordinate index */
688 jnrlistA = jjnr[jidx];
689 jnrlistB = jjnr[jidx+1];
690 jnrlistC = jjnr[jidx+2];
691 jnrlistD = jjnr[jidx+3];
692 /* Sign of each element will be negative for non-real atoms.
693 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
696 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
698 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
699 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
700 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
702 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
703 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
704 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
705 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
706 j_coord_offsetA = DIM*jnrA;
707 j_coord_offsetB = DIM*jnrB;
708 j_coord_offsetC = DIM*jnrC;
709 j_coord_offsetD = DIM*jnrD;
711 /* load j atom coordinates */
712 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
713 x+j_coord_offsetC,x+j_coord_offsetD,
714 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
716 /* Calculate displacement vector */
717 dx00 = _mm256_sub_pd(ix0,jx0);
718 dy00 = _mm256_sub_pd(iy0,jy0);
719 dz00 = _mm256_sub_pd(iz0,jz0);
720 dx01 = _mm256_sub_pd(ix0,jx1);
721 dy01 = _mm256_sub_pd(iy0,jy1);
722 dz01 = _mm256_sub_pd(iz0,jz1);
723 dx02 = _mm256_sub_pd(ix0,jx2);
724 dy02 = _mm256_sub_pd(iy0,jy2);
725 dz02 = _mm256_sub_pd(iz0,jz2);
726 dx10 = _mm256_sub_pd(ix1,jx0);
727 dy10 = _mm256_sub_pd(iy1,jy0);
728 dz10 = _mm256_sub_pd(iz1,jz0);
729 dx11 = _mm256_sub_pd(ix1,jx1);
730 dy11 = _mm256_sub_pd(iy1,jy1);
731 dz11 = _mm256_sub_pd(iz1,jz1);
732 dx12 = _mm256_sub_pd(ix1,jx2);
733 dy12 = _mm256_sub_pd(iy1,jy2);
734 dz12 = _mm256_sub_pd(iz1,jz2);
735 dx20 = _mm256_sub_pd(ix2,jx0);
736 dy20 = _mm256_sub_pd(iy2,jy0);
737 dz20 = _mm256_sub_pd(iz2,jz0);
738 dx21 = _mm256_sub_pd(ix2,jx1);
739 dy21 = _mm256_sub_pd(iy2,jy1);
740 dz21 = _mm256_sub_pd(iz2,jz1);
741 dx22 = _mm256_sub_pd(ix2,jx2);
742 dy22 = _mm256_sub_pd(iy2,jy2);
743 dz22 = _mm256_sub_pd(iz2,jz2);
745 /* Calculate squared distance and things based on it */
746 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
747 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
748 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
749 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
750 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
751 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
752 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
753 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
754 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
756 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
757 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
758 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
759 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
760 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
761 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
762 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
763 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
764 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
766 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
767 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
768 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
769 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
770 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
771 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
772 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
773 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
774 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
776 fjx0 = _mm256_setzero_pd();
777 fjy0 = _mm256_setzero_pd();
778 fjz0 = _mm256_setzero_pd();
779 fjx1 = _mm256_setzero_pd();
780 fjy1 = _mm256_setzero_pd();
781 fjz1 = _mm256_setzero_pd();
782 fjx2 = _mm256_setzero_pd();
783 fjy2 = _mm256_setzero_pd();
784 fjz2 = _mm256_setzero_pd();
786 /**************************
787 * CALCULATE INTERACTIONS *
788 **************************/
790 r00 = _mm256_mul_pd(rsq00,rinv00);
791 r00 = _mm256_andnot_pd(dummy_mask,r00);
793 /* EWALD ELECTROSTATICS */
795 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
796 ewrt = _mm256_mul_pd(r00,ewtabscale);
797 ewitab = _mm256_cvttpd_epi32(ewrt);
798 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
799 ewitab = _mm_slli_epi32(ewitab,2);
800 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
801 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
802 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
803 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
804 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
805 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
806 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
807 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
808 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm256_andnot_pd(dummy_mask,velec);
812 velecsum = _mm256_add_pd(velecsum,velec);
816 fscal = _mm256_andnot_pd(dummy_mask,fscal);
818 /* Calculate temporary vectorial force */
819 tx = _mm256_mul_pd(fscal,dx00);
820 ty = _mm256_mul_pd(fscal,dy00);
821 tz = _mm256_mul_pd(fscal,dz00);
823 /* Update vectorial force */
824 fix0 = _mm256_add_pd(fix0,tx);
825 fiy0 = _mm256_add_pd(fiy0,ty);
826 fiz0 = _mm256_add_pd(fiz0,tz);
828 fjx0 = _mm256_add_pd(fjx0,tx);
829 fjy0 = _mm256_add_pd(fjy0,ty);
830 fjz0 = _mm256_add_pd(fjz0,tz);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 r01 = _mm256_mul_pd(rsq01,rinv01);
837 r01 = _mm256_andnot_pd(dummy_mask,r01);
839 /* EWALD ELECTROSTATICS */
841 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
842 ewrt = _mm256_mul_pd(r01,ewtabscale);
843 ewitab = _mm256_cvttpd_epi32(ewrt);
844 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
845 ewitab = _mm_slli_epi32(ewitab,2);
846 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
847 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
848 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
849 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
850 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
851 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
852 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
853 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
854 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
856 /* Update potential sum for this i atom from the interaction with this j atom. */
857 velec = _mm256_andnot_pd(dummy_mask,velec);
858 velecsum = _mm256_add_pd(velecsum,velec);
862 fscal = _mm256_andnot_pd(dummy_mask,fscal);
864 /* Calculate temporary vectorial force */
865 tx = _mm256_mul_pd(fscal,dx01);
866 ty = _mm256_mul_pd(fscal,dy01);
867 tz = _mm256_mul_pd(fscal,dz01);
869 /* Update vectorial force */
870 fix0 = _mm256_add_pd(fix0,tx);
871 fiy0 = _mm256_add_pd(fiy0,ty);
872 fiz0 = _mm256_add_pd(fiz0,tz);
874 fjx1 = _mm256_add_pd(fjx1,tx);
875 fjy1 = _mm256_add_pd(fjy1,ty);
876 fjz1 = _mm256_add_pd(fjz1,tz);
878 /**************************
879 * CALCULATE INTERACTIONS *
880 **************************/
882 r02 = _mm256_mul_pd(rsq02,rinv02);
883 r02 = _mm256_andnot_pd(dummy_mask,r02);
885 /* EWALD ELECTROSTATICS */
887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
888 ewrt = _mm256_mul_pd(r02,ewtabscale);
889 ewitab = _mm256_cvttpd_epi32(ewrt);
890 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
891 ewitab = _mm_slli_epi32(ewitab,2);
892 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
893 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
894 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
895 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
896 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
897 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
898 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
899 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
900 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
902 /* Update potential sum for this i atom from the interaction with this j atom. */
903 velec = _mm256_andnot_pd(dummy_mask,velec);
904 velecsum = _mm256_add_pd(velecsum,velec);
908 fscal = _mm256_andnot_pd(dummy_mask,fscal);
910 /* Calculate temporary vectorial force */
911 tx = _mm256_mul_pd(fscal,dx02);
912 ty = _mm256_mul_pd(fscal,dy02);
913 tz = _mm256_mul_pd(fscal,dz02);
915 /* Update vectorial force */
916 fix0 = _mm256_add_pd(fix0,tx);
917 fiy0 = _mm256_add_pd(fiy0,ty);
918 fiz0 = _mm256_add_pd(fiz0,tz);
920 fjx2 = _mm256_add_pd(fjx2,tx);
921 fjy2 = _mm256_add_pd(fjy2,ty);
922 fjz2 = _mm256_add_pd(fjz2,tz);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 r10 = _mm256_mul_pd(rsq10,rinv10);
929 r10 = _mm256_andnot_pd(dummy_mask,r10);
931 /* EWALD ELECTROSTATICS */
933 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
934 ewrt = _mm256_mul_pd(r10,ewtabscale);
935 ewitab = _mm256_cvttpd_epi32(ewrt);
936 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
937 ewitab = _mm_slli_epi32(ewitab,2);
938 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
939 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
940 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
941 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
942 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
943 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
944 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
945 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
946 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec = _mm256_andnot_pd(dummy_mask,velec);
950 velecsum = _mm256_add_pd(velecsum,velec);
954 fscal = _mm256_andnot_pd(dummy_mask,fscal);
956 /* Calculate temporary vectorial force */
957 tx = _mm256_mul_pd(fscal,dx10);
958 ty = _mm256_mul_pd(fscal,dy10);
959 tz = _mm256_mul_pd(fscal,dz10);
961 /* Update vectorial force */
962 fix1 = _mm256_add_pd(fix1,tx);
963 fiy1 = _mm256_add_pd(fiy1,ty);
964 fiz1 = _mm256_add_pd(fiz1,tz);
966 fjx0 = _mm256_add_pd(fjx0,tx);
967 fjy0 = _mm256_add_pd(fjy0,ty);
968 fjz0 = _mm256_add_pd(fjz0,tz);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r11 = _mm256_mul_pd(rsq11,rinv11);
975 r11 = _mm256_andnot_pd(dummy_mask,r11);
977 /* EWALD ELECTROSTATICS */
979 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
980 ewrt = _mm256_mul_pd(r11,ewtabscale);
981 ewitab = _mm256_cvttpd_epi32(ewrt);
982 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
983 ewitab = _mm_slli_epi32(ewitab,2);
984 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
985 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
986 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
987 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
988 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
989 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
990 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
991 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
992 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm256_andnot_pd(dummy_mask,velec);
996 velecsum = _mm256_add_pd(velecsum,velec);
1000 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1002 /* Calculate temporary vectorial force */
1003 tx = _mm256_mul_pd(fscal,dx11);
1004 ty = _mm256_mul_pd(fscal,dy11);
1005 tz = _mm256_mul_pd(fscal,dz11);
1007 /* Update vectorial force */
1008 fix1 = _mm256_add_pd(fix1,tx);
1009 fiy1 = _mm256_add_pd(fiy1,ty);
1010 fiz1 = _mm256_add_pd(fiz1,tz);
1012 fjx1 = _mm256_add_pd(fjx1,tx);
1013 fjy1 = _mm256_add_pd(fjy1,ty);
1014 fjz1 = _mm256_add_pd(fjz1,tz);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 r12 = _mm256_mul_pd(rsq12,rinv12);
1021 r12 = _mm256_andnot_pd(dummy_mask,r12);
1023 /* EWALD ELECTROSTATICS */
1025 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1026 ewrt = _mm256_mul_pd(r12,ewtabscale);
1027 ewitab = _mm256_cvttpd_epi32(ewrt);
1028 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1029 ewitab = _mm_slli_epi32(ewitab,2);
1030 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1031 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1032 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1033 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1034 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1035 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1036 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1037 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1038 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec = _mm256_andnot_pd(dummy_mask,velec);
1042 velecsum = _mm256_add_pd(velecsum,velec);
1046 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1048 /* Calculate temporary vectorial force */
1049 tx = _mm256_mul_pd(fscal,dx12);
1050 ty = _mm256_mul_pd(fscal,dy12);
1051 tz = _mm256_mul_pd(fscal,dz12);
1053 /* Update vectorial force */
1054 fix1 = _mm256_add_pd(fix1,tx);
1055 fiy1 = _mm256_add_pd(fiy1,ty);
1056 fiz1 = _mm256_add_pd(fiz1,tz);
1058 fjx2 = _mm256_add_pd(fjx2,tx);
1059 fjy2 = _mm256_add_pd(fjy2,ty);
1060 fjz2 = _mm256_add_pd(fjz2,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r20 = _mm256_mul_pd(rsq20,rinv20);
1067 r20 = _mm256_andnot_pd(dummy_mask,r20);
1069 /* EWALD ELECTROSTATICS */
1071 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1072 ewrt = _mm256_mul_pd(r20,ewtabscale);
1073 ewitab = _mm256_cvttpd_epi32(ewrt);
1074 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1075 ewitab = _mm_slli_epi32(ewitab,2);
1076 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1077 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1078 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1079 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1080 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1081 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1082 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1083 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1084 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1086 /* Update potential sum for this i atom from the interaction with this j atom. */
1087 velec = _mm256_andnot_pd(dummy_mask,velec);
1088 velecsum = _mm256_add_pd(velecsum,velec);
1092 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1094 /* Calculate temporary vectorial force */
1095 tx = _mm256_mul_pd(fscal,dx20);
1096 ty = _mm256_mul_pd(fscal,dy20);
1097 tz = _mm256_mul_pd(fscal,dz20);
1099 /* Update vectorial force */
1100 fix2 = _mm256_add_pd(fix2,tx);
1101 fiy2 = _mm256_add_pd(fiy2,ty);
1102 fiz2 = _mm256_add_pd(fiz2,tz);
1104 fjx0 = _mm256_add_pd(fjx0,tx);
1105 fjy0 = _mm256_add_pd(fjy0,ty);
1106 fjz0 = _mm256_add_pd(fjz0,tz);
1108 /**************************
1109 * CALCULATE INTERACTIONS *
1110 **************************/
1112 r21 = _mm256_mul_pd(rsq21,rinv21);
1113 r21 = _mm256_andnot_pd(dummy_mask,r21);
1115 /* EWALD ELECTROSTATICS */
1117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1118 ewrt = _mm256_mul_pd(r21,ewtabscale);
1119 ewitab = _mm256_cvttpd_epi32(ewrt);
1120 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1121 ewitab = _mm_slli_epi32(ewitab,2);
1122 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1123 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1124 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1125 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1126 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1127 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1128 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1129 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1130 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1132 /* Update potential sum for this i atom from the interaction with this j atom. */
1133 velec = _mm256_andnot_pd(dummy_mask,velec);
1134 velecsum = _mm256_add_pd(velecsum,velec);
1138 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1140 /* Calculate temporary vectorial force */
1141 tx = _mm256_mul_pd(fscal,dx21);
1142 ty = _mm256_mul_pd(fscal,dy21);
1143 tz = _mm256_mul_pd(fscal,dz21);
1145 /* Update vectorial force */
1146 fix2 = _mm256_add_pd(fix2,tx);
1147 fiy2 = _mm256_add_pd(fiy2,ty);
1148 fiz2 = _mm256_add_pd(fiz2,tz);
1150 fjx1 = _mm256_add_pd(fjx1,tx);
1151 fjy1 = _mm256_add_pd(fjy1,ty);
1152 fjz1 = _mm256_add_pd(fjz1,tz);
1154 /**************************
1155 * CALCULATE INTERACTIONS *
1156 **************************/
1158 r22 = _mm256_mul_pd(rsq22,rinv22);
1159 r22 = _mm256_andnot_pd(dummy_mask,r22);
1161 /* EWALD ELECTROSTATICS */
1163 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1164 ewrt = _mm256_mul_pd(r22,ewtabscale);
1165 ewitab = _mm256_cvttpd_epi32(ewrt);
1166 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1167 ewitab = _mm_slli_epi32(ewitab,2);
1168 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1169 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1170 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1171 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1172 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1173 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1174 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1175 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1176 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm256_andnot_pd(dummy_mask,velec);
1180 velecsum = _mm256_add_pd(velecsum,velec);
1184 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1186 /* Calculate temporary vectorial force */
1187 tx = _mm256_mul_pd(fscal,dx22);
1188 ty = _mm256_mul_pd(fscal,dy22);
1189 tz = _mm256_mul_pd(fscal,dz22);
1191 /* Update vectorial force */
1192 fix2 = _mm256_add_pd(fix2,tx);
1193 fiy2 = _mm256_add_pd(fiy2,ty);
1194 fiz2 = _mm256_add_pd(fiz2,tz);
1196 fjx2 = _mm256_add_pd(fjx2,tx);
1197 fjy2 = _mm256_add_pd(fjy2,ty);
1198 fjz2 = _mm256_add_pd(fjz2,tz);
1200 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1201 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1202 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1203 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1205 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1206 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1208 /* Inner loop uses 378 flops */
1211 /* End of innermost loop */
1213 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1214 f+i_coord_offset,fshift+i_shift_offset);
1217 /* Update potential energies */
1218 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1220 /* Increment number of inner iterations */
1221 inneriter += j_index_end - j_index_start;
1223 /* Outer loop uses 19 flops */
1226 /* Increment number of outer iterations */
1229 /* Update outer/inner flops */
1231 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*378);
1234 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
1235 * Electrostatics interaction: Ewald
1236 * VdW interaction: None
1237 * Geometry: Water3-Water3
1238 * Calculate force/pot: Force
1241 nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
1242 (t_nblist * gmx_restrict nlist,
1243 rvec * gmx_restrict xx,
1244 rvec * gmx_restrict ff,
1245 t_forcerec * gmx_restrict fr,
1246 t_mdatoms * gmx_restrict mdatoms,
1247 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1248 t_nrnb * gmx_restrict nrnb)
1250 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1251 * just 0 for non-waters.
1252 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1253 * jnr indices corresponding to data put in the four positions in the SIMD register.
1255 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1256 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1257 int jnrA,jnrB,jnrC,jnrD;
1258 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1259 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1260 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1261 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1262 real rcutoff_scalar;
1263 real *shiftvec,*fshift,*x,*f;
1264 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1265 real scratch[4*DIM];
1266 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1267 real * vdwioffsetptr0;
1268 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1269 real * vdwioffsetptr1;
1270 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1271 real * vdwioffsetptr2;
1272 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1273 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1274 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1275 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1276 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1277 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1278 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1279 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1280 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1281 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1282 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1283 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1284 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1285 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1286 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1287 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1288 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1291 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1292 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1294 __m256d dummy_mask,cutoff_mask;
1295 __m128 tmpmask0,tmpmask1;
1296 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1297 __m256d one = _mm256_set1_pd(1.0);
1298 __m256d two = _mm256_set1_pd(2.0);
1304 jindex = nlist->jindex;
1306 shiftidx = nlist->shift;
1308 shiftvec = fr->shift_vec[0];
1309 fshift = fr->fshift[0];
1310 facel = _mm256_set1_pd(fr->epsfac);
1311 charge = mdatoms->chargeA;
1313 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1314 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1315 beta2 = _mm256_mul_pd(beta,beta);
1316 beta3 = _mm256_mul_pd(beta,beta2);
1318 ewtab = fr->ic->tabq_coul_F;
1319 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1320 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1322 /* Setup water-specific parameters */
1323 inr = nlist->iinr[0];
1324 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1325 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1326 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1328 jq0 = _mm256_set1_pd(charge[inr+0]);
1329 jq1 = _mm256_set1_pd(charge[inr+1]);
1330 jq2 = _mm256_set1_pd(charge[inr+2]);
1331 qq00 = _mm256_mul_pd(iq0,jq0);
1332 qq01 = _mm256_mul_pd(iq0,jq1);
1333 qq02 = _mm256_mul_pd(iq0,jq2);
1334 qq10 = _mm256_mul_pd(iq1,jq0);
1335 qq11 = _mm256_mul_pd(iq1,jq1);
1336 qq12 = _mm256_mul_pd(iq1,jq2);
1337 qq20 = _mm256_mul_pd(iq2,jq0);
1338 qq21 = _mm256_mul_pd(iq2,jq1);
1339 qq22 = _mm256_mul_pd(iq2,jq2);
1341 /* Avoid stupid compiler warnings */
1342 jnrA = jnrB = jnrC = jnrD = 0;
1343 j_coord_offsetA = 0;
1344 j_coord_offsetB = 0;
1345 j_coord_offsetC = 0;
1346 j_coord_offsetD = 0;
1351 for(iidx=0;iidx<4*DIM;iidx++)
1353 scratch[iidx] = 0.0;
1356 /* Start outer loop over neighborlists */
1357 for(iidx=0; iidx<nri; iidx++)
1359 /* Load shift vector for this list */
1360 i_shift_offset = DIM*shiftidx[iidx];
1362 /* Load limits for loop over neighbors */
1363 j_index_start = jindex[iidx];
1364 j_index_end = jindex[iidx+1];
1366 /* Get outer coordinate index */
1368 i_coord_offset = DIM*inr;
1370 /* Load i particle coords and add shift vector */
1371 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1372 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1374 fix0 = _mm256_setzero_pd();
1375 fiy0 = _mm256_setzero_pd();
1376 fiz0 = _mm256_setzero_pd();
1377 fix1 = _mm256_setzero_pd();
1378 fiy1 = _mm256_setzero_pd();
1379 fiz1 = _mm256_setzero_pd();
1380 fix2 = _mm256_setzero_pd();
1381 fiy2 = _mm256_setzero_pd();
1382 fiz2 = _mm256_setzero_pd();
1384 /* Start inner kernel loop */
1385 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1388 /* Get j neighbor index, and coordinate index */
1390 jnrB = jjnr[jidx+1];
1391 jnrC = jjnr[jidx+2];
1392 jnrD = jjnr[jidx+3];
1393 j_coord_offsetA = DIM*jnrA;
1394 j_coord_offsetB = DIM*jnrB;
1395 j_coord_offsetC = DIM*jnrC;
1396 j_coord_offsetD = DIM*jnrD;
1398 /* load j atom coordinates */
1399 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1400 x+j_coord_offsetC,x+j_coord_offsetD,
1401 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1403 /* Calculate displacement vector */
1404 dx00 = _mm256_sub_pd(ix0,jx0);
1405 dy00 = _mm256_sub_pd(iy0,jy0);
1406 dz00 = _mm256_sub_pd(iz0,jz0);
1407 dx01 = _mm256_sub_pd(ix0,jx1);
1408 dy01 = _mm256_sub_pd(iy0,jy1);
1409 dz01 = _mm256_sub_pd(iz0,jz1);
1410 dx02 = _mm256_sub_pd(ix0,jx2);
1411 dy02 = _mm256_sub_pd(iy0,jy2);
1412 dz02 = _mm256_sub_pd(iz0,jz2);
1413 dx10 = _mm256_sub_pd(ix1,jx0);
1414 dy10 = _mm256_sub_pd(iy1,jy0);
1415 dz10 = _mm256_sub_pd(iz1,jz0);
1416 dx11 = _mm256_sub_pd(ix1,jx1);
1417 dy11 = _mm256_sub_pd(iy1,jy1);
1418 dz11 = _mm256_sub_pd(iz1,jz1);
1419 dx12 = _mm256_sub_pd(ix1,jx2);
1420 dy12 = _mm256_sub_pd(iy1,jy2);
1421 dz12 = _mm256_sub_pd(iz1,jz2);
1422 dx20 = _mm256_sub_pd(ix2,jx0);
1423 dy20 = _mm256_sub_pd(iy2,jy0);
1424 dz20 = _mm256_sub_pd(iz2,jz0);
1425 dx21 = _mm256_sub_pd(ix2,jx1);
1426 dy21 = _mm256_sub_pd(iy2,jy1);
1427 dz21 = _mm256_sub_pd(iz2,jz1);
1428 dx22 = _mm256_sub_pd(ix2,jx2);
1429 dy22 = _mm256_sub_pd(iy2,jy2);
1430 dz22 = _mm256_sub_pd(iz2,jz2);
1432 /* Calculate squared distance and things based on it */
1433 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1434 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1435 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1436 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1437 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1438 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1439 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1440 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1441 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1443 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1444 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1445 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1446 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1447 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1448 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1449 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1450 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1451 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1453 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1454 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1455 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1456 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1457 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1458 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1459 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1460 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1461 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1463 fjx0 = _mm256_setzero_pd();
1464 fjy0 = _mm256_setzero_pd();
1465 fjz0 = _mm256_setzero_pd();
1466 fjx1 = _mm256_setzero_pd();
1467 fjy1 = _mm256_setzero_pd();
1468 fjz1 = _mm256_setzero_pd();
1469 fjx2 = _mm256_setzero_pd();
1470 fjy2 = _mm256_setzero_pd();
1471 fjz2 = _mm256_setzero_pd();
1473 /**************************
1474 * CALCULATE INTERACTIONS *
1475 **************************/
1477 r00 = _mm256_mul_pd(rsq00,rinv00);
1479 /* EWALD ELECTROSTATICS */
1481 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1482 ewrt = _mm256_mul_pd(r00,ewtabscale);
1483 ewitab = _mm256_cvttpd_epi32(ewrt);
1484 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1485 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1486 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1488 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1489 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1493 /* Calculate temporary vectorial force */
1494 tx = _mm256_mul_pd(fscal,dx00);
1495 ty = _mm256_mul_pd(fscal,dy00);
1496 tz = _mm256_mul_pd(fscal,dz00);
1498 /* Update vectorial force */
1499 fix0 = _mm256_add_pd(fix0,tx);
1500 fiy0 = _mm256_add_pd(fiy0,ty);
1501 fiz0 = _mm256_add_pd(fiz0,tz);
1503 fjx0 = _mm256_add_pd(fjx0,tx);
1504 fjy0 = _mm256_add_pd(fjy0,ty);
1505 fjz0 = _mm256_add_pd(fjz0,tz);
1507 /**************************
1508 * CALCULATE INTERACTIONS *
1509 **************************/
1511 r01 = _mm256_mul_pd(rsq01,rinv01);
1513 /* EWALD ELECTROSTATICS */
1515 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1516 ewrt = _mm256_mul_pd(r01,ewtabscale);
1517 ewitab = _mm256_cvttpd_epi32(ewrt);
1518 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1519 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1520 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1522 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1523 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1527 /* Calculate temporary vectorial force */
1528 tx = _mm256_mul_pd(fscal,dx01);
1529 ty = _mm256_mul_pd(fscal,dy01);
1530 tz = _mm256_mul_pd(fscal,dz01);
1532 /* Update vectorial force */
1533 fix0 = _mm256_add_pd(fix0,tx);
1534 fiy0 = _mm256_add_pd(fiy0,ty);
1535 fiz0 = _mm256_add_pd(fiz0,tz);
1537 fjx1 = _mm256_add_pd(fjx1,tx);
1538 fjy1 = _mm256_add_pd(fjy1,ty);
1539 fjz1 = _mm256_add_pd(fjz1,tz);
1541 /**************************
1542 * CALCULATE INTERACTIONS *
1543 **************************/
1545 r02 = _mm256_mul_pd(rsq02,rinv02);
1547 /* EWALD ELECTROSTATICS */
1549 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1550 ewrt = _mm256_mul_pd(r02,ewtabscale);
1551 ewitab = _mm256_cvttpd_epi32(ewrt);
1552 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1553 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1554 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1556 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1557 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1561 /* Calculate temporary vectorial force */
1562 tx = _mm256_mul_pd(fscal,dx02);
1563 ty = _mm256_mul_pd(fscal,dy02);
1564 tz = _mm256_mul_pd(fscal,dz02);
1566 /* Update vectorial force */
1567 fix0 = _mm256_add_pd(fix0,tx);
1568 fiy0 = _mm256_add_pd(fiy0,ty);
1569 fiz0 = _mm256_add_pd(fiz0,tz);
1571 fjx2 = _mm256_add_pd(fjx2,tx);
1572 fjy2 = _mm256_add_pd(fjy2,ty);
1573 fjz2 = _mm256_add_pd(fjz2,tz);
1575 /**************************
1576 * CALCULATE INTERACTIONS *
1577 **************************/
1579 r10 = _mm256_mul_pd(rsq10,rinv10);
1581 /* EWALD ELECTROSTATICS */
1583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1584 ewrt = _mm256_mul_pd(r10,ewtabscale);
1585 ewitab = _mm256_cvttpd_epi32(ewrt);
1586 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1587 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1588 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1590 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1591 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1595 /* Calculate temporary vectorial force */
1596 tx = _mm256_mul_pd(fscal,dx10);
1597 ty = _mm256_mul_pd(fscal,dy10);
1598 tz = _mm256_mul_pd(fscal,dz10);
1600 /* Update vectorial force */
1601 fix1 = _mm256_add_pd(fix1,tx);
1602 fiy1 = _mm256_add_pd(fiy1,ty);
1603 fiz1 = _mm256_add_pd(fiz1,tz);
1605 fjx0 = _mm256_add_pd(fjx0,tx);
1606 fjy0 = _mm256_add_pd(fjy0,ty);
1607 fjz0 = _mm256_add_pd(fjz0,tz);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 r11 = _mm256_mul_pd(rsq11,rinv11);
1615 /* EWALD ELECTROSTATICS */
1617 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1618 ewrt = _mm256_mul_pd(r11,ewtabscale);
1619 ewitab = _mm256_cvttpd_epi32(ewrt);
1620 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1621 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1622 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1624 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1625 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1629 /* Calculate temporary vectorial force */
1630 tx = _mm256_mul_pd(fscal,dx11);
1631 ty = _mm256_mul_pd(fscal,dy11);
1632 tz = _mm256_mul_pd(fscal,dz11);
1634 /* Update vectorial force */
1635 fix1 = _mm256_add_pd(fix1,tx);
1636 fiy1 = _mm256_add_pd(fiy1,ty);
1637 fiz1 = _mm256_add_pd(fiz1,tz);
1639 fjx1 = _mm256_add_pd(fjx1,tx);
1640 fjy1 = _mm256_add_pd(fjy1,ty);
1641 fjz1 = _mm256_add_pd(fjz1,tz);
1643 /**************************
1644 * CALCULATE INTERACTIONS *
1645 **************************/
1647 r12 = _mm256_mul_pd(rsq12,rinv12);
1649 /* EWALD ELECTROSTATICS */
1651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1652 ewrt = _mm256_mul_pd(r12,ewtabscale);
1653 ewitab = _mm256_cvttpd_epi32(ewrt);
1654 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1655 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1656 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1658 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1659 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1663 /* Calculate temporary vectorial force */
1664 tx = _mm256_mul_pd(fscal,dx12);
1665 ty = _mm256_mul_pd(fscal,dy12);
1666 tz = _mm256_mul_pd(fscal,dz12);
1668 /* Update vectorial force */
1669 fix1 = _mm256_add_pd(fix1,tx);
1670 fiy1 = _mm256_add_pd(fiy1,ty);
1671 fiz1 = _mm256_add_pd(fiz1,tz);
1673 fjx2 = _mm256_add_pd(fjx2,tx);
1674 fjy2 = _mm256_add_pd(fjy2,ty);
1675 fjz2 = _mm256_add_pd(fjz2,tz);
1677 /**************************
1678 * CALCULATE INTERACTIONS *
1679 **************************/
1681 r20 = _mm256_mul_pd(rsq20,rinv20);
1683 /* EWALD ELECTROSTATICS */
1685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1686 ewrt = _mm256_mul_pd(r20,ewtabscale);
1687 ewitab = _mm256_cvttpd_epi32(ewrt);
1688 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1689 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1690 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1692 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1693 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1697 /* Calculate temporary vectorial force */
1698 tx = _mm256_mul_pd(fscal,dx20);
1699 ty = _mm256_mul_pd(fscal,dy20);
1700 tz = _mm256_mul_pd(fscal,dz20);
1702 /* Update vectorial force */
1703 fix2 = _mm256_add_pd(fix2,tx);
1704 fiy2 = _mm256_add_pd(fiy2,ty);
1705 fiz2 = _mm256_add_pd(fiz2,tz);
1707 fjx0 = _mm256_add_pd(fjx0,tx);
1708 fjy0 = _mm256_add_pd(fjy0,ty);
1709 fjz0 = _mm256_add_pd(fjz0,tz);
1711 /**************************
1712 * CALCULATE INTERACTIONS *
1713 **************************/
1715 r21 = _mm256_mul_pd(rsq21,rinv21);
1717 /* EWALD ELECTROSTATICS */
1719 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1720 ewrt = _mm256_mul_pd(r21,ewtabscale);
1721 ewitab = _mm256_cvttpd_epi32(ewrt);
1722 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1723 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1724 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1726 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1727 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1731 /* Calculate temporary vectorial force */
1732 tx = _mm256_mul_pd(fscal,dx21);
1733 ty = _mm256_mul_pd(fscal,dy21);
1734 tz = _mm256_mul_pd(fscal,dz21);
1736 /* Update vectorial force */
1737 fix2 = _mm256_add_pd(fix2,tx);
1738 fiy2 = _mm256_add_pd(fiy2,ty);
1739 fiz2 = _mm256_add_pd(fiz2,tz);
1741 fjx1 = _mm256_add_pd(fjx1,tx);
1742 fjy1 = _mm256_add_pd(fjy1,ty);
1743 fjz1 = _mm256_add_pd(fjz1,tz);
1745 /**************************
1746 * CALCULATE INTERACTIONS *
1747 **************************/
1749 r22 = _mm256_mul_pd(rsq22,rinv22);
1751 /* EWALD ELECTROSTATICS */
1753 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1754 ewrt = _mm256_mul_pd(r22,ewtabscale);
1755 ewitab = _mm256_cvttpd_epi32(ewrt);
1756 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1757 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1758 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1760 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1761 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1765 /* Calculate temporary vectorial force */
1766 tx = _mm256_mul_pd(fscal,dx22);
1767 ty = _mm256_mul_pd(fscal,dy22);
1768 tz = _mm256_mul_pd(fscal,dz22);
1770 /* Update vectorial force */
1771 fix2 = _mm256_add_pd(fix2,tx);
1772 fiy2 = _mm256_add_pd(fiy2,ty);
1773 fiz2 = _mm256_add_pd(fiz2,tz);
1775 fjx2 = _mm256_add_pd(fjx2,tx);
1776 fjy2 = _mm256_add_pd(fjy2,ty);
1777 fjz2 = _mm256_add_pd(fjz2,tz);
1779 fjptrA = f+j_coord_offsetA;
1780 fjptrB = f+j_coord_offsetB;
1781 fjptrC = f+j_coord_offsetC;
1782 fjptrD = f+j_coord_offsetD;
1784 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1785 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1787 /* Inner loop uses 324 flops */
1790 if(jidx<j_index_end)
1793 /* Get j neighbor index, and coordinate index */
1794 jnrlistA = jjnr[jidx];
1795 jnrlistB = jjnr[jidx+1];
1796 jnrlistC = jjnr[jidx+2];
1797 jnrlistD = jjnr[jidx+3];
1798 /* Sign of each element will be negative for non-real atoms.
1799 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1800 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1802 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1804 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1805 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1806 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1808 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1809 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1810 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1811 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1812 j_coord_offsetA = DIM*jnrA;
1813 j_coord_offsetB = DIM*jnrB;
1814 j_coord_offsetC = DIM*jnrC;
1815 j_coord_offsetD = DIM*jnrD;
1817 /* load j atom coordinates */
1818 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1819 x+j_coord_offsetC,x+j_coord_offsetD,
1820 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1822 /* Calculate displacement vector */
1823 dx00 = _mm256_sub_pd(ix0,jx0);
1824 dy00 = _mm256_sub_pd(iy0,jy0);
1825 dz00 = _mm256_sub_pd(iz0,jz0);
1826 dx01 = _mm256_sub_pd(ix0,jx1);
1827 dy01 = _mm256_sub_pd(iy0,jy1);
1828 dz01 = _mm256_sub_pd(iz0,jz1);
1829 dx02 = _mm256_sub_pd(ix0,jx2);
1830 dy02 = _mm256_sub_pd(iy0,jy2);
1831 dz02 = _mm256_sub_pd(iz0,jz2);
1832 dx10 = _mm256_sub_pd(ix1,jx0);
1833 dy10 = _mm256_sub_pd(iy1,jy0);
1834 dz10 = _mm256_sub_pd(iz1,jz0);
1835 dx11 = _mm256_sub_pd(ix1,jx1);
1836 dy11 = _mm256_sub_pd(iy1,jy1);
1837 dz11 = _mm256_sub_pd(iz1,jz1);
1838 dx12 = _mm256_sub_pd(ix1,jx2);
1839 dy12 = _mm256_sub_pd(iy1,jy2);
1840 dz12 = _mm256_sub_pd(iz1,jz2);
1841 dx20 = _mm256_sub_pd(ix2,jx0);
1842 dy20 = _mm256_sub_pd(iy2,jy0);
1843 dz20 = _mm256_sub_pd(iz2,jz0);
1844 dx21 = _mm256_sub_pd(ix2,jx1);
1845 dy21 = _mm256_sub_pd(iy2,jy1);
1846 dz21 = _mm256_sub_pd(iz2,jz1);
1847 dx22 = _mm256_sub_pd(ix2,jx2);
1848 dy22 = _mm256_sub_pd(iy2,jy2);
1849 dz22 = _mm256_sub_pd(iz2,jz2);
1851 /* Calculate squared distance and things based on it */
1852 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1853 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1854 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1855 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1856 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1857 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1858 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1859 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1860 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1862 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1863 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1864 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1865 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1866 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1867 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1868 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1869 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1870 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1872 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1873 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1874 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1875 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1876 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1877 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1878 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1879 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1880 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1882 fjx0 = _mm256_setzero_pd();
1883 fjy0 = _mm256_setzero_pd();
1884 fjz0 = _mm256_setzero_pd();
1885 fjx1 = _mm256_setzero_pd();
1886 fjy1 = _mm256_setzero_pd();
1887 fjz1 = _mm256_setzero_pd();
1888 fjx2 = _mm256_setzero_pd();
1889 fjy2 = _mm256_setzero_pd();
1890 fjz2 = _mm256_setzero_pd();
1892 /**************************
1893 * CALCULATE INTERACTIONS *
1894 **************************/
1896 r00 = _mm256_mul_pd(rsq00,rinv00);
1897 r00 = _mm256_andnot_pd(dummy_mask,r00);
1899 /* EWALD ELECTROSTATICS */
1901 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1902 ewrt = _mm256_mul_pd(r00,ewtabscale);
1903 ewitab = _mm256_cvttpd_epi32(ewrt);
1904 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1905 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1906 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1908 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1909 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1913 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1915 /* Calculate temporary vectorial force */
1916 tx = _mm256_mul_pd(fscal,dx00);
1917 ty = _mm256_mul_pd(fscal,dy00);
1918 tz = _mm256_mul_pd(fscal,dz00);
1920 /* Update vectorial force */
1921 fix0 = _mm256_add_pd(fix0,tx);
1922 fiy0 = _mm256_add_pd(fiy0,ty);
1923 fiz0 = _mm256_add_pd(fiz0,tz);
1925 fjx0 = _mm256_add_pd(fjx0,tx);
1926 fjy0 = _mm256_add_pd(fjy0,ty);
1927 fjz0 = _mm256_add_pd(fjz0,tz);
1929 /**************************
1930 * CALCULATE INTERACTIONS *
1931 **************************/
1933 r01 = _mm256_mul_pd(rsq01,rinv01);
1934 r01 = _mm256_andnot_pd(dummy_mask,r01);
1936 /* EWALD ELECTROSTATICS */
1938 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1939 ewrt = _mm256_mul_pd(r01,ewtabscale);
1940 ewitab = _mm256_cvttpd_epi32(ewrt);
1941 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1942 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1943 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1945 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1946 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1950 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1952 /* Calculate temporary vectorial force */
1953 tx = _mm256_mul_pd(fscal,dx01);
1954 ty = _mm256_mul_pd(fscal,dy01);
1955 tz = _mm256_mul_pd(fscal,dz01);
1957 /* Update vectorial force */
1958 fix0 = _mm256_add_pd(fix0,tx);
1959 fiy0 = _mm256_add_pd(fiy0,ty);
1960 fiz0 = _mm256_add_pd(fiz0,tz);
1962 fjx1 = _mm256_add_pd(fjx1,tx);
1963 fjy1 = _mm256_add_pd(fjy1,ty);
1964 fjz1 = _mm256_add_pd(fjz1,tz);
1966 /**************************
1967 * CALCULATE INTERACTIONS *
1968 **************************/
1970 r02 = _mm256_mul_pd(rsq02,rinv02);
1971 r02 = _mm256_andnot_pd(dummy_mask,r02);
1973 /* EWALD ELECTROSTATICS */
1975 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1976 ewrt = _mm256_mul_pd(r02,ewtabscale);
1977 ewitab = _mm256_cvttpd_epi32(ewrt);
1978 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1979 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1980 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1982 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1983 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1987 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1989 /* Calculate temporary vectorial force */
1990 tx = _mm256_mul_pd(fscal,dx02);
1991 ty = _mm256_mul_pd(fscal,dy02);
1992 tz = _mm256_mul_pd(fscal,dz02);
1994 /* Update vectorial force */
1995 fix0 = _mm256_add_pd(fix0,tx);
1996 fiy0 = _mm256_add_pd(fiy0,ty);
1997 fiz0 = _mm256_add_pd(fiz0,tz);
1999 fjx2 = _mm256_add_pd(fjx2,tx);
2000 fjy2 = _mm256_add_pd(fjy2,ty);
2001 fjz2 = _mm256_add_pd(fjz2,tz);
2003 /**************************
2004 * CALCULATE INTERACTIONS *
2005 **************************/
2007 r10 = _mm256_mul_pd(rsq10,rinv10);
2008 r10 = _mm256_andnot_pd(dummy_mask,r10);
2010 /* EWALD ELECTROSTATICS */
2012 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2013 ewrt = _mm256_mul_pd(r10,ewtabscale);
2014 ewitab = _mm256_cvttpd_epi32(ewrt);
2015 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2016 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2017 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2019 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2020 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2024 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2026 /* Calculate temporary vectorial force */
2027 tx = _mm256_mul_pd(fscal,dx10);
2028 ty = _mm256_mul_pd(fscal,dy10);
2029 tz = _mm256_mul_pd(fscal,dz10);
2031 /* Update vectorial force */
2032 fix1 = _mm256_add_pd(fix1,tx);
2033 fiy1 = _mm256_add_pd(fiy1,ty);
2034 fiz1 = _mm256_add_pd(fiz1,tz);
2036 fjx0 = _mm256_add_pd(fjx0,tx);
2037 fjy0 = _mm256_add_pd(fjy0,ty);
2038 fjz0 = _mm256_add_pd(fjz0,tz);
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 r11 = _mm256_mul_pd(rsq11,rinv11);
2045 r11 = _mm256_andnot_pd(dummy_mask,r11);
2047 /* EWALD ELECTROSTATICS */
2049 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2050 ewrt = _mm256_mul_pd(r11,ewtabscale);
2051 ewitab = _mm256_cvttpd_epi32(ewrt);
2052 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2053 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2054 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2056 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2057 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2061 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2063 /* Calculate temporary vectorial force */
2064 tx = _mm256_mul_pd(fscal,dx11);
2065 ty = _mm256_mul_pd(fscal,dy11);
2066 tz = _mm256_mul_pd(fscal,dz11);
2068 /* Update vectorial force */
2069 fix1 = _mm256_add_pd(fix1,tx);
2070 fiy1 = _mm256_add_pd(fiy1,ty);
2071 fiz1 = _mm256_add_pd(fiz1,tz);
2073 fjx1 = _mm256_add_pd(fjx1,tx);
2074 fjy1 = _mm256_add_pd(fjy1,ty);
2075 fjz1 = _mm256_add_pd(fjz1,tz);
2077 /**************************
2078 * CALCULATE INTERACTIONS *
2079 **************************/
2081 r12 = _mm256_mul_pd(rsq12,rinv12);
2082 r12 = _mm256_andnot_pd(dummy_mask,r12);
2084 /* EWALD ELECTROSTATICS */
2086 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2087 ewrt = _mm256_mul_pd(r12,ewtabscale);
2088 ewitab = _mm256_cvttpd_epi32(ewrt);
2089 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2090 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2091 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2093 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2094 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2098 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2100 /* Calculate temporary vectorial force */
2101 tx = _mm256_mul_pd(fscal,dx12);
2102 ty = _mm256_mul_pd(fscal,dy12);
2103 tz = _mm256_mul_pd(fscal,dz12);
2105 /* Update vectorial force */
2106 fix1 = _mm256_add_pd(fix1,tx);
2107 fiy1 = _mm256_add_pd(fiy1,ty);
2108 fiz1 = _mm256_add_pd(fiz1,tz);
2110 fjx2 = _mm256_add_pd(fjx2,tx);
2111 fjy2 = _mm256_add_pd(fjy2,ty);
2112 fjz2 = _mm256_add_pd(fjz2,tz);
2114 /**************************
2115 * CALCULATE INTERACTIONS *
2116 **************************/
2118 r20 = _mm256_mul_pd(rsq20,rinv20);
2119 r20 = _mm256_andnot_pd(dummy_mask,r20);
2121 /* EWALD ELECTROSTATICS */
2123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124 ewrt = _mm256_mul_pd(r20,ewtabscale);
2125 ewitab = _mm256_cvttpd_epi32(ewrt);
2126 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2127 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2128 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2130 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2131 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2135 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2137 /* Calculate temporary vectorial force */
2138 tx = _mm256_mul_pd(fscal,dx20);
2139 ty = _mm256_mul_pd(fscal,dy20);
2140 tz = _mm256_mul_pd(fscal,dz20);
2142 /* Update vectorial force */
2143 fix2 = _mm256_add_pd(fix2,tx);
2144 fiy2 = _mm256_add_pd(fiy2,ty);
2145 fiz2 = _mm256_add_pd(fiz2,tz);
2147 fjx0 = _mm256_add_pd(fjx0,tx);
2148 fjy0 = _mm256_add_pd(fjy0,ty);
2149 fjz0 = _mm256_add_pd(fjz0,tz);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 r21 = _mm256_mul_pd(rsq21,rinv21);
2156 r21 = _mm256_andnot_pd(dummy_mask,r21);
2158 /* EWALD ELECTROSTATICS */
2160 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2161 ewrt = _mm256_mul_pd(r21,ewtabscale);
2162 ewitab = _mm256_cvttpd_epi32(ewrt);
2163 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2164 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2165 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2167 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2168 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2172 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2174 /* Calculate temporary vectorial force */
2175 tx = _mm256_mul_pd(fscal,dx21);
2176 ty = _mm256_mul_pd(fscal,dy21);
2177 tz = _mm256_mul_pd(fscal,dz21);
2179 /* Update vectorial force */
2180 fix2 = _mm256_add_pd(fix2,tx);
2181 fiy2 = _mm256_add_pd(fiy2,ty);
2182 fiz2 = _mm256_add_pd(fiz2,tz);
2184 fjx1 = _mm256_add_pd(fjx1,tx);
2185 fjy1 = _mm256_add_pd(fjy1,ty);
2186 fjz1 = _mm256_add_pd(fjz1,tz);
2188 /**************************
2189 * CALCULATE INTERACTIONS *
2190 **************************/
2192 r22 = _mm256_mul_pd(rsq22,rinv22);
2193 r22 = _mm256_andnot_pd(dummy_mask,r22);
2195 /* EWALD ELECTROSTATICS */
2197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2198 ewrt = _mm256_mul_pd(r22,ewtabscale);
2199 ewitab = _mm256_cvttpd_epi32(ewrt);
2200 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2201 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2202 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2204 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2205 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2209 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2211 /* Calculate temporary vectorial force */
2212 tx = _mm256_mul_pd(fscal,dx22);
2213 ty = _mm256_mul_pd(fscal,dy22);
2214 tz = _mm256_mul_pd(fscal,dz22);
2216 /* Update vectorial force */
2217 fix2 = _mm256_add_pd(fix2,tx);
2218 fiy2 = _mm256_add_pd(fiy2,ty);
2219 fiz2 = _mm256_add_pd(fiz2,tz);
2221 fjx2 = _mm256_add_pd(fjx2,tx);
2222 fjy2 = _mm256_add_pd(fjy2,ty);
2223 fjz2 = _mm256_add_pd(fjz2,tz);
2225 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2226 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2227 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2228 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2230 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2231 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2233 /* Inner loop uses 333 flops */
2236 /* End of innermost loop */
2238 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2239 f+i_coord_offset,fshift+i_shift_offset);
2241 /* Increment number of inner iterations */
2242 inneriter += j_index_end - j_index_start;
2244 /* Outer loop uses 18 flops */
2247 /* Increment number of outer iterations */
2250 /* Update outer/inner flops */
2252 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*333);